3 """@namespace IMP.pmi1.analysis
4 Tools for clustering and cluster analysis
6 from __future__
import print_function
16 from operator
import itemgetter
17 from copy
import deepcopy
18 from math
import log,sqrt
24 """Performs alignment and RMSD calculation for two sets of coordinates
26 The class also takes into account non-equal stoichiometry of the proteins.
27 If this is the case, the protein names of proteins in multiple copies
28 should be specified in the following form:
29 nameA..1, nameA..2 (note two dots).
32 def __init__(self, template, query, weights=None):
34 @param query {'p1':coords(L,3), 'p2':coords(L,3)}
35 @param template {'p1':coords(L,3), 'p2':coords(L,3)}
36 @param weights optional weights for each set of coordinates
39 self.template = template
42 if len(self.query.keys()) != len(self.template.keys()):
43 raise ValueError(
'''the number of proteins
44 in template and query does not match!''')
50 self.proteins = sorted(self.query.keys())
51 prots_uniq = [i.split(
'..')[0]
for i
in self.proteins]
59 np = prots_uniq.count(p)
60 copies = [i
for i
in self.proteins
if i.split(
'..')[0] == p]
61 prmts = list(itertools.permutations(copies, len(copies)))
64 self.Product = list(itertools.product(*P.values()))
72 torder = sum([list(i)
for i
in self.Product[0]], [])
75 if self.weights
is not None:
76 weights += [i
for i
in self.weights[t]]
79 self.rmsd = 10000000000.
80 for comb
in self.Product:
82 order = sum([list(i)
for i
in comb], [])
92 if self.weights
is not None:
103 from scipy.spatial.distance
import cdist
111 torder = sum([list(i)
for i
in self.Product[0]], [])
115 self.rmsd, Transformation = 10000000000.,
''
118 for comb
in self.Product:
119 order = sum([list(i)
for i
in comb], [])
125 if len(template_xyz) != len(query_xyz):
126 raise ValueError(
'''the number of coordinates
127 in template and query does not match!''')
132 query_xyz_tr = [transformation.get_transformed(n)
136 sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2)) / len(template_xyz))
139 Transformation = transformation
142 return (self.rmsd, Transformation)
147 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
148 'a..2':np.array([np.array([1.,1.,])]),
149 'a..3':np.array([np.array([-2.,1.])]),
150 'b':np.array([np.array([0.,-1.])]),
151 'c..1':np.array([np.array([-1.,-1.])]),
152 'c..2':np.array([np.array([1.,-1.])]),
153 'd':np.array([np.array([0.,0.])]),
154 'e':np.array([np.array([0.,1.])])}
156 Ali = Alignment(Proteins, Proteins)
158 if Ali.get_rmsd() == 0.0: print 'successful test!'
159 else: print 'ERROR!'; exit()
164 class Violations(object):
168 self.violation_thresholds = {}
169 self.violation_counts = {}
171 data = open(filename)
176 d = d.strip().split()
177 self.violation_thresholds[d[0]] = float(d[1])
179 def get_number_violated_restraints(self, rsts_dict):
181 for rst
in self.violation_thresholds:
182 if rst
not in rsts_dict:
184 if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
186 if rst
not in self.violation_counts:
187 self.violation_counts[rst] = 1
189 self.violation_counts[rst] += 1
195 """A class to cluster structures.
196 Uses scipy's cdist function to compute distance matrices
197 and sklearn's kmeans clustering module.
201 @param rmsd_weights Flat list of weights for each particle
205 from mpi4py
import MPI
206 self.comm = MPI.COMM_WORLD
207 self.rank = self.comm.Get_rank()
208 self.number_of_processes = self.comm.size
210 self.number_of_processes = 1
213 self.structure_cluster_ids =
None
214 self.tmpl_coords =
None
215 self.rmsd_weights=rmsd_weights
217 def set_template(self, part_coords):
219 self.tmpl_coords = part_coords
222 """Add coordinates for a single model."""
224 self.all_coords[frame] = Coords
226 def dist_matrix(self):
228 self.model_list_names = list(self.all_coords.keys())
229 self.model_indexes = list(range(len(self.model_list_names)))
230 self.model_indexes_dict = dict(
231 list(zip(self.model_list_names, self.model_indexes)))
232 model_indexes_unique_pairs = list(itertools.combinations(self.model_indexes, 2))
234 my_model_indexes_unique_pairs = IMP.pmi1.tools.chunk_list_into_segments(
235 model_indexes_unique_pairs,
236 self.number_of_processes)[self.rank]
238 print(
"process %s assigned with %s pairs" % (str(self.rank), str(len(my_model_indexes_unique_pairs))))
240 (raw_distance_dict, self.transformation_distance_dict) = self.matrix_calculation(self.all_coords,
242 my_model_indexes_unique_pairs)
244 if self.number_of_processes > 1:
247 pickable_transformations = self.get_pickable_transformation_distance_dict(
250 pickable_transformations)
251 self.set_transformation_distance_dict_from_pickable(
252 pickable_transformations)
254 self.raw_distance_matrix = np.zeros(
255 (len(self.model_list_names), len(self.model_list_names)))
256 for item
in raw_distance_dict:
258 self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
259 self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
261 def get_dist_matrix(self):
262 return self.raw_distance_matrix
265 """Run K-means clustering
266 @param number_of_clusters Num means
267 @param seed the random seed
269 from sklearn.cluster
import KMeans
274 kmeans = KMeans(n_clusters=number_of_clusters)
277 kmeans = KMeans(k=number_of_clusters)
278 kmeans.fit_predict(self.raw_distance_matrix)
280 self.structure_cluster_ids = kmeans.labels_
282 def get_pickable_transformation_distance_dict(self):
283 pickable_transformations = {}
284 for label
in self.transformation_distance_dict:
285 tr = self.transformation_distance_dict[label]
286 trans = tuple(tr.get_translation())
287 rot = tuple(tr.get_rotation().get_quaternion())
288 pickable_transformations[label] = (rot, trans)
289 return pickable_transformations
291 def set_transformation_distance_dict_from_pickable(
293 pickable_transformations):
294 self.transformation_distance_dict = {}
295 for label
in pickable_transformations:
296 tr = pickable_transformations[label]
299 self.transformation_distance_dict[
302 def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
304 outf = open(file_name +
".data",
'wb')
310 pickable_transformations = self.get_pickable_transformation_distance_dict(
313 (self.structure_cluster_ids,
314 self.model_list_names,
315 pickable_transformations),
318 np.save(file_name +
".npy", self.raw_distance_matrix)
320 def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
323 inputf = open(file_name +
".data",
'rb')
324 (self.structure_cluster_ids, self.model_list_names,
325 pickable_transformations) = pickle.load(inputf)
328 self.raw_distance_matrix = np.load(file_name +
".npy")
330 self.set_transformation_distance_dict_from_pickable(
331 pickable_transformations)
332 self.model_indexes = list(range(len(self.model_list_names)))
333 self.model_indexes_dict = dict(
334 list(zip(self.model_list_names, self.model_indexes)))
336 def plot_matrix(self, figurename="clustermatrix.pdf"):
337 import matplotlib
as mpl
339 import matplotlib.pylab
as pl
340 from scipy.cluster
import hierarchy
as hrc
342 fig = pl.figure(figsize=(10,8))
343 ax = fig.add_subplot(212)
344 dendrogram = hrc.dendrogram(
345 hrc.linkage(self.raw_distance_matrix),
348 leaves_order = dendrogram[
'leaves']
349 ax.set_xlabel(
'Model')
350 ax.set_ylabel(
'RMSD [Angstroms]')
352 ax2 = fig.add_subplot(221)
354 self.raw_distance_matrix[leaves_order,
357 interpolation=
'nearest')
358 cb = fig.colorbar(cax)
359 cb.set_label(
'RMSD [Angstroms]')
360 ax2.set_xlabel(
'Model')
361 ax2.set_ylabel(
'Model')
363 pl.savefig(figurename, dpi=300)
366 def get_model_index_from_name(self, name):
367 return self.model_indexes_dict[name]
369 def get_cluster_labels(self):
371 return list(set(self.structure_cluster_ids))
373 def get_number_of_clusters(self):
374 return len(self.get_cluster_labels())
376 def get_cluster_label_indexes(self, label):
378 [i
for i, l
in enumerate(self.structure_cluster_ids)
if l == label]
381 def get_cluster_label_names(self, label):
383 [self.model_list_names[i]
384 for i
in self.get_cluster_label_indexes(label)]
387 def get_cluster_label_average_rmsd(self, label):
389 indexes = self.get_cluster_label_indexes(label)
392 sub_distance_matrix = self.raw_distance_matrix[
393 indexes, :][:, indexes]
394 average_rmsd = np.sum(sub_distance_matrix) / \
395 (len(sub_distance_matrix)
396 ** 2 - len(sub_distance_matrix))
401 def get_cluster_label_size(self, label):
402 return len(self.get_cluster_label_indexes(label))
404 def get_transformation_to_first_member(
408 reference = self.get_cluster_label_indexes(cluster_label)[0]
409 return self.transformation_distance_dict[(reference, structure_index)]
411 def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
413 model_list_names = list(all_coords.keys())
414 rmsd_protein_names = list(all_coords[model_list_names[0]].keys())
415 raw_distance_dict = {}
416 transformation_distance_dict = {}
417 if template_coords
is None:
421 alignment_template_protein_names = list(template_coords.keys())
423 for (f1, f2)
in list_of_pairs:
431 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
432 for pr
in rmsd_protein_names])
434 for pr
in rmsd_protein_names:
435 coords_f2[pr] = all_coords[model_list_names[f2]][pr]
437 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
438 rmsd = Ali.get_rmsd()
444 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
445 for pr
in alignment_template_protein_names])
446 coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
447 for pr
in alignment_template_protein_names])
450 template_rmsd, transformation = Ali.align()
455 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
456 for pr
in rmsd_protein_names])
458 for pr
in rmsd_protein_names:
459 coords_f2[pr] = [transformation.get_transformed(
460 i)
for i
in all_coords[model_list_names[f2]][pr]]
462 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
463 rmsd = Ali.get_rmsd()
465 raw_distance_dict[(f1, f2)] = rmsd
466 raw_distance_dict[(f2, f1)] = rmsd
467 transformation_distance_dict[(f1, f2)] = transformation
468 transformation_distance_dict[(f2, f1)] = transformation
470 return raw_distance_dict, transformation_distance_dict
474 """Compute the RMSD (without alignment) taking into account the copy ambiguity.
475 To be used with pmi2 hierarchies. Can be used for instance as follows:
477 rmsd=IMP.pmi1.analysis.RMSD(hier,hier,[mol.get_name() for mol in mols],dynamic0=True,dynamic1=False)
478 output_objects.append(rmsd)
480 before shuffling the coordinates
483 def __init__(self,hier0,hier1,molnames,label="None",dynamic0=True,dynamic1=True,metric=IMP.algebra.get_rmsd):
485 @param hier0 first input hierarchy
486 @param hier1 second input hierarchy
487 @param molname the names of the molecules used for the RMSD
488 @dynamic0 if True stores the decorators XYZ and coordinates of hier0 can be update. If false coordinates are static (stored in Vector3Ds)
489 and will never be updated
490 @dynamic1 same as above
491 metric what metric should be used
495 self.dynamic0=dynamic0
496 self.dynamic1=dynamic1
501 """return data structure for the RMSD calculation"""
507 if name
not in molnames:
514 sel=
IMP.atom.Selection(mol,residue_index=i,representation_type=IMP.atom.BALLS,resolution=1)
515 parts=sel.get_selected_particles()
517 mol_coords[mol].append(
IMP.core.XYZ(parts[0]).get_coordinates())
521 moldict[name].append(mol)
524 return moldict, mol_coords, mol_XYZs
526 def get_rmsd_and_assigments(self):
532 for molname, ref_mols
in self.moldict1.items():
534 for ref_mol
in ref_mols:
536 coord1=[XYZ.get_coordinates()
for XYZ
in self.mol_XYZs1[ref_mol]]
538 coord1=self.molcoords1[ref_mol]
543 for rmf_mols
in itertools.permutations(self.moldict0[molname]):
545 for rmf_mol
in rmf_mols:
547 coord0=[XYZ.get_coordinates()
for XYZ
in self.mol_XYZs0[rmf_mol]]
549 coord0=self.molcoords0[rmf_mol]
553 rmf_mols_list.append(rmf_mols)
555 rmf_mols_best_order=rmf_mols_list[rmsd.index(m)]
558 for n, (ref_mol,rmf_mol)
in enumerate(zip(ref_mols,rmf_mols_best_order)):
559 best_assignments.append((rmf_mol,ref_mol))
561 coord0=[XYZ.get_coordinates()
for XYZ
in self.mol_XYZs0[rmf_mol]]
563 coord0=self.molcoords0[rmf_mol]
566 coord1=[XYZ.get_coordinates()
for XYZ
in self.mol_XYZs1[ref_mol]]
568 coord1=self.molcoords1[ref_mol]
569 rmsd_pair=self.metric(coord1, coord0)
570 N=len(self.molcoords1[ref_mol])
572 total_rmsd+=rmsd_pair*rmsd_pair*N
573 rmsd_dict[ref_mol]=rmsd_pair
574 total_rmsd = sqrt(total_rmsd/total_N)
575 return total_rmsd,best_assignments
578 """Returns output for IMP.pmi1.output.Output object"""
579 total_rmsd,best_assignments=self.get_rmsd_and_assigments()
582 for rmf_mol,ref_mol
in best_assignments:
583 ref_name=ref_mol.get_name()
585 rmf_name=rmf_mol.get_name()
587 assignments_out.append(rmf_name+
"."+str(rmf_copy)+
"->"+ref_name+
"."+str(ref_copy))
588 return {
"RMSD_"+self.label:str(total_rmsd),
"RMSD_assignments_"+self.label:str(assignments_out)}
594 """A class to evaluate the precision of an ensemble.
596 Also can evaluate the cross-precision of multiple ensembles.
597 Supports MPI for coordinate reading.
598 Recommended procedure:
599 -# initialize object and pass the selection for evaluating precision
600 -# call add_structures() to read in the data (specify group name)
601 -# call get_precision() to evaluate inter/intra precision
602 -# call get_rmsf() to evaluate within-group fluctuations
606 selection_dictionary={}):
608 @param model The IMP Model
609 @param resolution Use 1 or 10 (kluge: requires that "_Res:X" is
610 part of the hier name)
611 @param selection_dictionary Dictionary where keys are names for
612 selections and values are selection tuples for scoring
613 precision. "All" is automatically made as well
616 from mpi4py
import MPI
617 self.comm = MPI.COMM_WORLD
618 self.rank = self.comm.Get_rank()
619 self.number_of_processes = self.comm.size
621 self.number_of_processes = 1
624 self.styles = [
'pairwise_rmsd',
'pairwise_drmsd_k',
'pairwise_drmsd_Q',
625 'pairwise_drms_k',
'pairwise_rmsd',
'drmsd_from_center']
626 self.style =
'pairwise_drmsd_k'
627 self.structures_dictionary = {}
628 self.reference_structures_dictionary = {}
630 self.protein_names =
None
631 self.len_particles_resolution_one =
None
633 self.rmf_names_frames = {}
634 self.reference_rmf_names_frames =
None
635 self.reference_structure =
None
636 self.reference_prot =
None
637 self.selection_dictionary = selection_dictionary
638 self.threshold = 40.0
639 self.residue_particle_index_map =
None
641 if resolution
in [1,10]:
642 self.resolution = resolution
644 raise KeyError(
"Currently only allow resolution 1 or 10")
646 def _get_structure(self,rmf_frame_index,rmf_name):
647 """Read an RMF file and return the particles"""
648 rh = RMF.open_rmf_file_read_only(rmf_name)
650 print(
"getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
654 print(
"linking coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
659 if self.resolution==1:
661 elif self.resolution==10:
664 protein_names = list(particle_dict.keys())
665 particles_resolution_one = []
666 for k
in particle_dict:
667 particles_resolution_one += (particle_dict[k])
669 if self.protein_names==
None:
670 self.protein_names = protein_names
672 if self.protein_names!=protein_names:
673 print(
"Error: the protein names of the new coordinate set is not compatible with the previous one")
675 if self.len_particles_resolution_one==
None:
676 self.len_particles_resolution_one = len(particles_resolution_one)
678 if self.len_particles_resolution_one!=len(particles_resolution_one):
679 raise ValueError(
"the new coordinate set is not compatible with the previous one")
681 return particles_resolution_one
687 setup_index_map=
False):
688 """ Read a structure into the ensemble and store (as coordinates).
689 @param rmf_name The name of the RMF file
690 @param rmf_frame_index The frame to read
691 @param structure_set_name Name for the set that includes this structure
693 @param setup_index_map if requested, set up a dictionary to help
698 if structure_set_name
in self.structures_dictionary:
699 cdict = self.structures_dictionary[structure_set_name]
700 rmflist = self.rmf_names_frames[structure_set_name]
702 self.structures_dictionary[structure_set_name]={}
703 self.rmf_names_frames[structure_set_name]=[]
704 cdict = self.structures_dictionary[structure_set_name]
705 rmflist = self.rmf_names_frames[structure_set_name]
709 particles_resolution_one = self._get_structure(rmf_frame_index,rmf_name)
711 print(
"something wrong with the rmf")
713 self.selection_dictionary.update({
"All":self.protein_names})
715 for selection_name
in self.selection_dictionary:
716 selection_tuple = self.selection_dictionary[selection_name]
717 coords = self._select_coordinates(selection_tuple,particles_resolution_one,self.prots[0])
718 if selection_name
not in cdict:
719 cdict[selection_name] = [coords]
721 cdict[selection_name].append(coords)
723 rmflist.append((rmf_name,rmf_frame_index))
727 self.residue_particle_index_map = {}
728 for prot_name
in self.protein_names:
729 self.residue_particle_index_map[prot_name] = \
730 self._get_residue_particle_index_map(
732 particles_resolution_one,self.prots[0])
735 rmf_name_frame_tuples,
737 """Read a list of RMFs, supports parallel
738 @param rmf_name_frame_tuples list of (rmf_file_name,frame_number)
739 @param structure_set_name Name this set of structures (e.g. "cluster.1")
743 my_rmf_name_frame_tuples=IMP.pmi1.tools.chunk_list_into_segments(
744 rmf_name_frame_tuples,self.number_of_processes)[self.rank]
745 for nfr,tup
in enumerate(my_rmf_name_frame_tuples):
747 rmf_frame_index=tup[1]
749 if self.residue_particle_index_map
is None:
750 setup_index_map =
True
752 setup_index_map =
False
759 if self.number_of_processes > 1:
762 self.comm.send(self.structures_dictionary, dest=0, tag=11)
764 for i
in range(1, self.number_of_processes):
765 data_tmp = self.comm.recv(source=i, tag=11)
766 for key
in self.structures_dictionary:
767 self.structures_dictionary[key].update(data_tmp[key])
768 for i
in range(1, self.number_of_processes):
769 self.comm.send(self.structures_dictionary, dest=i, tag=11)
771 self.structures_dictionary = self.comm.recv(source=0, tag=11)
773 def _get_residue_particle_index_map(self,prot_name,structure,hier):
775 residue_particle_index_map = []
777 all_selected_particles = s.get_selected_particles()
778 intersection = list(set(all_selected_particles) & set(structure))
779 sorted_intersection = IMP.pmi1.tools.sort_by_residues(intersection)
780 for p
in sorted_intersection:
782 return residue_particle_index_map
785 def _select_coordinates(self,tuple_selections,structure,prot):
786 selected_coordinates=[]
787 for t
in tuple_selections:
788 if type(t)==tuple
and len(t)==3:
790 all_selected_particles = s.get_selected_particles()
791 intersection = list(set(all_selected_particles) & set(structure))
792 sorted_intersection = IMP.pmi1.tools.sort_by_residues(intersection)
793 cc = [tuple(
IMP.core.XYZ(p).get_coordinates())
for p
in sorted_intersection]
794 selected_coordinates += cc
797 all_selected_particles = s.get_selected_particles()
798 intersection = list(set(all_selected_particles) & set(structure))
799 sorted_intersection = IMP.pmi1.tools.sort_by_residues(intersection)
800 cc = [tuple(
IMP.core.XYZ(p).get_coordinates())
for p
in sorted_intersection]
801 selected_coordinates += cc
803 raise ValueError(
"Selection error")
804 return selected_coordinates
806 def set_threshold(self,threshold):
807 self.threshold = threshold
809 def _get_distance(self,
815 """ Compute distance between structures with various metrics """
816 c1 = self.structures_dictionary[structure_set_name1][selection_name][index1]
817 c2 = self.structures_dictionary[structure_set_name2][selection_name][index2]
821 if self.style==
'pairwise_drmsd_k':
823 if self.style==
'pairwise_drms_k':
825 if self.style==
'pairwise_drmsd_Q':
828 if self.style==
'pairwise_rmsd':
832 def _get_particle_distances(self,structure_set_name1,structure_set_name2,
833 selection_name,index1,index2):
834 c1 = self.structures_dictionary[structure_set_name1][selection_name][index1]
835 c2 = self.structures_dictionary[structure_set_name2][selection_name][index2]
840 distances=[np.linalg.norm(a-b)
for (a,b)
in zip(coordinates1,coordinates2)]
849 selection_keywords=
None):
850 """ Evaluate the precision of two named structure groups. Supports MPI.
851 When the structure_set_name1 is different from the structure_set_name2,
852 this evaluates the cross-precision (average pairwise distances).
853 @param outfile Name of the precision output file
854 @param structure_set_name1 string name of the first structure set
855 @param structure_set_name2 string name of the second structure set
856 @param skip analyze every (skip) structure for the distance matrix calculation
857 @param selection_keywords Specify the selection name you want to calculate on.
858 By default this is computed for everything you provided in the constructor,
859 plus all the subunits together.
861 if selection_keywords
is None:
862 sel_keys = list(self.selection_dictionary.keys())
864 for k
in selection_keywords:
865 if k
not in self.selection_dictionary:
866 raise KeyError(
"you are trying to find named selection " \
867 + k +
" which was not requested in the constructor")
868 sel_keys = selection_keywords
870 if outfile
is not None:
871 of = open(outfile,
"w")
873 for selection_name
in sel_keys:
874 number_of_structures_1 = len(self.structures_dictionary[structure_set_name1][selection_name])
875 number_of_structures_2 = len(self.structures_dictionary[structure_set_name2][selection_name])
877 structure_pointers_1 = list(range(0,number_of_structures_1,skip))
878 structure_pointers_2 = list(range(0,number_of_structures_2,skip))
879 pair_combination_list = list(itertools.product(structure_pointers_1,structure_pointers_2))
880 if len(pair_combination_list)==0:
881 raise ValueError(
"no structure selected. Check the skip parameter.")
884 my_pair_combination_list = IMP.pmi1.tools.chunk_list_into_segments(
885 pair_combination_list,self.number_of_processes)[self.rank]
886 my_length = len(my_pair_combination_list)
887 for n,pair
in enumerate(my_pair_combination_list):
888 progression = int(float(n)/my_length*100.0)
889 distances[pair] = self._get_distance(structure_set_name1,structure_set_name2,
890 selection_name,pair[0],pair[1])
891 if self.number_of_processes > 1:
896 if structure_set_name1==structure_set_name2:
897 structure_pointers = structure_pointers_1
898 number_of_structures = number_of_structures_1
903 distances_to_structure = {}
904 distances_to_structure_normalization = {}
906 for n
in structure_pointers:
907 distances_to_structure[n] = 0.0
908 distances_to_structure_normalization[n]=0
911 distance += distances[k]
912 distances_to_structure[k[0]] += distances[k]
913 distances_to_structure[k[1]] += distances[k]
914 distances_to_structure_normalization[k[0]] += 1
915 distances_to_structure_normalization[k[1]] += 1
917 for n
in structure_pointers:
918 distances_to_structure[n] = distances_to_structure[n]/distances_to_structure_normalization[n]
920 min_distance = min([distances_to_structure[n]
for n
in distances_to_structure])
921 centroid_index = [k
for k, v
in distances_to_structure.items()
if v == min_distance][0]
922 centroid_rmf_name = self.rmf_names_frames[structure_set_name1][centroid_index]
924 centroid_distance = 0.0
926 for n
in range(number_of_structures):
927 dist = self._get_distance(structure_set_name1,structure_set_name1,
928 selection_name,centroid_index,n)
929 centroid_distance += dist
930 distance_list.append(dist)
933 centroid_distance /= number_of_structures
935 if outfile
is not None:
936 of.write(str(selection_name)+
" "+structure_set_name1+
937 " average centroid distance "+str(centroid_distance)+
"\n")
938 of.write(str(selection_name)+
" "+structure_set_name1+
939 " centroid index "+str(centroid_index)+
"\n")
940 of.write(str(selection_name)+
" "+structure_set_name1+
941 " centroid rmf name "+str(centroid_rmf_name)+
"\n")
942 of.write(str(selection_name)+
" "+structure_set_name1+
943 " median centroid distance "+str(np.median(distance_list))+
"\n")
945 average_pairwise_distances=sum(distances.values())/len(list(distances.values()))
946 if outfile
is not None:
947 of.write(str(selection_name)+
" "+structure_set_name1+
" "+structure_set_name2+
948 " average pairwise distance "+str(average_pairwise_distances)+
"\n")
949 if outfile
is not None:
951 return centroid_index
957 set_plot_yaxis_range=
None):
958 """ Calculate the residue mean square fluctuations (RMSF).
959 Automatically outputs as data file and pdf
960 @param structure_set_name Which structure set to calculate RMSF for
961 @param outdir Where to write the files
962 @param skip Skip this number of structures
963 @param set_plot_yaxis_range In case you need to change the plot
971 for sel_name
in self.protein_names:
972 self.selection_dictionary.update({sel_name:[sel_name]})
974 number_of_structures = len(self.structures_dictionary[structure_set_name][sel_name])
978 rpim = self.residue_particle_index_map[sel_name]
979 outfile = outdir+
"/rmsf."+sel_name+
".dat"
980 of = open(outfile,
"w")
981 residue_distances = {}
983 for index
in range(number_of_structures):
984 distances = self._get_particle_distances(structure_set_name,
987 centroid_index,index)
988 for nblock,block
in enumerate(rpim):
989 for residue_number
in block:
990 residue_nblock[residue_number] = nblock
991 if residue_number
not in residue_distances:
992 residue_distances[residue_number] = [distances[nblock]]
994 residue_distances[residue_number].append(distances[nblock])
998 for rn
in residue_distances:
1000 rmsf = np.std(residue_distances[rn])
1002 of.write(str(rn)+
" "+str(residue_nblock[rn])+
" "+str(rmsf)+
"\n")
1004 IMP.pmi1.output.plot_xy_data(residues,rmsfs,title=sel_name,
1005 out_fn=outdir+
"/rmsf."+sel_name,display=
False,
1006 set_plot_yaxis_range=set_plot_yaxis_range,
1007 xlabel=
'Residue Number',ylabel=
'Standard error')
1012 """Read in a structure used for reference computation.
1013 Needed before calling get_average_distance_wrt_reference_structure()
1014 @param rmf_name The RMF file to read the reference
1015 @param rmf_frame_index The index in that file
1017 particles_resolution_one = self._get_structure(rmf_frame_index,rmf_name)
1018 self.reference_rmf_names_frames = (rmf_name,rmf_frame_index)
1020 for selection_name
in self.selection_dictionary:
1021 selection_tuple = self.selection_dictionary[selection_name]
1022 coords = self._select_coordinates(selection_tuple,
1023 particles_resolution_one,self.prots[0])
1024 self.reference_structures_dictionary[selection_name] = coords
1027 """First align then calculate RMSD
1028 @param structure_set_name: the name of the structure set
1029 @param alignment_selection: the key containing the selection tuples needed to make the alignment stored in self.selection_dictionary
1030 @return: for each structure in the structure set, returns the rmsd
1032 if self.reference_structures_dictionary=={}:
1033 print(
"Cannot compute until you set a reference structure")
1036 align_reference_coordinates = self.reference_structures_dictionary[alignment_selection_key]
1037 align_coordinates = self.structures_dictionary[structure_set_name][alignment_selection_key]
1038 transformations = []
1039 for c
in align_coordinates:
1041 transformation = Ali.align()[1]
1042 transformations.append(transformation)
1043 for selection_name
in self.selection_dictionary:
1044 reference_coordinates = self.reference_structures_dictionary[selection_name]
1047 for n,sc
in enumerate(self.structures_dictionary[structure_set_name][selection_name]):
1050 distances.append(distance)
1051 print(selection_name,
"average rmsd",sum(distances)/len(distances),
"median",self._get_median(distances),
"minimum distance",min(distances))
1053 def _get_median(self,list_of_values):
1054 return np.median(np.array(list_of_values))
1057 """Compare the structure set to the reference structure.
1058 @param structure_set_name The structure set to compute this on
1059 @note First call set_reference_structure()
1062 if self.reference_structures_dictionary=={}:
1063 print(
"Cannot compute until you set a reference structure")
1065 for selection_name
in self.selection_dictionary:
1066 reference_coordinates = self.reference_structures_dictionary[selection_name]
1069 for sc
in self.structures_dictionary[structure_set_name][selection_name]:
1071 if self.style==
'pairwise_drmsd_k':
1073 if self.style==
'pairwise_drms_k':
1075 if self.style==
'pairwise_drmsd_Q':
1077 if self.style==
'pairwise_rmsd':
1079 distances.append(distance)
1081 print(selection_name,
"average distance",sum(distances)/len(distances),
"minimum distance",min(distances),
'nframes',len(distances))
1082 ret[selection_name] = {
'average_distance':sum(distances)/len(distances),
'minimum_distance':min(distances)}
1085 def get_coordinates(self):
1088 def set_precision_style(self, style):
1089 if style
in self.styles:
1092 raise ValueError(
"No such style")
1096 """Compute mean density maps from structures.
1098 Keeps a dictionary of density maps,
1099 keys are in the custom ranges. When you call add_subunits_density, it adds
1100 particle coordinates to the existing density maps.
1103 def __init__(self, custom_ranges, representation=None, resolution=20.0, voxel=5.0):
1105 @param custom_ranges Required. It's a dictionary, keys are the
1106 density component names, values are selection tuples
1107 e.g. {'kin28':[['kin28',1,-1]],
1108 'density_name_1' :[('ccl1')],
1109 'density_name_2' :[(1,142,'tfb3d1'),
1110 (143,700,'tfb3d2')],
1111 @param representation PMI representation, for doing selections.
1112 Not needed if you only pass hierarchies
1113 @param resolution The MRC resolution of the output map (in Angstrom unit)
1114 @param voxel The voxel size for the output map (lower is slower)
1117 self.representation = representation
1118 self.MRCresolution = resolution
1121 self.count_models = 0.0
1122 self.custom_ranges = custom_ranges
1125 """Add a frame to the densities.
1126 @param hierarchy Optionally read the hierarchy from somewhere.
1127 If not passed, will just read the representation.
1129 self.count_models += 1.0
1133 all_particles_by_resolution = []
1134 for name
in part_dict:
1135 all_particles_by_resolution += part_dict[name]
1137 for density_name
in self.custom_ranges:
1140 all_particles_by_segments = []
1142 for seg
in self.custom_ranges[density_name]:
1145 parts += IMP.tools.select_by_tuple(self.representation,
1146 seg, resolution=1, name_is_ambiguous=
False)
1149 for h
in hierarchy.get_children():
1153 if type(seg) == str:
1155 elif type(seg) == tuple
and len(seg) == 2:
1157 hierarchy, molecule=seg[0],copy_index=seg[1])
1158 elif type(seg) == tuple
and len(seg) == 3:
1160 hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1))
1161 elif type(seg) == tuple
and len(seg) == 4:
1163 hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1),copy_index=seg[3])
1165 raise Exception(
'could not understand selection tuple '+str(seg))
1167 all_particles_by_segments += s.get_selected_particles()
1170 set(all_particles_by_segments) & set(all_particles_by_resolution))
1171 self._create_density_from_particles(parts, density_name)
1173 def normalize_density(self):
1176 def _create_density_from_particles(self, ps, name,
1177 kernel_type=
'GAUSSIAN'):
1178 '''Internal function for adding to densities.
1179 pass XYZR particles with mass and create a density from them.
1180 kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
1182 'GAUSSIAN': IMP.em.GAUSSIAN,
1183 'BINARIZED_SPHERE': IMP.em.BINARIZED_SPHERE,
1184 'SPHERE': IMP.em.SPHERE}
1188 dmap.set_was_used(
True)
1189 if name
not in self.densities:
1190 self.densities[name] = dmap
1196 dmap3.set_was_used(
True)
1198 dmap3.add(self.densities[name])
1199 self.densities[name] = dmap3
1201 def get_density_keys(self):
1202 return list(self.densities.keys())
1205 """Get the current density for some component name"""
1206 if name
not in self.densities:
1209 return self.densities[name]
1211 def write_mrc(self, path="./",suffix=None):
1213 for density_name
in self.densities:
1214 self.densities[density_name].
multiply(1. / self.count_models)
1216 name=path +
"/" + density_name +
".mrc"
1218 name=path +
"/" + density_name +
"." + suffix +
".mrc"
1219 path, file = os.path.split(name)
1220 if not os.path.exists(path):
1223 except OSError
as e:
1224 if e.errno != errno.EEXIST:
1227 self.densities[density_name],name,
1231 class GetContactMap(object):
1234 self.distance = distance
1235 self.contactmap =
''
1242 def set_prot(self, prot):
1243 from scipy.spatial.distance
import cdist
1252 for name
in particles_dictionary:
1253 residue_indexes = []
1254 for p
in particles_dictionary[name]:
1258 if len(residue_indexes) != 0:
1259 self.protnames.append(name)
1261 def get_subunit_coords(self, frame, align=0):
1262 from scipy.spatial.distance
import cdist
1266 test, testr = [], []
1267 for part
in self.prot.get_children():
1270 for chl
in part.get_children():
1276 SortedSegments.append((chl, startres))
1277 SortedSegments = sorted(SortedSegments, key=itemgetter(1))
1279 for sgmnt
in SortedSegments:
1282 crd = np.array([p.get_x(), p.get_y(), p.get_z()])
1285 radii.append(p.get_radius())
1287 new_name = part.get_name() +
'_' + sgmnt[0].get_name() +\
1290 .get_residue_indexes()[0])
1291 namelist.append(new_name)
1292 self.expanded[new_name] = len(
1294 if part.get_name()
not in self.resmap:
1295 self.resmap[part.get_name()] = {}
1297 self.resmap[part.get_name()][res] = new_name
1299 coords = np.array(coords)
1300 radii = np.array(radii)
1301 if len(self.namelist) == 0:
1302 self.namelist = namelist
1303 self.contactmap = np.zeros((len(coords), len(coords)))
1304 distances = cdist(coords, coords)
1305 distances = (distances - radii).T - radii
1306 distances = distances <= self.distance
1307 self.contactmap += distances
1312 identification_string=
'ISDCrossLinkMS_Distance_'):
1315 data = open(filename)
1316 D = data.readlines()
1320 if identification_string
in d:
1327 t1, t2 = (d[0], d[1]), (d[1], d[0])
1328 if t1
not in self.XL:
1329 self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
1330 self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
1332 self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
1333 self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
1335 def dist_matrix(self, skip_cmap=0, skip_xl=1, outname=None):
1339 L = sum(self.expanded.values())
1340 proteins = self.protnames
1345 proteins = [p.get_name()
for p
in self.prot.get_children()]
1347 for p1
in range(len(proteins)):
1348 for p2
in range(p1, len(proteins)):
1350 self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1351 pn1, pn2 = proteins[p1], proteins[p2]
1352 mtr = np.zeros((pl1 + 1, pl2 + 1))
1353 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1354 for i1
in range(1, pl1 + 1):
1355 for i2
in range(1, pl2 + 1):
1357 r1 = K.index(self.resmap[pn1][i1])
1358 r2 = K.index(self.resmap[pn2][i2])
1360 mtr[i1 - 1, i2 - 1] = r
1362 missing.append((pn1, pn2, i1, i2))
1364 Matrices[(pn1, pn2)] = mtr
1369 raise ValueError(
"cross-links were not provided, use add_xlinks function!")
1372 for p1
in range(len(proteins)):
1373 for p2
in range(p1, len(proteins)):
1375 self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1376 pn1, pn2 = proteins[p1], proteins[p2]
1377 mtr = np.zeros((pl1 + 1, pl2 + 1))
1380 xls = self.XL[(pn1, pn2)]
1383 xls = self.XL[(pn2, pn1)]
1388 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1389 for xl1, xl2
in xls:
1391 print(
'X' * 10, xl1, xl2)
1394 print(
'X' * 10, xl1, xl2)
1396 mtr[xl1 - 1, xl2 - 1] = 100
1398 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1399 for xl1, xl2
in xls:
1401 print(
'X' * 10, xl1, xl2)
1404 print(
'X' * 10, xl1, xl2)
1406 mtr[xl2 - 1, xl1 - 1] = 100
1408 print(
'No cross links between: ', pn1, pn2)
1409 Matrices_xl[(pn1, pn2)] = mtr
1425 for i, c
in enumerate(C):
1426 cl = max(self.resmap[c].keys())
1431 R.append(R[-1] + cl)
1436 import matplotlib
as mpl
1438 import matplotlib.pyplot
as plt
1439 import matplotlib.gridspec
as gridspec
1440 import scipy.sparse
as sparse
1443 gs = gridspec.GridSpec(len(W), len(W),
1448 for x1, r1
in enumerate(R):
1453 for x2, r2
in enumerate(R):
1459 ax = plt.subplot(gs[cnt])
1462 mtr = Matrices[(C[x1], C[x2])]
1464 mtr = Matrices[(C[x2], C[x1])].T
1468 interpolation=
'nearest',
1470 vmax=log(mtr.max())
if mtr.max() > 0.
else 1.)
1475 mtr = Matrices_xl[(C[x1], C[x2])]
1477 mtr = Matrices_xl[(C[x2], C[x1])].T
1479 sparse.csr_matrix(mtr),
1489 ax.set_ylabel(C[x1], rotation=90)
1491 plt.savefig(outname +
".pdf", dpi=300, transparent=
"False")
1499 def get_hiers_from_rmf(model, frame_number, rmf_file):
1501 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1504 rh = RMF.open_rmf_file_read_only(rmf_file)
1509 print(
"Unable to open rmf file %s" % (rmf_file))
1516 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1523 def link_hiers_to_rmf(model,hiers,frame_number, rmf_file):
1524 print(
"linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1525 rh = RMF.open_rmf_file_read_only(rmf_file)
1530 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1537 def get_hiers_and_restraints_from_rmf(model, frame_number, rmf_file):
1539 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1542 rh = RMF.open_rmf_file_read_only(rmf_file)
1548 print(
"Unable to open rmf file %s" % (rmf_file))
1553 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1560 def link_hiers_and_restraints_to_rmf(model,hiers,rs, frame_number, rmf_file):
1561 print(
"linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1562 rh = RMF.open_rmf_file_read_only(rmf_file)
1568 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1576 """Get particles at res 1, or any beads, based on the name.
1577 No Representation is needed. This is mainly used when the hierarchy
1578 is read from an RMF file.
1579 @return a dictionary of component names and their particles
1580 \note If the root node is named "System" or is a "State", do proper selection.
1585 for c
in prot.get_children():
1588 for s
in c.get_children():
1589 if "_Res:1" in s.get_name()
and "_Res:10" not in s.get_name():
1591 if "Beads" in s.get_name():
1595 for name
in particle_dict:
1596 particle_dict[name] = IMP.pmi1.tools.sort_by_residues(
1597 list(set(particle_dict[name]) & set(allparticles)))
1598 return particle_dict
1601 """Get particles at res 10, or any beads, based on the name.
1602 No Representation is needed.
1603 This is mainly used when the hierarchy is read from an RMF file.
1604 @return a dictionary of component names and their particles
1605 \note If the root node is named "System" or is a "State", do proper selection.
1609 for c
in prot.get_children():
1612 for s
in c.get_children():
1613 if "_Res:10" in s.get_name():
1615 if "Beads" in s.get_name():
1618 for name
in particle_dict:
1619 particle_dict[name] = IMP.pmi1.tools.sort_by_residues(
1620 list(set(particle_dict[name]) & set(allparticles)))
1621 return particle_dict
1623 def select_by_tuple(first_res_last_res_name_tuple):
1624 first_res = first_res_last_res_hier_tuple[0]
1625 last_res = first_res_last_res_hier_tuple[1]
1626 name = first_res_last_res_hier_tuple[2]
1629 """Visualization of crosslinks"""
1631 self.crosslinks = []
1632 self.external_csv_data =
None
1633 self.crosslinkedprots = set()
1634 self.mindist = +10000000.0
1635 self.maxdist = -10000000.0
1636 self.contactmap =
None
1638 def set_hierarchy(self, prot):
1639 self.prot_length_dict = {}
1640 self.model=prot.get_model()
1642 for i
in prot.get_children():
1644 residue_indexes = []
1648 if len(residue_indexes) != 0:
1649 self.prot_length_dict[name] = max(residue_indexes)
1651 def set_coordinates_for_contact_map(self, rmf_name,rmf_frame_index):
1652 from scipy.spatial.distance
import cdist
1654 rh= RMF.open_rmf_file_read_only(rmf_name)
1657 print(
"getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
1668 self.index_dictionary = {}
1670 for name
in particles_dictionary:
1671 residue_indexes = []
1672 for p
in particles_dictionary[name]:
1677 if len(residue_indexes) != 0:
1679 for res
in range(min(residue_indexes), max(residue_indexes) + 1):
1682 crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1684 radii.append(d.get_radius())
1685 if name
not in self.index_dictionary:
1686 self.index_dictionary[name] = [resindex]
1688 self.index_dictionary[name].append(resindex)
1691 coords = np.array(coords)
1692 radii = np.array(radii)
1694 distances = cdist(coords, coords)
1695 distances = (distances - radii).T - radii
1697 distances = np.where(distances <= 20.0, 1.0, 0)
1698 if self.contactmap
is None:
1699 self.contactmap = np.zeros((len(coords), len(coords)))
1700 self.contactmap += distances
1702 for prot
in prots: IMP.atom.destroy(prot)
1705 self, data_file, search_label=
'ISDCrossLinkMS_Distance_',
1708 filter_rmf_file_names=
None,
1709 external_csv_data_file=
None,
1710 external_csv_data_file_unique_id_key=
"Unique ID"):
1719 keys = po.get_keys()
1721 xl_keys = [k
for k
in keys
if search_label
in k]
1723 if filter_rmf_file_names
is not None:
1724 rmf_file_key=
"local_rmf_file_name"
1725 fs = po.get_fields(xl_keys+[rmf_file_key])
1727 fs = po.get_fields(xl_keys)
1730 self.cross_link_frequency = {}
1734 self.cross_link_distances = {}
1738 self.cross_link_distances_unique = {}
1740 if not external_csv_data_file
is None:
1743 self.external_csv_data = {}
1744 xldb = IMP.pmi1.tools.get_db_from_csv(external_csv_data_file)
1747 self.external_csv_data[
1748 xl[external_csv_data_file_unique_id_key]] = xl
1753 cross_link_frequency_list = []
1755 self.unique_cross_link_list = []
1759 keysplit = key.replace(
1768 if filter_label!=
None:
1769 if filter_label
not in keysplit:
continue
1772 r1 = int(keysplit[5])
1774 r2 = int(keysplit[7])
1777 confidence = keysplit[12]
1781 unique_identifier = keysplit[3]
1783 unique_identifier =
'0'
1785 r1 = int(keysplit[mapping[
"Residue1"]])
1786 c1 = keysplit[mapping[
"Protein1"]]
1787 r2 = int(keysplit[mapping[
"Residue2"]])
1788 c2 = keysplit[mapping[
"Protein2"]]
1790 confidence = keysplit[mapping[
"Confidence"]]
1794 unique_identifier = keysplit[mapping[
"Unique Identifier"]]
1796 unique_identifier =
'0'
1798 self.crosslinkedprots.add(c1)
1799 self.crosslinkedprots.add(c2)
1805 if filter_rmf_file_names
is not None:
1806 for n,d
in enumerate(fs[key]):
1807 if fs[rmf_file_key][n]
in filter_rmf_file_names:
1808 dists.append(float(d))
1810 dists=[float(f)
for f
in fs[key]]
1815 mdist = self.median(dists)
1817 stdv = np.std(np.array(dists))
1818 if self.mindist > mdist:
1819 self.mindist = mdist
1820 if self.maxdist < mdist:
1821 self.maxdist = mdist
1825 if not self.external_csv_data
is None:
1826 sample = self.external_csv_data[unique_identifier][
"Sample"]
1830 if (r1, c1, r2, c2,mdist)
not in cross_link_frequency_list:
1831 if (r1, c1, r2, c2)
not in self.cross_link_frequency:
1832 self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1833 self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1835 self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1836 self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1837 cross_link_frequency_list.append((r1, c1, r2, c2))
1838 cross_link_frequency_list.append((r2, c2, r1, c1))
1839 self.unique_cross_link_list.append(
1840 (r1, c1, r2, c2,mdist))
1842 if (r1, c1, r2, c2)
not in self.cross_link_distances:
1843 self.cross_link_distances[(
1849 confidence)] = dists
1850 self.cross_link_distances[(
1856 confidence)] = dists
1857 self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1859 self.cross_link_distances[(
1865 confidence)] += dists
1866 self.cross_link_distances[(
1872 confidence)] += dists
1874 self.crosslinks.append(
1884 self.crosslinks.append(
1895 self.cross_link_frequency_inverted = {}
1896 for xl
in self.unique_cross_link_list:
1897 (r1, c1, r2, c2, mdist) = xl
1898 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1899 if frequency
not in self.cross_link_frequency_inverted:
1900 self.cross_link_frequency_inverted[
1901 frequency] = [(r1, c1, r2, c2)]
1903 self.cross_link_frequency_inverted[
1904 frequency].append((r1, c1, r2, c2))
1908 def median(self, mylist):
1909 sorts = sorted(mylist)
1915 return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1916 return sorts[length / 2]
1918 def set_threshold(self,threshold):
1919 self.threshold=threshold
1921 def set_tolerance(self,tolerance):
1922 self.tolerance=tolerance
1924 def colormap(self, dist):
1925 if dist < self.threshold - self.tolerance:
1927 elif dist >= self.threshold + self.tolerance:
1932 def write_cross_link_database(self, filename, format='csv'):
1936 "Unique ID",
"Protein1",
"Residue1",
"Protein2",
"Residue2",
1937 "Median Distance",
"Standard Deviation",
"Confidence",
"Frequency",
"Arrangement"]
1939 if not self.external_csv_data
is None:
1940 keys = list(self.external_csv_data.keys())
1941 innerkeys = list(self.external_csv_data[keys[0]].keys())
1943 fieldnames += innerkeys
1945 dw = csv.DictWriter(
1949 fieldnames=fieldnames)
1951 for xl
in self.crosslinks:
1952 (r1, c1, r2, c2, mdist, stdv, confidence,
1953 unique_identifier, descriptor) = xl
1954 if descriptor ==
'original':
1956 outdict[
"Unique ID"] = unique_identifier
1957 outdict[
"Protein1"] = c1
1958 outdict[
"Protein2"] = c2
1959 outdict[
"Residue1"] = r1
1960 outdict[
"Residue2"] = r2
1961 outdict[
"Median Distance"] = mdist
1962 outdict[
"Standard Deviation"] = stdv
1963 outdict[
"Confidence"] = confidence
1964 outdict[
"Frequency"] = self.cross_link_frequency[
1967 arrangement =
"Intra"
1969 arrangement =
"Inter"
1970 outdict[
"Arrangement"] = arrangement
1971 if not self.external_csv_data
is None:
1972 outdict.update(self.external_csv_data[unique_identifier])
1974 dw.writerow(outdict)
1976 def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
1977 no_confidence_info=
False, filter=
None, layout=
"whole", crosslinkedonly=
False,
1978 filename=
None, confidence_classes=
None, alphablend=0.1, scale_symbol_size=1.0,
1979 gap_between_components=0,
1980 rename_protein_map=
None):
1994 import matplotlib
as mpl
1996 import matplotlib.pyplot
as plt
1997 import matplotlib.cm
as cm
1999 fig = plt.figure(figsize=(10, 10))
2000 ax = fig.add_subplot(111)
2006 if prot_listx
is None:
2008 prot_listx = list(self.crosslinkedprots)
2010 prot_listx = list(self.prot_length_dict.keys())
2013 nresx = gap_between_components + \
2014 sum([self.prot_length_dict[name]
2015 + gap_between_components
for name
in prot_listx])
2019 if prot_listy
is None:
2021 prot_listy = list(self.crosslinkedprots)
2023 prot_listy = list(self.prot_length_dict.keys())
2026 nresy = gap_between_components + \
2027 sum([self.prot_length_dict[name]
2028 + gap_between_components
for name
in prot_listy])
2033 res = gap_between_components
2034 for prot
in prot_listx:
2035 resoffsetx[prot] = res
2036 res += self.prot_length_dict[prot]
2038 res += gap_between_components
2042 res = gap_between_components
2043 for prot
in prot_listy:
2044 resoffsety[prot] = res
2045 res += self.prot_length_dict[prot]
2047 res += gap_between_components
2049 resoffsetdiagonal = {}
2050 res = gap_between_components
2051 for prot
in IMP.pmi1.tools.OrderedSet(prot_listx + prot_listy):
2052 resoffsetdiagonal[prot] = res
2053 res += self.prot_length_dict[prot]
2054 res += gap_between_components
2060 for n, prot
in enumerate(prot_listx):
2061 res = resoffsetx[prot]
2063 for proty
in prot_listy:
2064 resy = resoffsety[proty]
2065 endy = resendy[proty]
2066 ax.plot([res, res], [resy, endy],
'k-', lw=0.4)
2067 ax.plot([end, end], [resy, endy],
'k-', lw=0.4)
2068 xticks.append((float(res) + float(end)) / 2)
2069 if rename_protein_map
is not None:
2070 if prot
in rename_protein_map:
2071 prot=rename_protein_map[prot]
2072 xlabels.append(prot)
2076 for n, prot
in enumerate(prot_listy):
2077 res = resoffsety[prot]
2079 for protx
in prot_listx:
2080 resx = resoffsetx[protx]
2081 endx = resendx[protx]
2082 ax.plot([resx, endx], [res, res],
'k-', lw=0.4)
2083 ax.plot([resx, endx], [end, end],
'k-', lw=0.4)
2084 yticks.append((float(res) + float(end)) / 2)
2085 if rename_protein_map
is not None:
2086 if prot
in rename_protein_map:
2087 prot=rename_protein_map[prot]
2088 ylabels.append(prot)
2091 print(prot_listx, prot_listy)
2093 if not self.contactmap
is None:
2094 import matplotlib.cm
as cm
2095 tmp_array = np.zeros((nresx, nresy))
2097 for px
in prot_listx:
2099 for py
in prot_listy:
2101 resx = resoffsety[px]
2102 lengx = resendx[px] - 1
2103 resy = resoffsety[py]
2104 lengy = resendy[py] - 1
2105 indexes_x = self.index_dictionary[px]
2106 minx = min(indexes_x)
2107 maxx = max(indexes_x)
2108 indexes_y = self.index_dictionary[py]
2109 miny = min(indexes_y)
2110 maxy = max(indexes_y)
2112 print(px, py, minx, maxx, miny, maxy)
2117 resy:lengy] = self.contactmap[
2124 ax.imshow(tmp_array,
2127 interpolation=
'nearest')
2129 ax.set_xticks(xticks)
2130 ax.set_xticklabels(xlabels, rotation=90)
2131 ax.set_yticks(yticks)
2132 ax.set_yticklabels(ylabels)
2133 ax.set_xlim(0,nresx)
2134 ax.set_ylim(0,nresy)
2139 already_added_xls = []
2141 for xl
in self.crosslinks:
2143 (r1, c1, r2, c2, mdist, stdv, confidence,
2144 unique_identifier, descriptor) = xl
2146 if confidence_classes
is not None:
2147 if confidence
not in confidence_classes:
2151 pos1 = r1 + resoffsetx[c1]
2155 pos2 = r2 + resoffsety[c2]
2159 if not filter
is None:
2160 xldb = self.external_csv_data[unique_identifier]
2161 xldb.update({
"Distance": mdist})
2162 xldb.update({
"Distance_stdv": stdv})
2164 if filter[1] ==
">":
2165 if float(xldb[filter[0]]) <= float(filter[2]):
2168 if filter[1] ==
"<":
2169 if float(xldb[filter[0]]) >= float(filter[2]):
2172 if filter[1] ==
"==":
2173 if float(xldb[filter[0]]) != float(filter[2]):
2179 pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
2180 pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
2182 if layout ==
'lowerdiagonal':
2183 if pos_for_diagonal1 <= pos_for_diagonal2:
2185 if layout ==
'upperdiagonal':
2186 if pos_for_diagonal1 >= pos_for_diagonal2:
2189 already_added_xls.append((r1, c1, r2, c2))
2191 if not no_confidence_info:
2192 if confidence ==
'0.01':
2193 markersize = 14 * scale_symbol_size
2194 elif confidence ==
'0.05':
2195 markersize = 9 * scale_symbol_size
2196 elif confidence ==
'0.1':
2197 markersize = 6 * scale_symbol_size
2199 markersize = 15 * scale_symbol_size
2201 markersize = 5 * scale_symbol_size
2203 if not no_dist_info:
2204 color = self.colormap(mdist)
2214 markersize=markersize)
2218 fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
2220 [i.set_linewidth(2.0)
for i
in ax.spines.values()]
2225 plt.savefig(filename +
".pdf", dpi=300, transparent=
"False")
2229 def get_frequency_statistics(self, prot_list,
2232 violated_histogram = {}
2233 satisfied_histogram = {}
2234 unique_cross_links = []
2236 for xl
in self.unique_cross_link_list:
2237 (r1, c1, r2, c2, mdist) = xl
2240 if prot_list2
is None:
2241 if not c1
in prot_list:
2243 if not c2
in prot_list:
2246 if c1
in prot_list
and c2
in prot_list2:
2248 elif c1
in prot_list2
and c2
in prot_list:
2253 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2255 if (r1, c1, r2, c2)
not in unique_cross_links:
2257 if frequency
not in violated_histogram:
2258 violated_histogram[frequency] = 1
2260 violated_histogram[frequency] += 1
2262 if frequency
not in satisfied_histogram:
2263 satisfied_histogram[frequency] = 1
2265 satisfied_histogram[frequency] += 1
2266 unique_cross_links.append((r1, c1, r2, c2))
2267 unique_cross_links.append((r2, c2, r1, c1))
2269 print(
"# satisfied")
2271 total_number_of_crosslinks=0
2273 for i
in satisfied_histogram:
2277 if i
in violated_histogram:
2278 print(i, violated_histogram[i]+satisfied_histogram[i])
2280 print(i, satisfied_histogram[i])
2281 total_number_of_crosslinks+=i*satisfied_histogram[i]
2285 for i
in violated_histogram:
2286 print(i, violated_histogram[i])
2287 total_number_of_crosslinks+=i*violated_histogram[i]
2289 print(total_number_of_crosslinks)
2293 def print_cross_link_binary_symbols(self, prot_list,
2296 confidence_list = []
2297 for xl
in self.crosslinks:
2298 (r1, c1, r2, c2, mdist, stdv, confidence,
2299 unique_identifier, descriptor) = xl
2301 if prot_list2
is None:
2302 if not c1
in prot_list:
2304 if not c2
in prot_list:
2307 if c1
in prot_list
and c2
in prot_list2:
2309 elif c1
in prot_list2
and c2
in prot_list:
2314 if descriptor !=
"original":
2317 confidence_list.append(confidence)
2319 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2320 tmp_dist_binary = []
2323 tmp_dist_binary.append(1)
2325 tmp_dist_binary.append(0)
2326 tmp_matrix.append(tmp_dist_binary)
2328 matrix = list(zip(*tmp_matrix))
2330 satisfied_high_sum = 0
2331 satisfied_mid_sum = 0
2332 satisfied_low_sum = 0
2333 total_satisfied_sum = 0
2334 for k, m
in enumerate(matrix):
2343 for n, b
in enumerate(m):
2344 if confidence_list[n] ==
"0.01":
2348 satisfied_high_sum += 1
2349 elif confidence_list[n] ==
"0.05":
2353 satisfied_mid_sum += 1
2354 elif confidence_list[n] ==
"0.1":
2358 satisfied_low_sum += 1
2360 total_satisfied += 1
2361 total_satisfied_sum += 1
2363 print(k, satisfied_high, total_high)
2364 print(k, satisfied_mid, total_mid)
2365 print(k, satisfied_low, total_low)
2366 print(k, total_satisfied, total)
2367 print(float(satisfied_high_sum) / len(matrix))
2368 print(float(satisfied_mid_sum) / len(matrix))
2369 print(float(satisfied_low_sum) / len(matrix))
2372 def get_unique_crosslinks_statistics(self, prot_list,
2385 satisfied_string = []
2386 for xl
in self.crosslinks:
2387 (r1, c1, r2, c2, mdist, stdv, confidence,
2388 unique_identifier, descriptor) = xl
2390 if prot_list2
is None:
2391 if not c1
in prot_list:
2393 if not c2
in prot_list:
2396 if c1
in prot_list
and c2
in prot_list2:
2398 elif c1
in prot_list2
and c2
in prot_list:
2403 if descriptor !=
"original":
2407 if confidence ==
"0.01":
2411 if confidence ==
"0.05":
2415 if confidence ==
"0.1":
2420 satisfied_string.append(1)
2422 satisfied_string.append(0)
2424 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2425 tmp_dist_binary = []
2428 tmp_dist_binary.append(1)
2430 tmp_dist_binary.append(0)
2431 tmp_matrix.append(tmp_dist_binary)
2433 print(
"unique satisfied_high/total_high", satisfied_high,
"/", total_high)
2434 print(
"unique satisfied_mid/total_mid", satisfied_mid,
"/", total_mid)
2435 print(
"unique satisfied_low/total_low", satisfied_low,
"/", total_low)
2436 print(
"total", total)
2438 matrix = list(zip(*tmp_matrix))
2439 satisfied_models = 0
2441 for b
in satisfied_string:
2448 all_satisfied =
True
2450 for n, b
in enumerate(m):
2455 if b == 1
and satisfied_string[n] == 1:
2457 elif b == 1
and satisfied_string[n] == 0:
2459 elif b == 0
and satisfied_string[n] == 0:
2461 elif b == 0
and satisfied_string[n] == 1:
2462 all_satisfied =
False
2464 satisfied_models += 1
2466 print(satstr, all_satisfied)
2467 print(
"models that satisfies the median satisfied crosslinks/total models", satisfied_models, len(matrix))
2469 def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,
2472 import matplotlib
as mpl
2474 import matplotlib.pylab
as pl
2477 for kw
in self.cross_link_distances_unique:
2478 (r1, c1, r2, c2) = kw
2479 dists = self.cross_link_distances_unique[kw]
2481 if prot_list2
is None:
2482 if not c1
in prot_list:
2484 if not c2
in prot_list:
2487 if c1
in prot_list
and c2
in prot_list2:
2489 elif c1
in prot_list2
and c2
in prot_list:
2494 dists.append(sum(dists))
2495 tmp_matrix.append(dists)
2497 tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
2500 matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
2502 for i
in range(len(tmp_matrix)):
2503 for k
in range(len(tmp_matrix[i]) - 1):
2504 matrix[i][k] = tmp_matrix[i][k]
2509 ax = fig.add_subplot(211)
2511 cax = ax.imshow(matrix, interpolation=
'nearest')
2515 pl.savefig(figurename, dpi=300)
2524 arrangement=
"inter",
2525 confidence_input=
"None"):
2528 for xl
in self.cross_link_distances:
2529 (r1, c1, r2, c2, mdist, confidence) = xl
2530 if c1
in prots1
and c2
in prots2:
2531 if arrangement ==
"inter" and c1 == c2:
2533 if arrangement ==
"intra" and c1 != c2:
2535 if confidence_input == confidence:
2536 label = str(c1) +
":" + str(r1) + \
2537 "-" + str(c2) +
":" + str(r2)
2538 values = self.cross_link_distances[xl]
2539 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2540 data.append((label, values, mdist, frequency))
2542 sort_by_dist = sorted(data, key=
lambda tup: tup[2])
2543 sort_by_dist = list(zip(*sort_by_dist))
2544 values = sort_by_dist[1]
2545 positions = list(range(len(values)))
2546 labels = sort_by_dist[0]
2547 frequencies = list(map(float, sort_by_dist[3]))
2548 frequencies = [f * 10.0
for f
in frequencies]
2550 nchunks = int(float(len(values)) / nxl_per_row)
2551 values_chunks = IMP.pmi1.tools.chunk_list_into_segments(values, nchunks)
2552 positions_chunks = IMP.pmi1.tools.chunk_list_into_segments(
2555 frequencies_chunks = IMP.pmi1.tools.chunk_list_into_segments(
2558 labels_chunks = IMP.pmi1.tools.chunk_list_into_segments(labels, nchunks)
2560 for n, v
in enumerate(values_chunks):
2561 p = positions_chunks[n]
2562 f = frequencies_chunks[n]
2563 l = labels_chunks[n]
2565 filename +
"." + str(n), v, p, f,
2566 valuename=
"Distance (Ang)", positionname=
"Unique " + arrangement +
" Crosslinks", xlabels=l)
2568 def crosslink_distance_histogram(self, filename,
2571 confidence_classes=
None,
2577 if prot_list
is None:
2578 prot_list = list(self.prot_length_dict.keys())
2581 for xl
in self.crosslinks:
2582 (r1, c1, r2, c2, mdist, stdv, confidence,
2583 unique_identifier, descriptor) = xl
2585 if not confidence_classes
is None:
2586 if confidence
not in confidence_classes:
2589 if prot_list2
is None:
2590 if not c1
in prot_list:
2592 if not c2
in prot_list:
2595 if c1
in prot_list
and c2
in prot_list2:
2597 elif c1
in prot_list2
and c2
in prot_list:
2602 distances.append(mdist)
2605 filename, distances, valuename=
"C-alpha C-alpha distance [Ang]",
2606 bins=bins, color=color,
2608 reference_xline=35.0,
2609 yplotrange=yplotrange, normalized=normalized)
2611 def scatter_plot_xl_features(self, filename,
2617 reference_ylines=
None,
2618 distance_color=
True,
2620 import matplotlib
as mpl
2622 import matplotlib.pyplot
as plt
2623 import matplotlib.cm
as cm
2625 fig = plt.figure(figsize=(10, 10))
2626 ax = fig.add_subplot(111)
2628 for xl
in self.crosslinks:
2629 (r1, c1, r2, c2, mdist, stdv, confidence,
2630 unique_identifier, arrangement) = xl
2632 if prot_list2
is None:
2633 if not c1
in prot_list:
2635 if not c2
in prot_list:
2638 if c1
in prot_list
and c2
in prot_list2:
2640 elif c1
in prot_list2
and c2
in prot_list:
2645 xldb = self.external_csv_data[unique_identifier]
2646 xldb.update({
"Distance": mdist})
2647 xldb.update({
"Distance_stdv": stdv})
2649 xvalue = float(xldb[feature1])
2650 yvalue = float(xldb[feature2])
2653 color = self.colormap(mdist)
2657 ax.plot([xvalue], [yvalue],
'o', c=color, alpha=0.1, markersize=7)
2659 if not yplotrange
is None:
2660 ax.set_ylim(yplotrange)
2661 if not reference_ylines
is None:
2662 for rl
in reference_ylines:
2663 ax.axhline(rl, color=
'red', linestyle=
'dashed', linewidth=1)
2666 plt.savefig(filename +
"." + format, dpi=150, transparent=
"False")
static bool get_is_setup(const IMP::ParticleAdaptor &p)
def get_moldict_coorddict
return data structure for the RMSD calculation
A decorator to associate a particle with a part of a protein/DNA/RNA.
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
Performs alignment and RMSD calculation for two sets of coordinates.
Visualization of crosslinks.
A class to cluster structures.
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
Legacy PMI1 module to represent, score, sample and analyze models.
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
def do_cluster
Run K-means clustering.
void link_restraints(RMF::FileConstHandle fh, const Restraints &hs)
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name.
Tools for clustering and cluster analysis.
static Molecule setup_particle(Model *m, ParticleIndex pi)
def get_rmsd_wrt_reference_structure_with_alignment
First align then calculate RMSD.
def plot_field_histogram
Plot a list of histograms from a value list.
Class for sampling a density map from particles.
A decorator for keeping track of copies of a molecule.
def add_subunits_density
Add a frame to the densities.
A class to evaluate the precision of an ensemble.
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
Return a density map for which voxel i contains the result of m1[i]*m2[i].
DensityMap * create_density_map(const IMP::algebra::GridD< 3, S, V, E > &arg)
Create a density map from an arbitrary IMP::algebra::GridD.
def fill
Add coordinates for a single model.
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
def get_rmsf
Calculate the residue mean square fluctuations (RMSF).
Basic utilities for handling cryo-electron microscopy 3D density maps.
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
def get_density
Get the current density for some component name.
def get_average_distance_wrt_reference_structure
Compare the structure set to the reference structure.
def add_structure
Read a structure into the ensemble and store (as coordinates).
def get_precision
Evaluate the precision of two named structure groups.
def add_structures
Read a list of RMFs, supports parallel.
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
Restraints create_restraints(RMF::FileConstHandle fh, Model *m)
A class for reading stat files (either rmf or ascii v1 and v2)
Compute the RMSD (without alignment) taking into account the copy ambiguity.
Compute mean density maps from structures.
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
def get_output
Returns output for IMP.pmi1.output.Output object.
Classes for writing output files and processing them.
def set_reference_structure
Read in a structure used for reference computation.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
def plot_fields_box_plots
Plot time series as boxplots.
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Select hierarchy particles identified by the biological name.
Support for the RMF file format for storing hierarchical molecular data and markup.
A decorator for a particle with x,y,z coordinates and a radius.