3 """@namespace IMP.pmi.analysis
4 Tools for clustering and cluster analysis
6 from __future__
import print_function
16 from operator
import itemgetter
17 from copy
import deepcopy
18 from math
import log,sqrt
24 """Performs alignment and RMSD calculation for two sets of coordinates
26 The class also takes into account non-equal stoichiometry of the proteins.
27 If this is the case, the protein names of proteins in multiple copies
28 should be specified in the following form:
29 nameA..1, nameA..2 (note two dots).
32 def __init__(self, template, query, weights=None):
34 @param query {'p1':coords(L,3), 'p2':coords(L,3)}
35 @param template {'p1':coords(L,3), 'p2':coords(L,3)}
36 @param weights optional weights for each set of coordinates
39 self.template = template
42 if len(self.query.keys()) != len(self.template.keys()):
43 raise ValueError(
'''the number of proteins
44 in template and query does not match!''')
50 self.proteins = sorted(self.query.keys())
51 prots_uniq = [i.split(
'..')[0]
for i
in self.proteins]
59 np = prots_uniq.count(p)
60 copies = [i
for i
in self.proteins
if i.split(
'..')[0] == p]
61 prmts = list(itertools.permutations(copies, len(copies)))
64 self.Product = list(itertools.product(*P.values()))
72 torder = sum([list(i)
for i
in self.Product[0]], [])
75 if self.weights
is not None:
76 weights += [i
for i
in self.weights[t]]
79 self.rmsd = 10000000000.
80 for comb
in self.Product:
82 order = sum([list(i)
for i
in comb], [])
92 if self.weights
is not None:
103 from scipy.spatial.distance
import cdist
111 torder = sum([list(i)
for i
in self.Product[0]], [])
115 self.rmsd, Transformation = 10000000000.,
''
118 for comb
in self.Product:
119 order = sum([list(i)
for i
in comb], [])
125 if len(template_xyz) != len(query_xyz):
126 raise ValueError(
'''the number of coordinates
127 in template and query does not match!''')
132 query_xyz_tr = [transformation.get_transformed(n)
136 sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2)) / len(template_xyz))
139 Transformation = transformation
142 return (self.rmsd, Transformation)
147 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
148 'a..2':np.array([np.array([1.,1.,])]),
149 'a..3':np.array([np.array([-2.,1.])]),
150 'b':np.array([np.array([0.,-1.])]),
151 'c..1':np.array([np.array([-1.,-1.])]),
152 'c..2':np.array([np.array([1.,-1.])]),
153 'd':np.array([np.array([0.,0.])]),
154 'e':np.array([np.array([0.,1.])])}
156 Ali = Alignment(Proteins, Proteins)
158 if Ali.get_rmsd() == 0.0: print 'successful test!'
159 else: print 'ERROR!'; exit()
164 class Violations(object):
168 self.violation_thresholds = {}
169 self.violation_counts = {}
171 data = open(filename)
176 d = d.strip().split()
177 self.violation_thresholds[d[0]] = float(d[1])
179 def get_number_violated_restraints(self, rsts_dict):
181 for rst
in self.violation_thresholds:
182 if rst
not in rsts_dict:
184 if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
186 if rst
not in self.violation_counts:
187 self.violation_counts[rst] = 1
189 self.violation_counts[rst] += 1
195 """A class to cluster structures.
196 Uses scipy's cdist function to compute distance matrices
197 and sklearn's kmeans clustering module.
201 @param rmsd_weights Flat list of weights for each particle
205 from mpi4py
import MPI
206 self.comm = MPI.COMM_WORLD
207 self.rank = self.comm.Get_rank()
208 self.number_of_processes = self.comm.size
210 self.number_of_processes = 1
213 self.structure_cluster_ids =
None
214 self.tmpl_coords =
None
215 self.rmsd_weights=rmsd_weights
217 def set_template(self, part_coords):
219 self.tmpl_coords = part_coords
222 """Add coordinates for a single model."""
224 self.all_coords[frame] = Coords
226 def dist_matrix(self):
228 self.model_list_names = list(self.all_coords.keys())
229 self.model_indexes = list(range(len(self.model_list_names)))
230 self.model_indexes_dict = dict(
231 list(zip(self.model_list_names, self.model_indexes)))
232 model_indexes_unique_pairs = list(itertools.combinations(self.model_indexes, 2))
234 my_model_indexes_unique_pairs = IMP.pmi.tools.chunk_list_into_segments(
235 model_indexes_unique_pairs,
236 self.number_of_processes)[self.rank]
238 print(
"process %s assigned with %s pairs" % (str(self.rank), str(len(my_model_indexes_unique_pairs))))
240 (raw_distance_dict, self.transformation_distance_dict) = self.matrix_calculation(self.all_coords,
242 my_model_indexes_unique_pairs)
244 if self.number_of_processes > 1:
247 pickable_transformations = self.get_pickable_transformation_distance_dict(
250 pickable_transformations)
251 self.set_transformation_distance_dict_from_pickable(
252 pickable_transformations)
254 self.raw_distance_matrix = np.zeros(
255 (len(self.model_list_names), len(self.model_list_names)))
256 for item
in raw_distance_dict:
258 self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
259 self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
261 def get_dist_matrix(self):
262 return self.raw_distance_matrix
265 """Run K-means clustering
266 @param number_of_clusters Num means
267 @param seed the random seed
269 from sklearn.cluster
import KMeans
274 kmeans = KMeans(n_clusters=number_of_clusters)
277 kmeans = KMeans(k=number_of_clusters)
278 kmeans.fit_predict(self.raw_distance_matrix)
280 self.structure_cluster_ids = kmeans.labels_
282 def get_pickable_transformation_distance_dict(self):
283 pickable_transformations = {}
284 for label
in self.transformation_distance_dict:
285 tr = self.transformation_distance_dict[label]
286 trans = tuple(tr.get_translation())
287 rot = tuple(tr.get_rotation().get_quaternion())
288 pickable_transformations[label] = (rot, trans)
289 return pickable_transformations
291 def set_transformation_distance_dict_from_pickable(
293 pickable_transformations):
294 self.transformation_distance_dict = {}
295 for label
in pickable_transformations:
296 tr = pickable_transformations[label]
299 self.transformation_distance_dict[
302 def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
304 outf = open(file_name +
".data",
'wb')
310 pickable_transformations = self.get_pickable_transformation_distance_dict(
313 (self.structure_cluster_ids,
314 self.model_list_names,
315 pickable_transformations),
318 np.save(file_name +
".npy", self.raw_distance_matrix)
320 def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
323 inputf = open(file_name +
".data",
'rb')
324 (self.structure_cluster_ids, self.model_list_names,
325 pickable_transformations) = pickle.load(inputf)
328 self.raw_distance_matrix = np.load(file_name +
".npy")
330 self.set_transformation_distance_dict_from_pickable(
331 pickable_transformations)
332 self.model_indexes = list(range(len(self.model_list_names)))
333 self.model_indexes_dict = dict(
334 list(zip(self.model_list_names, self.model_indexes)))
336 def plot_matrix(self, figurename="clustermatrix.pdf"):
337 import matplotlib
as mpl
339 import matplotlib.pylab
as pl
340 from scipy.cluster
import hierarchy
as hrc
342 fig = pl.figure(figsize=(10,8))
343 ax = fig.add_subplot(212)
344 dendrogram = hrc.dendrogram(
345 hrc.linkage(self.raw_distance_matrix),
348 leaves_order = dendrogram[
'leaves']
349 ax.set_xlabel(
'Model')
350 ax.set_ylabel(
'RMSD [Angstroms]')
352 ax2 = fig.add_subplot(221)
354 self.raw_distance_matrix[leaves_order,
357 interpolation=
'nearest')
358 cb = fig.colorbar(cax)
359 cb.set_label(
'RMSD [Angstroms]')
360 ax2.set_xlabel(
'Model')
361 ax2.set_ylabel(
'Model')
363 pl.savefig(figurename, dpi=300)
366 def get_model_index_from_name(self, name):
367 return self.model_indexes_dict[name]
369 def get_cluster_labels(self):
371 return list(set(self.structure_cluster_ids))
373 def get_number_of_clusters(self):
374 return len(self.get_cluster_labels())
376 def get_cluster_label_indexes(self, label):
378 [i
for i, l
in enumerate(self.structure_cluster_ids)
if l == label]
381 def get_cluster_label_names(self, label):
383 [self.model_list_names[i]
384 for i
in self.get_cluster_label_indexes(label)]
387 def get_cluster_label_average_rmsd(self, label):
389 indexes = self.get_cluster_label_indexes(label)
392 sub_distance_matrix = self.raw_distance_matrix[
393 indexes, :][:, indexes]
394 average_rmsd = np.sum(sub_distance_matrix) / \
395 (len(sub_distance_matrix)
396 ** 2 - len(sub_distance_matrix))
401 def get_cluster_label_size(self, label):
402 return len(self.get_cluster_label_indexes(label))
404 def get_transformation_to_first_member(
408 reference = self.get_cluster_label_indexes(cluster_label)[0]
409 return self.transformation_distance_dict[(reference, structure_index)]
411 def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
413 model_list_names = list(all_coords.keys())
414 rmsd_protein_names = list(all_coords[model_list_names[0]].keys())
415 raw_distance_dict = {}
416 transformation_distance_dict = {}
417 if template_coords
is None:
421 alignment_template_protein_names = list(template_coords.keys())
423 for (f1, f2)
in list_of_pairs:
431 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
432 for pr
in rmsd_protein_names])
434 for pr
in rmsd_protein_names:
435 coords_f2[pr] = all_coords[model_list_names[f2]][pr]
437 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
438 rmsd = Ali.get_rmsd()
444 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
445 for pr
in alignment_template_protein_names])
446 coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
447 for pr
in alignment_template_protein_names])
450 template_rmsd, transformation = Ali.align()
455 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
456 for pr
in rmsd_protein_names])
458 for pr
in rmsd_protein_names:
459 coords_f2[pr] = [transformation.get_transformed(
460 i)
for i
in all_coords[model_list_names[f2]][pr]]
462 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
463 rmsd = Ali.get_rmsd()
465 raw_distance_dict[(f1, f2)] = rmsd
466 raw_distance_dict[(f2, f1)] = rmsd
467 transformation_distance_dict[(f1, f2)] = transformation
468 transformation_distance_dict[(f2, f1)] = transformation
470 return raw_distance_dict, transformation_distance_dict
474 """Compute the RMSD (without alignment) taking into account the copy ambiguity.
475 To be used with pmi2 hierarchies. Can be used for instance as follows:
477 rmsd=IMP.pmi.analysis.RMSD(hier,hier,[mol.get_name() for mol in mols],dynamic0=True,dynamic1=False)
478 output_objects.append(rmsd)
480 before shuffling the coordinates
483 def __init__(self,hier0,hier1,molnames,label="None",dynamic0=True,dynamic1=True,metric=IMP.algebra.get_rmsd):
485 @param hier0 first input hierarchy
486 @param hier1 second input hierarchy
487 @param molname the names of the molecules used for the RMSD
488 @dynamic0 if True stores the decorators XYZ and coordinates of hier0 can be update. If false coordinates are static (stored in Vector3Ds)
489 and will never be updated
490 @dynamic1 same as above
491 metric what metric should be used
495 self.dynamic0=dynamic0
496 self.dynamic1=dynamic1
501 """return data structure for the RMSD calculation"""
507 if name
not in molnames:
514 sel=
IMP.atom.Selection(mol,residue_index=i,representation_type=IMP.atom.BALLS,resolution=1)
515 parts=sel.get_selected_particles()
517 mol_coords[mol].append(
IMP.core.XYZ(parts[0]).get_coordinates())
521 moldict[name].append(mol)
524 return moldict, mol_coords, mol_XYZs
526 def get_rmsd_and_assigments(self):
532 for molname, ref_mols
in self.moldict1.items():
534 for ref_mol
in ref_mols:
536 coord1=[XYZ.get_coordinates()
for XYZ
in self.mol_XYZs1[ref_mol]]
538 coord1=self.molcoords1[ref_mol]
543 for rmf_mols
in itertools.permutations(self.moldict0[molname]):
545 for rmf_mol
in rmf_mols:
547 coord0=[XYZ.get_coordinates()
for XYZ
in self.mol_XYZs0[rmf_mol]]
549 coord0=self.molcoords0[rmf_mol]
553 rmf_mols_list.append(rmf_mols)
555 rmf_mols_best_order=rmf_mols_list[rmsd.index(m)]
558 for n, (ref_mol,rmf_mol)
in enumerate(zip(ref_mols,rmf_mols_best_order)):
559 best_assignments.append((rmf_mol,ref_mol))
561 coord0=[XYZ.get_coordinates()
for XYZ
in self.mol_XYZs0[rmf_mol]]
563 coord0=self.molcoords0[rmf_mol]
566 coord1=[XYZ.get_coordinates()
for XYZ
in self.mol_XYZs1[ref_mol]]
568 coord1=self.molcoords1[ref_mol]
569 rmsd_pair=self.metric(coord1, coord0)
570 N=len(self.molcoords1[ref_mol])
572 total_rmsd+=rmsd_pair*rmsd_pair*N
573 rmsd_dict[ref_mol]=rmsd_pair
574 total_rmsd = sqrt(total_rmsd/total_N)
575 return total_rmsd,best_assignments
578 """Returns output for IMP.pmi.output.Output object"""
579 total_rmsd,best_assignments=self.get_rmsd_and_assigments()
582 for rmf_mol,ref_mol
in best_assignments:
583 ref_name=ref_mol.get_name()
585 rmf_name=rmf_mol.get_name()
587 assignments_out.append(rmf_name+
"."+str(rmf_copy)+
"->"+ref_name+
"."+str(ref_copy))
588 return {
"RMSD_"+self.label:str(total_rmsd),
"RMSD_assignments_"+self.label:str(assignments_out)}
594 """A class to evaluate the precision of an ensemble.
596 Also can evaluate the cross-precision of multiple ensembles.
597 Supports MPI for coordinate reading.
598 Recommended procedure:
599 -# initialize object and pass the selection for evaluating precision
600 -# call add_structures() to read in the data (specify group name)
601 -# call get_precision() to evaluate inter/intra precision
602 -# call get_rmsf() to evaluate within-group fluctuations
606 selection_dictionary={}):
608 @param model The IMP Model
609 @param resolution Use 1 or 10 (kluge: requires that "_Res:X" is
610 part of the hier name)
611 @param selection_dictionary Dictionary where keys are names for
612 selections and values are selection tuples for scoring
613 precision. "All" is automatically made as well
616 from mpi4py
import MPI
617 self.comm = MPI.COMM_WORLD
618 self.rank = self.comm.Get_rank()
619 self.number_of_processes = self.comm.size
621 self.number_of_processes = 1
624 self.styles = [
'pairwise_rmsd',
'pairwise_drmsd_k',
'pairwise_drmsd_Q',
625 'pairwise_drms_k',
'pairwise_rmsd',
'drmsd_from_center']
626 self.style =
'pairwise_drmsd_k'
627 self.structures_dictionary = {}
628 self.reference_structures_dictionary = {}
630 self.protein_names =
None
631 self.len_particles_resolution_one =
None
633 self.rmf_names_frames = {}
634 self.reference_rmf_names_frames =
None
635 self.reference_structure =
None
636 self.reference_prot =
None
637 self.selection_dictionary = selection_dictionary
638 self.threshold = 40.0
639 self.residue_particle_index_map =
None
641 if resolution
in [1,10]:
642 self.resolution = resolution
644 raise KeyError(
"Currently only allow resolution 1 or 10")
646 def _get_structure(self,rmf_frame_index,rmf_name):
647 """Read an RMF file and return the particles"""
648 rh = RMF.open_rmf_file_read_only(rmf_name)
650 print(
"getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
654 print(
"linking coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
659 if self.resolution==1:
661 elif self.resolution==10:
664 protein_names = list(particle_dict.keys())
665 particles_resolution_one = []
666 for k
in particle_dict:
667 particles_resolution_one += (particle_dict[k])
669 if self.protein_names==
None:
670 self.protein_names = protein_names
672 if self.protein_names!=protein_names:
673 print(
"Error: the protein names of the new coordinate set is not compatible with the previous one")
675 if self.len_particles_resolution_one==
None:
676 self.len_particles_resolution_one = len(particles_resolution_one)
678 if self.len_particles_resolution_one!=len(particles_resolution_one):
679 raise ValueError(
"the new coordinate set is not compatible with the previous one")
681 return particles_resolution_one
687 setup_index_map=
False):
688 """ Read a structure into the ensemble and store (as coordinates).
689 @param rmf_name The name of the RMF file
690 @param rmf_frame_index The frame to read
691 @param structure_set_name Name for the set that includes this structure
693 @param setup_index_map if requested, set up a dictionary to help
698 if structure_set_name
in self.structures_dictionary:
699 cdict = self.structures_dictionary[structure_set_name]
700 rmflist = self.rmf_names_frames[structure_set_name]
702 self.structures_dictionary[structure_set_name]={}
703 self.rmf_names_frames[structure_set_name]=[]
704 cdict = self.structures_dictionary[structure_set_name]
705 rmflist = self.rmf_names_frames[structure_set_name]
709 particles_resolution_one = self._get_structure(rmf_frame_index,rmf_name)
711 print(
"something wrong with the rmf")
713 self.selection_dictionary.update({
"All":self.protein_names})
715 for selection_name
in self.selection_dictionary:
716 selection_tuple = self.selection_dictionary[selection_name]
717 coords = self._select_coordinates(selection_tuple,particles_resolution_one,self.prots[0])
718 if selection_name
not in cdict:
719 cdict[selection_name] = [coords]
721 cdict[selection_name].append(coords)
723 rmflist.append((rmf_name,rmf_frame_index))
727 self.residue_particle_index_map = {}
728 for prot_name
in self.protein_names:
729 self.residue_particle_index_map[prot_name] = \
730 self._get_residue_particle_index_map(
732 particles_resolution_one,self.prots[0])
735 rmf_name_frame_tuples,
737 """Read a list of RMFs, supports parallel
738 @param rmf_name_frame_tuples list of (rmf_file_name,frame_number)
739 @param structure_set_name Name this set of structures (e.g. "cluster.1")
743 my_rmf_name_frame_tuples=IMP.pmi.tools.chunk_list_into_segments(
744 rmf_name_frame_tuples,self.number_of_processes)[self.rank]
745 for nfr,tup
in enumerate(my_rmf_name_frame_tuples):
747 rmf_frame_index=tup[1]
749 if self.residue_particle_index_map
is None:
750 setup_index_map =
True
752 setup_index_map =
False
759 if self.number_of_processes > 1:
762 self.comm.send(self.structures_dictionary, dest=0, tag=11)
764 for i
in range(1, self.number_of_processes):
765 data_tmp = self.comm.recv(source=i, tag=11)
766 for key
in self.structures_dictionary:
767 self.structures_dictionary[key].update(data_tmp[key])
768 for i
in range(1, self.number_of_processes):
769 self.comm.send(self.structures_dictionary, dest=i, tag=11)
771 self.structures_dictionary = self.comm.recv(source=0, tag=11)
773 def _get_residue_particle_index_map(self,prot_name,structure,hier):
775 residue_particle_index_map = []
781 all_selected_particles = s.get_selected_particles()
782 intersection = list(set(all_selected_particles) & set(structure))
783 sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
784 for p
in sorted_intersection:
786 return residue_particle_index_map
789 def _select_coordinates(self,tuple_selections,structure,prot):
790 selected_coordinates=[]
791 for t
in tuple_selections:
792 if type(t)==tuple
and len(t)==3:
798 all_selected_particles = s.get_selected_particles()
799 intersection = list(set(all_selected_particles) & set(structure))
800 sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
801 cc = [tuple(
IMP.core.XYZ(p).get_coordinates())
for p
in sorted_intersection]
802 selected_coordinates += cc
808 all_selected_particles = s.get_selected_particles()
809 intersection = list(set(all_selected_particles) & set(structure))
810 sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
811 cc = [tuple(
IMP.core.XYZ(p).get_coordinates())
for p
in sorted_intersection]
812 selected_coordinates += cc
814 raise ValueError(
"Selection error")
815 return selected_coordinates
817 def set_threshold(self,threshold):
818 self.threshold = threshold
820 def _get_distance(self,
826 """ Compute distance between structures with various metrics """
827 c1 = self.structures_dictionary[structure_set_name1][selection_name][index1]
828 c2 = self.structures_dictionary[structure_set_name2][selection_name][index2]
832 if self.style==
'pairwise_drmsd_k':
834 if self.style==
'pairwise_drms_k':
836 if self.style==
'pairwise_drmsd_Q':
839 if self.style==
'pairwise_rmsd':
843 def _get_particle_distances(self,structure_set_name1,structure_set_name2,
844 selection_name,index1,index2):
845 c1 = self.structures_dictionary[structure_set_name1][selection_name][index1]
846 c2 = self.structures_dictionary[structure_set_name2][selection_name][index2]
851 distances=[np.linalg.norm(a-b)
for (a,b)
in zip(coordinates1,coordinates2)]
860 selection_keywords=
None):
861 """ Evaluate the precision of two named structure groups. Supports MPI.
862 When the structure_set_name1 is different from the structure_set_name2,
863 this evaluates the cross-precision (average pairwise distances).
864 @param outfile Name of the precision output file
865 @param structure_set_name1 string name of the first structure set
866 @param structure_set_name2 string name of the second structure set
867 @param skip analyze every (skip) structure for the distance matrix calculation
868 @param selection_keywords Specify the selection name you want to calculate on.
869 By default this is computed for everything you provided in the constructor,
870 plus all the subunits together.
872 if selection_keywords
is None:
873 sel_keys = list(self.selection_dictionary.keys())
875 for k
in selection_keywords:
876 if k
not in self.selection_dictionary:
877 raise KeyError(
"you are trying to find named selection " \
878 + k +
" which was not requested in the constructor")
879 sel_keys = selection_keywords
881 if outfile
is not None:
882 of = open(outfile,
"w")
884 for selection_name
in sel_keys:
885 number_of_structures_1 = len(self.structures_dictionary[structure_set_name1][selection_name])
886 number_of_structures_2 = len(self.structures_dictionary[structure_set_name2][selection_name])
888 structure_pointers_1 = list(range(0,number_of_structures_1,skip))
889 structure_pointers_2 = list(range(0,number_of_structures_2,skip))
890 pair_combination_list = list(itertools.product(structure_pointers_1,structure_pointers_2))
891 if len(pair_combination_list)==0:
892 raise ValueError(
"no structure selected. Check the skip parameter.")
895 my_pair_combination_list = IMP.pmi.tools.chunk_list_into_segments(
896 pair_combination_list,self.number_of_processes)[self.rank]
897 my_length = len(my_pair_combination_list)
898 for n,pair
in enumerate(my_pair_combination_list):
899 progression = int(float(n)/my_length*100.0)
900 distances[pair] = self._get_distance(structure_set_name1,structure_set_name2,
901 selection_name,pair[0],pair[1])
902 if self.number_of_processes > 1:
907 if structure_set_name1==structure_set_name2:
908 structure_pointers = structure_pointers_1
909 number_of_structures = number_of_structures_1
914 distances_to_structure = {}
915 distances_to_structure_normalization = {}
917 for n
in structure_pointers:
918 distances_to_structure[n] = 0.0
919 distances_to_structure_normalization[n]=0
922 distance += distances[k]
923 distances_to_structure[k[0]] += distances[k]
924 distances_to_structure[k[1]] += distances[k]
925 distances_to_structure_normalization[k[0]] += 1
926 distances_to_structure_normalization[k[1]] += 1
928 for n
in structure_pointers:
929 distances_to_structure[n] = distances_to_structure[n]/distances_to_structure_normalization[n]
931 min_distance = min([distances_to_structure[n]
for n
in distances_to_structure])
932 centroid_index = [k
for k, v
in distances_to_structure.items()
if v == min_distance][0]
933 centroid_rmf_name = self.rmf_names_frames[structure_set_name1][centroid_index]
935 centroid_distance = 0.0
937 for n
in range(number_of_structures):
938 dist = self._get_distance(structure_set_name1,structure_set_name1,
939 selection_name,centroid_index,n)
940 centroid_distance += dist
941 distance_list.append(dist)
944 centroid_distance /= number_of_structures
946 if outfile
is not None:
947 of.write(str(selection_name)+
" "+structure_set_name1+
948 " average centroid distance "+str(centroid_distance)+
"\n")
949 of.write(str(selection_name)+
" "+structure_set_name1+
950 " centroid index "+str(centroid_index)+
"\n")
951 of.write(str(selection_name)+
" "+structure_set_name1+
952 " centroid rmf name "+str(centroid_rmf_name)+
"\n")
953 of.write(str(selection_name)+
" "+structure_set_name1+
954 " median centroid distance "+str(np.median(distance_list))+
"\n")
956 average_pairwise_distances=sum(distances.values())/len(list(distances.values()))
957 if outfile
is not None:
958 of.write(str(selection_name)+
" "+structure_set_name1+
" "+structure_set_name2+
959 " average pairwise distance "+str(average_pairwise_distances)+
"\n")
960 if outfile
is not None:
962 return centroid_index
968 set_plot_yaxis_range=
None):
969 """ Calculate the residue mean square fluctuations (RMSF).
970 Automatically outputs as data file and pdf
971 @param structure_set_name Which structure set to calculate RMSF for
972 @param outdir Where to write the files
973 @param skip Skip this number of structures
974 @param set_plot_yaxis_range In case you need to change the plot
982 for sel_name
in self.protein_names:
983 self.selection_dictionary.update({sel_name:[sel_name]})
985 number_of_structures = len(self.structures_dictionary[structure_set_name][sel_name])
989 rpim = self.residue_particle_index_map[sel_name]
990 outfile = outdir+
"/rmsf."+sel_name+
".dat"
991 of = open(outfile,
"w")
992 residue_distances = {}
994 for index
in range(number_of_structures):
995 distances = self._get_particle_distances(structure_set_name,
998 centroid_index,index)
999 for nblock,block
in enumerate(rpim):
1000 for residue_number
in block:
1001 residue_nblock[residue_number] = nblock
1002 if residue_number
not in residue_distances:
1003 residue_distances[residue_number] = [distances[nblock]]
1005 residue_distances[residue_number].append(distances[nblock])
1009 for rn
in residue_distances:
1011 rmsf = np.std(residue_distances[rn])
1013 of.write(str(rn)+
" "+str(residue_nblock[rn])+
" "+str(rmsf)+
"\n")
1015 IMP.pmi.output.plot_xy_data(residues,rmsfs,title=sel_name,
1016 out_fn=outdir+
"/rmsf."+sel_name,display=
False,
1017 set_plot_yaxis_range=set_plot_yaxis_range,
1018 xlabel=
'Residue Number',ylabel=
'Standard error')
1023 """Read in a structure used for reference computation.
1024 Needed before calling get_average_distance_wrt_reference_structure()
1025 @param rmf_name The RMF file to read the reference
1026 @param rmf_frame_index The index in that file
1028 particles_resolution_one = self._get_structure(rmf_frame_index,rmf_name)
1029 self.reference_rmf_names_frames = (rmf_name,rmf_frame_index)
1031 for selection_name
in self.selection_dictionary:
1032 selection_tuple = self.selection_dictionary[selection_name]
1033 coords = self._select_coordinates(selection_tuple,
1034 particles_resolution_one,self.prots[0])
1035 self.reference_structures_dictionary[selection_name] = coords
1038 """First align then calculate RMSD
1039 @param structure_set_name: the name of the structure set
1040 @param alignment_selection: the key containing the selection tuples needed to make the alignment stored in self.selection_dictionary
1041 @return: for each structure in the structure set, returns the rmsd
1043 if self.reference_structures_dictionary=={}:
1044 print(
"Cannot compute until you set a reference structure")
1047 align_reference_coordinates = self.reference_structures_dictionary[alignment_selection_key]
1048 align_coordinates = self.structures_dictionary[structure_set_name][alignment_selection_key]
1049 transformations = []
1050 for c
in align_coordinates:
1052 transformation = Ali.align()[1]
1053 transformations.append(transformation)
1054 for selection_name
in self.selection_dictionary:
1055 reference_coordinates = self.reference_structures_dictionary[selection_name]
1058 for n,sc
in enumerate(self.structures_dictionary[structure_set_name][selection_name]):
1061 distances.append(distance)
1062 print(selection_name,
"average rmsd",sum(distances)/len(distances),
"median",self._get_median(distances),
"minimum distance",min(distances))
1064 def _get_median(self,list_of_values):
1065 return np.median(np.array(list_of_values))
1068 """Compare the structure set to the reference structure.
1069 @param structure_set_name The structure set to compute this on
1070 @note First call set_reference_structure()
1073 if self.reference_structures_dictionary=={}:
1074 print(
"Cannot compute until you set a reference structure")
1076 for selection_name
in self.selection_dictionary:
1077 reference_coordinates = self.reference_structures_dictionary[selection_name]
1080 for sc
in self.structures_dictionary[structure_set_name][selection_name]:
1082 if self.style==
'pairwise_drmsd_k':
1084 if self.style==
'pairwise_drms_k':
1086 if self.style==
'pairwise_drmsd_Q':
1088 if self.style==
'pairwise_rmsd':
1090 distances.append(distance)
1092 print(selection_name,
"average distance",sum(distances)/len(distances),
"minimum distance",min(distances),
'nframes',len(distances))
1093 ret[selection_name] = {
'average_distance':sum(distances)/len(distances),
'minimum_distance':min(distances)}
1096 def get_coordinates(self):
1099 def set_precision_style(self, style):
1100 if style
in self.styles:
1103 raise ValueError(
"No such style")
1107 """Compute mean density maps from structures.
1109 Keeps a dictionary of density maps,
1110 keys are in the custom ranges. When you call add_subunits_density, it adds
1111 particle coordinates to the existing density maps.
1114 def __init__(self, custom_ranges, representation=None, resolution=20.0, voxel=5.0):
1116 @param custom_ranges Required. It's a dictionary, keys are the
1117 density component names, values are selection tuples
1118 e.g. {'kin28':[['kin28',1,-1]],
1119 'density_name_1' :[('ccl1')],
1120 'density_name_2' :[(1,142,'tfb3d1'),
1121 (143,700,'tfb3d2')],
1122 @param representation PMI representation, for doing selections.
1123 Not needed if you only pass hierarchies
1124 @param resolution The MRC resolution of the output map (in Angstrom unit)
1125 @param voxel The voxel size for the output map (lower is slower)
1128 self.representation = representation
1129 self.MRCresolution = resolution
1132 self.count_models = 0.0
1133 self.custom_ranges = custom_ranges
1136 """Add a frame to the densities.
1137 @param hierarchy Optionally read the hierarchy from somewhere.
1138 If not passed, will just read the representation.
1140 self.count_models += 1.0
1144 all_particles_by_resolution = []
1145 for name
in part_dict:
1146 all_particles_by_resolution += part_dict[name]
1148 for density_name
in self.custom_ranges:
1151 all_particles_by_segments = []
1153 for seg
in self.custom_ranges[density_name]:
1156 parts += IMP.tools.select_by_tuple(self.representation,
1157 seg, resolution=1, name_is_ambiguous=
False)
1161 for h
in hierarchy.get_children():
1165 if type(seg) == str:
1167 elif type(seg) == tuple
and len(seg) == 2:
1169 hierarchy, molecule=seg[0],copy_index=seg[1])
1170 elif type(seg) == tuple
and len(seg) == 3:
1172 hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1))
1173 elif type(seg) == tuple
and len(seg) == 4:
1175 hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1),copy_index=seg[3])
1177 raise Exception(
'could not understand selection tuple '+str(seg))
1179 all_particles_by_segments += s.get_selected_particles()
1182 parts = all_particles_by_segments
1185 set(all_particles_by_segments) & set(all_particles_by_resolution))
1186 self._create_density_from_particles(parts, density_name)
1188 def normalize_density(self):
1191 def _create_density_from_particles(self, ps, name,
1192 kernel_type=
'GAUSSIAN'):
1193 '''Internal function for adding to densities.
1194 pass XYZR particles with mass and create a density from them.
1195 kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
1197 'GAUSSIAN': IMP.em.GAUSSIAN,
1198 'BINARIZED_SPHERE': IMP.em.BINARIZED_SPHERE,
1199 'SPHERE': IMP.em.SPHERE}
1203 dmap.set_was_used(
True)
1204 if name
not in self.densities:
1205 self.densities[name] = dmap
1211 dmap3.set_was_used(
True)
1213 dmap3.add(self.densities[name])
1214 self.densities[name] = dmap3
1216 def get_density_keys(self):
1217 return list(self.densities.keys())
1220 """Get the current density for some component name"""
1221 if name
not in self.densities:
1224 return self.densities[name]
1226 def write_mrc(self, path="./",suffix=None):
1228 for density_name
in self.densities:
1229 self.densities[density_name].
multiply(1. / self.count_models)
1231 name=path +
"/" + density_name +
".mrc"
1233 name=path +
"/" + density_name +
"." + suffix +
".mrc"
1234 path, file = os.path.split(name)
1235 if not os.path.exists(path):
1238 except OSError
as e:
1239 if e.errno != errno.EEXIST:
1242 self.densities[density_name],name,
1246 class GetContactMap(object):
1249 self.distance = distance
1250 self.contactmap =
''
1257 def set_prot(self, prot):
1258 from scipy.spatial.distance
import cdist
1267 for name
in particles_dictionary:
1268 residue_indexes = []
1269 for p
in particles_dictionary[name]:
1273 if len(residue_indexes) != 0:
1274 self.protnames.append(name)
1276 def get_subunit_coords(self, frame, align=0):
1277 from scipy.spatial.distance
import cdist
1281 test, testr = [], []
1282 for part
in self.prot.get_children():
1285 for chl
in part.get_children():
1291 SortedSegments.append((chl, startres))
1292 SortedSegments = sorted(SortedSegments, key=itemgetter(1))
1294 for sgmnt
in SortedSegments:
1297 crd = np.array([p.get_x(), p.get_y(), p.get_z()])
1300 radii.append(p.get_radius())
1302 new_name = part.get_name() +
'_' + sgmnt[0].get_name() +\
1305 .get_residue_indexes()[0])
1306 namelist.append(new_name)
1307 self.expanded[new_name] = len(
1309 if part.get_name()
not in self.resmap:
1310 self.resmap[part.get_name()] = {}
1312 self.resmap[part.get_name()][res] = new_name
1314 coords = np.array(coords)
1315 radii = np.array(radii)
1316 if len(self.namelist) == 0:
1317 self.namelist = namelist
1318 self.contactmap = np.zeros((len(coords), len(coords)))
1319 distances = cdist(coords, coords)
1320 distances = (distances - radii).T - radii
1321 distances = distances <= self.distance
1322 self.contactmap += distances
1327 identification_string=
'ISDCrossLinkMS_Distance_'):
1330 data = open(filname)
1331 D = data.readlines()
1335 if identification_string
in d:
1342 t1, t2 = (d[0], d[1]), (d[1], d[0])
1343 if t1
not in self.XL:
1344 self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
1345 self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
1347 self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
1348 self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
1350 def dist_matrix(self, skip_cmap=0, skip_xl=1, outname=None):
1354 L = sum(self.expanded.values())
1355 proteins = self.protnames
1360 proteins = [p.get_name()
for p
in self.prot.get_children()]
1362 for p1
in range(len(proteins)):
1363 for p2
in range(p1, len(proteins)):
1365 self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1366 pn1, pn2 = proteins[p1], proteins[p2]
1367 mtr = np.zeros((pl1 + 1, pl2 + 1))
1368 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1369 for i1
in range(1, pl1 + 1):
1370 for i2
in range(1, pl2 + 1):
1372 r1 = K.index(self.resmap[pn1][i1])
1373 r2 = K.index(self.resmap[pn2][i2])
1375 mtr[i1 - 1, i2 - 1] = r
1377 missing.append((pn1, pn2, i1, i2))
1379 Matrices[(pn1, pn2)] = mtr
1384 raise ValueError(
"cross-links were not provided, use add_xlinks function!")
1387 for p1
in range(len(proteins)):
1388 for p2
in range(p1, len(proteins)):
1390 self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1391 pn1, pn2 = proteins[p1], proteins[p2]
1392 mtr = np.zeros((pl1 + 1, pl2 + 1))
1395 xls = self.XL[(pn1, pn2)]
1398 xls = self.XL[(pn2, pn1)]
1403 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1404 for xl1, xl2
in xls:
1406 print(
'X' * 10, xl1, xl2)
1409 print(
'X' * 10, xl1, xl2)
1411 mtr[xl1 - 1, xl2 - 1] = 100
1413 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1414 for xl1, xl2
in xls:
1416 print(
'X' * 10, xl1, xl2)
1419 print(
'X' * 10, xl1, xl2)
1421 mtr[xl2 - 1, xl1 - 1] = 100
1423 print(
'No cross links between: ', pn1, pn2)
1424 Matrices_xl[(pn1, pn2)] = mtr
1440 for i, c
in enumerate(C):
1441 cl = max(self.resmap[c].keys())
1446 R.append(R[-1] + cl)
1451 import matplotlib
as mpl
1453 import matplotlib.pyplot
as plt
1454 import matplotlib.gridspec
as gridspec
1455 import scipy.sparse
as sparse
1458 gs = gridspec.GridSpec(len(W), len(W),
1463 for x1, r1
in enumerate(R):
1468 for x2, r2
in enumerate(R):
1474 ax = plt.subplot(gs[cnt])
1477 mtr = Matrices[(C[x1], C[x2])]
1479 mtr = Matrices[(C[x2], C[x1])].T
1483 interpolation=
'nearest',
1485 vmax=log(mtr.max()))
1490 mtr = Matrices_xl[(C[x1], C[x2])]
1492 mtr = Matrices_xl[(C[x2], C[x1])].T
1494 sparse.csr_matrix(mtr),
1504 ax.set_ylabel(C[x1], rotation=90)
1506 plt.savefig(outname +
".pdf", dpi=300, transparent=
"False")
1514 def get_hiers_from_rmf(model, frame_number, rmf_file):
1516 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1519 rh = RMF.open_rmf_file_read_only(rmf_file)
1524 print(
"Unable to open rmf file %s" % (rmf_file))
1531 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1538 def link_hiers_to_rmf(model,hiers,frame_number, rmf_file):
1539 print(
"linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1540 rh = RMF.open_rmf_file_read_only(rmf_file)
1545 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1552 def get_hiers_and_restraints_from_rmf(model, frame_number, rmf_file):
1554 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1557 rh = RMF.open_rmf_file_read_only(rmf_file)
1563 print(
"Unable to open rmf file %s" % (rmf_file))
1568 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1575 def link_hiers_and_restraints_to_rmf(model,hiers,rs, frame_number, rmf_file):
1576 print(
"linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1577 rh = RMF.open_rmf_file_read_only(rmf_file)
1583 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1591 """Get particles at res 1, or any beads, based on the name.
1592 No Representation is needed. This is mainly used when the hierarchy
1593 is read from an RMF file.
1594 @return a dictionary of component names and their particles
1595 \note If the root node is named "System" or is a "State", do proper selection.
1601 for mol
in IMP.atom.get_by_type(prot,IMP.atom.MOLECULE_TYPE):
1603 particle_dict[mol.get_name()] = sel.get_selected_particles()
1606 for c
in prot.get_children():
1609 for s
in c.get_children():
1610 if "_Res:1" in s.get_name()
and "_Res:10" not in s.get_name():
1612 if "Beads" in s.get_name():
1616 for name
in particle_dict:
1617 particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1618 list(set(particle_dict[name]) & set(allparticles)))
1619 return particle_dict
1622 """Get particles at res 10, or any beads, based on the name.
1623 No Representation is needed.
1624 This is mainly used when the hierarchy is read from an RMF file.
1625 @return a dictionary of component names and their particles
1626 \note If the root node is named "System" or is a "State", do proper selection.
1631 for mol
in IMP.atom.get_by_type(prot,IMP.atom.MOLECULE_TYPE):
1633 particle_dict[mol.get_name()] = sel.get_selected_particles()
1636 for c
in prot.get_children():
1639 for s
in c.get_children():
1640 if "_Res:10" in s.get_name():
1642 if "Beads" in s.get_name():
1645 for name
in particle_dict:
1646 particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1647 list(set(particle_dict[name]) & set(allparticles)))
1648 return particle_dict
1650 def select_by_tuple(first_res_last_res_name_tuple):
1651 first_res = first_res_last_res_hier_tuple[0]
1652 last_res = first_res_last_res_hier_tuple[1]
1653 name = first_res_last_res_hier_tuple[2]
1656 """Visualization of crosslinks"""
1658 self.crosslinks = []
1659 self.external_csv_data =
None
1660 self.crosslinkedprots = set()
1661 self.mindist = +10000000.0
1662 self.maxdist = -10000000.0
1663 self.contactmap =
None
1665 def set_hierarchy(self, prot):
1666 self.prot_length_dict = {}
1667 self.model=prot.get_model()
1669 for i
in prot.get_children():
1671 residue_indexes = []
1675 if len(residue_indexes) != 0:
1676 self.prot_length_dict[name] = max(residue_indexes)
1678 def set_coordinates_for_contact_map(self, rmf_name,rmf_frame_index):
1679 from scipy.spatial.distance
import cdist
1681 rh= RMF.open_rmf_file_read_only(rmf_name)
1684 print(
"getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
1695 self.index_dictionary = {}
1697 for name
in particles_dictionary:
1698 residue_indexes = []
1699 for p
in particles_dictionary[name]:
1704 if len(residue_indexes) != 0:
1706 for res
in range(min(residue_indexes), max(residue_indexes) + 1):
1709 crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1711 radii.append(d.get_radius())
1712 if name
not in self.index_dictionary:
1713 self.index_dictionary[name] = [resindex]
1715 self.index_dictionary[name].append(resindex)
1718 coords = np.array(coords)
1719 radii = np.array(radii)
1721 distances = cdist(coords, coords)
1722 distances = (distances - radii).T - radii
1724 distances = np.where(distances <= 20.0, 1.0, 0)
1725 if self.contactmap
is None:
1726 self.contactmap = np.zeros((len(coords), len(coords)))
1727 self.contactmap += distances
1729 for prot
in prots: IMP.atom.destroy(prot)
1732 self, data_file, search_label=
'ISDCrossLinkMS_Distance_',
1735 filter_rmf_file_names=
None,
1736 external_csv_data_file=
None,
1737 external_csv_data_file_unique_id_key=
"Unique ID"):
1746 keys = po.get_keys()
1748 xl_keys = [k
for k
in keys
if search_label
in k]
1750 if filter_rmf_file_names
is not None:
1751 rmf_file_key=
"local_rmf_file_name"
1752 fs = po.get_fields(xl_keys+[rmf_file_key])
1754 fs = po.get_fields(xl_keys)
1757 self.cross_link_frequency = {}
1761 self.cross_link_distances = {}
1765 self.cross_link_distances_unique = {}
1767 if not external_csv_data_file
is None:
1770 self.external_csv_data = {}
1771 xldb = IMP.pmi.tools.get_db_from_csv(external_csv_data_file)
1774 self.external_csv_data[
1775 xl[external_csv_data_file_unique_id_key]] = xl
1780 cross_link_frequency_list = []
1782 self.unique_cross_link_list = []
1786 keysplit = key.replace(
1795 if filter_label!=
None:
1796 if filter_label
not in keysplit:
continue
1799 r1 = int(keysplit[5])
1801 r2 = int(keysplit[7])
1804 confidence = keysplit[12]
1808 unique_identifier = keysplit[3]
1810 unique_identifier =
'0'
1812 r1 = int(keysplit[mapping[
"Residue1"]])
1813 c1 = keysplit[mapping[
"Protein1"]]
1814 r2 = int(keysplit[mapping[
"Residue2"]])
1815 c2 = keysplit[mapping[
"Protein2"]]
1817 confidence = keysplit[mapping[
"Confidence"]]
1821 unique_identifier = keysplit[mapping[
"Unique Identifier"]]
1823 unique_identifier =
'0'
1825 self.crosslinkedprots.add(c1)
1826 self.crosslinkedprots.add(c2)
1832 if filter_rmf_file_names
is not None:
1833 for n,d
in enumerate(fs[key]):
1834 if fs[rmf_file_key][n]
in filter_rmf_file_names:
1835 dists.append(float(d))
1837 dists=[float(f)
for f
in fs[key]]
1842 mdist = self.median(dists)
1844 stdv = np.std(np.array(dists))
1845 if self.mindist > mdist:
1846 self.mindist = mdist
1847 if self.maxdist < mdist:
1848 self.maxdist = mdist
1852 if not self.external_csv_data
is None:
1853 sample = self.external_csv_data[unique_identifier][
"Sample"]
1857 if (r1, c1, r2, c2,mdist)
not in cross_link_frequency_list:
1858 if (r1, c1, r2, c2)
not in self.cross_link_frequency:
1859 self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1860 self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1862 self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1863 self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1864 cross_link_frequency_list.append((r1, c1, r2, c2))
1865 cross_link_frequency_list.append((r2, c2, r1, c1))
1866 self.unique_cross_link_list.append(
1867 (r1, c1, r2, c2,mdist))
1869 if (r1, c1, r2, c2)
not in self.cross_link_distances:
1870 self.cross_link_distances[(
1876 confidence)] = dists
1877 self.cross_link_distances[(
1883 confidence)] = dists
1884 self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1886 self.cross_link_distances[(
1892 confidence)] += dists
1893 self.cross_link_distances[(
1899 confidence)] += dists
1901 self.crosslinks.append(
1911 self.crosslinks.append(
1922 self.cross_link_frequency_inverted = {}
1923 for xl
in self.unique_cross_link_list:
1924 (r1, c1, r2, c2, mdist) = xl
1925 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1926 if frequency
not in self.cross_link_frequency_inverted:
1927 self.cross_link_frequency_inverted[
1928 frequency] = [(r1, c1, r2, c2)]
1930 self.cross_link_frequency_inverted[
1931 frequency].append((r1, c1, r2, c2))
1935 def median(self, mylist):
1936 sorts = sorted(mylist)
1942 return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1943 return sorts[length / 2]
1945 def set_threshold(self,threshold):
1946 self.threshold=threshold
1948 def set_tolerance(self,tolerance):
1949 self.tolerance=tolerance
1951 def colormap(self, dist):
1952 if dist < self.threshold - self.tolerance:
1954 elif dist >= self.threshold + self.tolerance:
1959 def write_cross_link_database(self, filename, format='csv'):
1963 "Unique ID",
"Protein1",
"Residue1",
"Protein2",
"Residue2",
1964 "Median Distance",
"Standard Deviation",
"Confidence",
"Frequency",
"Arrangement"]
1966 if not self.external_csv_data
is None:
1967 keys = list(self.external_csv_data.keys())
1968 innerkeys = list(self.external_csv_data[keys[0]].keys())
1970 fieldnames += innerkeys
1972 dw = csv.DictWriter(
1976 fieldnames=fieldnames)
1978 for xl
in self.crosslinks:
1979 (r1, c1, r2, c2, mdist, stdv, confidence,
1980 unique_identifier, descriptor) = xl
1981 if descriptor ==
'original':
1983 outdict[
"Unique ID"] = unique_identifier
1984 outdict[
"Protein1"] = c1
1985 outdict[
"Protein2"] = c2
1986 outdict[
"Residue1"] = r1
1987 outdict[
"Residue2"] = r2
1988 outdict[
"Median Distance"] = mdist
1989 outdict[
"Standard Deviation"] = stdv
1990 outdict[
"Confidence"] = confidence
1991 outdict[
"Frequency"] = self.cross_link_frequency[
1994 arrangement =
"Intra"
1996 arrangement =
"Inter"
1997 outdict[
"Arrangement"] = arrangement
1998 if not self.external_csv_data
is None:
1999 outdict.update(self.external_csv_data[unique_identifier])
2001 dw.writerow(outdict)
2003 def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
2004 no_confidence_info=
False, filter=
None, layout=
"whole", crosslinkedonly=
False,
2005 filename=
None, confidence_classes=
None, alphablend=0.1, scale_symbol_size=1.0,
2006 gap_between_components=0,
2007 rename_protein_map=
None):
2021 import matplotlib
as mpl
2023 import matplotlib.pyplot
as plt
2024 import matplotlib.cm
as cm
2026 fig = plt.figure(figsize=(10, 10))
2027 ax = fig.add_subplot(111)
2033 if prot_listx
is None:
2035 prot_listx = list(self.crosslinkedprots)
2037 prot_listx = list(self.prot_length_dict.keys())
2040 nresx = gap_between_components + \
2041 sum([self.prot_length_dict[name]
2042 + gap_between_components
for name
in prot_listx])
2046 if prot_listy
is None:
2048 prot_listy = list(self.crosslinkedprots)
2050 prot_listy = list(self.prot_length_dict.keys())
2053 nresy = gap_between_components + \
2054 sum([self.prot_length_dict[name]
2055 + gap_between_components
for name
in prot_listy])
2060 res = gap_between_components
2061 for prot
in prot_listx:
2062 resoffsetx[prot] = res
2063 res += self.prot_length_dict[prot]
2065 res += gap_between_components
2069 res = gap_between_components
2070 for prot
in prot_listy:
2071 resoffsety[prot] = res
2072 res += self.prot_length_dict[prot]
2074 res += gap_between_components
2076 resoffsetdiagonal = {}
2077 res = gap_between_components
2078 for prot
in IMP.pmi.tools.OrderedSet(prot_listx + prot_listy):
2079 resoffsetdiagonal[prot] = res
2080 res += self.prot_length_dict[prot]
2081 res += gap_between_components
2087 for n, prot
in enumerate(prot_listx):
2088 res = resoffsetx[prot]
2090 for proty
in prot_listy:
2091 resy = resoffsety[proty]
2092 endy = resendy[proty]
2093 ax.plot([res, res], [resy, endy],
'k-', lw=0.4)
2094 ax.plot([end, end], [resy, endy],
'k-', lw=0.4)
2095 xticks.append((float(res) + float(end)) / 2)
2096 if rename_protein_map
is not None:
2097 if prot
in rename_protein_map:
2098 prot=rename_protein_map[prot]
2099 xlabels.append(prot)
2103 for n, prot
in enumerate(prot_listy):
2104 res = resoffsety[prot]
2106 for protx
in prot_listx:
2107 resx = resoffsetx[protx]
2108 endx = resendx[protx]
2109 ax.plot([resx, endx], [res, res],
'k-', lw=0.4)
2110 ax.plot([resx, endx], [end, end],
'k-', lw=0.4)
2111 yticks.append((float(res) + float(end)) / 2)
2112 if rename_protein_map
is not None:
2113 if prot
in rename_protein_map:
2114 prot=rename_protein_map[prot]
2115 ylabels.append(prot)
2118 print(prot_listx, prot_listy)
2120 if not self.contactmap
is None:
2121 import matplotlib.cm
as cm
2122 tmp_array = np.zeros((nresx, nresy))
2124 for px
in prot_listx:
2126 for py
in prot_listy:
2128 resx = resoffsety[px]
2129 lengx = resendx[px] - 1
2130 resy = resoffsety[py]
2131 lengy = resendy[py] - 1
2132 indexes_x = self.index_dictionary[px]
2133 minx = min(indexes_x)
2134 maxx = max(indexes_x)
2135 indexes_y = self.index_dictionary[py]
2136 miny = min(indexes_y)
2137 maxy = max(indexes_y)
2139 print(px, py, minx, maxx, miny, maxy)
2144 resy:lengy] = self.contactmap[
2151 ax.imshow(tmp_array,
2154 interpolation=
'nearest')
2156 ax.set_xticks(xticks)
2157 ax.set_xticklabels(xlabels, rotation=90)
2158 ax.set_yticks(yticks)
2159 ax.set_yticklabels(ylabels)
2160 ax.set_xlim(0,nresx)
2161 ax.set_ylim(0,nresy)
2166 already_added_xls = []
2168 for xl
in self.crosslinks:
2170 (r1, c1, r2, c2, mdist, stdv, confidence,
2171 unique_identifier, descriptor) = xl
2173 if confidence_classes
is not None:
2174 if confidence
not in confidence_classes:
2178 pos1 = r1 + resoffsetx[c1]
2182 pos2 = r2 + resoffsety[c2]
2186 if not filter
is None:
2187 xldb = self.external_csv_data[unique_identifier]
2188 xldb.update({
"Distance": mdist})
2189 xldb.update({
"Distance_stdv": stdv})
2191 if filter[1] ==
">":
2192 if float(xldb[filter[0]]) <= float(filter[2]):
2195 if filter[1] ==
"<":
2196 if float(xldb[filter[0]]) >= float(filter[2]):
2199 if filter[1] ==
"==":
2200 if float(xldb[filter[0]]) != float(filter[2]):
2206 pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
2207 pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
2209 if layout ==
'lowerdiagonal':
2210 if pos_for_diagonal1 <= pos_for_diagonal2:
2212 if layout ==
'upperdiagonal':
2213 if pos_for_diagonal1 >= pos_for_diagonal2:
2216 already_added_xls.append((r1, c1, r2, c2))
2218 if not no_confidence_info:
2219 if confidence ==
'0.01':
2220 markersize = 14 * scale_symbol_size
2221 elif confidence ==
'0.05':
2222 markersize = 9 * scale_symbol_size
2223 elif confidence ==
'0.1':
2224 markersize = 6 * scale_symbol_size
2226 markersize = 15 * scale_symbol_size
2228 markersize = 5 * scale_symbol_size
2230 if not no_dist_info:
2231 color = self.colormap(mdist)
2241 markersize=markersize)
2245 fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
2247 [i.set_linewidth(2.0)
for i
in ax.spines.values()]
2252 plt.savefig(filename +
".pdf", dpi=300, transparent=
"False")
2256 def get_frequency_statistics(self, prot_list,
2259 violated_histogram = {}
2260 satisfied_histogram = {}
2261 unique_cross_links = []
2263 for xl
in self.unique_cross_link_list:
2264 (r1, c1, r2, c2, mdist) = xl
2267 if prot_list2
is None:
2268 if not c1
in prot_list:
2270 if not c2
in prot_list:
2273 if c1
in prot_list
and c2
in prot_list2:
2275 elif c1
in prot_list2
and c2
in prot_list:
2280 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2282 if (r1, c1, r2, c2)
not in unique_cross_links:
2284 if frequency
not in violated_histogram:
2285 violated_histogram[frequency] = 1
2287 violated_histogram[frequency] += 1
2289 if frequency
not in satisfied_histogram:
2290 satisfied_histogram[frequency] = 1
2292 satisfied_histogram[frequency] += 1
2293 unique_cross_links.append((r1, c1, r2, c2))
2294 unique_cross_links.append((r2, c2, r1, c1))
2296 print(
"# satisfied")
2298 total_number_of_crosslinks=0
2300 for i
in satisfied_histogram:
2304 if i
in violated_histogram:
2305 print(i, violated_histogram[i]+satisfied_histogram[i])
2307 print(i, satisfied_histogram[i])
2308 total_number_of_crosslinks+=i*satisfied_histogram[i]
2312 for i
in violated_histogram:
2313 print(i, violated_histogram[i])
2314 total_number_of_crosslinks+=i*violated_histogram[i]
2316 print(total_number_of_crosslinks)
2320 def print_cross_link_binary_symbols(self, prot_list,
2323 confidence_list = []
2324 for xl
in self.crosslinks:
2325 (r1, c1, r2, c2, mdist, stdv, confidence,
2326 unique_identifier, descriptor) = xl
2328 if prot_list2
is None:
2329 if not c1
in prot_list:
2331 if not c2
in prot_list:
2334 if c1
in prot_list
and c2
in prot_list2:
2336 elif c1
in prot_list2
and c2
in prot_list:
2341 if descriptor !=
"original":
2344 confidence_list.append(confidence)
2346 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2347 tmp_dist_binary = []
2350 tmp_dist_binary.append(1)
2352 tmp_dist_binary.append(0)
2353 tmp_matrix.append(tmp_dist_binary)
2355 matrix = list(zip(*tmp_matrix))
2357 satisfied_high_sum = 0
2358 satisfied_mid_sum = 0
2359 satisfied_low_sum = 0
2360 total_satisfied_sum = 0
2361 for k, m
in enumerate(matrix):
2370 for n, b
in enumerate(m):
2371 if confidence_list[n] ==
"0.01":
2375 satisfied_high_sum += 1
2376 elif confidence_list[n] ==
"0.05":
2380 satisfied_mid_sum += 1
2381 elif confidence_list[n] ==
"0.1":
2385 satisfied_low_sum += 1
2387 total_satisfied += 1
2388 total_satisfied_sum += 1
2390 print(k, satisfied_high, total_high)
2391 print(k, satisfied_mid, total_mid)
2392 print(k, satisfied_low, total_low)
2393 print(k, total_satisfied, total)
2394 print(float(satisfied_high_sum) / len(matrix))
2395 print(float(satisfied_mid_sum) / len(matrix))
2396 print(float(satisfied_low_sum) / len(matrix))
2399 def get_unique_crosslinks_statistics(self, prot_list,
2412 satisfied_string = []
2413 for xl
in self.crosslinks:
2414 (r1, c1, r2, c2, mdist, stdv, confidence,
2415 unique_identifier, descriptor) = xl
2417 if prot_list2
is None:
2418 if not c1
in prot_list:
2420 if not c2
in prot_list:
2423 if c1
in prot_list
and c2
in prot_list2:
2425 elif c1
in prot_list2
and c2
in prot_list:
2430 if descriptor !=
"original":
2434 if confidence ==
"0.01":
2438 if confidence ==
"0.05":
2442 if confidence ==
"0.1":
2447 satisfied_string.append(1)
2449 satisfied_string.append(0)
2451 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2452 tmp_dist_binary = []
2455 tmp_dist_binary.append(1)
2457 tmp_dist_binary.append(0)
2458 tmp_matrix.append(tmp_dist_binary)
2460 print(
"unique satisfied_high/total_high", satisfied_high,
"/", total_high)
2461 print(
"unique satisfied_mid/total_mid", satisfied_mid,
"/", total_mid)
2462 print(
"unique satisfied_low/total_low", satisfied_low,
"/", total_low)
2463 print(
"total", total)
2465 matrix = list(zip(*tmp_matrix))
2466 satisfied_models = 0
2468 for b
in satisfied_string:
2475 all_satisfied =
True
2477 for n, b
in enumerate(m):
2482 if b == 1
and satisfied_string[n] == 1:
2484 elif b == 1
and satisfied_string[n] == 0:
2486 elif b == 0
and satisfied_string[n] == 0:
2488 elif b == 0
and satisfied_string[n] == 1:
2489 all_satisfied =
False
2491 satisfied_models += 1
2493 print(satstr, all_satisfied)
2494 print(
"models that satisfies the median satisfied crosslinks/total models", satisfied_models, len(matrix))
2496 def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,
2499 import matplotlib
as mpl
2501 import matplotlib.pylab
as pl
2504 for kw
in self.cross_link_distances_unique:
2505 (r1, c1, r2, c2) = kw
2506 dists = self.cross_link_distances_unique[kw]
2508 if prot_list2
is None:
2509 if not c1
in prot_list:
2511 if not c2
in prot_list:
2514 if c1
in prot_list
and c2
in prot_list2:
2516 elif c1
in prot_list2
and c2
in prot_list:
2521 dists.append(sum(dists))
2522 tmp_matrix.append(dists)
2524 tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
2527 matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
2529 for i
in range(len(tmp_matrix)):
2530 for k
in range(len(tmp_matrix[i]) - 1):
2531 matrix[i][k] = tmp_matrix[i][k]
2536 ax = fig.add_subplot(211)
2538 cax = ax.imshow(matrix, interpolation=
'nearest')
2542 pl.savefig(figurename, dpi=300)
2551 arrangement=
"inter",
2552 confidence_input=
"None"):
2555 for xl
in self.cross_link_distances:
2556 (r1, c1, r2, c2, mdist, confidence) = xl
2557 if c1
in prots1
and c2
in prots2:
2558 if arrangement ==
"inter" and c1 == c2:
2560 if arrangement ==
"intra" and c1 != c2:
2562 if confidence_input == confidence:
2563 label = str(c1) +
":" + str(r1) + \
2564 "-" + str(c2) +
":" + str(r2)
2565 values = self.cross_link_distances[xl]
2566 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2567 data.append((label, values, mdist, frequency))
2569 sort_by_dist = sorted(data, key=
lambda tup: tup[2])
2570 sort_by_dist = list(zip(*sort_by_dist))
2571 values = sort_by_dist[1]
2572 positions = list(range(len(values)))
2573 labels = sort_by_dist[0]
2574 frequencies = list(map(float, sort_by_dist[3]))
2575 frequencies = [f * 10.0
for f
in frequencies]
2577 nchunks = int(float(len(values)) / nxl_per_row)
2578 values_chunks = IMP.pmi.tools.chunk_list_into_segments(values, nchunks)
2579 positions_chunks = IMP.pmi.tools.chunk_list_into_segments(
2582 frequencies_chunks = IMP.pmi.tools.chunk_list_into_segments(
2585 labels_chunks = IMP.pmi.tools.chunk_list_into_segments(labels, nchunks)
2587 for n, v
in enumerate(values_chunks):
2588 p = positions_chunks[n]
2589 f = frequencies_chunks[n]
2590 l = labels_chunks[n]
2592 filename +
"." + str(n), v, p, f,
2593 valuename=
"Distance (Ang)", positionname=
"Unique " + arrangement +
" Crosslinks", xlabels=l)
2595 def crosslink_distance_histogram(self, filename,
2598 confidence_classes=
None,
2604 if prot_list
is None:
2605 prot_list = list(self.prot_length_dict.keys())
2608 for xl
in self.crosslinks:
2609 (r1, c1, r2, c2, mdist, stdv, confidence,
2610 unique_identifier, descriptor) = xl
2612 if not confidence_classes
is None:
2613 if confidence
not in confidence_classes:
2616 if prot_list2
is None:
2617 if not c1
in prot_list:
2619 if not c2
in prot_list:
2622 if c1
in prot_list
and c2
in prot_list2:
2624 elif c1
in prot_list2
and c2
in prot_list:
2629 distances.append(mdist)
2632 filename, distances, valuename=
"C-alpha C-alpha distance [Ang]",
2633 bins=bins, color=color,
2635 reference_xline=35.0,
2636 yplotrange=yplotrange, normalized=normalized)
2638 def scatter_plot_xl_features(self, filename,
2644 reference_ylines=
None,
2645 distance_color=
True,
2647 import matplotlib
as mpl
2649 import matplotlib.pyplot
as plt
2650 import matplotlib.cm
as cm
2652 fig = plt.figure(figsize=(10, 10))
2653 ax = fig.add_subplot(111)
2655 for xl
in self.crosslinks:
2656 (r1, c1, r2, c2, mdist, stdv, confidence,
2657 unique_identifier, arrangement) = xl
2659 if prot_list2
is None:
2660 if not c1
in prot_list:
2662 if not c2
in prot_list:
2665 if c1
in prot_list
and c2
in prot_list2:
2667 elif c1
in prot_list2
and c2
in prot_list:
2672 xldb = self.external_csv_data[unique_identifier]
2673 xldb.update({
"Distance": mdist})
2674 xldb.update({
"Distance_stdv": stdv})
2676 xvalue = float(xldb[feature1])
2677 yvalue = float(xldb[feature2])
2680 color = self.colormap(mdist)
2684 ax.plot([xvalue], [yvalue],
'o', c=color, alpha=0.1, markersize=7)
2686 if not yplotrange
is None:
2687 ax.set_ylim(yplotrange)
2688 if not reference_ylines
is None:
2689 for rl
in reference_ylines:
2690 ax.axhline(rl, color=
'red', linestyle=
'dashed', linewidth=1)
2693 plt.savefig(filename +
"." + format, dpi=150, transparent=
"False")
static bool get_is_setup(const IMP::ParticleAdaptor &p)
def get_output
Returns output for IMP.pmi.output.Output object.
Visualization of crosslinks.
A decorator to associate a particle with a part of a protein/DNA/RNA.
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
A class for reading stat files (either rmf or ascii v1 and v2)
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
Compute the RMSD (without alignment) taking into account the copy ambiguity.
void link_restraints(RMF::FileConstHandle fh, const Restraints &hs)
A class to evaluate the precision of an ensemble.
A class to cluster structures.
def get_rmsd_wrt_reference_structure_with_alignment
First align then calculate RMSD.
static Molecule setup_particle(Model *m, ParticleIndex pi)
Class for sampling a density map from particles.
A decorator for keeping track of copies of a molecule.
def add_structure
Read a structure into the ensemble and store (as coordinates).
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
Return a density map for which voxel i contains the result of m1[i]*m2[i].
Performs alignment and RMSD calculation for two sets of coordinates.
def add_subunits_density
Add a frame to the densities.
DensityMap * create_density_map(const IMP::algebra::GridD< 3, S, V, E > &arg)
Create a density map from an arbitrary IMP::algebra::GridD.
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Basic utilities for handling cryo-electron microscopy 3D density maps.
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
def fill
Add coordinates for a single model.
Tools for clustering and cluster analysis.
bool get_is_canonical(atom::Hierarchy h)
Walk up a PMI2 hierarchy/representations and check if the root is named System.
def do_cluster
Run K-means clustering.
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Classes for writing output files and processing them.
def set_reference_structure
Read in a structure used for reference computation.
def get_rmsf
Calculate the residue mean square fluctuations (RMSF).
def get_average_distance_wrt_reference_structure
Compare the structure set to the reference structure.
def get_density
Get the current density for some component name.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
def get_precision
Evaluate the precision of two named structure groups.
Restraints create_restraints(RMF::FileConstHandle fh, Model *m)
def get_moldict_coorddict
return data structure for the RMSD calculation
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name.
Python classes to represent, score, sample and analyze models.
def add_structures
Read a list of RMFs, supports parallel.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Select hierarchy particles identified by the biological name.
Compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
A decorator for a particle with x,y,z coordinates and a radius.