3 """@namespace IMP.pmi.analysis
4 Tools for clustering and cluster analysis
6 from __future__
import print_function
16 from operator
import itemgetter
17 from copy
import deepcopy
18 from math
import log,sqrt
24 """Performs alignment and RMSD calculation for two sets of coordinates
26 The class also takes into account non-equal stoichiometry of the proteins.
27 If this is the case, the protein names of proteins in multiple copies
28 should be specified in the following form:
29 nameA..1, nameA..2 (note two dots).
32 def __init__(self, template, query, weights=None):
34 @param query {'p1':coords(L,3), 'p2':coords(L,3)}
35 @param template {'p1':coords(L,3), 'p2':coords(L,3)}
36 @param weights optional weights for each set of coordinates
39 self.template = template
42 if len(self.query.keys()) != len(self.template.keys()):
43 raise ValueError(
'''the number of proteins
44 in template and query does not match!''')
50 self.proteins = sorted(self.query.keys())
51 prots_uniq = [i.split(
'..')[0]
for i
in self.proteins]
59 np = prots_uniq.count(p)
60 copies = [i
for i
in self.proteins
if i.split(
'..')[0] == p]
61 prmts = list(itertools.permutations(copies, len(copies)))
64 self.Product = list(itertools.product(*P.values()))
72 torder = sum([list(i)
for i
in self.Product[0]], [])
75 if self.weights
is not None:
76 weights += [i
for i
in self.weights[t]]
79 self.rmsd = 10000000000.
80 for comb
in self.Product:
82 order = sum([list(i)
for i
in comb], [])
92 if self.weights
is not None:
103 from scipy.spatial.distance
import cdist
111 torder = sum([list(i)
for i
in self.Product[0]], [])
115 self.rmsd, Transformation = 10000000000.,
''
118 for comb
in self.Product:
119 order = sum([list(i)
for i
in comb], [])
125 if len(template_xyz) != len(query_xyz):
126 raise ValueError(
'''the number of coordinates
127 in template and query does not match!''')
132 query_xyz_tr = [transformation.get_transformed(n)
136 sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2)) / len(template_xyz))
139 Transformation = transformation
142 return (self.rmsd, Transformation)
147 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
148 'a..2':np.array([np.array([1.,1.,])]),
149 'a..3':np.array([np.array([-2.,1.])]),
150 'b':np.array([np.array([0.,-1.])]),
151 'c..1':np.array([np.array([-1.,-1.])]),
152 'c..2':np.array([np.array([1.,-1.])]),
153 'd':np.array([np.array([0.,0.])]),
154 'e':np.array([np.array([0.,1.])])}
156 Ali = Alignment(Proteins, Proteins)
158 if Ali.get_rmsd() == 0.0: print 'successful test!'
159 else: print 'ERROR!'; exit()
164 class Violations(object):
168 self.violation_thresholds = {}
169 self.violation_counts = {}
171 data = open(filename)
176 d = d.strip().split()
177 self.violation_thresholds[d[0]] = float(d[1])
179 def get_number_violated_restraints(self, rsts_dict):
181 for rst
in self.violation_thresholds:
182 if rst
not in rsts_dict:
184 if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
186 if rst
not in self.violation_counts:
187 self.violation_counts[rst] = 1
189 self.violation_counts[rst] += 1
195 """A class to cluster structures.
196 Uses scipy's cdist function to compute distance matrices
197 and sklearn's kmeans clustering module.
201 @param rmsd_weights Flat list of weights for each particle
205 from mpi4py
import MPI
206 self.comm = MPI.COMM_WORLD
207 self.rank = self.comm.Get_rank()
208 self.number_of_processes = self.comm.size
210 self.number_of_processes = 1
213 self.structure_cluster_ids =
None
214 self.tmpl_coords =
None
215 self.rmsd_weights=rmsd_weights
217 def set_template(self, part_coords):
219 self.tmpl_coords = part_coords
222 """Add coordinates for a single model."""
224 self.all_coords[frame] = Coords
226 def dist_matrix(self):
228 self.model_list_names = list(self.all_coords.keys())
229 self.model_indexes = list(range(len(self.model_list_names)))
230 self.model_indexes_dict = dict(
231 list(zip(self.model_list_names, self.model_indexes)))
232 model_indexes_unique_pairs = list(itertools.combinations(self.model_indexes, 2))
234 my_model_indexes_unique_pairs = IMP.pmi.tools.chunk_list_into_segments(
235 model_indexes_unique_pairs,
236 self.number_of_processes)[self.rank]
238 print(
"process %s assigned with %s pairs" % (str(self.rank), str(len(my_model_indexes_unique_pairs))))
240 (raw_distance_dict, self.transformation_distance_dict) = self.matrix_calculation(self.all_coords,
242 my_model_indexes_unique_pairs)
244 if self.number_of_processes > 1:
247 pickable_transformations = self.get_pickable_transformation_distance_dict(
250 pickable_transformations)
251 self.set_transformation_distance_dict_from_pickable(
252 pickable_transformations)
254 self.raw_distance_matrix = np.zeros(
255 (len(self.model_list_names), len(self.model_list_names)))
256 for item
in raw_distance_dict:
258 self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
259 self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
261 def get_dist_matrix(self):
262 return self.raw_distance_matrix
265 """Run K-means clustering
266 @param number_of_clusters Num means
267 @param seed the random seed
269 from sklearn.cluster
import KMeans
274 kmeans = KMeans(n_clusters=number_of_clusters)
277 kmeans = KMeans(k=number_of_clusters)
278 kmeans.fit_predict(self.raw_distance_matrix)
280 self.structure_cluster_ids = kmeans.labels_
282 def get_pickable_transformation_distance_dict(self):
283 pickable_transformations = {}
284 for label
in self.transformation_distance_dict:
285 tr = self.transformation_distance_dict[label]
286 trans = tuple(tr.get_translation())
287 rot = tuple(tr.get_rotation().get_quaternion())
288 pickable_transformations[label] = (rot, trans)
289 return pickable_transformations
291 def set_transformation_distance_dict_from_pickable(
293 pickable_transformations):
294 self.transformation_distance_dict = {}
295 for label
in pickable_transformations:
296 tr = pickable_transformations[label]
299 self.transformation_distance_dict[
302 def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
304 outf = open(file_name +
".data",
'wb')
310 pickable_transformations = self.get_pickable_transformation_distance_dict(
313 (self.structure_cluster_ids,
314 self.model_list_names,
315 pickable_transformations),
318 np.save(file_name +
".npy", self.raw_distance_matrix)
320 def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
323 inputf = open(file_name +
".data",
'rb')
324 (self.structure_cluster_ids, self.model_list_names,
325 pickable_transformations) = pickle.load(inputf)
328 self.raw_distance_matrix = np.load(file_name +
".npy")
330 self.set_transformation_distance_dict_from_pickable(
331 pickable_transformations)
332 self.model_indexes = list(range(len(self.model_list_names)))
333 self.model_indexes_dict = dict(
334 list(zip(self.model_list_names, self.model_indexes)))
336 def plot_matrix(self, figurename="clustermatrix.pdf"):
337 import matplotlib
as mpl
339 import matplotlib.pylab
as pl
340 from scipy.cluster
import hierarchy
as hrc
342 fig = pl.figure(figsize=(10,8))
343 ax = fig.add_subplot(212)
344 dendrogram = hrc.dendrogram(
345 hrc.linkage(self.raw_distance_matrix),
348 leaves_order = dendrogram[
'leaves']
349 ax.set_xlabel(
'Model')
350 ax.set_ylabel(
'RMSD [Angstroms]')
352 ax2 = fig.add_subplot(221)
354 self.raw_distance_matrix[leaves_order,
357 interpolation=
'nearest')
358 cb = fig.colorbar(cax)
359 cb.set_label(
'RMSD [Angstroms]')
360 ax2.set_xlabel(
'Model')
361 ax2.set_ylabel(
'Model')
363 pl.savefig(figurename, dpi=300)
366 def get_model_index_from_name(self, name):
367 return self.model_indexes_dict[name]
369 def get_cluster_labels(self):
371 return list(set(self.structure_cluster_ids))
373 def get_number_of_clusters(self):
374 return len(self.get_cluster_labels())
376 def get_cluster_label_indexes(self, label):
378 [i
for i, l
in enumerate(self.structure_cluster_ids)
if l == label]
381 def get_cluster_label_names(self, label):
383 [self.model_list_names[i]
384 for i
in self.get_cluster_label_indexes(label)]
387 def get_cluster_label_average_rmsd(self, label):
389 indexes = self.get_cluster_label_indexes(label)
392 sub_distance_matrix = self.raw_distance_matrix[
393 indexes, :][:, indexes]
394 average_rmsd = np.sum(sub_distance_matrix) / \
395 (len(sub_distance_matrix)
396 ** 2 - len(sub_distance_matrix))
401 def get_cluster_label_size(self, label):
402 return len(self.get_cluster_label_indexes(label))
404 def get_transformation_to_first_member(
408 reference = self.get_cluster_label_indexes(cluster_label)[0]
409 return self.transformation_distance_dict[(reference, structure_index)]
411 def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
413 model_list_names = list(all_coords.keys())
414 rmsd_protein_names = list(all_coords[model_list_names[0]].keys())
415 raw_distance_dict = {}
416 transformation_distance_dict = {}
417 if template_coords
is None:
421 alignment_template_protein_names = list(template_coords.keys())
423 for (f1, f2)
in list_of_pairs:
431 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
432 for pr
in rmsd_protein_names])
434 for pr
in rmsd_protein_names:
435 coords_f2[pr] = all_coords[model_list_names[f2]][pr]
437 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
438 rmsd = Ali.get_rmsd()
444 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
445 for pr
in alignment_template_protein_names])
446 coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
447 for pr
in alignment_template_protein_names])
450 template_rmsd, transformation = Ali.align()
455 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
456 for pr
in rmsd_protein_names])
458 for pr
in rmsd_protein_names:
459 coords_f2[pr] = [transformation.get_transformed(
460 i)
for i
in all_coords[model_list_names[f2]][pr]]
462 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
463 rmsd = Ali.get_rmsd()
465 raw_distance_dict[(f1, f2)] = rmsd
466 raw_distance_dict[(f2, f1)] = rmsd
467 transformation_distance_dict[(f1, f2)] = transformation
468 transformation_distance_dict[(f2, f1)] = transformation
470 return raw_distance_dict, transformation_distance_dict
474 """Compute the RMSD (without alignment) taking into account the copy ambiguity.
475 To be used with pmi2 hierarchies. Can be used for instance as follows:
477 rmsd=IMP.pmi.analysis.RMSD(hier,hier,[mol.get_name() for mol in mols],dynamic0=True,dynamic1=False)
478 output_objects.append(rmsd)
480 before shuffling the coordinates
483 def __init__(self,hier0,hier1,molnames,label="None",dynamic0=True,dynamic1=True,metric=IMP.algebra.get_rmsd):
485 @param hier0 first input hierarchy
486 @param hier1 second input hierarchy
487 @param molname the names of the molecules used for the RMSD
488 @dynamic0 if True stores the decorators XYZ and coordinates of hier0 can be update. If false coordinates are static (stored in Vector3Ds)
489 and will never be updated
490 @dynamic1 same as above
491 metric what metric should be used
495 self.dynamic0=dynamic0
496 self.dynamic1=dynamic1
501 """return data structure for the RMSD calculation"""
507 if name
not in molnames:
514 sel=
IMP.atom.Selection(mol,residue_index=i,representation_type=IMP.atom.BALLS,resolution=1)
515 parts=sel.get_selected_particles()
517 mol_coords[mol].append(
IMP.core.XYZ(parts[0]).get_coordinates())
521 moldict[name].append(mol)
524 return moldict, mol_coords, mol_XYZs
526 def get_rmsd_and_assigments(self):
532 for molname, ref_mols
in self.moldict1.items():
534 for ref_mol
in ref_mols:
536 coord1=[XYZ.get_coordinates()
for XYZ
in self.mol_XYZs1[ref_mol]]
538 coord1=self.molcoords1[ref_mol]
543 for rmf_mols
in itertools.permutations(self.moldict0[molname]):
545 for rmf_mol
in rmf_mols:
547 coord0=[XYZ.get_coordinates()
for XYZ
in self.mol_XYZs0[rmf_mol]]
549 coord0=self.molcoords0[rmf_mol]
553 rmf_mols_list.append(rmf_mols)
555 rmf_mols_best_order=rmf_mols_list[rmsd.index(m)]
558 for n, (ref_mol,rmf_mol)
in enumerate(zip(ref_mols,rmf_mols_best_order)):
559 best_assignments.append((rmf_mol,ref_mol))
561 coord0=[XYZ.get_coordinates()
for XYZ
in self.mol_XYZs0[rmf_mol]]
563 coord0=self.molcoords0[rmf_mol]
566 coord1=[XYZ.get_coordinates()
for XYZ
in self.mol_XYZs1[ref_mol]]
568 coord1=self.molcoords1[ref_mol]
569 rmsd_pair=self.metric(coord1, coord0)
570 N=len(self.molcoords1[ref_mol])
572 total_rmsd+=rmsd_pair*rmsd_pair*N
573 rmsd_dict[ref_mol]=rmsd_pair
574 total_rmsd = sqrt(total_rmsd/total_N)
575 return total_rmsd,best_assignments
578 """Returns output for IMP.pmi.output.Output object"""
579 total_rmsd,best_assignments=self.get_rmsd_and_assigments()
582 for rmf_mol,ref_mol
in best_assignments:
583 ref_name=ref_mol.get_name()
585 rmf_name=rmf_mol.get_name()
587 assignments_out.append(rmf_name+
"."+str(rmf_copy)+
"->"+ref_name+
"."+str(ref_copy))
588 return {
"RMSD_"+self.label:str(total_rmsd),
"RMSD_assignments_"+self.label:str(assignments_out)}
594 """A class to evaluate the precision of an ensemble.
596 Also can evaluate the cross-precision of multiple ensembles.
597 Supports MPI for coordinate reading.
598 Recommended procedure:
599 -# initialize object and pass the selection for evaluating precision
600 -# call add_structures() to read in the data (specify group name)
601 -# call get_precision() to evaluate inter/intra precision
602 -# call get_rmsf() to evaluate within-group fluctuations
606 selection_dictionary={}):
608 @param model The IMP Model
609 @param resolution Use 1 or 10 (kluge: requires that "_Res:X" is
610 part of the hier name)
611 @param selection_dictionary Dictionary where keys are names for
612 selections and values are selection tuples for scoring
613 precision. "All" is automatically made as well
616 from mpi4py
import MPI
617 self.comm = MPI.COMM_WORLD
618 self.rank = self.comm.Get_rank()
619 self.number_of_processes = self.comm.size
621 self.number_of_processes = 1
624 self.styles = [
'pairwise_rmsd',
'pairwise_drmsd_k',
'pairwise_drmsd_Q',
625 'pairwise_drms_k',
'pairwise_rmsd',
'drmsd_from_center']
626 self.style =
'pairwise_drmsd_k'
627 self.structures_dictionary = {}
628 self.reference_structures_dictionary = {}
630 self.protein_names =
None
631 self.len_particles_resolution_one =
None
633 self.rmf_names_frames = {}
634 self.reference_rmf_names_frames =
None
635 self.reference_structure =
None
636 self.reference_prot =
None
637 self.selection_dictionary = selection_dictionary
638 self.threshold = 40.0
639 self.residue_particle_index_map =
None
641 if resolution
in [1,10]:
642 self.resolution = resolution
644 raise KeyError(
"Currently only allow resolution 1 or 10")
646 def _get_structure(self,rmf_frame_index,rmf_name):
647 """Read an RMF file and return the particles"""
648 rh = RMF.open_rmf_file_read_only(rmf_name)
650 print(
"getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
654 print(
"linking coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
659 if self.resolution==1:
661 elif self.resolution==10:
664 protein_names = list(particle_dict.keys())
665 particles_resolution_one = []
666 for k
in particle_dict:
667 particles_resolution_one += (particle_dict[k])
669 if self.protein_names==
None:
670 self.protein_names = protein_names
672 if self.protein_names!=protein_names:
673 print(
"Error: the protein names of the new coordinate set is not compatible with the previous one")
675 if self.len_particles_resolution_one==
None:
676 self.len_particles_resolution_one = len(particles_resolution_one)
678 if self.len_particles_resolution_one!=len(particles_resolution_one):
679 raise ValueError(
"the new coordinate set is not compatible with the previous one")
681 return particles_resolution_one
687 setup_index_map=
False):
688 """ Read a structure into the ensemble and store (as coordinates).
689 @param rmf_name The name of the RMF file
690 @param rmf_frame_index The frame to read
691 @param structure_set_name Name for the set that includes this structure
693 @param setup_index_map if requested, set up a dictionary to help
698 if structure_set_name
in self.structures_dictionary:
699 cdict = self.structures_dictionary[structure_set_name]
700 rmflist = self.rmf_names_frames[structure_set_name]
702 self.structures_dictionary[structure_set_name]={}
703 self.rmf_names_frames[structure_set_name]=[]
704 cdict = self.structures_dictionary[structure_set_name]
705 rmflist = self.rmf_names_frames[structure_set_name]
709 particles_resolution_one = self._get_structure(rmf_frame_index,rmf_name)
711 print(
"something wrong with the rmf")
713 self.selection_dictionary.update({
"All":self.protein_names})
715 for selection_name
in self.selection_dictionary:
716 selection_tuple = self.selection_dictionary[selection_name]
717 coords = self._select_coordinates(selection_tuple,particles_resolution_one,self.prots[0])
718 if selection_name
not in cdict:
719 cdict[selection_name] = [coords]
721 cdict[selection_name].append(coords)
723 rmflist.append((rmf_name,rmf_frame_index))
727 self.residue_particle_index_map = {}
728 for prot_name
in self.protein_names:
729 self.residue_particle_index_map[prot_name] = \
730 self._get_residue_particle_index_map(
732 particles_resolution_one,self.prots[0])
735 rmf_name_frame_tuples,
737 """Read a list of RMFs, supports parallel
738 @param rmf_name_frame_tuples list of (rmf_file_name,frame_number)
739 @param structure_set_name Name this set of structures (e.g. "cluster.1")
743 my_rmf_name_frame_tuples=IMP.pmi.tools.chunk_list_into_segments(
744 rmf_name_frame_tuples,self.number_of_processes)[self.rank]
745 for nfr,tup
in enumerate(my_rmf_name_frame_tuples):
747 rmf_frame_index=tup[1]
749 if self.residue_particle_index_map
is None:
750 setup_index_map =
True
752 setup_index_map =
False
759 if self.number_of_processes > 1:
762 self.comm.send(self.structures_dictionary, dest=0, tag=11)
764 for i
in range(1, self.number_of_processes):
765 data_tmp = self.comm.recv(source=i, tag=11)
766 for key
in self.structures_dictionary:
767 self.structures_dictionary[key].update(data_tmp[key])
768 for i
in range(1, self.number_of_processes):
769 self.comm.send(self.structures_dictionary, dest=i, tag=11)
771 self.structures_dictionary = self.comm.recv(source=0, tag=11)
773 def _get_residue_particle_index_map(self,prot_name,structure,hier):
775 residue_particle_index_map = []
781 all_selected_particles = s.get_selected_particles()
782 intersection = list(set(all_selected_particles) & set(structure))
783 sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
784 for p
in sorted_intersection:
786 return residue_particle_index_map
789 def _select_coordinates(self,tuple_selections,structure,prot):
790 selected_coordinates=[]
791 for t
in tuple_selections:
792 if type(t)==tuple
and len(t)==3:
798 all_selected_particles = s.get_selected_particles()
799 intersection = list(set(all_selected_particles) & set(structure))
800 sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
801 cc = [tuple(
IMP.core.XYZ(p).get_coordinates())
for p
in sorted_intersection]
802 selected_coordinates += cc
808 all_selected_particles = s.get_selected_particles()
809 intersection = list(set(all_selected_particles) & set(structure))
810 sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
811 cc = [tuple(
IMP.core.XYZ(p).get_coordinates())
for p
in sorted_intersection]
812 selected_coordinates += cc
814 raise ValueError(
"Selection error")
815 return selected_coordinates
817 def set_threshold(self,threshold):
818 self.threshold = threshold
820 def _get_distance(self,
826 """ Compute distance between structures with various metrics """
827 c1 = self.structures_dictionary[structure_set_name1][selection_name][index1]
828 c2 = self.structures_dictionary[structure_set_name2][selection_name][index2]
832 if self.style==
'pairwise_drmsd_k':
834 if self.style==
'pairwise_drms_k':
836 if self.style==
'pairwise_drmsd_Q':
839 if self.style==
'pairwise_rmsd':
843 def _get_particle_distances(self,structure_set_name1,structure_set_name2,
844 selection_name,index1,index2):
845 c1 = self.structures_dictionary[structure_set_name1][selection_name][index1]
846 c2 = self.structures_dictionary[structure_set_name2][selection_name][index2]
851 distances=[np.linalg.norm(a-b)
for (a,b)
in zip(coordinates1,coordinates2)]
860 selection_keywords=
None):
861 """ Evaluate the precision of two named structure groups. Supports MPI.
862 When the structure_set_name1 is different from the structure_set_name2,
863 this evaluates the cross-precision (average pairwise distances).
864 @param outfile Name of the precision output file
865 @param structure_set_name1 string name of the first structure set
866 @param structure_set_name2 string name of the second structure set
867 @param skip analyze every (skip) structure for the distance matrix calculation
868 @param selection_keywords Specify the selection name you want to calculate on.
869 By default this is computed for everything you provided in the constructor,
870 plus all the subunits together.
872 if selection_keywords
is None:
873 sel_keys = list(self.selection_dictionary.keys())
875 for k
in selection_keywords:
876 if k
not in self.selection_dictionary:
877 raise KeyError(
"you are trying to find named selection " \
878 + k +
" which was not requested in the constructor")
879 sel_keys = selection_keywords
881 if outfile
is not None:
882 of = open(outfile,
"w")
884 for selection_name
in sel_keys:
885 number_of_structures_1 = len(self.structures_dictionary[structure_set_name1][selection_name])
886 number_of_structures_2 = len(self.structures_dictionary[structure_set_name2][selection_name])
888 structure_pointers_1 = list(range(0,number_of_structures_1,skip))
889 structure_pointers_2 = list(range(0,number_of_structures_2,skip))
890 pair_combination_list = list(itertools.product(structure_pointers_1,structure_pointers_2))
891 if len(pair_combination_list)==0:
892 raise ValueError(
"no structure selected. Check the skip parameter.")
895 my_pair_combination_list = IMP.pmi.tools.chunk_list_into_segments(
896 pair_combination_list,self.number_of_processes)[self.rank]
897 my_length = len(my_pair_combination_list)
898 for n,pair
in enumerate(my_pair_combination_list):
899 progression = int(float(n)/my_length*100.0)
900 distances[pair] = self._get_distance(structure_set_name1,structure_set_name2,
901 selection_name,pair[0],pair[1])
902 if self.number_of_processes > 1:
907 if structure_set_name1==structure_set_name2:
908 structure_pointers = structure_pointers_1
909 number_of_structures = number_of_structures_1
914 distances_to_structure = {}
915 distances_to_structure_normalization = {}
917 for n
in structure_pointers:
918 distances_to_structure[n] = 0.0
919 distances_to_structure_normalization[n]=0
922 distance += distances[k]
923 distances_to_structure[k[0]] += distances[k]
924 distances_to_structure[k[1]] += distances[k]
925 distances_to_structure_normalization[k[0]] += 1
926 distances_to_structure_normalization[k[1]] += 1
928 for n
in structure_pointers:
929 distances_to_structure[n] = distances_to_structure[n]/distances_to_structure_normalization[n]
931 min_distance = min([distances_to_structure[n]
for n
in distances_to_structure])
932 centroid_index = [k
for k, v
in distances_to_structure.items()
if v == min_distance][0]
933 centroid_rmf_name = self.rmf_names_frames[structure_set_name1][centroid_index]
935 centroid_distance = 0.0
937 for n
in range(number_of_structures):
938 dist = self._get_distance(structure_set_name1,structure_set_name1,
939 selection_name,centroid_index,n)
940 centroid_distance += dist
941 distance_list.append(dist)
944 centroid_distance /= number_of_structures
946 if outfile
is not None:
947 of.write(str(selection_name)+
" "+structure_set_name1+
948 " average centroid distance "+str(centroid_distance)+
"\n")
949 of.write(str(selection_name)+
" "+structure_set_name1+
950 " centroid index "+str(centroid_index)+
"\n")
951 of.write(str(selection_name)+
" "+structure_set_name1+
952 " centroid rmf name "+str(centroid_rmf_name)+
"\n")
953 of.write(str(selection_name)+
" "+structure_set_name1+
954 " median centroid distance "+str(np.median(distance_list))+
"\n")
956 average_pairwise_distances=sum(distances.values())/len(list(distances.values()))
957 if outfile
is not None:
958 of.write(str(selection_name)+
" "+structure_set_name1+
" "+structure_set_name2+
959 " average pairwise distance "+str(average_pairwise_distances)+
"\n")
960 if outfile
is not None:
962 return centroid_index
968 set_plot_yaxis_range=
None):
969 """ Calculate the residue mean square fluctuations (RMSF).
970 Automatically outputs as data file and pdf
971 @param structure_set_name Which structure set to calculate RMSF for
972 @param outdir Where to write the files
973 @param skip Skip this number of structures
974 @param set_plot_yaxis_range In case you need to change the plot
982 for sel_name
in self.protein_names:
983 self.selection_dictionary.update({sel_name:[sel_name]})
985 number_of_structures = len(self.structures_dictionary[structure_set_name][sel_name])
989 rpim = self.residue_particle_index_map[sel_name]
990 outfile = outdir+
"/rmsf."+sel_name+
".dat"
991 of = open(outfile,
"w")
992 residue_distances = {}
994 for index
in range(number_of_structures):
995 distances = self._get_particle_distances(structure_set_name,
998 centroid_index,index)
999 for nblock,block
in enumerate(rpim):
1000 for residue_number
in block:
1001 residue_nblock[residue_number] = nblock
1002 if residue_number
not in residue_distances:
1003 residue_distances[residue_number] = [distances[nblock]]
1005 residue_distances[residue_number].append(distances[nblock])
1009 for rn
in residue_distances:
1011 rmsf = np.std(residue_distances[rn])
1013 of.write(str(rn)+
" "+str(residue_nblock[rn])+
" "+str(rmsf)+
"\n")
1015 IMP.pmi.output.plot_xy_data(residues,rmsfs,title=sel_name,
1016 out_fn=outdir+
"/rmsf."+sel_name,display=
False,
1017 set_plot_yaxis_range=set_plot_yaxis_range,
1018 xlabel=
'Residue Number',ylabel=
'Standard error')
1023 """Read in a structure used for reference computation.
1024 Needed before calling get_average_distance_wrt_reference_structure()
1025 @param rmf_name The RMF file to read the reference
1026 @param rmf_frame_index The index in that file
1028 particles_resolution_one = self._get_structure(rmf_frame_index,rmf_name)
1029 self.reference_rmf_names_frames = (rmf_name,rmf_frame_index)
1031 for selection_name
in self.selection_dictionary:
1032 selection_tuple = self.selection_dictionary[selection_name]
1033 coords = self._select_coordinates(selection_tuple,
1034 particles_resolution_one,self.prots[0])
1035 self.reference_structures_dictionary[selection_name] = coords
1038 """First align then calculate RMSD
1039 @param structure_set_name: the name of the structure set
1040 @param alignment_selection: the key containing the selection tuples needed to make the alignment stored in self.selection_dictionary
1041 @return: for each structure in the structure set, returns the rmsd
1043 if self.reference_structures_dictionary=={}:
1044 print(
"Cannot compute until you set a reference structure")
1047 align_reference_coordinates = self.reference_structures_dictionary[alignment_selection_key]
1048 align_coordinates = self.structures_dictionary[structure_set_name][alignment_selection_key]
1049 transformations = []
1050 for c
in align_coordinates:
1052 transformation = Ali.align()[1]
1053 transformations.append(transformation)
1054 for selection_name
in self.selection_dictionary:
1055 reference_coordinates = self.reference_structures_dictionary[selection_name]
1058 for n,sc
in enumerate(self.structures_dictionary[structure_set_name][selection_name]):
1061 distances.append(distance)
1062 print(selection_name,
"average rmsd",sum(distances)/len(distances),
"median",self._get_median(distances),
"minimum distance",min(distances))
1064 def _get_median(self,list_of_values):
1065 return np.median(np.array(list_of_values))
1068 """Compare the structure set to the reference structure.
1069 @param structure_set_name The structure set to compute this on
1070 @note First call set_reference_structure()
1073 if self.reference_structures_dictionary=={}:
1074 print(
"Cannot compute until you set a reference structure")
1076 for selection_name
in self.selection_dictionary:
1077 reference_coordinates = self.reference_structures_dictionary[selection_name]
1080 for sc
in self.structures_dictionary[structure_set_name][selection_name]:
1082 if self.style==
'pairwise_drmsd_k':
1084 if self.style==
'pairwise_drms_k':
1086 if self.style==
'pairwise_drmsd_Q':
1088 if self.style==
'pairwise_rmsd':
1090 distances.append(distance)
1092 print(selection_name,
"average distance",sum(distances)/len(distances),
"minimum distance",min(distances),
'nframes',len(distances))
1093 ret[selection_name] = {
'average_distance':sum(distances)/len(distances),
'minimum_distance':min(distances)}
1096 def get_coordinates(self):
1099 def set_precision_style(self, style):
1100 if style
in self.styles:
1103 raise ValueError(
"No such style")
1107 """Compute mean density maps from structures.
1109 Keeps a dictionary of density maps,
1110 keys are in the custom ranges. When you call add_subunits_density, it adds
1111 particle coordinates to the existing density maps.
1114 def __init__(self, custom_ranges, representation=None, resolution=20.0, voxel=5.0):
1116 @param custom_ranges Required. It's a dictionary, keys are the
1117 density component names, values are selection tuples
1118 e.g. {'kin28':[['kin28',1,-1]],
1119 'density_name_1' :[('ccl1')],
1120 'density_name_2' :[(1,142,'tfb3d1'),
1121 (143,700,'tfb3d2')],
1122 @param representation PMI representation, for doing selections.
1123 Not needed if you only pass hierarchies
1124 @param resolution The MRC resolution of the output map (in Angstrom unit)
1125 @param voxel The voxel size for the output map (lower is slower)
1128 self.representation = representation
1129 self.MRCresolution = resolution
1132 self.count_models = 0.0
1133 self.custom_ranges = custom_ranges
1136 """Add a frame to the densities.
1137 @param hierarchy Optionally read the hierarchy from somewhere.
1138 If not passed, will just read the representation.
1140 self.count_models += 1.0
1144 all_particles_by_resolution = []
1145 for name
in part_dict:
1146 all_particles_by_resolution += part_dict[name]
1148 for density_name
in self.custom_ranges:
1151 all_particles_by_segments = []
1153 for seg
in self.custom_ranges[density_name]:
1156 parts += IMP.tools.select_by_tuple(self.representation,
1157 seg, resolution=1, name_is_ambiguous=
False)
1161 for h
in hierarchy.get_children():
1165 if type(seg) == str:
1167 elif type(seg) == tuple
and len(seg) == 2:
1169 hierarchy, molecule=seg[0],copy_index=seg[1])
1170 elif type(seg) == tuple
and len(seg) == 3:
1172 hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1))
1173 elif type(seg) == tuple
and len(seg) == 4:
1175 hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1),copy_index=seg[3])
1177 raise Exception(
'could not understand selection tuple '+str(seg))
1179 all_particles_by_segments += s.get_selected_particles()
1182 parts = all_particles_by_segments
1185 set(all_particles_by_segments) & set(all_particles_by_resolution))
1186 self._create_density_from_particles(parts, density_name)
1188 def normalize_density(self):
1191 def _create_density_from_particles(self, ps, name,
1192 kernel_type=
'GAUSSIAN'):
1193 '''Internal function for adding to densities.
1194 pass XYZR particles with mass and create a density from them.
1195 kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
1197 'GAUSSIAN': IMP.em.GAUSSIAN,
1198 'BINARIZED_SPHERE': IMP.em.BINARIZED_SPHERE,
1199 'SPHERE': IMP.em.SPHERE}
1203 dmap.set_was_used(
True)
1204 if name
not in self.densities:
1205 self.densities[name] = dmap
1211 dmap3.set_was_used(
True)
1213 dmap3.add(self.densities[name])
1214 self.densities[name] = dmap3
1216 def get_density_keys(self):
1217 return list(self.densities.keys())
1220 """Get the current density for some component name"""
1221 if name
not in self.densities:
1224 return self.densities[name]
1226 def write_mrc(self, path="./"):
1227 for density_name
in self.densities:
1228 self.densities[density_name].
multiply(1. / self.count_models)
1230 self.densities[density_name],
1231 path +
"/" + density_name +
".mrc",
1235 class GetContactMap(object):
1238 self.distance = distance
1239 self.contactmap =
''
1246 def set_prot(self, prot):
1247 from scipy.spatial.distance
import cdist
1256 for name
in particles_dictionary:
1257 residue_indexes = []
1258 for p
in particles_dictionary[name]:
1262 if len(residue_indexes) != 0:
1263 self.protnames.append(name)
1265 def get_subunit_coords(self, frame, align=0):
1266 from scipy.spatial.distance
import cdist
1270 test, testr = [], []
1271 for part
in self.prot.get_children():
1274 for chl
in part.get_children():
1280 SortedSegments.append((chl, startres))
1281 SortedSegments = sorted(SortedSegments, key=itemgetter(1))
1283 for sgmnt
in SortedSegments:
1286 crd = np.array([p.get_x(), p.get_y(), p.get_z()])
1289 radii.append(p.get_radius())
1291 new_name = part.get_name() +
'_' + sgmnt[0].get_name() +\
1294 .get_residue_indexes()[0])
1295 namelist.append(new_name)
1296 self.expanded[new_name] = len(
1298 if part.get_name()
not in self.resmap:
1299 self.resmap[part.get_name()] = {}
1301 self.resmap[part.get_name()][res] = new_name
1303 coords = np.array(coords)
1304 radii = np.array(radii)
1305 if len(self.namelist) == 0:
1306 self.namelist = namelist
1307 self.contactmap = np.zeros((len(coords), len(coords)))
1308 distances = cdist(coords, coords)
1309 distances = (distances - radii).T - radii
1310 distances = distances <= self.distance
1311 self.contactmap += distances
1316 identification_string=
'ISDCrossLinkMS_Distance_'):
1319 data = open(filname)
1320 D = data.readlines()
1324 if identification_string
in d:
1331 t1, t2 = (d[0], d[1]), (d[1], d[0])
1332 if t1
not in self.XL:
1333 self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
1334 self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
1336 self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
1337 self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
1339 def dist_matrix(self, skip_cmap=0, skip_xl=1, outname=None):
1343 L = sum(self.expanded.values())
1344 proteins = self.protnames
1349 proteins = [p.get_name()
for p
in self.prot.get_children()]
1351 for p1
in range(len(proteins)):
1352 for p2
in range(p1, len(proteins)):
1354 self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1355 pn1, pn2 = proteins[p1], proteins[p2]
1356 mtr = np.zeros((pl1 + 1, pl2 + 1))
1357 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1358 for i1
in range(1, pl1 + 1):
1359 for i2
in range(1, pl2 + 1):
1361 r1 = K.index(self.resmap[pn1][i1])
1362 r2 = K.index(self.resmap[pn2][i2])
1364 mtr[i1 - 1, i2 - 1] = r
1366 missing.append((pn1, pn2, i1, i2))
1368 Matrices[(pn1, pn2)] = mtr
1373 raise ValueError(
"cross-links were not provided, use add_xlinks function!")
1376 for p1
in range(len(proteins)):
1377 for p2
in range(p1, len(proteins)):
1379 self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1380 pn1, pn2 = proteins[p1], proteins[p2]
1381 mtr = np.zeros((pl1 + 1, pl2 + 1))
1384 xls = self.XL[(pn1, pn2)]
1387 xls = self.XL[(pn2, pn1)]
1392 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1393 for xl1, xl2
in xls:
1395 print(
'X' * 10, xl1, xl2)
1398 print(
'X' * 10, xl1, xl2)
1400 mtr[xl1 - 1, xl2 - 1] = 100
1402 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1403 for xl1, xl2
in xls:
1405 print(
'X' * 10, xl1, xl2)
1408 print(
'X' * 10, xl1, xl2)
1410 mtr[xl2 - 1, xl1 - 1] = 100
1412 print(
'No cross links between: ', pn1, pn2)
1413 Matrices_xl[(pn1, pn2)] = mtr
1429 for i, c
in enumerate(C):
1430 cl = max(self.resmap[c].keys())
1435 R.append(R[-1] + cl)
1440 import matplotlib
as mpl
1442 import matplotlib.pyplot
as plt
1443 import matplotlib.gridspec
as gridspec
1444 import scipy.sparse
as sparse
1447 gs = gridspec.GridSpec(len(W), len(W),
1452 for x1, r1
in enumerate(R):
1457 for x2, r2
in enumerate(R):
1463 ax = plt.subplot(gs[cnt])
1466 mtr = Matrices[(C[x1], C[x2])]
1468 mtr = Matrices[(C[x2], C[x1])].T
1472 interpolation=
'nearest',
1474 vmax=log(mtr.max()))
1479 mtr = Matrices_xl[(C[x1], C[x2])]
1481 mtr = Matrices_xl[(C[x2], C[x1])].T
1483 sparse.csr_matrix(mtr),
1493 ax.set_ylabel(C[x1], rotation=90)
1495 plt.savefig(outname +
".pdf", dpi=300, transparent=
"False")
1503 def get_hiers_from_rmf(model, frame_number, rmf_file):
1505 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1508 rh = RMF.open_rmf_file_read_only(rmf_file)
1513 print(
"Unable to open rmf file %s" % (rmf_file))
1520 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1527 def link_hiers_to_rmf(model,hiers,frame_number, rmf_file):
1528 print(
"linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1529 rh = RMF.open_rmf_file_read_only(rmf_file)
1534 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1541 def get_hiers_and_restraints_from_rmf(model, frame_number, rmf_file):
1543 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1546 rh = RMF.open_rmf_file_read_only(rmf_file)
1552 print(
"Unable to open rmf file %s" % (rmf_file))
1557 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1564 def link_hiers_and_restraints_to_rmf(model,hiers,rs, frame_number, rmf_file):
1565 print(
"linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1566 rh = RMF.open_rmf_file_read_only(rmf_file)
1572 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1579 def get_hiers_from_rmf(model, frame_number, rmf_file):
1580 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1583 rh = RMF.open_rmf_file_read_only(rmf_file)
1588 print(
"Unable to open rmf file %s" % (rmf_file))
1595 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1603 """Get particles at res 1, or any beads, based on the name.
1604 No Representation is needed. This is mainly used when the hierarchy
1605 is read from an RMF file.
1606 @return a dictionary of component names and their particles
1607 \note If the root node is named "System" or is a "State", do proper selection.
1613 for mol
in IMP.atom.get_by_type(prot,IMP.atom.MOLECULE_TYPE):
1615 particle_dict[mol.get_name()] = sel.get_selected_particles()
1618 for c
in prot.get_children():
1621 for s
in c.get_children():
1622 if "_Res:1" in s.get_name()
and "_Res:10" not in s.get_name():
1624 if "Beads" in s.get_name():
1628 for name
in particle_dict:
1629 particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1630 list(set(particle_dict[name]) & set(allparticles)))
1631 return particle_dict
1634 """Get particles at res 10, or any beads, based on the name.
1635 No Representation is needed.
1636 This is mainly used when the hierarchy is read from an RMF file.
1637 @return a dictionary of component names and their particles
1638 \note If the root node is named "System" or is a "State", do proper selection.
1643 for mol
in IMP.atom.get_by_type(prot,IMP.atom.MOLECULE_TYPE):
1645 particle_dict[mol.get_name()] = sel.get_selected_particles()
1648 for c
in prot.get_children():
1651 for s
in c.get_children():
1652 if "_Res:10" in s.get_name():
1654 if "Beads" in s.get_name():
1657 for name
in particle_dict:
1658 particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1659 list(set(particle_dict[name]) & set(allparticles)))
1660 return particle_dict
1662 def select_by_tuple(first_res_last_res_name_tuple):
1663 first_res = first_res_last_res_hier_tuple[0]
1664 last_res = first_res_last_res_hier_tuple[1]
1665 name = first_res_last_res_hier_tuple[2]
1668 """Visualization of crosslinks"""
1670 self.crosslinks = []
1671 self.external_csv_data =
None
1672 self.crosslinkedprots = set()
1673 self.mindist = +10000000.0
1674 self.maxdist = -10000000.0
1675 self.contactmap =
None
1677 def set_hierarchy(self, prot):
1678 self.prot_length_dict = {}
1679 self.model=prot.get_model()
1681 for i
in prot.get_children():
1683 residue_indexes = []
1687 if len(residue_indexes) != 0:
1688 self.prot_length_dict[name] = max(residue_indexes)
1690 def set_coordinates_for_contact_map(self, rmf_name,rmf_frame_index):
1691 from scipy.spatial.distance
import cdist
1693 rh= RMF.open_rmf_file_read_only(rmf_name)
1696 print(
"getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
1707 self.index_dictionary = {}
1709 for name
in particles_dictionary:
1710 residue_indexes = []
1711 for p
in particles_dictionary[name]:
1716 if len(residue_indexes) != 0:
1718 for res
in range(min(residue_indexes), max(residue_indexes) + 1):
1721 crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1723 radii.append(d.get_radius())
1724 if name
not in self.index_dictionary:
1725 self.index_dictionary[name] = [resindex]
1727 self.index_dictionary[name].append(resindex)
1730 coords = np.array(coords)
1731 radii = np.array(radii)
1733 distances = cdist(coords, coords)
1734 distances = (distances - radii).T - radii
1736 distances = np.where(distances <= 20.0, 1.0, 0)
1737 if self.contactmap
is None:
1738 self.contactmap = np.zeros((len(coords), len(coords)))
1739 self.contactmap += distances
1741 for prot
in prots: IMP.atom.destroy(prot)
1744 self, data_file, search_label=
'ISDCrossLinkMS_Distance_',
1747 filter_rmf_file_names=
None,
1748 external_csv_data_file=
None,
1749 external_csv_data_file_unique_id_key=
"Unique ID"):
1758 keys = po.get_keys()
1760 xl_keys = [k
for k
in keys
if search_label
in k]
1762 if filter_rmf_file_names
is not None:
1763 rmf_file_key=
"local_rmf_file_name"
1764 fs = po.get_fields(xl_keys+[rmf_file_key])
1766 fs = po.get_fields(xl_keys)
1769 self.cross_link_frequency = {}
1773 self.cross_link_distances = {}
1777 self.cross_link_distances_unique = {}
1779 if not external_csv_data_file
is None:
1782 self.external_csv_data = {}
1783 xldb = IMP.pmi.tools.get_db_from_csv(external_csv_data_file)
1786 self.external_csv_data[
1787 xl[external_csv_data_file_unique_id_key]] = xl
1792 cross_link_frequency_list = []
1794 self.unique_cross_link_list = []
1798 keysplit = key.replace(
1807 if filter_label!=
None:
1808 if filter_label
not in keysplit:
continue
1811 r1 = int(keysplit[5])
1813 r2 = int(keysplit[7])
1816 confidence = keysplit[12]
1820 unique_identifier = keysplit[3]
1822 unique_identifier =
'0'
1824 r1 = int(keysplit[mapping[
"Residue1"]])
1825 c1 = keysplit[mapping[
"Protein1"]]
1826 r2 = int(keysplit[mapping[
"Residue2"]])
1827 c2 = keysplit[mapping[
"Protein2"]]
1829 confidence = keysplit[mapping[
"Confidence"]]
1833 unique_identifier = keysplit[mapping[
"Unique Identifier"]]
1835 unique_identifier =
'0'
1837 self.crosslinkedprots.add(c1)
1838 self.crosslinkedprots.add(c2)
1844 if filter_rmf_file_names
is not None:
1845 for n,d
in enumerate(fs[key]):
1846 if fs[rmf_file_key][n]
in filter_rmf_file_names:
1847 dists.append(float(d))
1849 dists=[float(f)
for f
in fs[key]]
1854 mdist = self.median(dists)
1856 stdv = np.std(np.array(dists))
1857 if self.mindist > mdist:
1858 self.mindist = mdist
1859 if self.maxdist < mdist:
1860 self.maxdist = mdist
1864 if not self.external_csv_data
is None:
1865 sample = self.external_csv_data[unique_identifier][
"Sample"]
1869 if (r1, c1, r2, c2,mdist)
not in cross_link_frequency_list:
1870 if (r1, c1, r2, c2)
not in self.cross_link_frequency:
1871 self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1872 self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1874 self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1875 self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1876 cross_link_frequency_list.append((r1, c1, r2, c2))
1877 cross_link_frequency_list.append((r2, c2, r1, c1))
1878 self.unique_cross_link_list.append(
1879 (r1, c1, r2, c2,mdist))
1881 if (r1, c1, r2, c2)
not in self.cross_link_distances:
1882 self.cross_link_distances[(
1888 confidence)] = dists
1889 self.cross_link_distances[(
1895 confidence)] = dists
1896 self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1898 self.cross_link_distances[(
1904 confidence)] += dists
1905 self.cross_link_distances[(
1911 confidence)] += dists
1913 self.crosslinks.append(
1923 self.crosslinks.append(
1934 self.cross_link_frequency_inverted = {}
1935 for xl
in self.unique_cross_link_list:
1936 (r1, c1, r2, c2, mdist) = xl
1937 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1938 if frequency
not in self.cross_link_frequency_inverted:
1939 self.cross_link_frequency_inverted[
1940 frequency] = [(r1, c1, r2, c2)]
1942 self.cross_link_frequency_inverted[
1943 frequency].append((r1, c1, r2, c2))
1947 def median(self, mylist):
1948 sorts = sorted(mylist)
1954 return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1955 return sorts[length / 2]
1957 def set_threshold(self,threshold):
1958 self.threshold=threshold
1960 def set_tolerance(self,tolerance):
1961 self.tolerance=tolerance
1963 def colormap(self, dist):
1964 if dist < self.threshold - self.tolerance:
1966 elif dist >= self.threshold + self.tolerance:
1971 def write_cross_link_database(self, filename, format='csv'):
1975 "Unique ID",
"Protein1",
"Residue1",
"Protein2",
"Residue2",
1976 "Median Distance",
"Standard Deviation",
"Confidence",
"Frequency",
"Arrangement"]
1978 if not self.external_csv_data
is None:
1979 keys = list(self.external_csv_data.keys())
1980 innerkeys = list(self.external_csv_data[keys[0]].keys())
1982 fieldnames += innerkeys
1984 dw = csv.DictWriter(
1988 fieldnames=fieldnames)
1990 for xl
in self.crosslinks:
1991 (r1, c1, r2, c2, mdist, stdv, confidence,
1992 unique_identifier, descriptor) = xl
1993 if descriptor ==
'original':
1995 outdict[
"Unique ID"] = unique_identifier
1996 outdict[
"Protein1"] = c1
1997 outdict[
"Protein2"] = c2
1998 outdict[
"Residue1"] = r1
1999 outdict[
"Residue2"] = r2
2000 outdict[
"Median Distance"] = mdist
2001 outdict[
"Standard Deviation"] = stdv
2002 outdict[
"Confidence"] = confidence
2003 outdict[
"Frequency"] = self.cross_link_frequency[
2006 arrangement =
"Intra"
2008 arrangement =
"Inter"
2009 outdict[
"Arrangement"] = arrangement
2010 if not self.external_csv_data
is None:
2011 outdict.update(self.external_csv_data[unique_identifier])
2013 dw.writerow(outdict)
2015 def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
2016 no_confidence_info=
False, filter=
None, layout=
"whole", crosslinkedonly=
False,
2017 filename=
None, confidence_classes=
None, alphablend=0.1, scale_symbol_size=1.0,
2018 gap_between_components=0,
2019 rename_protein_map=
None):
2033 import matplotlib
as mpl
2035 import matplotlib.pyplot
as plt
2036 import matplotlib.cm
as cm
2038 fig = plt.figure(figsize=(10, 10))
2039 ax = fig.add_subplot(111)
2045 if prot_listx
is None:
2047 prot_listx = list(self.crosslinkedprots)
2049 prot_listx = list(self.prot_length_dict.keys())
2052 nresx = gap_between_components + \
2053 sum([self.prot_length_dict[name]
2054 + gap_between_components
for name
in prot_listx])
2058 if prot_listy
is None:
2060 prot_listy = list(self.crosslinkedprots)
2062 prot_listy = list(self.prot_length_dict.keys())
2065 nresy = gap_between_components + \
2066 sum([self.prot_length_dict[name]
2067 + gap_between_components
for name
in prot_listy])
2072 res = gap_between_components
2073 for prot
in prot_listx:
2074 resoffsetx[prot] = res
2075 res += self.prot_length_dict[prot]
2077 res += gap_between_components
2081 res = gap_between_components
2082 for prot
in prot_listy:
2083 resoffsety[prot] = res
2084 res += self.prot_length_dict[prot]
2086 res += gap_between_components
2088 resoffsetdiagonal = {}
2089 res = gap_between_components
2090 for prot
in IMP.pmi.tools.OrderedSet(prot_listx + prot_listy):
2091 resoffsetdiagonal[prot] = res
2092 res += self.prot_length_dict[prot]
2093 res += gap_between_components
2099 for n, prot
in enumerate(prot_listx):
2100 res = resoffsetx[prot]
2102 for proty
in prot_listy:
2103 resy = resoffsety[proty]
2104 endy = resendy[proty]
2105 ax.plot([res, res], [resy, endy],
'k-', lw=0.4)
2106 ax.plot([end, end], [resy, endy],
'k-', lw=0.4)
2107 xticks.append((float(res) + float(end)) / 2)
2108 if rename_protein_map
is not None:
2109 if prot
in rename_protein_map:
2110 prot=rename_protein_map[prot]
2111 xlabels.append(prot)
2115 for n, prot
in enumerate(prot_listy):
2116 res = resoffsety[prot]
2118 for protx
in prot_listx:
2119 resx = resoffsetx[protx]
2120 endx = resendx[protx]
2121 ax.plot([resx, endx], [res, res],
'k-', lw=0.4)
2122 ax.plot([resx, endx], [end, end],
'k-', lw=0.4)
2123 yticks.append((float(res) + float(end)) / 2)
2124 if rename_protein_map
is not None:
2125 if prot
in rename_protein_map:
2126 prot=rename_protein_map[prot]
2127 ylabels.append(prot)
2130 print(prot_listx, prot_listy)
2132 if not self.contactmap
is None:
2133 import matplotlib.cm
as cm
2134 tmp_array = np.zeros((nresx, nresy))
2136 for px
in prot_listx:
2138 for py
in prot_listy:
2140 resx = resoffsety[px]
2141 lengx = resendx[px] - 1
2142 resy = resoffsety[py]
2143 lengy = resendy[py] - 1
2144 indexes_x = self.index_dictionary[px]
2145 minx = min(indexes_x)
2146 maxx = max(indexes_x)
2147 indexes_y = self.index_dictionary[py]
2148 miny = min(indexes_y)
2149 maxy = max(indexes_y)
2151 print(px, py, minx, maxx, miny, maxy)
2156 resy:lengy] = self.contactmap[
2163 ax.imshow(tmp_array,
2166 interpolation=
'nearest')
2168 ax.set_xticks(xticks)
2169 ax.set_xticklabels(xlabels, rotation=90)
2170 ax.set_yticks(yticks)
2171 ax.set_yticklabels(ylabels)
2172 ax.set_xlim(0,nresx)
2173 ax.set_ylim(0,nresy)
2178 already_added_xls = []
2180 for xl
in self.crosslinks:
2182 (r1, c1, r2, c2, mdist, stdv, confidence,
2183 unique_identifier, descriptor) = xl
2185 if confidence_classes
is not None:
2186 if confidence
not in confidence_classes:
2190 pos1 = r1 + resoffsetx[c1]
2194 pos2 = r2 + resoffsety[c2]
2198 if not filter
is None:
2199 xldb = self.external_csv_data[unique_identifier]
2200 xldb.update({
"Distance": mdist})
2201 xldb.update({
"Distance_stdv": stdv})
2203 if filter[1] ==
">":
2204 if float(xldb[filter[0]]) <= float(filter[2]):
2207 if filter[1] ==
"<":
2208 if float(xldb[filter[0]]) >= float(filter[2]):
2211 if filter[1] ==
"==":
2212 if float(xldb[filter[0]]) != float(filter[2]):
2218 pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
2219 pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
2221 if layout ==
'lowerdiagonal':
2222 if pos_for_diagonal1 <= pos_for_diagonal2:
2224 if layout ==
'upperdiagonal':
2225 if pos_for_diagonal1 >= pos_for_diagonal2:
2228 already_added_xls.append((r1, c1, r2, c2))
2230 if not no_confidence_info:
2231 if confidence ==
'0.01':
2232 markersize = 14 * scale_symbol_size
2233 elif confidence ==
'0.05':
2234 markersize = 9 * scale_symbol_size
2235 elif confidence ==
'0.1':
2236 markersize = 6 * scale_symbol_size
2238 markersize = 15 * scale_symbol_size
2240 markersize = 5 * scale_symbol_size
2242 if not no_dist_info:
2243 color = self.colormap(mdist)
2253 markersize=markersize)
2257 fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
2259 [i.set_linewidth(2.0)
for i
in ax.spines.values()]
2264 plt.savefig(filename +
".pdf", dpi=300, transparent=
"False")
2268 def get_frequency_statistics(self, prot_list,
2271 violated_histogram = {}
2272 satisfied_histogram = {}
2273 unique_cross_links = []
2275 for xl
in self.unique_cross_link_list:
2276 (r1, c1, r2, c2, mdist) = xl
2279 if prot_list2
is None:
2280 if not c1
in prot_list:
2282 if not c2
in prot_list:
2285 if c1
in prot_list
and c2
in prot_list2:
2287 elif c1
in prot_list2
and c2
in prot_list:
2292 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2294 if (r1, c1, r2, c2)
not in unique_cross_links:
2296 if frequency
not in violated_histogram:
2297 violated_histogram[frequency] = 1
2299 violated_histogram[frequency] += 1
2301 if frequency
not in satisfied_histogram:
2302 satisfied_histogram[frequency] = 1
2304 satisfied_histogram[frequency] += 1
2305 unique_cross_links.append((r1, c1, r2, c2))
2306 unique_cross_links.append((r2, c2, r1, c1))
2308 print(
"# satisfied")
2310 total_number_of_crosslinks=0
2312 for i
in satisfied_histogram:
2316 if i
in violated_histogram:
2317 print(i, violated_histogram[i]+satisfied_histogram[i])
2319 print(i, satisfied_histogram[i])
2320 total_number_of_crosslinks+=i*satisfied_histogram[i]
2324 for i
in violated_histogram:
2325 print(i, violated_histogram[i])
2326 total_number_of_crosslinks+=i*violated_histogram[i]
2328 print(total_number_of_crosslinks)
2332 def print_cross_link_binary_symbols(self, prot_list,
2335 confidence_list = []
2336 for xl
in self.crosslinks:
2337 (r1, c1, r2, c2, mdist, stdv, confidence,
2338 unique_identifier, descriptor) = xl
2340 if prot_list2
is None:
2341 if not c1
in prot_list:
2343 if not c2
in prot_list:
2346 if c1
in prot_list
and c2
in prot_list2:
2348 elif c1
in prot_list2
and c2
in prot_list:
2353 if descriptor !=
"original":
2356 confidence_list.append(confidence)
2358 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2359 tmp_dist_binary = []
2362 tmp_dist_binary.append(1)
2364 tmp_dist_binary.append(0)
2365 tmp_matrix.append(tmp_dist_binary)
2367 matrix = list(zip(*tmp_matrix))
2369 satisfied_high_sum = 0
2370 satisfied_mid_sum = 0
2371 satisfied_low_sum = 0
2372 total_satisfied_sum = 0
2373 for k, m
in enumerate(matrix):
2382 for n, b
in enumerate(m):
2383 if confidence_list[n] ==
"0.01":
2387 satisfied_high_sum += 1
2388 elif confidence_list[n] ==
"0.05":
2392 satisfied_mid_sum += 1
2393 elif confidence_list[n] ==
"0.1":
2397 satisfied_low_sum += 1
2399 total_satisfied += 1
2400 total_satisfied_sum += 1
2402 print(k, satisfied_high, total_high)
2403 print(k, satisfied_mid, total_mid)
2404 print(k, satisfied_low, total_low)
2405 print(k, total_satisfied, total)
2406 print(float(satisfied_high_sum) / len(matrix))
2407 print(float(satisfied_mid_sum) / len(matrix))
2408 print(float(satisfied_low_sum) / len(matrix))
2411 def get_unique_crosslinks_statistics(self, prot_list,
2424 satisfied_string = []
2425 for xl
in self.crosslinks:
2426 (r1, c1, r2, c2, mdist, stdv, confidence,
2427 unique_identifier, descriptor) = xl
2429 if prot_list2
is None:
2430 if not c1
in prot_list:
2432 if not c2
in prot_list:
2435 if c1
in prot_list
and c2
in prot_list2:
2437 elif c1
in prot_list2
and c2
in prot_list:
2442 if descriptor !=
"original":
2446 if confidence ==
"0.01":
2450 if confidence ==
"0.05":
2454 if confidence ==
"0.1":
2459 satisfied_string.append(1)
2461 satisfied_string.append(0)
2463 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2464 tmp_dist_binary = []
2467 tmp_dist_binary.append(1)
2469 tmp_dist_binary.append(0)
2470 tmp_matrix.append(tmp_dist_binary)
2472 print(
"unique satisfied_high/total_high", satisfied_high,
"/", total_high)
2473 print(
"unique satisfied_mid/total_mid", satisfied_mid,
"/", total_mid)
2474 print(
"unique satisfied_low/total_low", satisfied_low,
"/", total_low)
2475 print(
"total", total)
2477 matrix = list(zip(*tmp_matrix))
2478 satisfied_models = 0
2480 for b
in satisfied_string:
2487 all_satisfied =
True
2489 for n, b
in enumerate(m):
2494 if b == 1
and satisfied_string[n] == 1:
2496 elif b == 1
and satisfied_string[n] == 0:
2498 elif b == 0
and satisfied_string[n] == 0:
2500 elif b == 0
and satisfied_string[n] == 1:
2501 all_satisfied =
False
2503 satisfied_models += 1
2505 print(satstr, all_satisfied)
2506 print(
"models that satisfies the median satisfied crosslinks/total models", satisfied_models, len(matrix))
2508 def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,
2511 import matplotlib
as mpl
2513 import matplotlib.pylab
as pl
2516 for kw
in self.cross_link_distances_unique:
2517 (r1, c1, r2, c2) = kw
2518 dists = self.cross_link_distances_unique[kw]
2520 if prot_list2
is None:
2521 if not c1
in prot_list:
2523 if not c2
in prot_list:
2526 if c1
in prot_list
and c2
in prot_list2:
2528 elif c1
in prot_list2
and c2
in prot_list:
2533 dists.append(sum(dists))
2534 tmp_matrix.append(dists)
2536 tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
2539 matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
2541 for i
in range(len(tmp_matrix)):
2542 for k
in range(len(tmp_matrix[i]) - 1):
2543 matrix[i][k] = tmp_matrix[i][k]
2548 ax = fig.add_subplot(211)
2550 cax = ax.imshow(matrix, interpolation=
'nearest')
2554 pl.savefig(figurename, dpi=300)
2563 arrangement=
"inter",
2564 confidence_input=
"None"):
2567 for xl
in self.cross_link_distances:
2568 (r1, c1, r2, c2, mdist, confidence) = xl
2569 if c1
in prots1
and c2
in prots2:
2570 if arrangement ==
"inter" and c1 == c2:
2572 if arrangement ==
"intra" and c1 != c2:
2574 if confidence_input == confidence:
2575 label = str(c1) +
":" + str(r1) + \
2576 "-" + str(c2) +
":" + str(r2)
2577 values = self.cross_link_distances[xl]
2578 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2579 data.append((label, values, mdist, frequency))
2581 sort_by_dist = sorted(data, key=
lambda tup: tup[2])
2582 sort_by_dist = list(zip(*sort_by_dist))
2583 values = sort_by_dist[1]
2584 positions = list(range(len(values)))
2585 labels = sort_by_dist[0]
2586 frequencies = list(map(float, sort_by_dist[3]))
2587 frequencies = [f * 10.0
for f
in frequencies]
2589 nchunks = int(float(len(values)) / nxl_per_row)
2590 values_chunks = IMP.pmi.tools.chunk_list_into_segments(values, nchunks)
2591 positions_chunks = IMP.pmi.tools.chunk_list_into_segments(
2594 frequencies_chunks = IMP.pmi.tools.chunk_list_into_segments(
2597 labels_chunks = IMP.pmi.tools.chunk_list_into_segments(labels, nchunks)
2599 for n, v
in enumerate(values_chunks):
2600 p = positions_chunks[n]
2601 f = frequencies_chunks[n]
2602 l = labels_chunks[n]
2604 filename +
"." + str(n), v, p, f,
2605 valuename=
"Distance (Ang)", positionname=
"Unique " + arrangement +
" Crosslinks", xlabels=l)
2607 def crosslink_distance_histogram(self, filename,
2610 confidence_classes=
None,
2616 if prot_list
is None:
2617 prot_list = list(self.prot_length_dict.keys())
2620 for xl
in self.crosslinks:
2621 (r1, c1, r2, c2, mdist, stdv, confidence,
2622 unique_identifier, descriptor) = xl
2624 if not confidence_classes
is None:
2625 if confidence
not in confidence_classes:
2628 if prot_list2
is None:
2629 if not c1
in prot_list:
2631 if not c2
in prot_list:
2634 if c1
in prot_list
and c2
in prot_list2:
2636 elif c1
in prot_list2
and c2
in prot_list:
2641 distances.append(mdist)
2644 filename, distances, valuename=
"C-alpha C-alpha distance [Ang]",
2645 bins=bins, color=color,
2647 reference_xline=35.0,
2648 yplotrange=yplotrange, normalized=normalized)
2650 def scatter_plot_xl_features(self, filename,
2656 reference_ylines=
None,
2657 distance_color=
True,
2659 import matplotlib
as mpl
2661 import matplotlib.pyplot
as plt
2662 import matplotlib.cm
as cm
2664 fig = plt.figure(figsize=(10, 10))
2665 ax = fig.add_subplot(111)
2667 for xl
in self.crosslinks:
2668 (r1, c1, r2, c2, mdist, stdv, confidence,
2669 unique_identifier, arrangement) = xl
2671 if prot_list2
is None:
2672 if not c1
in prot_list:
2674 if not c2
in prot_list:
2677 if c1
in prot_list
and c2
in prot_list2:
2679 elif c1
in prot_list2
and c2
in prot_list:
2684 xldb = self.external_csv_data[unique_identifier]
2685 xldb.update({
"Distance": mdist})
2686 xldb.update({
"Distance_stdv": stdv})
2688 xvalue = float(xldb[feature1])
2689 yvalue = float(xldb[feature2])
2692 color = self.colormap(mdist)
2696 ax.plot([xvalue], [yvalue],
'o', c=color, alpha=0.1, markersize=7)
2698 if not yplotrange
is None:
2699 ax.set_ylim(yplotrange)
2700 if not reference_ylines
is None:
2701 for rl
in reference_ylines:
2702 ax.axhline(rl, color=
'red', linestyle=
'dashed', linewidth=1)
2705 plt.savefig(filename +
"." + format, dpi=150, transparent=
"False")
static bool get_is_setup(const IMP::ParticleAdaptor &p)
def get_output
Returns output for IMP.pmi.output.Output object.
Visualization of crosslinks.
A decorator to associate a particle with a part of a protein/DNA/RNA.
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
A class for reading stat files.
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
Compute the RMSD (without alignment) taking into account the copy ambiguity.
void link_restraints(RMF::FileConstHandle fh, const Restraints &hs)
A class to evaluate the precision of an ensemble.
A class to cluster structures.
def get_rmsd_wrt_reference_structure_with_alignment
First align then calculate RMSD.
static Molecule setup_particle(Model *m, ParticleIndex pi)
Class for sampling a density map from particles.
A decorator for keeping track of copies of a molecule.
def add_structure
Read a structure into the ensemble and store (as coordinates).
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
Return a density map for which voxel i contains the result of m1[i]*m2[i].
Performs alignment and RMSD calculation for two sets of coordinates.
def add_subunits_density
Add a frame to the densities.
DensityMap * create_density_map(const IMP::algebra::GridD< 3, S, V, E > &arg)
Create a density map from an arbitrary IMP::algebra::GridD.
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Basic utilities for handling cryo-electron microscopy 3D density maps.
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
def fill
Add coordinates for a single model.
Tools for clustering and cluster analysis.
bool get_is_canonical(atom::Hierarchy h)
Walk up a PMI2 hierarchy/representations and check if the root is named System.
def do_cluster
Run K-means clustering.
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Classes for writing output files and processing them.
def set_reference_structure
Read in a structure used for reference computation.
def get_rmsf
Calculate the residue mean square fluctuations (RMSF).
def get_average_distance_wrt_reference_structure
Compare the structure set to the reference structure.
def get_density
Get the current density for some component name.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
def get_precision
Evaluate the precision of two named structure groups.
Restraints create_restraints(RMF::FileConstHandle fh, Model *m)
def get_moldict_coorddict
return data structure for the RMSD calculation
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name.
Python classes to represent, score, sample and analyze models.
def add_structures
Read a list of RMFs, supports parallel.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Select hierarchy particles identified by the biological name.
Compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
A decorator for a particle with x,y,z coordinates and a radius.