3 """@namespace IMP.pmi.analysis
4 Tools for clustering and cluster analysis
6 from __future__
import print_function
16 from operator
import itemgetter
17 from copy
import deepcopy
18 from math
import log,sqrt
24 """Performs alignment and RMSD calculation for two sets of coordinates
26 The class also takes into account non-equal stoichiometry of the proteins.
27 If this is the case, the protein names of proteins in multiple copies
28 should be specified in the following form:
29 nameA..1, nameA..2 (note two dots).
32 def __init__(self, template, query, weights=None):
34 @param query {'p1':coords(L,3), 'p2':coords(L,3)}
35 @param template {'p1':coords(L,3), 'p2':coords(L,3)}
36 @param weights optional weights for each set of coordinates
39 self.template = template
42 if len(self.query.keys()) != len(self.template.keys()):
43 raise ValueError(
'''the number of proteins
44 in template and query does not match!''')
50 self.proteins = sorted(self.query.keys())
51 prots_uniq = [i.split(
'..')[0]
for i
in self.proteins]
59 np = prots_uniq.count(p)
60 copies = [i
for i
in self.proteins
if i.split(
'..')[0] == p]
61 prmts = list(itertools.permutations(copies, len(copies)))
64 self.Product = list(itertools.product(*P.values()))
72 torder = sum([list(i)
for i
in self.Product[0]], [])
75 if self.weights
is not None:
76 weights += [i
for i
in self.weights[t]]
79 self.rmsd = 10000000000.
80 for comb
in self.Product:
82 order = sum([list(i)
for i
in comb], [])
92 if self.weights
is not None:
103 from scipy.spatial.distance
import cdist
111 torder = sum([list(i)
for i
in self.Product[0]], [])
115 self.rmsd, Transformation = 10000000000.,
''
118 for comb
in self.Product:
119 order = sum([list(i)
for i
in comb], [])
125 if len(template_xyz) != len(query_xyz):
126 raise ValueError(
'''the number of coordinates
127 in template and query does not match!''')
132 query_xyz_tr = [transformation.get_transformed(n)
136 sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2)) / len(template_xyz))
139 Transformation = transformation
142 return (self.rmsd, Transformation)
147 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
148 'a..2':np.array([np.array([1.,1.,])]),
149 'a..3':np.array([np.array([-2.,1.])]),
150 'b':np.array([np.array([0.,-1.])]),
151 'c..1':np.array([np.array([-1.,-1.])]),
152 'c..2':np.array([np.array([1.,-1.])]),
153 'd':np.array([np.array([0.,0.])]),
154 'e':np.array([np.array([0.,1.])])}
156 Ali = Alignment(Proteins, Proteins)
158 if Ali.get_rmsd() == 0.0: print 'successful test!'
159 else: print 'ERROR!'; exit()
164 class Violations(object):
168 self.violation_thresholds = {}
169 self.violation_counts = {}
171 data = open(filename)
176 d = d.strip().split()
177 self.violation_thresholds[d[0]] = float(d[1])
179 def get_number_violated_restraints(self, rsts_dict):
181 for rst
in self.violation_thresholds:
182 if rst
not in rsts_dict:
184 if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
186 if rst
not in self.violation_counts:
187 self.violation_counts[rst] = 1
189 self.violation_counts[rst] += 1
195 """A class to cluster structures.
196 Uses scipy's cdist function to compute distance matrices
197 and sklearn's kmeans clustering module.
201 @param rmsd_weights Flat list of weights for each particle
205 from mpi4py
import MPI
206 self.comm = MPI.COMM_WORLD
207 self.rank = self.comm.Get_rank()
208 self.number_of_processes = self.comm.size
210 self.number_of_processes = 1
213 self.structure_cluster_ids =
None
214 self.tmpl_coords =
None
215 self.rmsd_weights=rmsd_weights
217 def set_template(self, part_coords):
219 self.tmpl_coords = part_coords
222 """Add coordinates for a single model."""
224 self.all_coords[frame] = Coords
226 def dist_matrix(self):
228 self.model_list_names = list(self.all_coords.keys())
229 self.model_indexes = list(range(len(self.model_list_names)))
230 self.model_indexes_dict = dict(
231 list(zip(self.model_list_names, self.model_indexes)))
232 model_indexes_unique_pairs = list(itertools.combinations(self.model_indexes, 2))
234 my_model_indexes_unique_pairs = IMP.pmi.tools.chunk_list_into_segments(
235 model_indexes_unique_pairs,
236 self.number_of_processes)[self.rank]
238 print(
"process %s assigned with %s pairs" % (str(self.rank), str(len(my_model_indexes_unique_pairs))))
240 (raw_distance_dict, self.transformation_distance_dict) = self.matrix_calculation(self.all_coords,
242 my_model_indexes_unique_pairs)
244 if self.number_of_processes > 1:
247 pickable_transformations = self.get_pickable_transformation_distance_dict(
250 pickable_transformations)
251 self.set_transformation_distance_dict_from_pickable(
252 pickable_transformations)
254 self.raw_distance_matrix = np.zeros(
255 (len(self.model_list_names), len(self.model_list_names)))
256 for item
in raw_distance_dict:
258 self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
259 self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
261 def get_dist_matrix(self):
262 return self.raw_distance_matrix
265 """Run K-means clustering
266 @param number_of_clusters Num means
267 @param seed the random seed
269 from sklearn.cluster
import KMeans
274 kmeans = KMeans(n_clusters=number_of_clusters)
277 kmeans = KMeans(k=number_of_clusters)
278 kmeans.fit_predict(self.raw_distance_matrix)
280 self.structure_cluster_ids = kmeans.labels_
282 def get_pickable_transformation_distance_dict(self):
283 pickable_transformations = {}
284 for label
in self.transformation_distance_dict:
285 tr = self.transformation_distance_dict[label]
286 trans = tuple(tr.get_translation())
287 rot = tuple(tr.get_rotation().get_quaternion())
288 pickable_transformations[label] = (rot, trans)
289 return pickable_transformations
291 def set_transformation_distance_dict_from_pickable(
293 pickable_transformations):
294 self.transformation_distance_dict = {}
295 for label
in pickable_transformations:
296 tr = pickable_transformations[label]
299 self.transformation_distance_dict[
302 def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
304 outf = open(file_name +
".data",
'wb')
310 pickable_transformations = self.get_pickable_transformation_distance_dict(
313 (self.structure_cluster_ids,
314 self.model_list_names,
315 pickable_transformations),
318 np.save(file_name +
".npy", self.raw_distance_matrix)
320 def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
323 inputf = open(file_name +
".data",
'rb')
324 (self.structure_cluster_ids, self.model_list_names,
325 pickable_transformations) = pickle.load(inputf)
328 self.raw_distance_matrix = np.load(file_name +
".npy")
330 self.set_transformation_distance_dict_from_pickable(
331 pickable_transformations)
332 self.model_indexes = list(range(len(self.model_list_names)))
333 self.model_indexes_dict = dict(
334 list(zip(self.model_list_names, self.model_indexes)))
336 def plot_matrix(self, figurename="clustermatrix.pdf"):
337 import matplotlib
as mpl
339 import matplotlib.pylab
as pl
340 from scipy.cluster
import hierarchy
as hrc
342 fig = pl.figure(figsize=(10,8))
343 ax = fig.add_subplot(212)
344 dendrogram = hrc.dendrogram(
345 hrc.linkage(self.raw_distance_matrix),
348 leaves_order = dendrogram[
'leaves']
349 ax.set_xlabel(
'Model')
350 ax.set_ylabel(
'RMSD [Angstroms]')
352 ax2 = fig.add_subplot(221)
354 self.raw_distance_matrix[leaves_order,
357 interpolation=
'nearest')
358 cb = fig.colorbar(cax)
359 cb.set_label(
'RMSD [Angstroms]')
360 ax2.set_xlabel(
'Model')
361 ax2.set_ylabel(
'Model')
363 pl.savefig(figurename, dpi=300)
366 def get_model_index_from_name(self, name):
367 return self.model_indexes_dict[name]
369 def get_cluster_labels(self):
371 return list(set(self.structure_cluster_ids))
373 def get_number_of_clusters(self):
374 return len(self.get_cluster_labels())
376 def get_cluster_label_indexes(self, label):
378 [i
for i, l
in enumerate(self.structure_cluster_ids)
if l == label]
381 def get_cluster_label_names(self, label):
383 [self.model_list_names[i]
384 for i
in self.get_cluster_label_indexes(label)]
387 def get_cluster_label_average_rmsd(self, label):
389 indexes = self.get_cluster_label_indexes(label)
392 sub_distance_matrix = self.raw_distance_matrix[
393 indexes, :][:, indexes]
394 average_rmsd = np.sum(sub_distance_matrix) / \
395 (len(sub_distance_matrix)
396 ** 2 - len(sub_distance_matrix))
401 def get_cluster_label_size(self, label):
402 return len(self.get_cluster_label_indexes(label))
404 def get_transformation_to_first_member(
408 reference = self.get_cluster_label_indexes(cluster_label)[0]
409 return self.transformation_distance_dict[(reference, structure_index)]
411 def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
413 model_list_names = list(all_coords.keys())
414 rmsd_protein_names = list(all_coords[model_list_names[0]].keys())
415 raw_distance_dict = {}
416 transformation_distance_dict = {}
417 if template_coords
is None:
421 alignment_template_protein_names = list(template_coords.keys())
423 for (f1, f2)
in list_of_pairs:
431 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
432 for pr
in rmsd_protein_names])
434 for pr
in rmsd_protein_names:
435 coords_f2[pr] = all_coords[model_list_names[f2]][pr]
437 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
438 rmsd = Ali.get_rmsd()
444 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
445 for pr
in alignment_template_protein_names])
446 coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
447 for pr
in alignment_template_protein_names])
450 template_rmsd, transformation = Ali.align()
455 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
456 for pr
in rmsd_protein_names])
458 for pr
in rmsd_protein_names:
459 coords_f2[pr] = [transformation.get_transformed(
460 i)
for i
in all_coords[model_list_names[f2]][pr]]
462 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
463 rmsd = Ali.get_rmsd()
465 raw_distance_dict[(f1, f2)] = rmsd
466 raw_distance_dict[(f2, f1)] = rmsd
467 transformation_distance_dict[(f1, f2)] = transformation
468 transformation_distance_dict[(f2, f1)] = transformation
470 return raw_distance_dict, transformation_distance_dict
474 """Compute the RMSD (without alignment) taking into account the copy ambiguity.
475 To be used with pmi2 hierarchies. Can be used for instance as follows:
477 rmsd=IMP.pmi.analysis.RMSD(hier,hier,[mol.get_name() for mol in mols],dynamic0=True,dynamic1=False)
478 output_objects.append(rmsd)
480 before shuffling the coordinates
483 def __init__(self,hier0,hier1,molnames,label="None",dynamic0=True,dynamic1=True,metric=IMP.algebra.get_rmsd):
485 @param hier0 first input hierarchy
486 @param hier1 second input hierarchy
487 @param molname the names of the molecules used for the RMSD
488 @dynamic0 if True stores the decorators XYZ and coordinates of hier0 can be update. If false coordinates are static (stored in Vector3Ds)
489 and will never be updated
490 @dynamic1 same as above
491 metric what metric should be used
495 self.dynamic0=dynamic0
496 self.dynamic1=dynamic1
501 """return data structure for the RMSD calculation"""
507 if name
not in molnames:
514 sel=
IMP.atom.Selection(mol,residue_index=i,representation_type=IMP.atom.BALLS,resolution=1)
515 parts=sel.get_selected_particles()
517 mol_coords[mol].append(
IMP.core.XYZ(parts[0]).get_coordinates())
521 moldict[name].append(mol)
524 return moldict, mol_coords, mol_XYZs
526 def get_rmsd_and_assigments(self):
532 for molname, ref_mols
in self.moldict1.items():
534 for ref_mol
in ref_mols:
536 coord1=[XYZ.get_coordinates()
for XYZ
in self.mol_XYZs1[ref_mol]]
538 coord1=self.molcoords1[ref_mol]
543 for rmf_mols
in itertools.permutations(self.moldict0[molname]):
545 for rmf_mol
in rmf_mols:
547 coord0=[XYZ.get_coordinates()
for XYZ
in self.mol_XYZs0[rmf_mol]]
549 coord0=self.molcoords0[rmf_mol]
553 rmf_mols_list.append(rmf_mols)
555 rmf_mols_best_order=rmf_mols_list[rmsd.index(m)]
558 for n, (ref_mol,rmf_mol)
in enumerate(zip(ref_mols,rmf_mols_best_order)):
559 best_assignments.append((rmf_mol,ref_mol))
561 coord0=[XYZ.get_coordinates()
for XYZ
in self.mol_XYZs0[rmf_mol]]
563 coord0=self.molcoords0[rmf_mol]
566 coord1=[XYZ.get_coordinates()
for XYZ
in self.mol_XYZs1[ref_mol]]
568 coord1=self.molcoords1[ref_mol]
569 rmsd_pair=self.metric(coord1, coord0)
570 N=len(self.molcoords1[ref_mol])
572 total_rmsd+=rmsd_pair*rmsd_pair*N
573 rmsd_dict[ref_mol]=rmsd_pair
574 total_rmsd = sqrt(total_rmsd/total_N)
575 return total_rmsd,best_assignments
578 """Returns output for IMP.pmi.output.Output object"""
579 total_rmsd,best_assignments=self.get_rmsd_and_assigments()
582 for rmf_mol,ref_mol
in best_assignments:
583 ref_name=ref_mol.get_name()
585 rmf_name=rmf_mol.get_name()
587 assignments_out.append(rmf_name+
"."+str(rmf_copy)+
"->"+ref_name+
"."+str(ref_copy))
588 return {
"RMSD_"+self.label:str(total_rmsd),
"RMSD_assignments_"+self.label:str(assignments_out)}
594 """A class to evaluate the precision of an ensemble.
596 Also can evaluate the cross-precision of multiple ensembles.
597 Supports MPI for coordinate reading.
598 Recommended procedure:
599 -# initialize object and pass the selection for evaluating precision
600 -# call add_structures() to read in the data (specify group name)
601 -# call get_precision() to evaluate inter/intra precision
602 -# call get_rmsf() to evaluate within-group fluctuations
606 selection_dictionary={}):
608 @param model The IMP Model
609 @param resolution Use 1 or 10 (kluge: requires that "_Res:X" is
610 part of the hier name)
611 @param selection_dictionary Dictionary where keys are names for
612 selections and values are selection tuples for scoring
613 precision. "All" is automatically made as well
616 from mpi4py
import MPI
617 self.comm = MPI.COMM_WORLD
618 self.rank = self.comm.Get_rank()
619 self.number_of_processes = self.comm.size
621 self.number_of_processes = 1
624 self.styles = [
'pairwise_rmsd',
'pairwise_drmsd_k',
'pairwise_drmsd_Q',
625 'pairwise_drms_k',
'pairwise_rmsd',
'drmsd_from_center']
626 self.style =
'pairwise_drmsd_k'
627 self.structures_dictionary = {}
628 self.reference_structures_dictionary = {}
630 self.protein_names =
None
631 self.len_particles_resolution_one =
None
633 self.rmf_names_frames = {}
634 self.reference_rmf_names_frames =
None
635 self.reference_structure =
None
636 self.reference_prot =
None
637 self.selection_dictionary = selection_dictionary
638 self.threshold = 40.0
639 self.residue_particle_index_map =
None
641 if resolution
in [1,10]:
642 self.resolution = resolution
644 raise KeyError(
"Currently only allow resolution 1 or 10")
646 def _get_structure(self,rmf_frame_index,rmf_name):
647 """Read an RMF file and return the particles"""
648 rh = RMF.open_rmf_file_read_only(rmf_name)
650 print(
"getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
654 print(
"linking coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
659 if self.resolution==1:
661 elif self.resolution==10:
664 protein_names = list(particle_dict.keys())
665 particles_resolution_one = []
666 for k
in particle_dict:
667 particles_resolution_one += (particle_dict[k])
669 if self.protein_names==
None:
670 self.protein_names = protein_names
672 if self.protein_names!=protein_names:
673 print(
"Error: the protein names of the new coordinate set is not compatible with the previous one")
675 if self.len_particles_resolution_one==
None:
676 self.len_particles_resolution_one = len(particles_resolution_one)
678 if self.len_particles_resolution_one!=len(particles_resolution_one):
679 raise ValueError(
"the new coordinate set is not compatible with the previous one")
681 return particles_resolution_one
687 setup_index_map=
False):
688 """ Read a structure into the ensemble and store (as coordinates).
689 @param rmf_name The name of the RMF file
690 @param rmf_frame_index The frame to read
691 @param structure_set_name Name for the set that includes this structure
693 @param setup_index_map if requested, set up a dictionary to help
698 if structure_set_name
in self.structures_dictionary:
699 cdict = self.structures_dictionary[structure_set_name]
700 rmflist = self.rmf_names_frames[structure_set_name]
702 self.structures_dictionary[structure_set_name]={}
703 self.rmf_names_frames[structure_set_name]=[]
704 cdict = self.structures_dictionary[structure_set_name]
705 rmflist = self.rmf_names_frames[structure_set_name]
709 particles_resolution_one = self._get_structure(rmf_frame_index,rmf_name)
711 print(
"something wrong with the rmf")
713 self.selection_dictionary.update({
"All":self.protein_names})
715 for selection_name
in self.selection_dictionary:
716 selection_tuple = self.selection_dictionary[selection_name]
717 coords = self._select_coordinates(selection_tuple,particles_resolution_one,self.prots[0])
718 if selection_name
not in cdict:
719 cdict[selection_name] = [coords]
721 cdict[selection_name].append(coords)
723 rmflist.append((rmf_name,rmf_frame_index))
727 self.residue_particle_index_map = {}
728 for prot_name
in self.protein_names:
729 self.residue_particle_index_map[prot_name] = \
730 self._get_residue_particle_index_map(
732 particles_resolution_one,self.prots[0])
735 rmf_name_frame_tuples,
737 """Read a list of RMFs, supports parallel
738 @param rmf_name_frame_tuples list of (rmf_file_name,frame_number)
739 @param structure_set_name Name this set of structures (e.g. "cluster.1")
743 my_rmf_name_frame_tuples=IMP.pmi.tools.chunk_list_into_segments(
744 rmf_name_frame_tuples,self.number_of_processes)[self.rank]
745 for nfr,tup
in enumerate(my_rmf_name_frame_tuples):
747 rmf_frame_index=tup[1]
749 if self.residue_particle_index_map
is None:
750 setup_index_map =
True
752 setup_index_map =
False
759 if self.number_of_processes > 1:
762 self.comm.send(self.structures_dictionary, dest=0, tag=11)
764 for i
in range(1, self.number_of_processes):
765 data_tmp = self.comm.recv(source=i, tag=11)
766 for key
in self.structures_dictionary:
767 self.structures_dictionary[key].update(data_tmp[key])
768 for i
in range(1, self.number_of_processes):
769 self.comm.send(self.structures_dictionary, dest=i, tag=11)
771 self.structures_dictionary = self.comm.recv(source=0, tag=11)
773 def _get_residue_particle_index_map(self,prot_name,structure,hier):
775 residue_particle_index_map = []
781 all_selected_particles = s.get_selected_particles()
782 intersection = list(set(all_selected_particles) & set(structure))
783 sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
784 for p
in sorted_intersection:
786 return residue_particle_index_map
789 def _select_coordinates(self,tuple_selections,structure,prot):
790 selected_coordinates=[]
791 for t
in tuple_selections:
792 if isinstance(t, tuple)
and len(t)==3:
798 all_selected_particles = s.get_selected_particles()
799 intersection = list(set(all_selected_particles) & set(structure))
800 sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
801 cc = [tuple(
IMP.core.XYZ(p).get_coordinates())
for p
in sorted_intersection]
802 selected_coordinates += cc
803 elif isinstance(t, str):
808 all_selected_particles = s.get_selected_particles()
809 intersection = list(set(all_selected_particles) & set(structure))
810 sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
811 cc = [tuple(
IMP.core.XYZ(p).get_coordinates())
for p
in sorted_intersection]
812 selected_coordinates += cc
814 raise ValueError(
"Selection error")
815 return selected_coordinates
817 def set_threshold(self,threshold):
818 self.threshold = threshold
820 def _get_distance(self,
826 """ Compute distance between structures with various metrics """
827 c1 = self.structures_dictionary[structure_set_name1][selection_name][index1]
828 c2 = self.structures_dictionary[structure_set_name2][selection_name][index2]
832 if self.style==
'pairwise_drmsd_k':
834 if self.style==
'pairwise_drms_k':
836 if self.style==
'pairwise_drmsd_Q':
839 if self.style==
'pairwise_rmsd':
843 def _get_particle_distances(self,structure_set_name1,structure_set_name2,
844 selection_name,index1,index2):
845 c1 = self.structures_dictionary[structure_set_name1][selection_name][index1]
846 c2 = self.structures_dictionary[structure_set_name2][selection_name][index2]
851 distances=[np.linalg.norm(a-b)
for (a,b)
in zip(coordinates1,coordinates2)]
860 selection_keywords=
None):
861 """ Evaluate the precision of two named structure groups. Supports MPI.
862 When the structure_set_name1 is different from the structure_set_name2,
863 this evaluates the cross-precision (average pairwise distances).
864 @param outfile Name of the precision output file
865 @param structure_set_name1 string name of the first structure set
866 @param structure_set_name2 string name of the second structure set
867 @param skip analyze every (skip) structure for the distance matrix calculation
868 @param selection_keywords Specify the selection name you want to calculate on.
869 By default this is computed for everything you provided in the constructor,
870 plus all the subunits together.
872 if selection_keywords
is None:
873 sel_keys = list(self.selection_dictionary.keys())
875 for k
in selection_keywords:
876 if k
not in self.selection_dictionary:
877 raise KeyError(
"you are trying to find named selection " \
878 + k +
" which was not requested in the constructor")
879 sel_keys = selection_keywords
881 if outfile
is not None:
882 of = open(outfile,
"w")
884 for selection_name
in sel_keys:
885 number_of_structures_1 = len(self.structures_dictionary[structure_set_name1][selection_name])
886 number_of_structures_2 = len(self.structures_dictionary[structure_set_name2][selection_name])
888 structure_pointers_1 = list(range(0,number_of_structures_1,skip))
889 structure_pointers_2 = list(range(0,number_of_structures_2,skip))
890 pair_combination_list = list(itertools.product(structure_pointers_1,structure_pointers_2))
891 if len(pair_combination_list)==0:
892 raise ValueError(
"no structure selected. Check the skip parameter.")
895 my_pair_combination_list = IMP.pmi.tools.chunk_list_into_segments(
896 pair_combination_list,self.number_of_processes)[self.rank]
897 my_length = len(my_pair_combination_list)
898 for n,pair
in enumerate(my_pair_combination_list):
899 progression = int(float(n)/my_length*100.0)
900 distances[pair] = self._get_distance(structure_set_name1,structure_set_name2,
901 selection_name,pair[0],pair[1])
902 if self.number_of_processes > 1:
907 if structure_set_name1==structure_set_name2:
908 structure_pointers = structure_pointers_1
909 number_of_structures = number_of_structures_1
914 distances_to_structure = {}
915 distances_to_structure_normalization = {}
917 for n
in structure_pointers:
918 distances_to_structure[n] = 0.0
919 distances_to_structure_normalization[n]=0
922 distance += distances[k]
923 distances_to_structure[k[0]] += distances[k]
924 distances_to_structure[k[1]] += distances[k]
925 distances_to_structure_normalization[k[0]] += 1
926 distances_to_structure_normalization[k[1]] += 1
928 for n
in structure_pointers:
929 distances_to_structure[n] = distances_to_structure[n]/distances_to_structure_normalization[n]
931 min_distance = min([distances_to_structure[n]
for n
in distances_to_structure])
932 centroid_index = [k
for k, v
in distances_to_structure.items()
if v == min_distance][0]
933 centroid_rmf_name = self.rmf_names_frames[structure_set_name1][centroid_index]
935 centroid_distance = 0.0
937 for n
in range(number_of_structures):
938 dist = self._get_distance(structure_set_name1,structure_set_name1,
939 selection_name,centroid_index,n)
940 centroid_distance += dist
941 distance_list.append(dist)
944 centroid_distance /= number_of_structures
946 if outfile
is not None:
947 of.write(str(selection_name)+
" "+structure_set_name1+
948 " average centroid distance "+str(centroid_distance)+
"\n")
949 of.write(str(selection_name)+
" "+structure_set_name1+
950 " centroid index "+str(centroid_index)+
"\n")
951 of.write(str(selection_name)+
" "+structure_set_name1+
952 " centroid rmf name "+str(centroid_rmf_name)+
"\n")
953 of.write(str(selection_name)+
" "+structure_set_name1+
954 " median centroid distance "+str(np.median(distance_list))+
"\n")
956 average_pairwise_distances=sum(distances.values())/len(list(distances.values()))
957 if outfile
is not None:
958 of.write(str(selection_name)+
" "+structure_set_name1+
" "+structure_set_name2+
959 " average pairwise distance "+str(average_pairwise_distances)+
"\n")
960 if outfile
is not None:
962 return centroid_index
968 set_plot_yaxis_range=
None):
969 """ Calculate the residue mean square fluctuations (RMSF).
970 Automatically outputs as data file and pdf
971 @param structure_set_name Which structure set to calculate RMSF for
972 @param outdir Where to write the files
973 @param skip Skip this number of structures
974 @param set_plot_yaxis_range In case you need to change the plot
982 for sel_name
in self.protein_names:
983 self.selection_dictionary.update({sel_name:[sel_name]})
985 number_of_structures = len(self.structures_dictionary[structure_set_name][sel_name])
989 rpim = self.residue_particle_index_map[sel_name]
990 outfile = outdir+
"/rmsf."+sel_name+
".dat"
991 of = open(outfile,
"w")
992 residue_distances = {}
994 for index
in range(number_of_structures):
995 distances = self._get_particle_distances(structure_set_name,
998 centroid_index,index)
999 for nblock,block
in enumerate(rpim):
1000 for residue_number
in block:
1001 residue_nblock[residue_number] = nblock
1002 if residue_number
not in residue_distances:
1003 residue_distances[residue_number] = [distances[nblock]]
1005 residue_distances[residue_number].append(distances[nblock])
1009 for rn
in residue_distances:
1011 rmsf = np.std(residue_distances[rn])
1013 of.write(str(rn)+
" "+str(residue_nblock[rn])+
" "+str(rmsf)+
"\n")
1015 IMP.pmi.output.plot_xy_data(residues,rmsfs,title=sel_name,
1016 out_fn=outdir+
"/rmsf."+sel_name,display=
False,
1017 set_plot_yaxis_range=set_plot_yaxis_range,
1018 xlabel=
'Residue Number',ylabel=
'Standard error')
1023 """Read in a structure used for reference computation.
1024 Needed before calling get_average_distance_wrt_reference_structure()
1025 @param rmf_name The RMF file to read the reference
1026 @param rmf_frame_index The index in that file
1028 particles_resolution_one = self._get_structure(rmf_frame_index,rmf_name)
1029 self.reference_rmf_names_frames = (rmf_name,rmf_frame_index)
1031 for selection_name
in self.selection_dictionary:
1032 selection_tuple = self.selection_dictionary[selection_name]
1033 coords = self._select_coordinates(selection_tuple,
1034 particles_resolution_one,self.prots[0])
1035 self.reference_structures_dictionary[selection_name] = coords
1038 """First align then calculate RMSD
1039 @param structure_set_name: the name of the structure set
1040 @param alignment_selection: the key containing the selection tuples needed to make the alignment stored in self.selection_dictionary
1041 @return: for each structure in the structure set, returns the rmsd
1044 if self.reference_structures_dictionary=={}:
1045 print(
"Cannot compute until you set a reference structure")
1048 align_reference_coordinates = self.reference_structures_dictionary[alignment_selection_key]
1049 align_coordinates = self.structures_dictionary[structure_set_name][alignment_selection_key]
1050 transformations = []
1051 for c
in align_coordinates:
1053 transformation = Ali.align()[1]
1054 transformations.append(transformation)
1055 for selection_name
in self.selection_dictionary:
1056 reference_coordinates = self.reference_structures_dictionary[selection_name]
1059 for n,sc
in enumerate(self.structures_dictionary[structure_set_name][selection_name]):
1062 distances.append(distance)
1063 ret[selection_name] = {
'all_distances':distances,
'average_distance':sum(distances)/len(distances),
'minimum_distance':min(distances)}
1066 def _get_median(self,list_of_values):
1067 return np.median(np.array(list_of_values))
1070 """Compare the structure set to the reference structure.
1071 @param structure_set_name The structure set to compute this on
1072 @note First call set_reference_structure()
1075 if self.reference_structures_dictionary=={}:
1076 print(
"Cannot compute until you set a reference structure")
1078 for selection_name
in self.selection_dictionary:
1079 reference_coordinates = self.reference_structures_dictionary[selection_name]
1082 for sc
in self.structures_dictionary[structure_set_name][selection_name]:
1084 if self.style==
'pairwise_drmsd_k':
1086 if self.style==
'pairwise_drms_k':
1088 if self.style==
'pairwise_drmsd_Q':
1090 if self.style==
'pairwise_rmsd':
1092 distances.append(distance)
1094 print(selection_name,
"average distance",sum(distances)/len(distances),
"minimum distance",min(distances),
'nframes',len(distances))
1095 ret[selection_name] = {
'average_distance':sum(distances)/len(distances),
'minimum_distance':min(distances)}
1098 def get_coordinates(self):
1101 def set_precision_style(self, style):
1102 if style
in self.styles:
1105 raise ValueError(
"No such style")
1109 """Compute mean density maps from structures.
1111 Keeps a dictionary of density maps,
1112 keys are in the custom ranges. When you call add_subunits_density, it adds
1113 particle coordinates to the existing density maps.
1116 def __init__(self, custom_ranges, representation=None, resolution=20.0, voxel=5.0):
1118 @param custom_ranges Required. It's a dictionary, keys are the
1119 density component names, values are selection tuples
1120 e.g. {'kin28':[['kin28',1,-1]],
1121 'density_name_1' :[('ccl1')],
1122 'density_name_2' :[(1,142,'tfb3d1'),
1123 (143,700,'tfb3d2')],
1124 @param representation PMI representation, for doing selections.
1125 Not needed if you only pass hierarchies
1126 @param resolution The MRC resolution of the output map (in Angstrom unit)
1127 @param voxel The voxel size for the output map (lower is slower)
1130 self.representation = representation
1131 self.MRCresolution = resolution
1134 self.count_models = 0.0
1135 self.custom_ranges = custom_ranges
1138 """Add a frame to the densities.
1139 @param hierarchy Optionally read the hierarchy from somewhere.
1140 If not passed, will just read the representation.
1142 self.count_models += 1.0
1146 all_particles_by_resolution = []
1147 for name
in part_dict:
1148 all_particles_by_resolution += part_dict[name]
1150 for density_name
in self.custom_ranges:
1153 all_particles_by_segments = []
1155 for seg
in self.custom_ranges[density_name]:
1158 parts += IMP.tools.select_by_tuple(self.representation,
1159 seg, resolution=1, name_is_ambiguous=
False)
1163 for h
in hierarchy.get_children():
1167 if isinstance(seg, str):
1169 elif isinstance(seg, tuple)
and len(seg) == 2:
1171 hierarchy, molecule=seg[0],copy_index=seg[1])
1172 elif isinstance(seg, tuple)
and len(seg) == 3:
1174 hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1))
1175 elif isinstance(seg, tuple)
and len(seg) == 4:
1177 hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1),copy_index=seg[3])
1179 raise Exception(
'could not understand selection tuple '+str(seg))
1181 all_particles_by_segments += s.get_selected_particles()
1184 parts = all_particles_by_segments
1187 set(all_particles_by_segments) & set(all_particles_by_resolution))
1188 self._create_density_from_particles(parts, density_name)
1190 def normalize_density(self):
1193 def _create_density_from_particles(self, ps, name,
1194 kernel_type=
'GAUSSIAN'):
1195 '''Internal function for adding to densities.
1196 pass XYZR particles with mass and create a density from them.
1197 kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
1199 'GAUSSIAN': IMP.em.GAUSSIAN,
1200 'BINARIZED_SPHERE': IMP.em.BINARIZED_SPHERE,
1201 'SPHERE': IMP.em.SPHERE}
1205 dmap.set_was_used(
True)
1206 if name
not in self.densities:
1207 self.densities[name] = dmap
1213 dmap3.set_was_used(
True)
1215 dmap3.add(self.densities[name])
1216 self.densities[name] = dmap3
1218 def get_density_keys(self):
1219 return list(self.densities.keys())
1222 """Get the current density for some component name"""
1223 if name
not in self.densities:
1226 return self.densities[name]
1228 def write_mrc(self, path="./",suffix=None):
1230 for density_name
in self.densities:
1231 self.densities[density_name].
multiply(1. / self.count_models)
1233 name=path +
"/" + density_name +
".mrc"
1235 name=path +
"/" + density_name +
"." + suffix +
".mrc"
1236 path, file = os.path.split(name)
1237 if not os.path.exists(path):
1240 except OSError
as e:
1241 if e.errno != errno.EEXIST:
1244 self.densities[density_name],name,
1248 class GetContactMap(object):
1251 self.distance = distance
1252 self.contactmap =
''
1259 def set_prot(self, prot):
1260 from scipy.spatial.distance
import cdist
1269 for name
in particles_dictionary:
1270 residue_indexes = []
1271 for p
in particles_dictionary[name]:
1275 if len(residue_indexes) != 0:
1276 self.protnames.append(name)
1278 def get_subunit_coords(self, frame, align=0):
1279 from scipy.spatial.distance
import cdist
1283 test, testr = [], []
1284 for part
in self.prot.get_children():
1287 for chl
in part.get_children():
1293 SortedSegments.append((chl, startres))
1294 SortedSegments = sorted(SortedSegments, key=itemgetter(1))
1296 for sgmnt
in SortedSegments:
1299 crd = np.array([p.get_x(), p.get_y(), p.get_z()])
1302 radii.append(p.get_radius())
1304 new_name = part.get_name() +
'_' + sgmnt[0].get_name() +\
1307 .get_residue_indexes()[0])
1308 namelist.append(new_name)
1309 self.expanded[new_name] = len(
1311 if part.get_name()
not in self.resmap:
1312 self.resmap[part.get_name()] = {}
1314 self.resmap[part.get_name()][res] = new_name
1316 coords = np.array(coords)
1317 radii = np.array(radii)
1318 if len(self.namelist) == 0:
1319 self.namelist = namelist
1320 self.contactmap = np.zeros((len(coords), len(coords)))
1321 distances = cdist(coords, coords)
1322 distances = (distances - radii).T - radii
1323 distances = distances <= self.distance
1324 self.contactmap += distances
1329 identification_string=
'ISDCrossLinkMS_Distance_'):
1332 data = open(filname)
1333 D = data.readlines()
1337 if identification_string
in d:
1344 t1, t2 = (d[0], d[1]), (d[1], d[0])
1345 if t1
not in self.XL:
1346 self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
1347 self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
1349 self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
1350 self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
1352 def dist_matrix(self, skip_cmap=0, skip_xl=1, outname=None):
1356 L = sum(self.expanded.values())
1357 proteins = self.protnames
1362 proteins = [p.get_name()
for p
in self.prot.get_children()]
1364 for p1
in range(len(proteins)):
1365 for p2
in range(p1, len(proteins)):
1367 self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1368 pn1, pn2 = proteins[p1], proteins[p2]
1369 mtr = np.zeros((pl1 + 1, pl2 + 1))
1370 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1371 for i1
in range(1, pl1 + 1):
1372 for i2
in range(1, pl2 + 1):
1374 r1 = K.index(self.resmap[pn1][i1])
1375 r2 = K.index(self.resmap[pn2][i2])
1377 mtr[i1 - 1, i2 - 1] = r
1379 missing.append((pn1, pn2, i1, i2))
1380 Matrices[(pn1, pn2)] = mtr
1385 raise ValueError(
"cross-links were not provided, use add_xlinks function!")
1387 for p1
in range(len(proteins)):
1388 for p2
in range(p1, len(proteins)):
1390 self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1391 pn1, pn2 = proteins[p1], proteins[p2]
1392 mtr = np.zeros((pl1 + 1, pl2 + 1))
1395 xls = self.XL[(pn1, pn2)]
1398 xls = self.XL[(pn2, pn1)]
1403 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1404 for xl1, xl2
in xls:
1406 print(
'X' * 10, xl1, xl2)
1409 print(
'X' * 10, xl1, xl2)
1411 mtr[xl1 - 1, xl2 - 1] = 100
1413 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1414 for xl1, xl2
in xls:
1416 print(
'X' * 10, xl1, xl2)
1419 print(
'X' * 10, xl1, xl2)
1421 mtr[xl2 - 1, xl1 - 1] = 100
1423 print(
'No cross links between: ', pn1, pn2)
1424 Matrices_xl[(pn1, pn2)] = mtr
1440 for i, c
in enumerate(C):
1441 cl = max(self.resmap[c].keys())
1446 R.append(R[-1] + cl)
1451 import matplotlib
as mpl
1453 import matplotlib.pyplot
as plt
1454 import matplotlib.gridspec
as gridspec
1455 import scipy.sparse
as sparse
1458 gs = gridspec.GridSpec(len(W), len(W),
1463 for x1, r1
in enumerate(R):
1468 for x2, r2
in enumerate(R):
1474 ax = plt.subplot(gs[cnt])
1477 mtr = Matrices[(C[x1], C[x2])]
1479 mtr = Matrices[(C[x2], C[x1])].T
1483 interpolation=
'nearest',
1485 vmax=log(mtr.max()))
1490 mtr = Matrices_xl[(C[x1], C[x2])]
1492 mtr = Matrices_xl[(C[x2], C[x1])].T
1494 sparse.csr_matrix(mtr),
1504 ax.set_ylabel(C[x1], rotation=90)
1506 plt.savefig(outname +
".pdf", dpi=300, transparent=
"False")
1514 def get_hiers_from_rmf(model, frame_number, rmf_file):
1516 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1519 rh = RMF.open_rmf_file_read_only(rmf_file)
1524 print(
"Unable to open rmf file %s" % (rmf_file))
1531 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1538 def link_hiers_to_rmf(model,hiers,frame_number, rmf_file):
1539 print(
"linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1540 rh = RMF.open_rmf_file_read_only(rmf_file)
1545 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1552 def get_hiers_and_restraints_from_rmf(model, frame_number, rmf_file):
1554 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1557 rh = RMF.open_rmf_file_read_only(rmf_file)
1563 print(
"Unable to open rmf file %s" % (rmf_file))
1568 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1575 def link_hiers_and_restraints_to_rmf(model,hiers,rs, frame_number, rmf_file):
1576 print(
"linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1577 rh = RMF.open_rmf_file_read_only(rmf_file)
1583 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1591 """Get particles at res 1, or any beads, based on the name.
1592 No Representation is needed. This is mainly used when the hierarchy
1593 is read from an RMF file.
1594 @return a dictionary of component names and their particles
1595 @note If the root node is named "System" or is a "State", do proper selection.
1601 for mol
in IMP.atom.get_by_type(prot,IMP.atom.MOLECULE_TYPE):
1603 particle_dict[mol.get_name()] = sel.get_selected_particles()
1606 for c
in prot.get_children():
1609 for s
in c.get_children():
1610 if "_Res:1" in s.get_name()
and "_Res:10" not in s.get_name():
1612 if "Beads" in s.get_name():
1616 for name
in particle_dict:
1617 particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1618 list(set(particle_dict[name]) & set(allparticles)))
1619 return particle_dict
1622 """Get particles at res 10, or any beads, based on the name.
1623 No Representation is needed.
1624 This is mainly used when the hierarchy is read from an RMF file.
1625 @return a dictionary of component names and their particles
1626 @note If the root node is named "System" or is a "State", do proper selection.
1631 for mol
in IMP.atom.get_by_type(prot,IMP.atom.MOLECULE_TYPE):
1633 particle_dict[mol.get_name()] = sel.get_selected_particles()
1636 for c
in prot.get_children():
1639 for s
in c.get_children():
1640 if "_Res:10" in s.get_name():
1642 if "Beads" in s.get_name():
1645 for name
in particle_dict:
1646 particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1647 list(set(particle_dict[name]) & set(allparticles)))
1648 return particle_dict
1650 def select_by_tuple(first_res_last_res_name_tuple):
1651 first_res = first_res_last_res_hier_tuple[0]
1652 last_res = first_res_last_res_hier_tuple[1]
1653 name = first_res_last_res_hier_tuple[2]
1656 """Visualization of crosslinks"""
1658 self.crosslinks = []
1659 self.external_csv_data =
None
1660 self.crosslinkedprots = set()
1661 self.mindist = +10000000.0
1662 self.maxdist = -10000000.0
1663 self.contactmap =
None
1665 def set_hierarchy(self, prot):
1666 self.prot_length_dict = {}
1667 self.model=prot.get_model()
1669 for i
in prot.get_children():
1671 residue_indexes = []
1675 if len(residue_indexes) != 0:
1676 self.prot_length_dict[name] = max(residue_indexes)
1678 def set_coordinates_for_contact_map(self, rmf_name,rmf_frame_index):
1679 from scipy.spatial.distance
import cdist
1681 rh= RMF.open_rmf_file_read_only(rmf_name)
1684 print(
"getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
1695 self.index_dictionary = {}
1697 for name
in particles_dictionary:
1698 residue_indexes = []
1699 for p
in particles_dictionary[name]:
1704 if len(residue_indexes) != 0:
1706 for res
in range(min(residue_indexes), max(residue_indexes) + 1):
1709 crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1711 radii.append(d.get_radius())
1712 if name
not in self.index_dictionary:
1713 self.index_dictionary[name] = [resindex]
1715 self.index_dictionary[name].append(resindex)
1718 coords = np.array(coords)
1719 radii = np.array(radii)
1721 distances = cdist(coords, coords)
1722 distances = (distances - radii).T - radii
1724 distances = np.where(distances <= 20.0, 1.0, 0)
1725 if self.contactmap
is None:
1726 self.contactmap = np.zeros((len(coords), len(coords)))
1727 self.contactmap += distances
1729 for prot
in prots: IMP.atom.destroy(prot)
1732 self, data_file, search_label=
'ISDCrossLinkMS_Distance_',
1735 filter_rmf_file_names=
None,
1736 external_csv_data_file=
None,
1737 external_csv_data_file_unique_id_key=
"Unique ID"):
1746 keys = po.get_keys()
1748 xl_keys = [k
for k
in keys
if search_label
in k]
1750 if filter_rmf_file_names
is not None:
1751 rmf_file_key=
"local_rmf_file_name"
1752 fs = po.get_fields(xl_keys+[rmf_file_key])
1754 fs = po.get_fields(xl_keys)
1757 self.cross_link_frequency = {}
1761 self.cross_link_distances = {}
1765 self.cross_link_distances_unique = {}
1767 if not external_csv_data_file
is None:
1770 self.external_csv_data = {}
1771 xldb = IMP.pmi.tools.get_db_from_csv(external_csv_data_file)
1774 self.external_csv_data[
1775 xl[external_csv_data_file_unique_id_key]] = xl
1780 cross_link_frequency_list = []
1782 self.unique_cross_link_list = []
1786 keysplit = key.replace(
1795 if filter_label!=
None:
1796 if filter_label
not in keysplit:
continue
1799 r1 = int(keysplit[5])
1801 r2 = int(keysplit[7])
1804 confidence = keysplit[12]
1808 unique_identifier = keysplit[3]
1810 unique_identifier =
'0'
1812 r1 = int(keysplit[mapping[
"Residue1"]])
1813 c1 = keysplit[mapping[
"Protein1"]]
1814 r2 = int(keysplit[mapping[
"Residue2"]])
1815 c2 = keysplit[mapping[
"Protein2"]]
1817 confidence = keysplit[mapping[
"Confidence"]]
1821 unique_identifier = keysplit[mapping[
"Unique Identifier"]]
1823 unique_identifier =
'0'
1825 self.crosslinkedprots.add(c1)
1826 self.crosslinkedprots.add(c2)
1832 if filter_rmf_file_names
is not None:
1833 for n,d
in enumerate(fs[key]):
1834 if fs[rmf_file_key][n]
in filter_rmf_file_names:
1835 dists.append(float(d))
1837 dists=[float(f)
for f
in fs[key]]
1842 mdist = self.median(dists)
1844 stdv = np.std(np.array(dists))
1845 if self.mindist > mdist:
1846 self.mindist = mdist
1847 if self.maxdist < mdist:
1848 self.maxdist = mdist
1852 if not self.external_csv_data
is None:
1853 sample = self.external_csv_data[unique_identifier][
"Sample"]
1857 if (r1, c1, r2, c2,mdist)
not in cross_link_frequency_list:
1858 if (r1, c1, r2, c2)
not in self.cross_link_frequency:
1859 self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1860 self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1862 self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1863 self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1864 cross_link_frequency_list.append((r1, c1, r2, c2))
1865 cross_link_frequency_list.append((r2, c2, r1, c1))
1866 self.unique_cross_link_list.append(
1867 (r1, c1, r2, c2,mdist))
1869 if (r1, c1, r2, c2)
not in self.cross_link_distances:
1870 self.cross_link_distances[(
1876 confidence)] = dists
1877 self.cross_link_distances[(
1883 confidence)] = dists
1884 self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1886 self.cross_link_distances[(
1892 confidence)] += dists
1893 self.cross_link_distances[(
1899 confidence)] += dists
1901 self.crosslinks.append(
1911 self.crosslinks.append(
1922 self.cross_link_frequency_inverted = {}
1923 for xl
in self.unique_cross_link_list:
1924 (r1, c1, r2, c2, mdist) = xl
1925 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1926 if frequency
not in self.cross_link_frequency_inverted:
1927 self.cross_link_frequency_inverted[
1928 frequency] = [(r1, c1, r2, c2)]
1930 self.cross_link_frequency_inverted[
1931 frequency].append((r1, c1, r2, c2))
1935 def median(self, mylist):
1936 sorts = sorted(mylist)
1942 return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1943 return sorts[length / 2]
1945 def set_threshold(self,threshold):
1946 self.threshold=threshold
1948 def set_tolerance(self,tolerance):
1949 self.tolerance=tolerance
1951 def colormap(self, dist):
1952 if dist < self.threshold - self.tolerance:
1954 elif dist >= self.threshold + self.tolerance:
1959 def write_cross_link_database(self, filename, format='csv'):
1963 "Unique ID",
"Protein1",
"Residue1",
"Protein2",
"Residue2",
1964 "Median Distance",
"Standard Deviation",
"Confidence",
"Frequency",
"Arrangement"]
1966 if not self.external_csv_data
is None:
1967 keys = list(self.external_csv_data.keys())
1968 innerkeys = list(self.external_csv_data[keys[0]].keys())
1970 fieldnames += innerkeys
1972 dw = csv.DictWriter(
1976 fieldnames=fieldnames)
1978 for xl
in self.crosslinks:
1979 (r1, c1, r2, c2, mdist, stdv, confidence,
1980 unique_identifier, descriptor) = xl
1981 if descriptor ==
'original':
1983 outdict[
"Unique ID"] = unique_identifier
1984 outdict[
"Protein1"] = c1
1985 outdict[
"Protein2"] = c2
1986 outdict[
"Residue1"] = r1
1987 outdict[
"Residue2"] = r2
1988 outdict[
"Median Distance"] = mdist
1989 outdict[
"Standard Deviation"] = stdv
1990 outdict[
"Confidence"] = confidence
1991 outdict[
"Frequency"] = self.cross_link_frequency[
1994 arrangement =
"Intra"
1996 arrangement =
"Inter"
1997 outdict[
"Arrangement"] = arrangement
1998 if not self.external_csv_data
is None:
1999 outdict.update(self.external_csv_data[unique_identifier])
2001 dw.writerow(outdict)
2003 def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
2004 no_confidence_info=
False, filter=
None, layout=
"whole", crosslinkedonly=
False,
2005 filename=
None, confidence_classes=
None, alphablend=0.1, scale_symbol_size=1.0,
2006 gap_between_components=0,
2007 rename_protein_map=
None):
2021 import matplotlib
as mpl
2023 import matplotlib.pyplot
as plt
2024 import matplotlib.cm
as cm
2026 fig = plt.figure(figsize=(10, 10))
2027 ax = fig.add_subplot(111)
2033 if prot_listx
is None:
2035 prot_listx = list(self.crosslinkedprots)
2037 prot_listx = list(self.prot_length_dict.keys())
2040 nresx = gap_between_components + \
2041 sum([self.prot_length_dict[name]
2042 + gap_between_components
for name
in prot_listx])
2046 if prot_listy
is None:
2048 prot_listy = list(self.crosslinkedprots)
2050 prot_listy = list(self.prot_length_dict.keys())
2053 nresy = gap_between_components + \
2054 sum([self.prot_length_dict[name]
2055 + gap_between_components
for name
in prot_listy])
2060 res = gap_between_components
2061 for prot
in prot_listx:
2062 resoffsetx[prot] = res
2063 res += self.prot_length_dict[prot]
2065 res += gap_between_components
2069 res = gap_between_components
2070 for prot
in prot_listy:
2071 resoffsety[prot] = res
2072 res += self.prot_length_dict[prot]
2074 res += gap_between_components
2076 resoffsetdiagonal = {}
2077 res = gap_between_components
2078 for prot
in IMP.pmi.tools.OrderedSet(prot_listx + prot_listy):
2079 resoffsetdiagonal[prot] = res
2080 res += self.prot_length_dict[prot]
2081 res += gap_between_components
2087 for n, prot
in enumerate(prot_listx):
2088 res = resoffsetx[prot]
2090 for proty
in prot_listy:
2091 resy = resoffsety[proty]
2092 endy = resendy[proty]
2093 ax.plot([res, res], [resy, endy],
'k-', lw=0.4)
2094 ax.plot([end, end], [resy, endy],
'k-', lw=0.4)
2095 xticks.append((float(res) + float(end)) / 2)
2096 if rename_protein_map
is not None:
2097 if prot
in rename_protein_map:
2098 prot=rename_protein_map[prot]
2099 xlabels.append(prot)
2103 for n, prot
in enumerate(prot_listy):
2104 res = resoffsety[prot]
2106 for protx
in prot_listx:
2107 resx = resoffsetx[protx]
2108 endx = resendx[protx]
2109 ax.plot([resx, endx], [res, res],
'k-', lw=0.4)
2110 ax.plot([resx, endx], [end, end],
'k-', lw=0.4)
2111 yticks.append((float(res) + float(end)) / 2)
2112 if rename_protein_map
is not None:
2113 if prot
in rename_protein_map:
2114 prot=rename_protein_map[prot]
2115 ylabels.append(prot)
2118 print(prot_listx, prot_listy)
2120 if not self.contactmap
is None:
2121 import matplotlib.cm
as cm
2122 tmp_array = np.zeros((nresx, nresy))
2124 for px
in prot_listx:
2126 for py
in prot_listy:
2128 resx = resoffsety[px]
2129 lengx = resendx[px] - 1
2130 resy = resoffsety[py]
2131 lengy = resendy[py] - 1
2132 indexes_x = self.index_dictionary[px]
2133 minx = min(indexes_x)
2134 maxx = max(indexes_x)
2135 indexes_y = self.index_dictionary[py]
2136 miny = min(indexes_y)
2137 maxy = max(indexes_y)
2139 print(px, py, minx, maxx, miny, maxy)
2144 resy:lengy] = self.contactmap[
2151 ax.imshow(tmp_array,
2154 interpolation=
'nearest')
2156 ax.set_xticks(xticks)
2157 ax.set_xticklabels(xlabels, rotation=90)
2158 ax.set_yticks(yticks)
2159 ax.set_yticklabels(ylabels)
2160 ax.set_xlim(0,nresx)
2161 ax.set_ylim(0,nresy)
2166 already_added_xls = []
2168 for xl
in self.crosslinks:
2170 (r1, c1, r2, c2, mdist, stdv, confidence,
2171 unique_identifier, descriptor) = xl
2173 if confidence_classes
is not None:
2174 if confidence
not in confidence_classes:
2178 pos1 = r1 + resoffsetx[c1]
2182 pos2 = r2 + resoffsety[c2]
2186 if not filter
is None:
2187 xldb = self.external_csv_data[unique_identifier]
2188 xldb.update({
"Distance": mdist})
2189 xldb.update({
"Distance_stdv": stdv})
2191 if filter[1] ==
">":
2192 if float(xldb[filter[0]]) <= float(filter[2]):
2195 if filter[1] ==
"<":
2196 if float(xldb[filter[0]]) >= float(filter[2]):
2199 if filter[1] ==
"==":
2200 if float(xldb[filter[0]]) != float(filter[2]):
2206 pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
2207 pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
2209 if layout ==
'lowerdiagonal':
2210 if pos_for_diagonal1 <= pos_for_diagonal2:
2212 if layout ==
'upperdiagonal':
2213 if pos_for_diagonal1 >= pos_for_diagonal2:
2216 already_added_xls.append((r1, c1, r2, c2))
2218 if not no_confidence_info:
2219 if confidence ==
'0.01':
2220 markersize = 14 * scale_symbol_size
2221 elif confidence ==
'0.05':
2222 markersize = 9 * scale_symbol_size
2223 elif confidence ==
'0.1':
2224 markersize = 6 * scale_symbol_size
2226 markersize = 15 * scale_symbol_size
2228 markersize = 5 * scale_symbol_size
2230 if not no_dist_info:
2231 color = self.colormap(mdist)
2241 markersize=markersize)
2245 fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
2247 for i
in ax.spines.values():
2248 i.set_linewidth(2.0)
2253 plt.savefig(filename +
".pdf", dpi=300, transparent=
"False")
2257 def get_frequency_statistics(self, prot_list,
2260 violated_histogram = {}
2261 satisfied_histogram = {}
2262 unique_cross_links = []
2264 for xl
in self.unique_cross_link_list:
2265 (r1, c1, r2, c2, mdist) = xl
2268 if prot_list2
is None:
2269 if not c1
in prot_list:
2271 if not c2
in prot_list:
2274 if c1
in prot_list
and c2
in prot_list2:
2276 elif c1
in prot_list2
and c2
in prot_list:
2281 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2283 if (r1, c1, r2, c2)
not in unique_cross_links:
2285 if frequency
not in violated_histogram:
2286 violated_histogram[frequency] = 1
2288 violated_histogram[frequency] += 1
2290 if frequency
not in satisfied_histogram:
2291 satisfied_histogram[frequency] = 1
2293 satisfied_histogram[frequency] += 1
2294 unique_cross_links.append((r1, c1, r2, c2))
2295 unique_cross_links.append((r2, c2, r1, c1))
2297 print(
"# satisfied")
2299 total_number_of_crosslinks=0
2301 for i
in satisfied_histogram:
2305 if i
in violated_histogram:
2306 print(i, violated_histogram[i]+satisfied_histogram[i])
2308 print(i, satisfied_histogram[i])
2309 total_number_of_crosslinks+=i*satisfied_histogram[i]
2313 for i
in violated_histogram:
2314 print(i, violated_histogram[i])
2315 total_number_of_crosslinks+=i*violated_histogram[i]
2317 print(total_number_of_crosslinks)
2321 def print_cross_link_binary_symbols(self, prot_list,
2324 confidence_list = []
2325 for xl
in self.crosslinks:
2326 (r1, c1, r2, c2, mdist, stdv, confidence,
2327 unique_identifier, descriptor) = xl
2329 if prot_list2
is None:
2330 if not c1
in prot_list:
2332 if not c2
in prot_list:
2335 if c1
in prot_list
and c2
in prot_list2:
2337 elif c1
in prot_list2
and c2
in prot_list:
2342 if descriptor !=
"original":
2345 confidence_list.append(confidence)
2347 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2348 tmp_dist_binary = []
2351 tmp_dist_binary.append(1)
2353 tmp_dist_binary.append(0)
2354 tmp_matrix.append(tmp_dist_binary)
2356 matrix = list(zip(*tmp_matrix))
2358 satisfied_high_sum = 0
2359 satisfied_mid_sum = 0
2360 satisfied_low_sum = 0
2361 total_satisfied_sum = 0
2362 for k, m
in enumerate(matrix):
2371 for n, b
in enumerate(m):
2372 if confidence_list[n] ==
"0.01":
2376 satisfied_high_sum += 1
2377 elif confidence_list[n] ==
"0.05":
2381 satisfied_mid_sum += 1
2382 elif confidence_list[n] ==
"0.1":
2386 satisfied_low_sum += 1
2388 total_satisfied += 1
2389 total_satisfied_sum += 1
2391 print(k, satisfied_high, total_high)
2392 print(k, satisfied_mid, total_mid)
2393 print(k, satisfied_low, total_low)
2394 print(k, total_satisfied, total)
2395 print(float(satisfied_high_sum) / len(matrix))
2396 print(float(satisfied_mid_sum) / len(matrix))
2397 print(float(satisfied_low_sum) / len(matrix))
2400 def get_unique_crosslinks_statistics(self, prot_list,
2413 satisfied_string = []
2414 for xl
in self.crosslinks:
2415 (r1, c1, r2, c2, mdist, stdv, confidence,
2416 unique_identifier, descriptor) = xl
2418 if prot_list2
is None:
2419 if not c1
in prot_list:
2421 if not c2
in prot_list:
2424 if c1
in prot_list
and c2
in prot_list2:
2426 elif c1
in prot_list2
and c2
in prot_list:
2431 if descriptor !=
"original":
2435 if confidence ==
"0.01":
2439 if confidence ==
"0.05":
2443 if confidence ==
"0.1":
2448 satisfied_string.append(1)
2450 satisfied_string.append(0)
2452 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2453 tmp_dist_binary = []
2456 tmp_dist_binary.append(1)
2458 tmp_dist_binary.append(0)
2459 tmp_matrix.append(tmp_dist_binary)
2461 print(
"unique satisfied_high/total_high", satisfied_high,
"/", total_high)
2462 print(
"unique satisfied_mid/total_mid", satisfied_mid,
"/", total_mid)
2463 print(
"unique satisfied_low/total_low", satisfied_low,
"/", total_low)
2464 print(
"total", total)
2466 matrix = list(zip(*tmp_matrix))
2467 satisfied_models = 0
2469 for b
in satisfied_string:
2476 all_satisfied =
True
2478 for n, b
in enumerate(m):
2483 if b == 1
and satisfied_string[n] == 1:
2485 elif b == 1
and satisfied_string[n] == 0:
2487 elif b == 0
and satisfied_string[n] == 0:
2489 elif b == 0
and satisfied_string[n] == 1:
2490 all_satisfied =
False
2492 satisfied_models += 1
2494 print(satstr, all_satisfied)
2495 print(
"models that satisfies the median satisfied crosslinks/total models", satisfied_models, len(matrix))
2497 def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,
2500 import matplotlib
as mpl
2502 import matplotlib.pylab
as pl
2505 for kw
in self.cross_link_distances_unique:
2506 (r1, c1, r2, c2) = kw
2507 dists = self.cross_link_distances_unique[kw]
2509 if prot_list2
is None:
2510 if not c1
in prot_list:
2512 if not c2
in prot_list:
2515 if c1
in prot_list
and c2
in prot_list2:
2517 elif c1
in prot_list2
and c2
in prot_list:
2522 dists.append(sum(dists))
2523 tmp_matrix.append(dists)
2525 tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
2528 matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
2530 for i
in range(len(tmp_matrix)):
2531 for k
in range(len(tmp_matrix[i]) - 1):
2532 matrix[i][k] = tmp_matrix[i][k]
2537 ax = fig.add_subplot(211)
2539 cax = ax.imshow(matrix, interpolation=
'nearest')
2543 pl.savefig(figurename, dpi=300)
2552 arrangement=
"inter",
2553 confidence_input=
"None"):
2556 for xl
in self.cross_link_distances:
2557 (r1, c1, r2, c2, mdist, confidence) = xl
2558 if c1
in prots1
and c2
in prots2:
2559 if arrangement ==
"inter" and c1 == c2:
2561 if arrangement ==
"intra" and c1 != c2:
2563 if confidence_input == confidence:
2564 label = str(c1) +
":" + str(r1) + \
2565 "-" + str(c2) +
":" + str(r2)
2566 values = self.cross_link_distances[xl]
2567 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2568 data.append((label, values, mdist, frequency))
2570 sort_by_dist = sorted(data, key=
lambda tup: tup[2])
2571 sort_by_dist = list(zip(*sort_by_dist))
2572 values = sort_by_dist[1]
2573 positions = list(range(len(values)))
2574 labels = sort_by_dist[0]
2575 frequencies = list(map(float, sort_by_dist[3]))
2576 frequencies = [f * 10.0
for f
in frequencies]
2578 nchunks = int(float(len(values)) / nxl_per_row)
2579 values_chunks = IMP.pmi.tools.chunk_list_into_segments(values, nchunks)
2580 positions_chunks = IMP.pmi.tools.chunk_list_into_segments(
2583 frequencies_chunks = IMP.pmi.tools.chunk_list_into_segments(
2586 labels_chunks = IMP.pmi.tools.chunk_list_into_segments(labels, nchunks)
2588 for n, v
in enumerate(values_chunks):
2589 p = positions_chunks[n]
2590 f = frequencies_chunks[n]
2591 l = labels_chunks[n]
2593 filename +
"." + str(n), v, p, f,
2594 valuename=
"Distance (Ang)", positionname=
"Unique " + arrangement +
" Crosslinks", xlabels=l)
2596 def crosslink_distance_histogram(self, filename,
2599 confidence_classes=
None,
2605 if prot_list
is None:
2606 prot_list = list(self.prot_length_dict.keys())
2609 for xl
in self.crosslinks:
2610 (r1, c1, r2, c2, mdist, stdv, confidence,
2611 unique_identifier, descriptor) = xl
2613 if not confidence_classes
is None:
2614 if confidence
not in confidence_classes:
2617 if prot_list2
is None:
2618 if not c1
in prot_list:
2620 if not c2
in prot_list:
2623 if c1
in prot_list
and c2
in prot_list2:
2625 elif c1
in prot_list2
and c2
in prot_list:
2630 distances.append(mdist)
2633 filename, distances, valuename=
"C-alpha C-alpha distance [Ang]",
2634 bins=bins, color=color,
2636 reference_xline=35.0,
2637 yplotrange=yplotrange, normalized=normalized)
2639 def scatter_plot_xl_features(self, filename,
2645 reference_ylines=
None,
2646 distance_color=
True,
2648 import matplotlib
as mpl
2650 import matplotlib.pyplot
as plt
2651 import matplotlib.cm
as cm
2653 fig = plt.figure(figsize=(10, 10))
2654 ax = fig.add_subplot(111)
2656 for xl
in self.crosslinks:
2657 (r1, c1, r2, c2, mdist, stdv, confidence,
2658 unique_identifier, arrangement) = xl
2660 if prot_list2
is None:
2661 if not c1
in prot_list:
2663 if not c2
in prot_list:
2666 if c1
in prot_list
and c2
in prot_list2:
2668 elif c1
in prot_list2
and c2
in prot_list:
2673 xldb = self.external_csv_data[unique_identifier]
2674 xldb.update({
"Distance": mdist})
2675 xldb.update({
"Distance_stdv": stdv})
2677 xvalue = float(xldb[feature1])
2678 yvalue = float(xldb[feature2])
2681 color = self.colormap(mdist)
2685 ax.plot([xvalue], [yvalue],
'o', c=color, alpha=0.1, markersize=7)
2687 if not yplotrange
is None:
2688 ax.set_ylim(yplotrange)
2689 if not reference_ylines
is None:
2690 for rl
in reference_ylines:
2691 ax.axhline(rl, color=
'red', linestyle=
'dashed', linewidth=1)
2694 plt.savefig(filename +
"." + format, dpi=150, transparent=
"False")
static bool get_is_setup(const IMP::ParticleAdaptor &p)
def get_output
Returns output for IMP.pmi.output.Output object.
Visualization of crosslinks.
A decorator to associate a particle with a part of a protein/DNA/RNA.
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
A class for reading stat files (either rmf or ascii v1 and v2)
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
Compute the RMSD (without alignment) taking into account the copy ambiguity.
void link_restraints(RMF::FileConstHandle fh, const Restraints &hs)
A class to evaluate the precision of an ensemble.
A class to cluster structures.
def get_rmsd_wrt_reference_structure_with_alignment
First align then calculate RMSD.
static Molecule setup_particle(Model *m, ParticleIndex pi)
Class for sampling a density map from particles.
A decorator for keeping track of copies of a molecule.
def add_structure
Read a structure into the ensemble and store (as coordinates).
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
Return a density map for which voxel i contains the result of m1[i]*m2[i].
Performs alignment and RMSD calculation for two sets of coordinates.
def add_subunits_density
Add a frame to the densities.
DensityMap * create_density_map(const IMP::algebra::GridD< 3, S, V, E > &arg)
Create a density map from an arbitrary IMP::algebra::GridD.
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Basic utilities for handling cryo-electron microscopy 3D density maps.
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
def fill
Add coordinates for a single model.
Tools for clustering and cluster analysis.
bool get_is_canonical(atom::Hierarchy h)
Walk up a PMI2 hierarchy/representations and check if the root is named System.
def do_cluster
Run K-means clustering.
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Classes for writing output files and processing them.
def set_reference_structure
Read in a structure used for reference computation.
def get_rmsf
Calculate the residue mean square fluctuations (RMSF).
def get_average_distance_wrt_reference_structure
Compare the structure set to the reference structure.
def get_density
Get the current density for some component name.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
def get_precision
Evaluate the precision of two named structure groups.
Restraints create_restraints(RMF::FileConstHandle fh, Model *m)
def get_moldict_coorddict
return data structure for the RMSD calculation
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name.
Python classes to represent, score, sample and analyze models.
def add_structures
Read a list of RMFs, supports parallel.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Select hierarchy particles identified by the biological name.
Compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
A decorator for a particle with x,y,z coordinates and a radius.