3 """@namespace IMP.pmi.analysis
4 Tools for clustering and cluster analysis
6 from __future__
import print_function
16 from operator
import itemgetter
17 from math
import log, sqrt
23 """Performs alignment and RMSD calculation for two sets of coordinates
25 The class also takes into account non-equal stoichiometry of the proteins.
26 If this is the case, the protein names of proteins in multiple copies
27 should be specified in the following form:
28 nameA..1, nameA..2 (note two dots).
31 def __init__(self, template, query, weights=None):
33 @param query {'p1':coords(L,3), 'p2':coords(L,3)}
34 @param template {'p1':coords(L,3), 'p2':coords(L,3)}
35 @param weights optional weights for each set of coordinates
38 self.template = template
39 self.weights = weights
41 if len(self.query.keys()) != len(self.template.keys()):
42 raise ValueError(
'''the number of proteins
43 in template and query does not match!''')
49 self.proteins = sorted(self.query.keys())
50 prots_uniq = [i.split(
'..')[0]
for i
in self.proteins]
58 copies = [i
for i
in self.proteins
if i.split(
'..')[0] == p]
59 prmts = list(itertools.permutations(copies, len(copies)))
62 self.Product = list(itertools.product(*P.values()))
70 torder = sum([list(i)
for i
in self.Product[0]], [])
73 if self.weights
is not None:
74 weights += [i
for i
in self.weights[t]]
76 self.rmsd = 10000000000.
77 for comb
in self.Product:
79 order = sum([list(i)
for i
in comb], [])
84 if self.weights
is not None:
86 template_xyz, query_xyz, weights)
94 from scipy.spatial.distance
import cdist
102 torder = sum([list(i)
for i
in self.Product[0]], [])
106 self.rmsd, Transformation = 10000000000.,
''
109 for comb
in self.Product:
110 order = sum([list(i)
for i
in comb], [])
115 if len(template_xyz) != len(query_xyz):
116 raise ValueError(
'''the number of coordinates
117 in template and query does not match!''')
121 query_xyz, template_xyz)
122 query_xyz_tr = [transformation.get_transformed(n)
126 sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2))
130 Transformation = transformation
133 return (self.rmsd, Transformation)
138 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
139 'a..2':np.array([np.array([1.,1.,])]),
140 'a..3':np.array([np.array([-2.,1.])]),
141 'b':np.array([np.array([0.,-1.])]),
142 'c..1':np.array([np.array([-1.,-1.])]),
143 'c..2':np.array([np.array([1.,-1.])]),
144 'd':np.array([np.array([0.,0.])]),
145 'e':np.array([np.array([0.,1.])])}
147 Ali = Alignment(Proteins, Proteins)
149 if Ali.get_rmsd() == 0.0: print 'successful test!'
150 else: print 'ERROR!'; exit()
155 class Violations(object):
159 self.violation_thresholds = {}
160 self.violation_counts = {}
162 data = open(filename)
167 d = d.strip().split()
168 self.violation_thresholds[d[0]] = float(d[1])
170 def get_number_violated_restraints(self, rsts_dict):
172 for rst
in self.violation_thresholds:
173 if rst
not in rsts_dict:
175 if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
177 if rst
not in self.violation_counts:
178 self.violation_counts[rst] = 1
180 self.violation_counts[rst] += 1
185 """A class to cluster structures.
186 Uses scipy's cdist function to compute distance matrices
187 and sklearn's kmeans clustering module.
191 @param rmsd_weights Flat list of weights for each particle
195 from mpi4py
import MPI
196 self.comm = MPI.COMM_WORLD
197 self.rank = self.comm.Get_rank()
198 self.number_of_processes = self.comm.size
200 self.number_of_processes = 1
203 self.structure_cluster_ids =
None
204 self.tmpl_coords =
None
205 self.rmsd_weights = rmsd_weights
207 def set_template(self, part_coords):
208 self.tmpl_coords = part_coords
211 """Add coordinates for a single model."""
212 self.all_coords[frame] = Coords
214 def dist_matrix(self):
215 self.model_list_names = list(self.all_coords.keys())
216 self.model_indexes = list(range(len(self.model_list_names)))
217 self.model_indexes_dict = dict(
218 list(zip(self.model_list_names, self.model_indexes)))
219 model_indexes_unique_pairs = \
220 list(itertools.combinations(self.model_indexes, 2))
222 my_model_indexes_unique_pairs = IMP.pmi.tools.chunk_list_into_segments(
223 model_indexes_unique_pairs,
224 self.number_of_processes)[self.rank]
226 print(
"process %s assigned with %s pairs"
227 % (str(self.rank), str(len(my_model_indexes_unique_pairs))))
229 (raw_distance_dict, self.transformation_distance_dict) = \
230 self.matrix_calculation(self.all_coords, self.tmpl_coords,
231 my_model_indexes_unique_pairs)
233 if self.number_of_processes > 1:
236 pickable_transformations = \
237 self.get_pickable_transformation_distance_dict()
239 pickable_transformations)
240 self.set_transformation_distance_dict_from_pickable(
241 pickable_transformations)
243 self.raw_distance_matrix = np.zeros(
244 (len(self.model_list_names), len(self.model_list_names)))
245 for item
in raw_distance_dict:
247 self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
248 self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
250 def get_dist_matrix(self):
251 return self.raw_distance_matrix
254 """Run K-means clustering
255 @param number_of_clusters Num means
256 @param seed the random seed
258 from sklearn.cluster
import KMeans
263 kmeans = KMeans(n_clusters=number_of_clusters)
266 kmeans = KMeans(k=number_of_clusters)
267 kmeans.fit_predict(self.raw_distance_matrix)
269 self.structure_cluster_ids = kmeans.labels_
271 def get_pickable_transformation_distance_dict(self):
272 pickable_transformations = {}
273 for label
in self.transformation_distance_dict:
274 tr = self.transformation_distance_dict[label]
275 trans = tuple(tr.get_translation())
276 rot = tuple(tr.get_rotation().get_quaternion())
277 pickable_transformations[label] = (rot, trans)
278 return pickable_transformations
280 def set_transformation_distance_dict_from_pickable(
282 pickable_transformations):
283 self.transformation_distance_dict = {}
284 for label
in pickable_transformations:
285 tr = pickable_transformations[label]
288 self.transformation_distance_dict[
291 def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
293 outf = open(file_name +
".data",
'wb')
299 pickable_transformations = \
300 self.get_pickable_transformation_distance_dict()
302 (self.structure_cluster_ids,
303 self.model_list_names,
304 pickable_transformations),
307 np.save(file_name +
".npy", self.raw_distance_matrix)
309 def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
312 inputf = open(file_name +
".data",
'rb')
313 (self.structure_cluster_ids, self.model_list_names,
314 pickable_transformations) = pickle.load(inputf)
317 self.raw_distance_matrix = np.load(file_name +
".npy")
319 self.set_transformation_distance_dict_from_pickable(
320 pickable_transformations)
321 self.model_indexes = list(range(len(self.model_list_names)))
322 self.model_indexes_dict = dict(
323 list(zip(self.model_list_names, self.model_indexes)))
325 def plot_matrix(self, figurename="clustermatrix.pdf"):
326 import matplotlib
as mpl
328 import matplotlib.pylab
as pl
329 from scipy.cluster
import hierarchy
as hrc
331 fig = pl.figure(figsize=(10, 8))
332 ax = fig.add_subplot(212)
333 dendrogram = hrc.dendrogram(
334 hrc.linkage(self.raw_distance_matrix),
337 leaves_order = dendrogram[
'leaves']
338 ax.set_xlabel(
'Model')
339 ax.set_ylabel(
'RMSD [Angstroms]')
341 ax2 = fig.add_subplot(221)
343 self.raw_distance_matrix[leaves_order,
346 interpolation=
'nearest')
347 cb = fig.colorbar(cax)
348 cb.set_label(
'RMSD [Angstroms]')
349 ax2.set_xlabel(
'Model')
350 ax2.set_ylabel(
'Model')
352 pl.savefig(figurename, dpi=300)
355 def get_model_index_from_name(self, name):
356 return self.model_indexes_dict[name]
358 def get_cluster_labels(self):
360 return list(set(self.structure_cluster_ids))
362 def get_number_of_clusters(self):
363 return len(self.get_cluster_labels())
365 def get_cluster_label_indexes(self, label):
367 [i
for i, l
in enumerate(self.structure_cluster_ids)
if l == label]
370 def get_cluster_label_names(self, label):
372 [self.model_list_names[i]
373 for i
in self.get_cluster_label_indexes(label)]
376 def get_cluster_label_average_rmsd(self, label):
378 indexes = self.get_cluster_label_indexes(label)
381 sub_distance_matrix = self.raw_distance_matrix[
382 indexes, :][:, indexes]
383 average_rmsd = np.sum(sub_distance_matrix) / \
384 (len(sub_distance_matrix)
385 ** 2 - len(sub_distance_matrix))
390 def get_cluster_label_size(self, label):
391 return len(self.get_cluster_label_indexes(label))
393 def get_transformation_to_first_member(
397 reference = self.get_cluster_label_indexes(cluster_label)[0]
398 return self.transformation_distance_dict[(reference, structure_index)]
400 def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
402 model_list_names = list(all_coords.keys())
403 rmsd_protein_names = list(all_coords[model_list_names[0]].keys())
404 raw_distance_dict = {}
405 transformation_distance_dict = {}
406 if template_coords
is None:
410 alignment_template_protein_names = list(template_coords.keys())
412 for (f1, f2)
in list_of_pairs:
420 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
421 for pr
in rmsd_protein_names])
423 for pr
in rmsd_protein_names:
424 coords_f2[pr] = all_coords[model_list_names[f2]][pr]
426 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
427 rmsd = Ali.get_rmsd()
433 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
434 for pr
in alignment_template_protein_names])
435 coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
436 for pr
in alignment_template_protein_names])
439 template_rmsd, transformation = Ali.align()
444 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
445 for pr
in rmsd_protein_names])
447 for pr
in rmsd_protein_names:
448 coords_f2[pr] = [transformation.get_transformed(
449 i)
for i
in all_coords[model_list_names[f2]][pr]]
451 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
452 rmsd = Ali.get_rmsd()
454 raw_distance_dict[(f1, f2)] = rmsd
455 raw_distance_dict[(f2, f1)] = rmsd
456 transformation_distance_dict[(f1, f2)] = transformation
457 transformation_distance_dict[(f2, f1)] = transformation
459 return raw_distance_dict, transformation_distance_dict
463 """Compute the RMSD (without alignment) taking into account the copy
465 To be used with pmi2 hierarchies. Can be used for instance as follows:
467 rmsd = IMP.pmi.analysis.RMSD(
468 hier, hier, [mol.get_name() for mol in mols],
469 dynamic0=True, dynamic1=False)
470 output_objects.append(rmsd)
472 before shuffling the coordinates
475 def __init__(self, hier0, hier1, molnames, label="None", dynamic0=True,
476 dynamic1=
True, metric=IMP.algebra.get_rmsd):
478 @param hier0 first input hierarchy
479 @param hier1 second input hierarchy
480 @param molname the names of the molecules used for the RMSD
481 @dynamic0 if True stores the decorators XYZ and coordinates of
482 hier0 can be update. If false coordinates are static
483 (stored in Vector3Ds) and will never be updated
484 @dynamic1 same as above
485 metric what metric should be used
487 self.moldict0, self.molcoords0, self.mol_XYZs0 = \
489 self.moldict1, self.molcoords1, self.mol_XYZs1 = \
491 self.dynamic0 = dynamic0
492 self.dynamic1 = dynamic1
497 """return data structure for the RMSD calculation"""
502 name = mol.get_name()
503 if name
not in molnames:
511 mol, residue_index=i,
512 representation_type=IMP.atom.BALLS,
514 parts = sel.get_selected_particles()
516 mol_coords[mol].append(
521 moldict[name].append(mol)
523 moldict[name] = [mol]
524 return moldict, mol_coords, mol_XYZs
526 def get_rmsd_and_assigments(self):
529 best_assignments = []
531 for molname, ref_mols
in self.moldict1.items():
533 for ref_mol
in ref_mols:
535 coord1 = [XYZ.get_coordinates()
536 for XYZ
in self.mol_XYZs1[ref_mol]]
538 coord1 = self.molcoords1[ref_mol]
543 for rmf_mols
in itertools.permutations(self.moldict0[molname]):
545 for rmf_mol
in rmf_mols:
547 coord0 = [XYZ.get_coordinates()
548 for XYZ
in self.mol_XYZs0[rmf_mol]]
550 coord0 = self.molcoords0[rmf_mol]
554 rmf_mols_list.append(rmf_mols)
556 rmf_mols_best_order = rmf_mols_list[rmsd.index(m)]
558 for n, (ref_mol, rmf_mol)
in enumerate(
559 zip(ref_mols, rmf_mols_best_order)):
560 best_assignments.append((rmf_mol, ref_mol))
562 coord0 = [XYZ.get_coordinates()
563 for XYZ
in self.mol_XYZs0[rmf_mol]]
565 coord0 = self.molcoords0[rmf_mol]
568 coord1 = [XYZ.get_coordinates()
569 for XYZ
in self.mol_XYZs1[ref_mol]]
571 coord1 = self.molcoords1[ref_mol]
572 rmsd_pair = self.metric(coord1, coord0)
573 N = len(self.molcoords1[ref_mol])
575 total_rmsd += rmsd_pair*rmsd_pair*N
576 rmsd_dict[ref_mol] = rmsd_pair
577 total_rmsd = sqrt(total_rmsd/total_N)
578 return total_rmsd, best_assignments
581 """Returns output for IMP.pmi.output.Output object"""
582 total_rmsd, best_assignments = self.get_rmsd_and_assigments()
585 for rmf_mol, ref_mol
in best_assignments:
586 ref_name = ref_mol.get_name()
588 rmf_name = rmf_mol.get_name()
590 assignments_out.append(rmf_name +
"." + str(rmf_copy) +
"->" +
591 ref_name +
"." + str(ref_copy))
592 return {
"RMSD_" + self.label: str(total_rmsd),
593 "RMSD_assignments_" + self.label: str(assignments_out)}
597 """A class to evaluate the precision of an ensemble.
599 Also can evaluate the cross-precision of multiple ensembles.
600 Supports MPI for coordinate reading.
601 Recommended procedure:
602 -# initialize object and pass the selection for evaluating precision
603 -# call add_structures() to read in the data (specify group name)
604 -# call get_precision() to evaluate inter/intra precision
605 -# call get_rmsf() to evaluate within-group fluctuations
607 def __init__(self, model, resolution=1, selection_dictionary={}):
609 @param model The IMP Model
610 @param resolution Use 1 or 10 (kluge: requires that "_Res:X" is
611 part of the hier name)
612 @param selection_dictionary Dictionary where keys are names for
613 selections and values are selection tuples for scoring
614 precision. "All" is automatically made as well
617 from mpi4py
import MPI
618 self.comm = MPI.COMM_WORLD
619 self.rank = self.comm.Get_rank()
620 self.number_of_processes = self.comm.size
622 self.number_of_processes = 1
625 self.styles = [
'pairwise_rmsd',
'pairwise_drmsd_k',
'pairwise_drmsd_Q',
626 'pairwise_drms_k',
'pairwise_rmsd',
'drmsd_from_center']
627 self.style =
'pairwise_drmsd_k'
628 self.structures_dictionary = {}
629 self.reference_structures_dictionary = {}
631 self.protein_names =
None
632 self.len_particles_resolution_one =
None
634 self.rmf_names_frames = {}
635 self.reference_rmf_names_frames =
None
636 self.reference_structure =
None
637 self.reference_prot =
None
638 self.selection_dictionary = selection_dictionary
639 self.threshold = 40.0
640 self.residue_particle_index_map =
None
642 if resolution
in [1, 10]:
643 self.resolution = resolution
645 raise KeyError(
"Currently only allow resolution 1 or 10")
647 def _get_structure(self, rmf_frame_index, rmf_name):
648 """Read an RMF file and return the particles"""
649 rh = RMF.open_rmf_file_read_only(rmf_name)
651 print(
"getting coordinates for frame %i rmf file %s"
652 % (rmf_frame_index, rmf_name))
656 print(
"linking coordinates for frame %i rmf file %s"
657 % (rmf_frame_index, rmf_name))
662 if self.resolution == 1:
664 elif self.resolution == 10:
667 protein_names = list(particle_dict.keys())
668 particles_resolution_one = []
669 for k
in particle_dict:
670 particles_resolution_one += (particle_dict[k])
672 if self.protein_names
is None:
673 self.protein_names = protein_names
675 if self.protein_names != protein_names:
676 print(
"Error: the protein names of the new coordinate set "
677 "is not compatible with the previous one")
679 if self.len_particles_resolution_one
is None:
680 self.len_particles_resolution_one = len(particles_resolution_one)
682 if self.len_particles_resolution_one != \
683 len(particles_resolution_one):
684 raise ValueError(
"the new coordinate set is not compatible "
685 "with the previous one")
687 return particles_resolution_one
693 setup_index_map=
False):
694 """ Read a structure into the ensemble and store (as coordinates).
695 @param rmf_name The name of the RMF file
696 @param rmf_frame_index The frame to read
697 @param structure_set_name Name for the set that includes this structure
699 @param setup_index_map if requested, set up a dictionary to help
704 if structure_set_name
in self.structures_dictionary:
705 cdict = self.structures_dictionary[structure_set_name]
706 rmflist = self.rmf_names_frames[structure_set_name]
708 self.structures_dictionary[structure_set_name] = {}
709 self.rmf_names_frames[structure_set_name] = []
710 cdict = self.structures_dictionary[structure_set_name]
711 rmflist = self.rmf_names_frames[structure_set_name]
715 particles_resolution_one = self._get_structure(
716 rmf_frame_index, rmf_name)
718 print(
"something wrong with the rmf")
720 self.selection_dictionary.update({
"All": self.protein_names})
722 for selection_name
in self.selection_dictionary:
723 selection_tuple = self.selection_dictionary[selection_name]
724 coords = self._select_coordinates(
725 selection_tuple, particles_resolution_one, self.prots[0])
726 if selection_name
not in cdict:
727 cdict[selection_name] = [coords]
729 cdict[selection_name].append(coords)
731 rmflist.append((rmf_name, rmf_frame_index))
735 self.residue_particle_index_map = {}
736 for prot_name
in self.protein_names:
737 self.residue_particle_index_map[prot_name] = \
738 self._get_residue_particle_index_map(
739 prot_name, particles_resolution_one, self.prots[0])
742 rmf_name_frame_tuples,
744 """Read a list of RMFs, supports parallel
745 @param rmf_name_frame_tuples list of (rmf_file_name,frame_number)
746 @param structure_set_name Name this set of structures
751 my_rmf_name_frame_tuples = IMP.pmi.tools.chunk_list_into_segments(
752 rmf_name_frame_tuples, self.number_of_processes)[self.rank]
753 for nfr, tup
in enumerate(my_rmf_name_frame_tuples):
755 rmf_frame_index = tup[1]
757 if self.residue_particle_index_map
is None:
758 setup_index_map =
True
760 setup_index_map =
False
767 if self.number_of_processes > 1:
769 self.rmf_names_frames)
771 self.comm.send(self.structures_dictionary, dest=0, tag=11)
773 for i
in range(1, self.number_of_processes):
774 data_tmp = self.comm.recv(source=i, tag=11)
775 for key
in self.structures_dictionary:
776 self.structures_dictionary[key].update(data_tmp[key])
777 for i
in range(1, self.number_of_processes):
778 self.comm.send(self.structures_dictionary, dest=i, tag=11)
780 self.structures_dictionary = self.comm.recv(source=0, tag=11)
782 def _get_residue_particle_index_map(self, prot_name, structure, hier):
784 residue_particle_index_map = []
790 all_selected_particles = s.get_selected_particles()
791 intersection = list(set(all_selected_particles) & set(structure))
792 sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
793 for p
in sorted_intersection:
794 residue_particle_index_map.append(
796 return residue_particle_index_map
798 def _select_coordinates(self, tuple_selections, structure, prot):
799 selected_coordinates = []
800 for t
in tuple_selections:
801 if isinstance(t, tuple)
and len(t) == 3:
804 prot, molecules=[t[2]],
805 residue_indexes=range(t[0], t[1]+1), resolution=1)
808 prot, molecules=[t[2]],
809 residue_indexes=range(t[0], t[1]+1))
810 all_selected_particles = s.get_selected_particles()
811 intersection = list(set(all_selected_particles)
813 sorted_intersection = \
814 IMP.pmi.tools.sort_by_residues(intersection)
816 for p
in sorted_intersection]
817 selected_coordinates += cc
818 elif isinstance(t, str):
823 all_selected_particles = s.get_selected_particles()
824 intersection = list(set(all_selected_particles)
826 sorted_intersection = \
827 IMP.pmi.tools.sort_by_residues(intersection)
829 for p
in sorted_intersection]
830 selected_coordinates += cc
832 raise ValueError(
"Selection error")
833 return selected_coordinates
835 def set_threshold(self, threshold):
836 self.threshold = threshold
838 def _get_distance(self, structure_set_name1, structure_set_name2,
839 selection_name, index1, index2):
840 """Compute distance between structures with various metrics"""
841 c1 = self.structures_dictionary[
842 structure_set_name1][selection_name][index1]
843 c2 = self.structures_dictionary[
844 structure_set_name2][selection_name][index2]
848 if self.style ==
'pairwise_drmsd_k':
850 if self.style ==
'pairwise_drms_k':
852 if self.style ==
'pairwise_drmsd_Q':
856 if self.style ==
'pairwise_rmsd':
860 def _get_particle_distances(self, structure_set_name1, structure_set_name2,
861 selection_name, index1, index2):
862 c1 = self.structures_dictionary[
863 structure_set_name1][selection_name][index1]
864 c2 = self.structures_dictionary[
865 structure_set_name2][selection_name][index2]
870 distances = [np.linalg.norm(a-b)
871 for (a, b)
in zip(coordinates1, coordinates2)]
876 outfile=
None, skip=1, selection_keywords=
None):
877 """ Evaluate the precision of two named structure groups. Supports MPI.
878 When the structure_set_name1 is different from the structure_set_name2,
879 this evaluates the cross-precision (average pairwise distances).
880 @param outfile Name of the precision output file
881 @param structure_set_name1 string name of the first structure set
882 @param structure_set_name2 string name of the second structure set
883 @param skip analyze every (skip) structure for the distance
885 @param selection_keywords Specify the selection name you want to
886 calculate on. By default this is computed for everything
887 you provided in the constructor, plus all the subunits together.
889 if selection_keywords
is None:
890 sel_keys = list(self.selection_dictionary.keys())
892 for k
in selection_keywords:
893 if k
not in self.selection_dictionary:
895 "you are trying to find named selection "
896 + k +
" which was not requested in the constructor")
897 sel_keys = selection_keywords
899 if outfile
is not None:
900 of = open(outfile,
"w")
902 for selection_name
in sel_keys:
903 number_of_structures_1 = len(self.structures_dictionary[
904 structure_set_name1][selection_name])
905 number_of_structures_2 = len(self.structures_dictionary[
906 structure_set_name2][selection_name])
908 structure_pointers_1 = list(range(0, number_of_structures_1, skip))
909 structure_pointers_2 = list(range(0, number_of_structures_2, skip))
910 pair_combination_list = list(
911 itertools.product(structure_pointers_1, structure_pointers_2))
912 if len(pair_combination_list) == 0:
913 raise ValueError(
"no structure selected. Check the "
917 my_pair_combination_list = IMP.pmi.tools.chunk_list_into_segments(
918 pair_combination_list, self.number_of_processes)[self.rank]
919 for n, pair
in enumerate(my_pair_combination_list):
920 distances[pair] = self._get_distance(
921 structure_set_name1, structure_set_name2, selection_name,
923 if self.number_of_processes > 1:
928 if structure_set_name1 == structure_set_name2:
929 structure_pointers = structure_pointers_1
930 number_of_structures = number_of_structures_1
935 distances_to_structure = {}
936 distances_to_structure_normalization = {}
938 for n
in structure_pointers:
939 distances_to_structure[n] = 0.0
940 distances_to_structure_normalization[n] = 0
943 distance += distances[k]
944 distances_to_structure[k[0]] += distances[k]
945 distances_to_structure[k[1]] += distances[k]
946 distances_to_structure_normalization[k[0]] += 1
947 distances_to_structure_normalization[k[1]] += 1
949 for n
in structure_pointers:
950 distances_to_structure[n] = \
951 distances_to_structure[n] \
952 / distances_to_structure_normalization[n]
954 min_distance = min([distances_to_structure[n]
955 for n
in distances_to_structure])
956 centroid_index = [k
for k, v
in
957 distances_to_structure.items()
958 if v == min_distance][0]
959 centroid_rmf_name = \
960 self.rmf_names_frames[
961 structure_set_name1][centroid_index]
963 centroid_distance = 0.0
965 for n
in range(number_of_structures):
966 dist = self._get_distance(
967 structure_set_name1, structure_set_name1,
968 selection_name, centroid_index, n)
969 centroid_distance += dist
970 distance_list.append(dist)
972 centroid_distance /= number_of_structures
973 if outfile
is not None:
974 of.write(str(selection_name) +
" " +
975 structure_set_name1 +
976 " average centroid distance " +
977 str(centroid_distance) +
"\n")
978 of.write(str(selection_name) +
" " +
979 structure_set_name1 +
" centroid index " +
980 str(centroid_index) +
"\n")
981 of.write(str(selection_name) +
" " +
982 structure_set_name1 +
" centroid rmf name " +
983 str(centroid_rmf_name) +
"\n")
984 of.write(str(selection_name) +
" " +
985 structure_set_name1 +
986 " median centroid distance " +
987 str(np.median(distance_list)) +
"\n")
989 average_pairwise_distances = sum(
990 distances.values())/len(list(distances.values()))
991 if outfile
is not None:
992 of.write(str(selection_name) +
" " + structure_set_name1 +
993 " " + structure_set_name2 +
994 " average pairwise distance " +
995 str(average_pairwise_distances) +
"\n")
996 if outfile
is not None:
998 return centroid_index
1000 def get_rmsf(self, structure_set_name, outdir="./", skip=1,
1001 set_plot_yaxis_range=
None):
1002 """ Calculate the residue mean square fluctuations (RMSF).
1003 Automatically outputs as data file and pdf
1004 @param structure_set_name Which structure set to calculate RMSF for
1005 @param outdir Where to write the files
1006 @param skip Skip this number of structures
1007 @param set_plot_yaxis_range In case you need to change the plot
1015 for sel_name
in self.protein_names:
1016 self.selection_dictionary.update({sel_name: [sel_name]})
1018 number_of_structures = \
1019 len(self.structures_dictionary[
1020 structure_set_name][sel_name])
1024 rpim = self.residue_particle_index_map[sel_name]
1025 outfile = outdir+
"/rmsf."+sel_name+
".dat"
1026 of = open(outfile,
"w")
1027 residue_distances = {}
1029 for index
in range(number_of_structures):
1030 distances = self._get_particle_distances(
1031 structure_set_name, structure_set_name, sel_name,
1032 centroid_index, index)
1033 for nblock, block
in enumerate(rpim):
1034 for residue_number
in block:
1035 residue_nblock[residue_number] = nblock
1036 if residue_number
not in residue_distances:
1037 residue_distances[residue_number] = \
1041 residue_number].append(distances[nblock])
1045 for rn
in residue_distances:
1047 rmsf = np.std(residue_distances[rn])
1049 of.write(str(rn) +
" " + str(residue_nblock[rn])
1050 +
" " + str(rmsf) +
"\n")
1052 IMP.pmi.output.plot_xy_data(
1053 residues, rmsfs, title=sel_name,
1054 out_fn=outdir+
"/rmsf."+sel_name, display=
False,
1055 set_plot_yaxis_range=set_plot_yaxis_range,
1056 xlabel=
'Residue Number', ylabel=
'Standard error')
1060 """Read in a structure used for reference computation.
1061 Needed before calling get_average_distance_wrt_reference_structure()
1062 @param rmf_name The RMF file to read the reference
1063 @param rmf_frame_index The index in that file
1065 particles_resolution_one = self._get_structure(rmf_frame_index,
1067 self.reference_rmf_names_frames = (rmf_name, rmf_frame_index)
1069 for selection_name
in self.selection_dictionary:
1070 selection_tuple = self.selection_dictionary[selection_name]
1071 coords = self._select_coordinates(
1072 selection_tuple, particles_resolution_one, self.prots[0])
1073 self.reference_structures_dictionary[selection_name] = coords
1076 self, structure_set_name, alignment_selection_key):
1077 """First align then calculate RMSD
1078 @param structure_set_name: the name of the structure set
1079 @param alignment_selection: the key containing the selection tuples
1080 needed to make the alignment stored in self.selection_dictionary
1081 @return: for each structure in the structure set, returns the rmsd
1084 if self.reference_structures_dictionary == {}:
1085 print(
"Cannot compute until you set a reference structure")
1088 align_reference_coordinates = \
1089 self.reference_structures_dictionary[alignment_selection_key]
1090 align_coordinates = self.structures_dictionary[
1091 structure_set_name][alignment_selection_key]
1092 transformations = []
1093 for c
in align_coordinates:
1095 {
"All": align_reference_coordinates}, {
"All": c})
1096 transformation = Ali.align()[1]
1097 transformations.append(transformation)
1098 for selection_name
in self.selection_dictionary:
1099 reference_coordinates = \
1100 self.reference_structures_dictionary[selection_name]
1102 for c
in reference_coordinates]
1104 for n, sc
in enumerate(self.structures_dictionary[
1105 structure_set_name][selection_name]):
1110 distances.append(distance)
1111 ret[selection_name] = {
1112 'all_distances': distances,
1113 'average_distance': sum(distances)/len(distances),
1114 'minimum_distance': min(distances)}
1117 def _get_median(self, list_of_values):
1118 return np.median(np.array(list_of_values))
1121 """Compare the structure set to the reference structure.
1122 @param structure_set_name The structure set to compute this on
1123 @note First call set_reference_structure()
1126 if self.reference_structures_dictionary == {}:
1127 print(
"Cannot compute until you set a reference structure")
1129 for selection_name
in self.selection_dictionary:
1130 reference_coordinates = self.reference_structures_dictionary[
1133 for c
in reference_coordinates]
1135 for sc
in self.structures_dictionary[
1136 structure_set_name][selection_name]:
1138 if self.style ==
'pairwise_drmsd_k':
1140 if self.style ==
'pairwise_drms_k':
1142 if self.style ==
'pairwise_drmsd_Q':
1144 coordinates1, coordinates2, self.threshold)
1145 if self.style ==
'pairwise_rmsd':
1147 distances.append(distance)
1149 print(selection_name,
"average distance",
1150 sum(distances)/len(distances),
"minimum distance",
1151 min(distances),
'nframes', len(distances))
1152 ret[selection_name] = {
1153 'average_distance': sum(distances)/len(distances),
1154 'minimum_distance': min(distances)}
1157 def get_coordinates(self):
1160 def set_precision_style(self, style):
1161 if style
in self.styles:
1164 raise ValueError(
"No such style")
1168 """Compute mean density maps from structures.
1170 Keeps a dictionary of density maps,
1171 keys are in the custom ranges. When you call add_subunits_density, it adds
1172 particle coordinates to the existing density maps.
1175 def __init__(self, custom_ranges, representation=None, resolution=20.0,
1178 @param custom_ranges Required. It's a dictionary, keys are the
1179 density component names, values are selection tuples
1180 e.g. {'kin28':[['kin28',1,-1]],
1181 'density_name_1' :[('ccl1')],
1182 'density_name_2' :[(1,142,'tfb3d1'),
1183 (143,700,'tfb3d2')],
1184 @param representation PMI representation, for doing selections.
1185 Not needed if you only pass hierarchies
1186 @param resolution The MRC resolution of the output map (in
1188 @param voxel The voxel size for the output map (lower is slower)
1191 self.representation = representation
1192 self.MRCresolution = resolution
1195 self.count_models = 0.0
1196 self.custom_ranges = custom_ranges
1199 """Add a frame to the densities.
1200 @param hierarchy Optionally read the hierarchy from somewhere.
1201 If not passed, will just read the representation.
1203 self.count_models += 1.0
1207 all_particles_by_resolution = []
1208 for name
in part_dict:
1209 all_particles_by_resolution += part_dict[name]
1211 for density_name
in self.custom_ranges:
1214 all_particles_by_segments = []
1216 for seg
in self.custom_ranges[density_name]:
1220 parts += IMP.tools.select_by_tuple(
1221 self.representation, seg, resolution=1,
1222 name_is_ambiguous=
False)
1226 for h
in hierarchy.get_children():
1231 if isinstance(seg, str):
1233 elif isinstance(seg, tuple)
and len(seg) == 2:
1235 hierarchy, molecule=seg[0], copy_index=seg[1])
1236 elif isinstance(seg, tuple)
and len(seg) == 3:
1238 hierarchy, molecule=seg[2],
1239 residue_indexes=range(seg[0], seg[1] + 1))
1240 elif isinstance(seg, tuple)
and len(seg) == 4:
1242 hierarchy, molecule=seg[2],
1243 residue_indexes=range(seg[0], seg[1] + 1),
1246 raise Exception(
'could not understand selection tuple '
1249 all_particles_by_segments += s.get_selected_particles()
1252 parts = all_particles_by_segments
1254 parts = list(set(all_particles_by_segments)
1255 & set(all_particles_by_resolution))
1256 self._create_density_from_particles(parts, density_name)
1258 def normalize_density(self):
1261 def _create_density_from_particles(self, ps, name,
1262 kernel_type=
'GAUSSIAN'):
1263 '''Internal function for adding to densities.
1264 pass XYZR particles with mass and create a density from them.
1265 kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
1268 dmap.set_was_used(
True)
1269 if name
not in self.densities:
1270 self.densities[name] = dmap
1276 dmap3.set_was_used(
True)
1278 dmap3.add(self.densities[name])
1279 self.densities[name] = dmap3
1281 def get_density_keys(self):
1282 return list(self.densities.keys())
1285 """Get the current density for some component name"""
1286 if name
not in self.densities:
1289 return self.densities[name]
1291 def write_mrc(self, path="./", suffix=None):
1294 for density_name
in self.densities:
1295 self.densities[density_name].
multiply(1. / self.count_models)
1297 name = path +
"/" + density_name +
".mrc"
1299 name = path +
"/" + density_name +
"." + suffix +
".mrc"
1300 path, file = os.path.split(name)
1301 if not os.path.exists(path):
1304 except OSError
as e:
1305 if e.errno != errno.EEXIST:
1307 IMP.em.write_map(self.densities[density_name], name,
1311 class GetContactMap(object):
1314 self.distance = distance
1315 self.contactmap =
''
1322 def set_prot(self, prot):
1328 for name
in particles_dictionary:
1329 residue_indexes = []
1330 for p
in particles_dictionary[name]:
1334 if len(residue_indexes) != 0:
1335 self.protnames.append(name)
1337 def get_subunit_coords(self, frame, align=0):
1338 from scipy.spatial.distance
import cdist
1342 for part
in self.prot.get_children():
1345 for chl
in part.get_children():
1349 SortedSegments.append((chl, startres))
1350 SortedSegments = sorted(SortedSegments, key=itemgetter(1))
1352 for sgmnt
in SortedSegments:
1355 crd = np.array([p.get_x(), p.get_y(), p.get_z()])
1358 radii.append(p.get_radius())
1360 new_name = part.get_name() +
'_' + sgmnt[0].get_name() + \
1363 .get_residue_indexes()[0])
1364 namelist.append(new_name)
1365 self.expanded[new_name] = len(
1367 if part.get_name()
not in self.resmap:
1368 self.resmap[part.get_name()] = {}
1370 self.resmap[part.get_name()][res] = new_name
1372 coords = np.array(coords)
1373 radii = np.array(radii)
1374 if len(self.namelist) == 0:
1375 self.namelist = namelist
1376 self.contactmap = np.zeros((len(coords), len(coords)))
1377 distances = cdist(coords, coords)
1378 distances = (distances - radii).T - radii
1379 distances = distances <= self.distance
1380 self.contactmap += distances
1382 def add_xlinks(self, filname,
1383 identification_string=
'ISDCrossLinkMS_Distance_'):
1385 with open(filname)
as data:
1386 D = data.readlines()
1389 if identification_string
in d:
1390 d = d.replace(
"_",
" ").replace(
"-",
" ").replace(
1393 t1, t2 = (d[0], d[1]), (d[1], d[0])
1394 if t1
not in self.XL:
1395 self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
1396 self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
1398 self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
1399 self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
1401 def dist_matrix(self, skip_cmap=0, skip_xl=1, outname=None):
1405 proteins = self.protnames
1410 proteins = [p.get_name()
for p
in self.prot.get_children()]
1412 for p1
in range(len(proteins)):
1413 for p2
in range(p1, len(proteins)):
1414 pl1, pl2 = (max(self.resmap[proteins[p1]].keys()),
1415 max(self.resmap[proteins[p2]].keys()))
1416 pn1, pn2 = proteins[p1], proteins[p2]
1417 mtr = np.zeros((pl1 + 1, pl2 + 1))
1418 print(
'Creating matrix for: ',
1419 p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1420 for i1
in range(1, pl1 + 1):
1421 for i2
in range(1, pl2 + 1):
1423 r1 = K.index(self.resmap[pn1][i1])
1424 r2 = K.index(self.resmap[pn2][i2])
1426 mtr[i1 - 1, i2 - 1] = r
1428 missing.append((pn1, pn2, i1, i2))
1429 Matrices[(pn1, pn2)] = mtr
1434 raise ValueError(
"cross-links were not provided, use "
1435 "add_xlinks function!")
1437 for p1
in range(len(proteins)):
1438 for p2
in range(p1, len(proteins)):
1439 pl1, pl2 = (max(self.resmap[proteins[p1]].keys()),
1440 max(self.resmap[proteins[p2]].keys()))
1441 pn1, pn2 = proteins[p1], proteins[p2]
1442 mtr = np.zeros((pl1 + 1, pl2 + 1))
1445 xls = self.XL[(pn1, pn2)]
1448 xls = self.XL[(pn2, pn1)]
1453 print(
'Creating matrix for: ',
1454 p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1455 for xl1, xl2
in xls:
1457 print(
'X' * 10, xl1, xl2)
1460 print(
'X' * 10, xl1, xl2)
1462 mtr[xl1 - 1, xl2 - 1] = 100
1464 print(
'Creating matrix for: ',
1465 p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1466 for xl1, xl2
in xls:
1468 print(
'X' * 10, xl1, xl2)
1471 print(
'X' * 10, xl1, xl2)
1473 mtr[xl2 - 1, xl1 - 1] = 100
1475 print(
'No cross-links between: ', pn1, pn2)
1476 Matrices_xl[(pn1, pn2)] = mtr
1483 for i, c
in enumerate(C):
1484 cl = max(self.resmap[c].keys())
1489 R.append(R[-1] + cl)
1494 import matplotlib
as mpl
1496 import matplotlib.pyplot
as plt
1497 import matplotlib.gridspec
as gridspec
1498 import scipy.sparse
as sparse
1501 gs = gridspec.GridSpec(len(W), len(W),
1506 for x1, r1
in enumerate(R):
1507 for x2, r2
in enumerate(R):
1508 ax = plt.subplot(gs[cnt])
1511 mtr = Matrices[(C[x1], C[x2])]
1513 mtr = Matrices[(C[x2], C[x1])].T
1516 interpolation=
'nearest',
1518 vmax=log(mtr.max()))
1523 mtr = Matrices_xl[(C[x1], C[x2])]
1525 mtr = Matrices_xl[(C[x2], C[x1])].T
1527 sparse.csr_matrix(mtr),
1537 ax.set_ylabel(C[x1], rotation=90)
1539 plt.savefig(outname +
".pdf", dpi=300, transparent=
"False")
1547 def get_hiers_from_rmf(model, frame_number, rmf_file):
1549 print(
"getting coordinates for frame %i rmf file %s"
1550 % (frame_number, rmf_file))
1553 rh = RMF.open_rmf_file_read_only(rmf_file)
1558 print(
"Unable to open rmf file %s" % (rmf_file))
1564 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1572 def link_hiers_to_rmf(model, hiers, frame_number, rmf_file):
1573 print(
"linking hierarchies for frame %i rmf file %s"
1574 % (frame_number, rmf_file))
1575 rh = RMF.open_rmf_file_read_only(rmf_file)
1580 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1587 def get_hiers_and_restraints_from_rmf(model, frame_number, rmf_file):
1589 print(
"getting coordinates for frame %i rmf file %s"
1590 % (frame_number, rmf_file))
1593 rh = RMF.open_rmf_file_read_only(rmf_file)
1599 print(
"Unable to open rmf file %s" % (rmf_file))
1604 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1612 def link_hiers_and_restraints_to_rmf(model, hiers, rs, frame_number, rmf_file):
1613 print(
"linking hierarchies for frame %i rmf file %s"
1614 % (frame_number, rmf_file))
1615 rh = RMF.open_rmf_file_read_only(rmf_file)
1621 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1629 """Get particles at res 1, or any beads, based on the name.
1630 No Representation is needed. This is mainly used when the hierarchy
1631 is read from an RMF file.
1633 @return a dictionary of component names and their particles
1634 @note If the root node is named "System" or is a "State", do
1641 for mol
in IMP.atom.get_by_type(prot, IMP.atom.MOLECULE_TYPE):
1643 particle_dict[mol.get_name()] = sel.get_selected_particles()
1646 for c
in prot.get_children():
1649 for s
in c.get_children():
1650 if "_Res:1" in s.get_name()
and "_Res:10" not in s.get_name():
1652 if "Beads" in s.get_name():
1655 for name
in particle_dict:
1656 particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1657 list(set(particle_dict[name]) & set(allparticles)))
1658 return particle_dict
1662 """Get particles at res 10, or any beads, based on the name.
1663 No Representation is needed.
1664 This is mainly used when the hierarchy is read from an RMF file.
1666 @return a dictionary of component names and their particles
1667 @note If the root node is named "System" or is a "State", do proper
1673 for mol
in IMP.atom.get_by_type(prot, IMP.atom.MOLECULE_TYPE):
1675 particle_dict[mol.get_name()] = sel.get_selected_particles()
1678 for c
in prot.get_children():
1681 for s
in c.get_children():
1682 if "_Res:10" in s.get_name():
1684 if "Beads" in s.get_name():
1686 for name
in particle_dict:
1687 particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1688 list(set(particle_dict[name]) & set(allparticles)))
1689 return particle_dict
1693 """Visualization of cross-links"""
1695 self.crosslinks = []
1696 self.external_csv_data =
None
1697 self.crosslinkedprots = set()
1698 self.mindist = +10000000.0
1699 self.maxdist = -10000000.0
1700 self.contactmap =
None
1702 def set_hierarchy(self, prot):
1703 self.prot_length_dict = {}
1704 self.model = prot.get_model()
1706 for i
in prot.get_children():
1708 residue_indexes = []
1712 if len(residue_indexes) != 0:
1713 self.prot_length_dict[name] = max(residue_indexes)
1715 def set_coordinates_for_contact_map(self, rmf_name, rmf_frame_index):
1716 from scipy.spatial.distance
import cdist
1718 rh = RMF.open_rmf_file_read_only(rmf_name)
1721 print(
"getting coordinates for frame %i rmf file %s"
1722 % (rmf_frame_index, rmf_name))
1731 self.index_dictionary = {}
1733 for name
in particles_dictionary:
1734 residue_indexes = []
1735 for p
in particles_dictionary[name]:
1739 if len(residue_indexes) != 0:
1741 for res
in range(min(residue_indexes),
1742 max(residue_indexes) + 1):
1745 crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1747 radii.append(d.get_radius())
1748 if name
not in self.index_dictionary:
1749 self.index_dictionary[name] = [resindex]
1751 self.index_dictionary[name].append(resindex)
1754 coords = np.array(coords)
1755 radii = np.array(radii)
1757 distances = cdist(coords, coords)
1758 distances = (distances - radii).T - radii
1760 distances = np.where(distances <= 20.0, 1.0, 0)
1761 if self.contactmap
is None:
1762 self.contactmap = np.zeros((len(coords), len(coords)))
1763 self.contactmap += distances
1766 IMP.atom.destroy(prot)
1769 self, data_file, search_label=
'ISDCrossLinkMS_Distance_',
1770 mapping=
None, filter_label=
None,
1772 filter_rmf_file_names=
None, external_csv_data_file=
None,
1773 external_csv_data_file_unique_id_key=
"Unique ID"):
1784 keys = po.get_keys()
1786 xl_keys = [k
for k
in keys
if search_label
in k]
1788 if filter_rmf_file_names
is not None:
1789 rmf_file_key =
"local_rmf_file_name"
1790 fs = po.get_fields(xl_keys+[rmf_file_key])
1792 fs = po.get_fields(xl_keys)
1795 self.cross_link_frequency = {}
1799 self.cross_link_distances = {}
1803 self.cross_link_distances_unique = {}
1805 if external_csv_data_file
is not None:
1808 self.external_csv_data = {}
1809 xldb = IMP.pmi.tools.get_db_from_csv(external_csv_data_file)
1812 self.external_csv_data[
1813 xl[external_csv_data_file_unique_id_key]] = xl
1818 cross_link_frequency_list = []
1820 self.unique_cross_link_list = []
1824 keysplit = key.replace(
"_",
" ").replace(
"-",
" ").replace(
1827 if filter_label
is not None:
1828 if filter_label
not in keysplit:
1832 r1 = int(keysplit[5])
1834 r2 = int(keysplit[7])
1837 confidence = keysplit[12]
1841 unique_identifier = keysplit[3]
1843 unique_identifier =
'0'
1845 r1 = int(keysplit[mapping[
"Residue1"]])
1846 c1 = keysplit[mapping[
"Protein1"]]
1847 r2 = int(keysplit[mapping[
"Residue2"]])
1848 c2 = keysplit[mapping[
"Protein2"]]
1850 confidence = keysplit[mapping[
"Confidence"]]
1854 unique_identifier = keysplit[mapping[
"Unique Identifier"]]
1856 unique_identifier =
'0'
1858 self.crosslinkedprots.add(c1)
1859 self.crosslinkedprots.add(c2)
1865 if filter_rmf_file_names
is not None:
1866 for n, d
in enumerate(fs[key]):
1867 if fs[rmf_file_key][n]
in filter_rmf_file_names:
1868 dists.append(float(d))
1870 dists = [float(f)
for f
in fs[key]]
1875 mdist = self.median(dists)
1877 stdv = np.std(np.array(dists))
1878 if self.mindist > mdist:
1879 self.mindist = mdist
1880 if self.maxdist < mdist:
1881 self.maxdist = mdist
1885 if (r1, c1, r2, c2, mdist)
not in cross_link_frequency_list:
1886 if (r1, c1, r2, c2)
not in self.cross_link_frequency:
1887 self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1888 self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1890 self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1891 self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1892 cross_link_frequency_list.append((r1, c1, r2, c2))
1893 cross_link_frequency_list.append((r2, c2, r1, c1))
1894 self.unique_cross_link_list.append(
1895 (r1, c1, r2, c2, mdist))
1897 if (r1, c1, r2, c2)
not in self.cross_link_distances:
1898 self.cross_link_distances[(
1899 r1, c1, r2, c2, mdist, confidence)] = dists
1900 self.cross_link_distances[(
1901 r2, c2, r1, c1, mdist, confidence)] = dists
1902 self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1904 self.cross_link_distances[(
1905 r2, c2, r1, c1, mdist, confidence)] += dists
1906 self.cross_link_distances[(
1907 r1, c1, r2, c2, mdist, confidence)] += dists
1909 self.crosslinks.append(
1910 (r1, c1, r2, c2, mdist, stdv, confidence, unique_identifier,
1912 self.crosslinks.append(
1913 (r2, c2, r1, c1, mdist, stdv, confidence, unique_identifier,
1916 self.cross_link_frequency_inverted = {}
1917 for xl
in self.unique_cross_link_list:
1918 (r1, c1, r2, c2, mdist) = xl
1919 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1920 if frequency
not in self.cross_link_frequency_inverted:
1921 self.cross_link_frequency_inverted[
1922 frequency] = [(r1, c1, r2, c2)]
1924 self.cross_link_frequency_inverted[
1925 frequency].append((r1, c1, r2, c2))
1929 def median(self, mylist):
1930 sorts = sorted(mylist)
1936 return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1937 return sorts[length / 2]
1939 def set_threshold(self, threshold):
1940 self.threshold = threshold
1942 def set_tolerance(self, tolerance):
1943 self.tolerance = tolerance
1945 def colormap(self, dist):
1946 if dist < self.threshold - self.tolerance:
1948 elif dist >= self.threshold + self.tolerance:
1953 def write_cross_link_database(self, filename, format='csv'):
1957 "Unique ID",
"Protein1",
"Residue1",
"Protein2",
"Residue2",
1958 "Median Distance",
"Standard Deviation",
"Confidence",
1959 "Frequency",
"Arrangement"]
1961 if self.external_csv_data
is not None:
1962 keys = list(self.external_csv_data.keys())
1963 innerkeys = list(self.external_csv_data[keys[0]].keys())
1965 fieldnames += innerkeys
1967 dw = csv.DictWriter(
1968 open(filename,
"w"),
1970 fieldnames=fieldnames)
1972 for xl
in self.crosslinks:
1973 (r1, c1, r2, c2, mdist, stdv, confidence,
1974 unique_identifier, descriptor) = xl
1975 if descriptor ==
'original':
1977 outdict[
"Unique ID"] = unique_identifier
1978 outdict[
"Protein1"] = c1
1979 outdict[
"Protein2"] = c2
1980 outdict[
"Residue1"] = r1
1981 outdict[
"Residue2"] = r2
1982 outdict[
"Median Distance"] = mdist
1983 outdict[
"Standard Deviation"] = stdv
1984 outdict[
"Confidence"] = confidence
1985 outdict[
"Frequency"] = self.cross_link_frequency[
1988 arrangement =
"Intra"
1990 arrangement =
"Inter"
1991 outdict[
"Arrangement"] = arrangement
1992 if self.external_csv_data
is not None:
1993 outdict.update(self.external_csv_data[unique_identifier])
1995 dw.writerow(outdict)
1997 def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
1998 no_confidence_info=
False, filter=
None, layout=
"whole",
1999 crosslinkedonly=
False, filename=
None, confidence_classes=
None,
2000 alphablend=0.1, scale_symbol_size=1.0, gap_between_components=0,
2001 rename_protein_map=
None):
2016 import matplotlib
as mpl
2018 import matplotlib.pyplot
as plt
2019 import matplotlib.cm
as cm
2021 fig = plt.figure(figsize=(10, 10))
2022 ax = fig.add_subplot(111)
2028 if prot_listx
is None:
2030 prot_listx = list(self.crosslinkedprots)
2032 prot_listx = list(self.prot_length_dict.keys())
2035 nresx = gap_between_components + \
2036 sum([self.prot_length_dict[name]
2037 + gap_between_components
for name
in prot_listx])
2041 if prot_listy
is None:
2043 prot_listy = list(self.crosslinkedprots)
2045 prot_listy = list(self.prot_length_dict.keys())
2048 nresy = gap_between_components + \
2049 sum([self.prot_length_dict[name]
2050 + gap_between_components
for name
in prot_listy])
2055 res = gap_between_components
2056 for prot
in prot_listx:
2057 resoffsetx[prot] = res
2058 res += self.prot_length_dict[prot]
2060 res += gap_between_components
2064 res = gap_between_components
2065 for prot
in prot_listy:
2066 resoffsety[prot] = res
2067 res += self.prot_length_dict[prot]
2069 res += gap_between_components
2071 resoffsetdiagonal = {}
2072 res = gap_between_components
2073 for prot
in IMP.pmi.tools.OrderedSet(prot_listx + prot_listy):
2074 resoffsetdiagonal[prot] = res
2075 res += self.prot_length_dict[prot]
2076 res += gap_between_components
2082 for n, prot
in enumerate(prot_listx):
2083 res = resoffsetx[prot]
2085 for proty
in prot_listy:
2086 resy = resoffsety[proty]
2087 endy = resendy[proty]
2088 ax.plot([res, res], [resy, endy],
'k-', lw=0.4)
2089 ax.plot([end, end], [resy, endy],
'k-', lw=0.4)
2090 xticks.append((float(res) + float(end)) / 2)
2091 if rename_protein_map
is not None:
2092 if prot
in rename_protein_map:
2093 prot = rename_protein_map[prot]
2094 xlabels.append(prot)
2098 for n, prot
in enumerate(prot_listy):
2099 res = resoffsety[prot]
2101 for protx
in prot_listx:
2102 resx = resoffsetx[protx]
2103 endx = resendx[protx]
2104 ax.plot([resx, endx], [res, res],
'k-', lw=0.4)
2105 ax.plot([resx, endx], [end, end],
'k-', lw=0.4)
2106 yticks.append((float(res) + float(end)) / 2)
2107 if rename_protein_map
is not None:
2108 if prot
in rename_protein_map:
2109 prot = rename_protein_map[prot]
2110 ylabels.append(prot)
2113 print(prot_listx, prot_listy)
2115 if self.contactmap
is not None:
2116 tmp_array = np.zeros((nresx, nresy))
2118 for px
in prot_listx:
2120 for py
in prot_listy:
2122 resx = resoffsety[px]
2123 lengx = resendx[px] - 1
2124 resy = resoffsety[py]
2125 lengy = resendy[py] - 1
2126 indexes_x = self.index_dictionary[px]
2127 minx = min(indexes_x)
2128 maxx = max(indexes_x)
2129 indexes_y = self.index_dictionary[py]
2130 miny = min(indexes_y)
2131 maxy = max(indexes_y)
2133 print(px, py, minx, maxx, miny, maxy)
2138 resy:lengy] = self.contactmap[
2144 ax.imshow(tmp_array,
2147 interpolation=
'nearest')
2149 ax.set_xticks(xticks)
2150 ax.set_xticklabels(xlabels, rotation=90)
2151 ax.set_yticks(yticks)
2152 ax.set_yticklabels(ylabels)
2153 ax.set_xlim(0, nresx)
2154 ax.set_ylim(0, nresy)
2158 already_added_xls = []
2160 for xl
in self.crosslinks:
2162 (r1, c1, r2, c2, mdist, stdv, confidence,
2163 unique_identifier, descriptor) = xl
2165 if confidence_classes
is not None:
2166 if confidence
not in confidence_classes:
2170 pos1 = r1 + resoffsetx[c1]
2174 pos2 = r2 + resoffsety[c2]
2178 if filter
is not None:
2179 xldb = self.external_csv_data[unique_identifier]
2180 xldb.update({
"Distance": mdist})
2181 xldb.update({
"Distance_stdv": stdv})
2183 if filter[1] ==
">":
2184 if float(xldb[filter[0]]) <= float(filter[2]):
2187 if filter[1] ==
"<":
2188 if float(xldb[filter[0]]) >= float(filter[2]):
2191 if filter[1] ==
"==":
2192 if float(xldb[filter[0]]) != float(filter[2]):
2198 pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
2199 pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
2201 if layout ==
'lowerdiagonal':
2202 if pos_for_diagonal1 <= pos_for_diagonal2:
2204 if layout ==
'upperdiagonal':
2205 if pos_for_diagonal1 >= pos_for_diagonal2:
2208 already_added_xls.append((r1, c1, r2, c2))
2210 if not no_confidence_info:
2211 if confidence ==
'0.01':
2212 markersize = 14 * scale_symbol_size
2213 elif confidence ==
'0.05':
2214 markersize = 9 * scale_symbol_size
2215 elif confidence ==
'0.1':
2216 markersize = 6 * scale_symbol_size
2218 markersize = 15 * scale_symbol_size
2220 markersize = 5 * scale_symbol_size
2222 if not no_dist_info:
2223 color = self.colormap(mdist)
2233 markersize=markersize)
2235 fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
2237 for i
in ax.spines.values():
2238 i.set_linewidth(2.0)
2241 plt.savefig(filename +
".pdf", dpi=300, transparent=
"False")
2245 def get_frequency_statistics(self, prot_list,
2248 violated_histogram = {}
2249 satisfied_histogram = {}
2250 unique_cross_links = []
2252 for xl
in self.unique_cross_link_list:
2253 (r1, c1, r2, c2, mdist) = xl
2256 if prot_list2
is None:
2257 if c1
not in prot_list:
2259 if c2
not in prot_list:
2262 if c1
in prot_list
and c2
in prot_list2:
2264 elif c1
in prot_list2
and c2
in prot_list:
2269 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2271 if (r1, c1, r2, c2)
not in unique_cross_links:
2273 if frequency
not in violated_histogram:
2274 violated_histogram[frequency] = 1
2276 violated_histogram[frequency] += 1
2278 if frequency
not in satisfied_histogram:
2279 satisfied_histogram[frequency] = 1
2281 satisfied_histogram[frequency] += 1
2282 unique_cross_links.append((r1, c1, r2, c2))
2283 unique_cross_links.append((r2, c2, r1, c1))
2285 print(
"# satisfied")
2287 total_number_of_crosslinks = 0
2289 for i
in satisfied_histogram:
2293 if i
in violated_histogram:
2294 print(i, violated_histogram[i]+satisfied_histogram[i])
2296 print(i, satisfied_histogram[i])
2297 total_number_of_crosslinks += i*satisfied_histogram[i]
2301 for i
in violated_histogram:
2302 print(i, violated_histogram[i])
2303 total_number_of_crosslinks += i*violated_histogram[i]
2305 print(total_number_of_crosslinks)
2307 def print_cross_link_binary_symbols(self, prot_list,
2310 confidence_list = []
2311 for xl
in self.crosslinks:
2312 (r1, c1, r2, c2, mdist, stdv, confidence,
2313 unique_identifier, descriptor) = xl
2315 if prot_list2
is None:
2316 if c1
not in prot_list:
2318 if c2
not in prot_list:
2321 if c1
in prot_list
and c2
in prot_list2:
2323 elif c1
in prot_list2
and c2
in prot_list:
2328 if descriptor !=
"original":
2331 confidence_list.append(confidence)
2333 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2334 tmp_dist_binary = []
2337 tmp_dist_binary.append(1)
2339 tmp_dist_binary.append(0)
2340 tmp_matrix.append(tmp_dist_binary)
2342 matrix = list(zip(*tmp_matrix))
2344 satisfied_high_sum = 0
2345 satisfied_mid_sum = 0
2346 satisfied_low_sum = 0
2347 total_satisfied_sum = 0
2348 for k, m
in enumerate(matrix):
2357 for n, b
in enumerate(m):
2358 if confidence_list[n] ==
"0.01":
2362 satisfied_high_sum += 1
2363 elif confidence_list[n] ==
"0.05":
2367 satisfied_mid_sum += 1
2368 elif confidence_list[n] ==
"0.1":
2372 satisfied_low_sum += 1
2374 total_satisfied += 1
2375 total_satisfied_sum += 1
2377 print(k, satisfied_high, total_high)
2378 print(k, satisfied_mid, total_mid)
2379 print(k, satisfied_low, total_low)
2380 print(k, total_satisfied, total)
2381 print(float(satisfied_high_sum) / len(matrix))
2382 print(float(satisfied_mid_sum) / len(matrix))
2383 print(float(satisfied_low_sum) / len(matrix))
2386 def get_unique_crosslinks_statistics(self, prot_list,
2399 satisfied_string = []
2400 for xl
in self.crosslinks:
2401 (r1, c1, r2, c2, mdist, stdv, confidence,
2402 unique_identifier, descriptor) = xl
2404 if prot_list2
is None:
2405 if c1
not in prot_list:
2407 if c2
not in prot_list:
2410 if c1
in prot_list
and c2
in prot_list2:
2412 elif c1
in prot_list2
and c2
in prot_list:
2417 if descriptor !=
"original":
2421 if confidence ==
"0.01":
2425 if confidence ==
"0.05":
2429 if confidence ==
"0.1":
2434 satisfied_string.append(1)
2436 satisfied_string.append(0)
2438 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2439 tmp_dist_binary = []
2442 tmp_dist_binary.append(1)
2444 tmp_dist_binary.append(0)
2445 tmp_matrix.append(tmp_dist_binary)
2447 print(
"unique satisfied_high/total_high", satisfied_high,
"/",
2449 print(
"unique satisfied_mid/total_mid", satisfied_mid,
"/", total_mid)
2450 print(
"unique satisfied_low/total_low", satisfied_low,
"/", total_low)
2451 print(
"total", total)
2453 matrix = list(zip(*tmp_matrix))
2454 satisfied_models = 0
2456 for b
in satisfied_string:
2463 all_satisfied =
True
2465 for n, b
in enumerate(m):
2470 if b == 1
and satisfied_string[n] == 1:
2472 elif b == 1
and satisfied_string[n] == 0:
2474 elif b == 0
and satisfied_string[n] == 0:
2476 elif b == 0
and satisfied_string[n] == 1:
2477 all_satisfied =
False
2479 satisfied_models += 1
2481 print(satstr, all_satisfied)
2482 print(
"models that satisfies the median satisfied cross-links/total "
2483 "models", satisfied_models, len(matrix))
2485 def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,
2488 import matplotlib
as mpl
2490 import matplotlib.pylab
as pl
2493 for kw
in self.cross_link_distances_unique:
2494 (r1, c1, r2, c2) = kw
2495 dists = self.cross_link_distances_unique[kw]
2497 if prot_list2
is None:
2498 if c1
not in prot_list:
2500 if c2
not in prot_list:
2503 if c1
in prot_list
and c2
in prot_list2:
2505 elif c1
in prot_list2
and c2
in prot_list:
2510 dists.append(sum(dists))
2511 tmp_matrix.append(dists)
2513 tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
2516 matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
2518 for i
in range(len(tmp_matrix)):
2519 for k
in range(len(tmp_matrix[i]) - 1):
2520 matrix[i][k] = tmp_matrix[i][k]
2525 ax = fig.add_subplot(211)
2527 cax = ax.imshow(matrix, interpolation=
'nearest')
2529 pl.savefig(figurename, dpi=300)
2538 arrangement=
"inter",
2539 confidence_input=
"None"):
2542 for xl
in self.cross_link_distances:
2543 (r1, c1, r2, c2, mdist, confidence) = xl
2544 if c1
in prots1
and c2
in prots2:
2545 if arrangement ==
"inter" and c1 == c2:
2547 if arrangement ==
"intra" and c1 != c2:
2549 if confidence_input == confidence:
2550 label = str(c1) +
":" + str(r1) + \
2551 "-" + str(c2) +
":" + str(r2)
2552 values = self.cross_link_distances[xl]
2553 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2554 data.append((label, values, mdist, frequency))
2556 sort_by_dist = sorted(data, key=
lambda tup: tup[2])
2557 sort_by_dist = list(zip(*sort_by_dist))
2558 values = sort_by_dist[1]
2559 positions = list(range(len(values)))
2560 labels = sort_by_dist[0]
2561 frequencies = list(map(float, sort_by_dist[3]))
2562 frequencies = [f * 10.0
for f
in frequencies]
2564 nchunks = int(float(len(values)) / nxl_per_row)
2565 values_chunks = IMP.pmi.tools.chunk_list_into_segments(values, nchunks)
2566 positions_chunks = IMP.pmi.tools.chunk_list_into_segments(
2569 frequencies_chunks = IMP.pmi.tools.chunk_list_into_segments(
2572 labels_chunks = IMP.pmi.tools.chunk_list_into_segments(labels, nchunks)
2574 for n, v
in enumerate(values_chunks):
2575 p = positions_chunks[n]
2576 f = frequencies_chunks[n]
2578 filename +
"." + str(n), v, p, f,
2579 valuename=
"Distance (Ang)",
2580 positionname=
"Unique " + arrangement +
" Crosslinks",
2581 xlabels=labels_chunks[n])
2583 def crosslink_distance_histogram(self, filename,
2586 confidence_classes=
None,
2592 if prot_list
is None:
2593 prot_list = list(self.prot_length_dict.keys())
2596 for xl
in self.crosslinks:
2597 (r1, c1, r2, c2, mdist, stdv, confidence,
2598 unique_identifier, descriptor) = xl
2600 if confidence_classes
is not None:
2601 if confidence
not in confidence_classes:
2604 if prot_list2
is None:
2605 if c1
not in prot_list:
2607 if c2
not in prot_list:
2610 if c1
in prot_list
and c2
in prot_list2:
2612 elif c1
in prot_list2
and c2
in prot_list:
2617 distances.append(mdist)
2620 filename, distances, valuename=
"C-alpha C-alpha distance [Ang]",
2621 bins=bins, color=color,
2623 reference_xline=35.0,
2624 yplotrange=yplotrange, normalized=normalized)
2626 def scatter_plot_xl_features(self, filename,
2632 reference_ylines=
None,
2633 distance_color=
True,
2635 import matplotlib
as mpl
2637 import matplotlib.pyplot
as plt
2639 fig = plt.figure(figsize=(10, 10))
2640 ax = fig.add_subplot(111)
2642 for xl
in self.crosslinks:
2643 (r1, c1, r2, c2, mdist, stdv, confidence,
2644 unique_identifier, arrangement) = xl
2646 if prot_list2
is None:
2647 if c1
not in prot_list:
2649 if c2
not in prot_list:
2652 if c1
in prot_list
and c2
in prot_list2:
2654 elif c1
in prot_list2
and c2
in prot_list:
2659 xldb = self.external_csv_data[unique_identifier]
2660 xldb.update({
"Distance": mdist})
2661 xldb.update({
"Distance_stdv": stdv})
2663 xvalue = float(xldb[feature1])
2664 yvalue = float(xldb[feature2])
2667 color = self.colormap(mdist)
2671 ax.plot([xvalue], [yvalue],
'o', c=color, alpha=0.1, markersize=7)
2673 if yplotrange
is not None:
2674 ax.set_ylim(yplotrange)
2675 if reference_ylines
is not None:
2676 for rl
in reference_ylines:
2677 ax.axhline(rl, color=
'red', linestyle=
'dashed', linewidth=1)
2680 plt.savefig(filename +
"." + format, dpi=150, transparent=
"False")
static bool get_is_setup(const IMP::ParticleAdaptor &p)
def get_output
Returns output for IMP.pmi.output.Output object.
Visualization of cross-links.
A decorator to associate a particle with a part of a protein/DNA/RNA.
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
A class for reading stat files (either rmf or ascii v1 and v2)
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
Compute the RMSD (without alignment) taking into account the copy ambiguity.
void link_restraints(RMF::FileConstHandle fh, const Restraints &hs)
A class to evaluate the precision of an ensemble.
A class to cluster structures.
def get_rmsd_wrt_reference_structure_with_alignment
First align then calculate RMSD.
static Molecule setup_particle(Model *m, ParticleIndex pi)
Class for sampling a density map from particles.
A decorator for keeping track of copies of a molecule.
def add_structure
Read a structure into the ensemble and store (as coordinates).
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
Return a density map for which voxel i contains the result of m1[i]*m2[i].
Performs alignment and RMSD calculation for two sets of coordinates.
def add_subunits_density
Add a frame to the densities.
DensityMap * create_density_map(const IMP::algebra::GridD< 3, S, V, E > &arg)
Create a density map from an arbitrary IMP::algebra::GridD.
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Basic utilities for handling cryo-electron microscopy 3D density maps.
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
def fill
Add coordinates for a single model.
Tools for clustering and cluster analysis.
bool get_is_canonical(atom::Hierarchy h)
Walk up a PMI2 hierarchy/representations and check if the root is named System.
def do_cluster
Run K-means clustering.
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Classes for writing output files and processing them.
def set_reference_structure
Read in a structure used for reference computation.
def get_rmsf
Calculate the residue mean square fluctuations (RMSF).
def get_average_distance_wrt_reference_structure
Compare the structure set to the reference structure.
def get_density
Get the current density for some component name.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
def get_precision
Evaluate the precision of two named structure groups.
Restraints create_restraints(RMF::FileConstHandle fh, Model *m)
def get_moldict_coorddict
return data structure for the RMSD calculation
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name.
Python classes to represent, score, sample and analyze models.
def add_structures
Read a list of RMFs, supports parallel.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Select hierarchy particles identified by the biological name.
Compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
A decorator for a particle with x,y,z coordinates and a radius.