3 """@namespace IMP.pmi.analysis
4 Tools for clustering and cluster analysis
6 from __future__
import print_function
16 from operator
import itemgetter
17 from math
import log, sqrt
23 """Performs alignment and RMSD calculation for two sets of coordinates
25 The class also takes into account non-equal stoichiometry of the proteins.
26 If this is the case, the protein names of proteins in multiple copies
27 should be specified in the following form:
28 nameA..1, nameA..2 (note two dots).
31 def __init__(self, template, query, weights=None):
33 @param query {'p1':coords(L,3), 'p2':coords(L,3)}
34 @param template {'p1':coords(L,3), 'p2':coords(L,3)}
35 @param weights optional weights for each set of coordinates
38 self.template = template
39 self.weights = weights
41 if len(self.query.keys()) != len(self.template.keys()):
42 raise ValueError(
'''the number of proteins
43 in template and query does not match!''')
49 self.proteins = sorted(self.query.keys())
50 prots_uniq = [i.split(
'..')[0]
for i
in self.proteins]
58 copies = [i
for i
in self.proteins
if i.split(
'..')[0] == p]
59 prmts = list(itertools.permutations(copies, len(copies)))
62 self.Product = list(itertools.product(*P.values()))
70 torder = sum([list(i)
for i
in self.Product[0]], [])
73 if self.weights
is not None:
74 weights += [i
for i
in self.weights[t]]
76 self.rmsd = 10000000000.
77 for comb
in self.Product:
79 order = sum([list(i)
for i
in comb], [])
84 if self.weights
is not None:
86 template_xyz, query_xyz, weights)
94 from scipy.spatial.distance
import cdist
102 torder = sum([list(i)
for i
in self.Product[0]], [])
106 self.rmsd, Transformation = 10000000000.,
''
109 for comb
in self.Product:
110 order = sum([list(i)
for i
in comb], [])
115 if len(template_xyz) != len(query_xyz):
116 raise ValueError(
'''the number of coordinates
117 in template and query does not match!''')
121 query_xyz, template_xyz)
122 query_xyz_tr = [transformation.get_transformed(n)
126 sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2))
130 Transformation = transformation
133 return (self.rmsd, Transformation)
138 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
139 'a..2':np.array([np.array([1.,1.,])]),
140 'a..3':np.array([np.array([-2.,1.])]),
141 'b':np.array([np.array([0.,-1.])]),
142 'c..1':np.array([np.array([-1.,-1.])]),
143 'c..2':np.array([np.array([1.,-1.])]),
144 'd':np.array([np.array([0.,0.])]),
145 'e':np.array([np.array([0.,1.])])}
147 Ali = Alignment(Proteins, Proteins)
149 if Ali.get_rmsd() == 0.0: print 'successful test!'
150 else: print 'ERROR!'; exit()
155 class Violations(object):
159 self.violation_thresholds = {}
160 self.violation_counts = {}
162 data = open(filename)
167 d = d.strip().split()
168 self.violation_thresholds[d[0]] = float(d[1])
170 def get_number_violated_restraints(self, rsts_dict):
172 for rst
in self.violation_thresholds:
173 if rst
not in rsts_dict:
175 if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
177 if rst
not in self.violation_counts:
178 self.violation_counts[rst] = 1
180 self.violation_counts[rst] += 1
185 """A class to cluster structures.
186 Uses scipy's cdist function to compute distance matrices
187 and sklearn's kmeans clustering module.
191 @param rmsd_weights Flat list of weights for each particle
195 from mpi4py
import MPI
196 self.comm = MPI.COMM_WORLD
197 self.rank = self.comm.Get_rank()
198 self.number_of_processes = self.comm.size
200 self.number_of_processes = 1
203 self.structure_cluster_ids =
None
204 self.tmpl_coords =
None
205 self.rmsd_weights = rmsd_weights
207 def set_template(self, part_coords):
208 self.tmpl_coords = part_coords
211 """Add coordinates for a single model."""
212 self.all_coords[frame] = Coords
214 def dist_matrix(self):
215 self.model_list_names = list(self.all_coords.keys())
216 self.model_indexes = list(range(len(self.model_list_names)))
217 self.model_indexes_dict = dict(
218 list(zip(self.model_list_names, self.model_indexes)))
219 model_indexes_unique_pairs = \
220 list(itertools.combinations(self.model_indexes, 2))
222 my_model_indexes_unique_pairs = IMP.pmi.tools.chunk_list_into_segments(
223 model_indexes_unique_pairs,
224 self.number_of_processes)[self.rank]
226 print(
"process %s assigned with %s pairs"
227 % (str(self.rank), str(len(my_model_indexes_unique_pairs))))
229 (raw_distance_dict, self.transformation_distance_dict) = \
230 self.matrix_calculation(self.all_coords, self.tmpl_coords,
231 my_model_indexes_unique_pairs)
233 if self.number_of_processes > 1:
236 pickable_transformations = \
237 self.get_pickable_transformation_distance_dict()
239 pickable_transformations)
240 self.set_transformation_distance_dict_from_pickable(
241 pickable_transformations)
243 self.raw_distance_matrix = np.zeros(
244 (len(self.model_list_names), len(self.model_list_names)))
245 for item
in raw_distance_dict:
247 self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
248 self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
250 def get_dist_matrix(self):
251 return self.raw_distance_matrix
254 """Run K-means clustering
255 @param number_of_clusters Num means
256 @param seed the random seed
258 from sklearn.cluster
import KMeans
263 kmeans = KMeans(n_clusters=number_of_clusters)
266 kmeans = KMeans(k=number_of_clusters)
267 kmeans.fit_predict(self.raw_distance_matrix)
269 self.structure_cluster_ids = kmeans.labels_
271 def get_pickable_transformation_distance_dict(self):
272 pickable_transformations = {}
273 for label
in self.transformation_distance_dict:
274 tr = self.transformation_distance_dict[label]
275 trans = tuple(tr.get_translation())
276 rot = tuple(tr.get_rotation().get_quaternion())
277 pickable_transformations[label] = (rot, trans)
278 return pickable_transformations
280 def set_transformation_distance_dict_from_pickable(
282 pickable_transformations):
283 self.transformation_distance_dict = {}
284 for label
in pickable_transformations:
285 tr = pickable_transformations[label]
288 self.transformation_distance_dict[
291 def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
293 outf = open(file_name +
".data",
'wb')
299 pickable_transformations = \
300 self.get_pickable_transformation_distance_dict()
302 (self.structure_cluster_ids,
303 self.model_list_names,
304 pickable_transformations),
307 np.save(file_name +
".npy", self.raw_distance_matrix)
309 def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
312 inputf = open(file_name +
".data",
'rb')
313 (self.structure_cluster_ids, self.model_list_names,
314 pickable_transformations) = pickle.load(inputf)
317 self.raw_distance_matrix = np.load(file_name +
".npy")
319 self.set_transformation_distance_dict_from_pickable(
320 pickable_transformations)
321 self.model_indexes = list(range(len(self.model_list_names)))
322 self.model_indexes_dict = dict(
323 list(zip(self.model_list_names, self.model_indexes)))
325 def plot_matrix(self, figurename="clustermatrix.pdf"):
326 import matplotlib
as mpl
328 import matplotlib.pylab
as pl
329 from scipy.cluster
import hierarchy
as hrc
331 fig = pl.figure(figsize=(10, 8))
332 ax = fig.add_subplot(212)
333 dendrogram = hrc.dendrogram(
334 hrc.linkage(self.raw_distance_matrix),
337 leaves_order = dendrogram[
'leaves']
338 ax.set_xlabel(
'Model')
339 ax.set_ylabel(
'RMSD [Angstroms]')
341 ax2 = fig.add_subplot(221)
343 self.raw_distance_matrix[leaves_order,
346 interpolation=
'nearest')
347 cb = fig.colorbar(cax)
348 cb.set_label(
'RMSD [Angstroms]')
349 ax2.set_xlabel(
'Model')
350 ax2.set_ylabel(
'Model')
352 pl.savefig(figurename, dpi=300)
355 def get_model_index_from_name(self, name):
356 return self.model_indexes_dict[name]
358 def get_cluster_labels(self):
360 return list(set(self.structure_cluster_ids))
362 def get_number_of_clusters(self):
363 return len(self.get_cluster_labels())
365 def get_cluster_label_indexes(self, label):
367 [i
for i, l
in enumerate(self.structure_cluster_ids)
if l == label]
370 def get_cluster_label_names(self, label):
372 [self.model_list_names[i]
373 for i
in self.get_cluster_label_indexes(label)]
376 def get_cluster_label_average_rmsd(self, label):
378 indexes = self.get_cluster_label_indexes(label)
381 sub_distance_matrix = self.raw_distance_matrix[
382 indexes, :][:, indexes]
383 average_rmsd = np.sum(sub_distance_matrix) / \
384 (len(sub_distance_matrix)
385 ** 2 - len(sub_distance_matrix))
390 def get_cluster_label_size(self, label):
391 return len(self.get_cluster_label_indexes(label))
393 def get_transformation_to_first_member(
397 reference = self.get_cluster_label_indexes(cluster_label)[0]
398 return self.transformation_distance_dict[(reference, structure_index)]
400 def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
402 model_list_names = list(all_coords.keys())
403 rmsd_protein_names = list(all_coords[model_list_names[0]].keys())
404 raw_distance_dict = {}
405 transformation_distance_dict = {}
406 if template_coords
is None:
410 alignment_template_protein_names = list(template_coords.keys())
412 for (f1, f2)
in list_of_pairs:
420 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
421 for pr
in rmsd_protein_names])
423 for pr
in rmsd_protein_names:
424 coords_f2[pr] = all_coords[model_list_names[f2]][pr]
426 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
427 rmsd = Ali.get_rmsd()
433 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
434 for pr
in alignment_template_protein_names])
435 coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
436 for pr
in alignment_template_protein_names])
439 template_rmsd, transformation = Ali.align()
444 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
445 for pr
in rmsd_protein_names])
447 for pr
in rmsd_protein_names:
448 coords_f2[pr] = [transformation.get_transformed(
449 i)
for i
in all_coords[model_list_names[f2]][pr]]
451 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
452 rmsd = Ali.get_rmsd()
454 raw_distance_dict[(f1, f2)] = rmsd
455 raw_distance_dict[(f2, f1)] = rmsd
456 transformation_distance_dict[(f1, f2)] = transformation
457 transformation_distance_dict[(f2, f1)] = transformation
459 return raw_distance_dict, transformation_distance_dict
463 """Compute the RMSD (without alignment) taking into account the copy
465 To be used with pmi2 hierarchies. Can be used for instance as follows:
467 rmsd = IMP.pmi.analysis.RMSD(
468 hier, hier, [mol.get_name() for mol in mols],
469 dynamic0=True, dynamic1=False)
470 output_objects.append(rmsd)
472 before shuffling the coordinates
475 def __init__(self, hier0, hier1, molnames, label="None", dynamic0=True,
476 dynamic1=
True, metric=IMP.algebra.get_rmsd):
478 @param hier0 first input hierarchy
479 @param hier1 second input hierarchy
480 @param molnames the names of the molecules used for the RMSD
481 @dynamic0 if True stores the decorators XYZ and coordinates of
482 hier0 can be update. If false coordinates are static
483 (stored in Vector3Ds) and will never be updated
484 @dynamic1 same as above
485 metric what metric should be used
487 self.moldict0, self.molcoords0, self.mol_XYZs0 = \
489 self.moldict1, self.molcoords1, self.mol_XYZs1 = \
491 self.dynamic0 = dynamic0
492 self.dynamic1 = dynamic1
497 """return data structure for the RMSD calculation"""
502 name = mol.get_name()
503 if name
not in molnames:
511 mol, residue_index=i,
512 representation_type=IMP.atom.BALLS,
514 parts = sel.get_selected_particles()
516 mol_coords[mol].append(
521 moldict[name].append(mol)
523 moldict[name] = [mol]
524 return moldict, mol_coords, mol_XYZs
526 def get_rmsd_and_assigments(self):
529 best_assignments = []
531 for molname, ref_mols
in self.moldict1.items():
533 for ref_mol
in ref_mols:
535 coord1 = [XYZ.get_coordinates()
536 for XYZ
in self.mol_XYZs1[ref_mol]]
538 coord1 = self.molcoords1[ref_mol]
543 for rmf_mols
in itertools.permutations(self.moldict0[molname]):
545 for rmf_mol
in rmf_mols:
547 coord0 = [XYZ.get_coordinates()
548 for XYZ
in self.mol_XYZs0[rmf_mol]]
550 coord0 = self.molcoords0[rmf_mol]
554 rmf_mols_list.append(rmf_mols)
556 rmf_mols_best_order = rmf_mols_list[rmsd.index(m)]
558 for n, (ref_mol, rmf_mol)
in enumerate(
559 zip(ref_mols, rmf_mols_best_order)):
560 best_assignments.append((rmf_mol, ref_mol))
562 coord0 = [XYZ.get_coordinates()
563 for XYZ
in self.mol_XYZs0[rmf_mol]]
565 coord0 = self.molcoords0[rmf_mol]
568 coord1 = [XYZ.get_coordinates()
569 for XYZ
in self.mol_XYZs1[ref_mol]]
571 coord1 = self.molcoords1[ref_mol]
572 rmsd_pair = self.metric(coord1, coord0)
573 N = len(self.molcoords1[ref_mol])
575 total_rmsd += rmsd_pair*rmsd_pair*N
576 rmsd_dict[ref_mol] = rmsd_pair
577 total_rmsd = sqrt(total_rmsd/total_N)
578 return total_rmsd, best_assignments
581 """Returns output for IMP.pmi.output.Output object"""
582 total_rmsd, best_assignments = self.get_rmsd_and_assigments()
585 for rmf_mol, ref_mol
in best_assignments:
586 ref_name = ref_mol.get_name()
588 rmf_name = rmf_mol.get_name()
590 assignments_out.append(rmf_name +
"." + str(rmf_copy) +
"->" +
591 ref_name +
"." + str(ref_copy))
592 return {
"RMSD_" + self.label: str(total_rmsd),
593 "RMSD_assignments_" + self.label: str(assignments_out)}
597 """A class to evaluate the precision of an ensemble.
599 Also can evaluate the cross-precision of multiple ensembles.
600 Supports MPI for coordinate reading.
601 Recommended procedure:
602 -# initialize object and pass the selection for evaluating precision
603 -# call add_structures() to read in the data (specify group name)
604 -# call get_precision() to evaluate inter/intra precision
605 -# call get_rmsf() to evaluate within-group fluctuations
607 def __init__(self, model, resolution=1, selection_dictionary={}):
609 @param model The IMP Model
610 @param resolution Use 1 or 10 (kluge: requires that "_Res:X" is
611 part of the hier name)
612 @param selection_dictionary Dictionary where keys are names for
613 selections and values are selection tuples for scoring
614 precision. "All" is automatically made as well
617 from mpi4py
import MPI
618 self.comm = MPI.COMM_WORLD
619 self.rank = self.comm.Get_rank()
620 self.number_of_processes = self.comm.size
622 self.number_of_processes = 1
625 self.styles = [
'pairwise_rmsd',
'pairwise_drmsd_k',
'pairwise_drmsd_Q',
626 'pairwise_drms_k',
'pairwise_rmsd',
'drmsd_from_center']
627 self.style =
'pairwise_drmsd_k'
628 self.structures_dictionary = {}
629 self.reference_structures_dictionary = {}
631 self.protein_names =
None
632 self.len_particles_resolution_one =
None
634 self.rmf_names_frames = {}
635 self.reference_rmf_names_frames =
None
636 self.reference_structure =
None
637 self.reference_prot =
None
638 self.selection_dictionary = selection_dictionary
639 self.threshold = 40.0
640 self.residue_particle_index_map =
None
642 if resolution
in [1, 10]:
643 self.resolution = resolution
645 raise KeyError(
"Currently only allow resolution 1 or 10")
647 def _get_structure(self, rmf_frame_index, rmf_name):
648 """Read an RMF file and return the particles"""
649 rh = RMF.open_rmf_file_read_only(rmf_name)
651 print(
"getting coordinates for frame %i rmf file %s"
652 % (rmf_frame_index, rmf_name))
656 print(
"linking coordinates for frame %i rmf file %s"
657 % (rmf_frame_index, rmf_name))
662 if self.resolution == 1:
664 elif self.resolution == 10:
667 protein_names = list(particle_dict.keys())
668 particles_resolution_one = []
669 for k
in particle_dict:
670 particles_resolution_one += (particle_dict[k])
672 if self.protein_names
is None:
673 self.protein_names = protein_names
675 if self.protein_names != protein_names:
676 print(
"Error: the protein names of the new coordinate set "
677 "is not compatible with the previous one")
679 if self.len_particles_resolution_one
is None:
680 self.len_particles_resolution_one = len(particles_resolution_one)
682 if self.len_particles_resolution_one != \
683 len(particles_resolution_one):
684 raise ValueError(
"the new coordinate set is not compatible "
685 "with the previous one")
687 return particles_resolution_one
693 setup_index_map=
False):
694 """ Read a structure into the ensemble and store (as coordinates).
695 @param rmf_name The name of the RMF file
696 @param rmf_frame_index The frame to read
697 @param structure_set_name Name for the set that includes this structure
699 @param setup_index_map if requested, set up a dictionary to help
704 if structure_set_name
in self.structures_dictionary:
705 cdict = self.structures_dictionary[structure_set_name]
706 rmflist = self.rmf_names_frames[structure_set_name]
708 self.structures_dictionary[structure_set_name] = {}
709 self.rmf_names_frames[structure_set_name] = []
710 cdict = self.structures_dictionary[structure_set_name]
711 rmflist = self.rmf_names_frames[structure_set_name]
715 particles_resolution_one = self._get_structure(
716 rmf_frame_index, rmf_name)
718 print(
"something wrong with the rmf")
720 self.selection_dictionary.update({
"All": self.protein_names})
722 for selection_name
in self.selection_dictionary:
723 selection_tuple = self.selection_dictionary[selection_name]
724 coords = self._select_coordinates(
725 selection_tuple, particles_resolution_one, self.prots[0])
726 if selection_name
not in cdict:
727 cdict[selection_name] = [coords]
729 cdict[selection_name].append(coords)
731 rmflist.append((rmf_name, rmf_frame_index))
735 self.residue_particle_index_map = {}
736 for prot_name
in self.protein_names:
737 self.residue_particle_index_map[prot_name] = \
738 self._get_residue_particle_index_map(
739 prot_name, particles_resolution_one, self.prots[0])
742 rmf_name_frame_tuples,
744 """Read a list of RMFs, supports parallel
745 @param rmf_name_frame_tuples list of (rmf_file_name,frame_number)
746 @param structure_set_name Name this set of structures
751 my_rmf_name_frame_tuples = IMP.pmi.tools.chunk_list_into_segments(
752 rmf_name_frame_tuples, self.number_of_processes)[self.rank]
753 for nfr, tup
in enumerate(my_rmf_name_frame_tuples):
755 rmf_frame_index = tup[1]
757 if self.residue_particle_index_map
is None:
758 setup_index_map =
True
760 setup_index_map =
False
767 if self.number_of_processes > 1:
769 self.rmf_names_frames)
771 self.comm.send(self.structures_dictionary, dest=0, tag=11)
773 for i
in range(1, self.number_of_processes):
774 data_tmp = self.comm.recv(source=i, tag=11)
775 for key
in self.structures_dictionary:
776 self.structures_dictionary[key].update(data_tmp[key])
777 for i
in range(1, self.number_of_processes):
778 self.comm.send(self.structures_dictionary, dest=i, tag=11)
780 self.structures_dictionary = self.comm.recv(source=0, tag=11)
782 def _get_residue_particle_index_map(self, prot_name, structure, hier):
784 residue_particle_index_map = []
786 all_selected_particles = s.get_selected_particles()
787 intersection = list(set(all_selected_particles) & set(structure))
788 sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
789 for p
in sorted_intersection:
790 residue_particle_index_map.append(
792 return residue_particle_index_map
794 def _select_coordinates(self, tuple_selections, structure, prot):
795 selected_coordinates = []
796 for t
in tuple_selections:
797 if isinstance(t, tuple)
and len(t) == 3:
799 prot, molecules=[t[2]],
800 residue_indexes=range(t[0], t[1]+1), resolution=1)
801 all_selected_particles = s.get_selected_particles()
802 intersection = list(set(all_selected_particles)
804 sorted_intersection = \
805 IMP.pmi.tools.sort_by_residues(intersection)
807 for p
in sorted_intersection]
808 selected_coordinates += cc
809 elif isinstance(t, str):
811 all_selected_particles = s.get_selected_particles()
812 intersection = list(set(all_selected_particles)
814 sorted_intersection = \
815 IMP.pmi.tools.sort_by_residues(intersection)
817 for p
in sorted_intersection]
818 selected_coordinates += cc
820 raise ValueError(
"Selection error")
821 return selected_coordinates
823 def set_threshold(self, threshold):
824 self.threshold = threshold
826 def _get_distance(self, structure_set_name1, structure_set_name2,
827 selection_name, index1, index2):
828 """Compute distance between structures with various metrics"""
829 c1 = self.structures_dictionary[
830 structure_set_name1][selection_name][index1]
831 c2 = self.structures_dictionary[
832 structure_set_name2][selection_name][index2]
836 if self.style ==
'pairwise_drmsd_k':
838 if self.style ==
'pairwise_drms_k':
840 if self.style ==
'pairwise_drmsd_Q':
844 if self.style ==
'pairwise_rmsd':
848 def _get_particle_distances(self, structure_set_name1, structure_set_name2,
849 selection_name, index1, index2):
850 c1 = self.structures_dictionary[
851 structure_set_name1][selection_name][index1]
852 c2 = self.structures_dictionary[
853 structure_set_name2][selection_name][index2]
858 distances = [np.linalg.norm(a-b)
859 for (a, b)
in zip(coordinates1, coordinates2)]
864 outfile=
None, skip=1, selection_keywords=
None):
865 """ Evaluate the precision of two named structure groups. Supports MPI.
866 When the structure_set_name1 is different from the structure_set_name2,
867 this evaluates the cross-precision (average pairwise distances).
868 @param outfile Name of the precision output file
869 @param structure_set_name1 string name of the first structure set
870 @param structure_set_name2 string name of the second structure set
871 @param skip analyze every (skip) structure for the distance
873 @param selection_keywords Specify the selection name you want to
874 calculate on. By default this is computed for everything
875 you provided in the constructor, plus all the subunits together.
877 if selection_keywords
is None:
878 sel_keys = list(self.selection_dictionary.keys())
880 for k
in selection_keywords:
881 if k
not in self.selection_dictionary:
883 "you are trying to find named selection "
884 + k +
" which was not requested in the constructor")
885 sel_keys = selection_keywords
887 if outfile
is not None:
888 of = open(outfile,
"w")
890 for selection_name
in sel_keys:
891 number_of_structures_1 = len(self.structures_dictionary[
892 structure_set_name1][selection_name])
893 number_of_structures_2 = len(self.structures_dictionary[
894 structure_set_name2][selection_name])
896 structure_pointers_1 = list(range(0, number_of_structures_1, skip))
897 structure_pointers_2 = list(range(0, number_of_structures_2, skip))
898 pair_combination_list = list(
899 itertools.product(structure_pointers_1, structure_pointers_2))
900 if len(pair_combination_list) == 0:
901 raise ValueError(
"no structure selected. Check the "
905 my_pair_combination_list = IMP.pmi.tools.chunk_list_into_segments(
906 pair_combination_list, self.number_of_processes)[self.rank]
907 for n, pair
in enumerate(my_pair_combination_list):
908 distances[pair] = self._get_distance(
909 structure_set_name1, structure_set_name2, selection_name,
911 if self.number_of_processes > 1:
916 if structure_set_name1 == structure_set_name2:
917 structure_pointers = structure_pointers_1
918 number_of_structures = number_of_structures_1
923 distances_to_structure = {}
924 distances_to_structure_normalization = {}
926 for n
in structure_pointers:
927 distances_to_structure[n] = 0.0
928 distances_to_structure_normalization[n] = 0
931 distance += distances[k]
932 distances_to_structure[k[0]] += distances[k]
933 distances_to_structure[k[1]] += distances[k]
934 distances_to_structure_normalization[k[0]] += 1
935 distances_to_structure_normalization[k[1]] += 1
937 for n
in structure_pointers:
938 distances_to_structure[n] = \
939 distances_to_structure[n] \
940 / distances_to_structure_normalization[n]
942 min_distance = min([distances_to_structure[n]
943 for n
in distances_to_structure])
944 centroid_index = [k
for k, v
in
945 distances_to_structure.items()
946 if v == min_distance][0]
947 centroid_rmf_name = \
948 self.rmf_names_frames[
949 structure_set_name1][centroid_index]
951 centroid_distance = 0.0
953 for n
in range(number_of_structures):
954 dist = self._get_distance(
955 structure_set_name1, structure_set_name1,
956 selection_name, centroid_index, n)
957 centroid_distance += dist
958 distance_list.append(dist)
960 centroid_distance /= number_of_structures
961 if outfile
is not None:
962 of.write(str(selection_name) +
" " +
963 structure_set_name1 +
964 " average centroid distance " +
965 str(centroid_distance) +
"\n")
966 of.write(str(selection_name) +
" " +
967 structure_set_name1 +
" centroid index " +
968 str(centroid_index) +
"\n")
969 of.write(str(selection_name) +
" " +
970 structure_set_name1 +
" centroid rmf name " +
971 str(centroid_rmf_name) +
"\n")
972 of.write(str(selection_name) +
" " +
973 structure_set_name1 +
974 " median centroid distance " +
975 str(np.median(distance_list)) +
"\n")
977 average_pairwise_distances = sum(
978 distances.values())/len(list(distances.values()))
979 if outfile
is not None:
980 of.write(str(selection_name) +
" " + structure_set_name1 +
981 " " + structure_set_name2 +
982 " average pairwise distance " +
983 str(average_pairwise_distances) +
"\n")
984 if outfile
is not None:
986 return centroid_index
988 def get_rmsf(self, structure_set_name, outdir="./", skip=1,
989 set_plot_yaxis_range=
None):
990 """ Calculate the residue mean square fluctuations (RMSF).
991 Automatically outputs as data file and pdf
992 @param structure_set_name Which structure set to calculate RMSF for
993 @param outdir Where to write the files
994 @param skip Skip this number of structures
995 @param set_plot_yaxis_range In case you need to change the plot
1003 for sel_name
in self.protein_names:
1004 self.selection_dictionary.update({sel_name: [sel_name]})
1006 number_of_structures = \
1007 len(self.structures_dictionary[
1008 structure_set_name][sel_name])
1012 rpim = self.residue_particle_index_map[sel_name]
1013 outfile = outdir+
"/rmsf."+sel_name+
".dat"
1014 of = open(outfile,
"w")
1015 residue_distances = {}
1017 for index
in range(number_of_structures):
1018 distances = self._get_particle_distances(
1019 structure_set_name, structure_set_name, sel_name,
1020 centroid_index, index)
1021 for nblock, block
in enumerate(rpim):
1022 for residue_number
in block:
1023 residue_nblock[residue_number] = nblock
1024 if residue_number
not in residue_distances:
1025 residue_distances[residue_number] = \
1029 residue_number].append(distances[nblock])
1033 for rn
in residue_distances:
1035 rmsf = np.std(residue_distances[rn])
1037 of.write(str(rn) +
" " + str(residue_nblock[rn])
1038 +
" " + str(rmsf) +
"\n")
1040 IMP.pmi.output.plot_xy_data(
1041 residues, rmsfs, title=sel_name,
1042 out_fn=outdir+
"/rmsf."+sel_name, display=
False,
1043 set_plot_yaxis_range=set_plot_yaxis_range,
1044 xlabel=
'Residue Number', ylabel=
'Standard error')
1048 """Read in a structure used for reference computation.
1049 Needed before calling get_average_distance_wrt_reference_structure()
1050 @param rmf_name The RMF file to read the reference
1051 @param rmf_frame_index The index in that file
1053 particles_resolution_one = self._get_structure(rmf_frame_index,
1055 self.reference_rmf_names_frames = (rmf_name, rmf_frame_index)
1057 for selection_name
in self.selection_dictionary:
1058 selection_tuple = self.selection_dictionary[selection_name]
1059 coords = self._select_coordinates(
1060 selection_tuple, particles_resolution_one, self.prots[0])
1061 self.reference_structures_dictionary[selection_name] = coords
1064 self, structure_set_name, alignment_selection_key):
1065 """First align then calculate RMSD
1066 @param structure_set_name: the name of the structure set
1067 @param alignment_selection_key: the key containing the selection tuples
1068 needed to make the alignment stored in self.selection_dictionary
1069 @return: for each structure in the structure set, returns the rmsd
1072 if self.reference_structures_dictionary == {}:
1073 print(
"Cannot compute until you set a reference structure")
1076 align_reference_coordinates = \
1077 self.reference_structures_dictionary[alignment_selection_key]
1078 align_coordinates = self.structures_dictionary[
1079 structure_set_name][alignment_selection_key]
1080 transformations = []
1081 for c
in align_coordinates:
1083 {
"All": align_reference_coordinates}, {
"All": c})
1084 transformation = Ali.align()[1]
1085 transformations.append(transformation)
1086 for selection_name
in self.selection_dictionary:
1087 reference_coordinates = \
1088 self.reference_structures_dictionary[selection_name]
1090 for c
in reference_coordinates]
1092 for n, sc
in enumerate(self.structures_dictionary[
1093 structure_set_name][selection_name]):
1098 distances.append(distance)
1099 ret[selection_name] = {
1100 'all_distances': distances,
1101 'average_distance': sum(distances)/len(distances),
1102 'minimum_distance': min(distances)}
1105 def _get_median(self, list_of_values):
1106 return np.median(np.array(list_of_values))
1109 """Compare the structure set to the reference structure.
1110 @param structure_set_name The structure set to compute this on
1111 @note First call set_reference_structure()
1114 if self.reference_structures_dictionary == {}:
1115 print(
"Cannot compute until you set a reference structure")
1117 for selection_name
in self.selection_dictionary:
1118 reference_coordinates = self.reference_structures_dictionary[
1121 for c
in reference_coordinates]
1123 for sc
in self.structures_dictionary[
1124 structure_set_name][selection_name]:
1126 if self.style ==
'pairwise_drmsd_k':
1128 if self.style ==
'pairwise_drms_k':
1130 if self.style ==
'pairwise_drmsd_Q':
1132 coordinates1, coordinates2, self.threshold)
1133 if self.style ==
'pairwise_rmsd':
1135 distances.append(distance)
1137 print(selection_name,
"average distance",
1138 sum(distances)/len(distances),
"minimum distance",
1139 min(distances),
'nframes', len(distances))
1140 ret[selection_name] = {
1141 'average_distance': sum(distances)/len(distances),
1142 'minimum_distance': min(distances)}
1145 def get_coordinates(self):
1148 def set_precision_style(self, style):
1149 if style
in self.styles:
1152 raise ValueError(
"No such style")
1156 """Compute mean density maps from structures.
1158 Keeps a dictionary of density maps,
1159 keys are in the custom ranges. When you call add_subunits_density, it adds
1160 particle coordinates to the existing density maps.
1163 def __init__(self, custom_ranges, representation=None, resolution=20.0,
1166 @param custom_ranges Required. It's a dictionary, keys are the
1167 density component names, values are selection tuples
1168 e.g. {'kin28':[['kin28',1,-1]],
1169 'density_name_1' :[('ccl1')],
1170 'density_name_2' :[(1,142,'tfb3d1'),
1171 (143,700,'tfb3d2')],
1172 @param resolution The MRC resolution of the output map (in
1174 @param voxel The voxel size for the output map (lower is slower)
1179 "The 'representation' argument is deprecated "
1180 "(and is ignored). Pass hierarchies to "
1181 "add_subunits_density() instead.")
1182 self.MRCresolution = resolution
1185 self.count_models = 0.0
1186 self.custom_ranges = custom_ranges
1189 """Add a frame to the densities.
1190 @param hierarchy The hierarchy to add.
1192 self.count_models += 1.0
1195 all_particles_by_resolution = []
1196 for name
in part_dict:
1197 all_particles_by_resolution += part_dict[name]
1199 for density_name
in self.custom_ranges:
1201 all_particles_by_segments = []
1203 for seg
in self.custom_ranges[density_name]:
1204 if isinstance(seg, str):
1206 elif isinstance(seg, tuple)
and len(seg) == 2:
1208 hierarchy, molecule=seg[0], copy_index=seg[1])
1209 elif isinstance(seg, tuple)
and len(seg) == 3:
1211 hierarchy, molecule=seg[2],
1212 residue_indexes=range(seg[0], seg[1] + 1))
1213 elif isinstance(seg, tuple)
and len(seg) == 4:
1215 hierarchy, molecule=seg[2],
1216 residue_indexes=range(seg[0], seg[1] + 1),
1219 raise Exception(
'could not understand selection tuple '
1222 all_particles_by_segments += s.get_selected_particles()
1223 parts = all_particles_by_segments
1224 self._create_density_from_particles(parts, density_name)
1226 def normalize_density(self):
1229 def _create_density_from_particles(self, ps, name,
1230 kernel_type=
'GAUSSIAN'):
1231 '''Internal function for adding to densities.
1232 pass XYZR particles with mass and create a density from them.
1233 kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
1236 dmap.set_was_used(
True)
1237 if name
not in self.densities:
1238 self.densities[name] = dmap
1244 dmap3.set_was_used(
True)
1246 dmap3.add(self.densities[name])
1247 self.densities[name] = dmap3
1249 def get_density_keys(self):
1250 return list(self.densities.keys())
1253 """Get the current density for some component name"""
1254 if name
not in self.densities:
1257 return self.densities[name]
1259 def write_mrc(self, path="./", suffix=None):
1262 for density_name
in self.densities:
1263 self.densities[density_name].
multiply(1. / self.count_models)
1265 name = path +
"/" + density_name +
".mrc"
1267 name = path +
"/" + density_name +
"." + suffix +
".mrc"
1268 path, file = os.path.split(name)
1269 if not os.path.exists(path):
1272 except OSError
as e:
1273 if e.errno != errno.EEXIST:
1275 IMP.em.write_map(self.densities[density_name], name,
1279 class GetContactMap(object):
1282 self.distance = distance
1283 self.contactmap =
''
1290 def set_prot(self, prot):
1296 for name
in particles_dictionary:
1297 residue_indexes = []
1298 for p
in particles_dictionary[name]:
1302 if len(residue_indexes) != 0:
1303 self.protnames.append(name)
1305 def get_subunit_coords(self, frame, align=0):
1306 from scipy.spatial.distance
import cdist
1310 for part
in self.prot.get_children():
1313 for chl
in part.get_children():
1317 SortedSegments.append((chl, startres))
1318 SortedSegments = sorted(SortedSegments, key=itemgetter(1))
1320 for sgmnt
in SortedSegments:
1323 crd = np.array([p.get_x(), p.get_y(), p.get_z()])
1326 radii.append(p.get_radius())
1328 new_name = part.get_name() +
'_' + sgmnt[0].get_name() + \
1331 .get_residue_indexes()[0])
1332 namelist.append(new_name)
1333 self.expanded[new_name] = len(
1335 if part.get_name()
not in self.resmap:
1336 self.resmap[part.get_name()] = {}
1338 self.resmap[part.get_name()][res] = new_name
1340 coords = np.array(coords)
1341 radii = np.array(radii)
1342 if len(self.namelist) == 0:
1343 self.namelist = namelist
1344 self.contactmap = np.zeros((len(coords), len(coords)))
1345 distances = cdist(coords, coords)
1346 distances = (distances - radii).T - radii
1347 distances = distances <= self.distance
1348 self.contactmap += distances
1350 def add_xlinks(self, filename,
1351 identification_string=
'ISDCrossLinkMS_Distance_'):
1353 with open(filename)
as data:
1354 D = data.readlines()
1357 if identification_string
in d:
1358 d = d.replace(
"_",
" ").replace(
"-",
" ").replace(
1361 t1, t2 = (d[0], d[1]), (d[1], d[0])
1362 if t1
not in self.XL:
1363 self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
1364 self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
1366 self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
1367 self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
1369 def dist_matrix(self, skip_cmap=0, skip_xl=1, outname=None):
1373 proteins = self.protnames
1378 proteins = [p.get_name()
for p
in self.prot.get_children()]
1380 for p1
in range(len(proteins)):
1381 for p2
in range(p1, len(proteins)):
1382 pl1, pl2 = (max(self.resmap[proteins[p1]].keys()),
1383 max(self.resmap[proteins[p2]].keys()))
1384 pn1, pn2 = proteins[p1], proteins[p2]
1385 mtr = np.zeros((pl1 + 1, pl2 + 1))
1386 print(
'Creating matrix for: ',
1387 p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1388 for i1
in range(1, pl1 + 1):
1389 for i2
in range(1, pl2 + 1):
1391 r1 = K.index(self.resmap[pn1][i1])
1392 r2 = K.index(self.resmap[pn2][i2])
1394 mtr[i1 - 1, i2 - 1] = r
1396 missing.append((pn1, pn2, i1, i2))
1397 Matrices[(pn1, pn2)] = mtr
1402 raise ValueError(
"cross-links were not provided, use "
1403 "add_xlinks function!")
1405 for p1
in range(len(proteins)):
1406 for p2
in range(p1, len(proteins)):
1407 pl1, pl2 = (max(self.resmap[proteins[p1]].keys()),
1408 max(self.resmap[proteins[p2]].keys()))
1409 pn1, pn2 = proteins[p1], proteins[p2]
1410 mtr = np.zeros((pl1 + 1, pl2 + 1))
1413 xls = self.XL[(pn1, pn2)]
1416 xls = self.XL[(pn2, pn1)]
1421 print(
'Creating matrix for: ',
1422 p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1423 for xl1, xl2
in xls:
1425 print(
'X' * 10, xl1, xl2)
1428 print(
'X' * 10, xl1, xl2)
1430 mtr[xl1 - 1, xl2 - 1] = 100
1432 print(
'Creating matrix for: ',
1433 p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1434 for xl1, xl2
in xls:
1436 print(
'X' * 10, xl1, xl2)
1439 print(
'X' * 10, xl1, xl2)
1441 mtr[xl2 - 1, xl1 - 1] = 100
1443 print(
'No cross-links between: ', pn1, pn2)
1444 Matrices_xl[(pn1, pn2)] = mtr
1451 for i, c
in enumerate(C):
1452 cl = max(self.resmap[c].keys())
1457 R.append(R[-1] + cl)
1462 import matplotlib
as mpl
1464 import matplotlib.pyplot
as plt
1465 import matplotlib.gridspec
as gridspec
1466 import scipy.sparse
as sparse
1469 gs = gridspec.GridSpec(len(W), len(W),
1474 for x1, r1
in enumerate(R):
1475 for x2, r2
in enumerate(R):
1476 ax = plt.subplot(gs[cnt])
1479 mtr = Matrices[(C[x1], C[x2])]
1481 mtr = Matrices[(C[x2], C[x1])].T
1484 interpolation=
'nearest',
1486 vmax=log(mtr.max())
if mtr.max() > 0.
else 1.)
1491 mtr = Matrices_xl[(C[x1], C[x2])]
1493 mtr = Matrices_xl[(C[x2], C[x1])].T
1495 sparse.csr_matrix(mtr),
1505 ax.set_ylabel(C[x1], rotation=90)
1507 plt.savefig(outname +
".pdf", dpi=300, transparent=
"False")
1515 def get_hiers_from_rmf(model, frame_number, rmf_file):
1517 print(
"getting coordinates for frame %i rmf file %s"
1518 % (frame_number, rmf_file))
1521 rh = RMF.open_rmf_file_read_only(rmf_file)
1526 print(
"Unable to open rmf file %s" % (rmf_file))
1532 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1540 def link_hiers_to_rmf(model, hiers, frame_number, rmf_file):
1541 print(
"linking hierarchies for frame %i rmf file %s"
1542 % (frame_number, rmf_file))
1543 rh = RMF.open_rmf_file_read_only(rmf_file)
1548 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1555 def get_hiers_and_restraints_from_rmf(model, frame_number, rmf_file):
1557 print(
"getting coordinates for frame %i rmf file %s"
1558 % (frame_number, rmf_file))
1561 rh = RMF.open_rmf_file_read_only(rmf_file)
1567 print(
"Unable to open rmf file %s" % (rmf_file))
1572 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1580 def link_hiers_and_restraints_to_rmf(model, hiers, rs, frame_number, rmf_file):
1581 print(
"linking hierarchies for frame %i rmf file %s"
1582 % (frame_number, rmf_file))
1583 rh = RMF.open_rmf_file_read_only(rmf_file)
1589 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1597 """Get particles at res 1, or any beads, based on the name.
1598 No Representation is needed. This is mainly used when the hierarchy
1599 is read from an RMF file.
1601 @return a dictionary of component names and their particles
1602 @note If the root node is named "System" or is a "State", do
1608 for mol
in IMP.atom.get_by_type(prot, IMP.atom.MOLECULE_TYPE):
1610 particle_dict[mol.get_name()] = sel.get_selected_particles()
1611 return particle_dict
1615 """Get particles at res 10, or any beads, based on the name.
1616 No Representation is needed.
1617 This is mainly used when the hierarchy is read from an RMF file.
1619 @return a dictionary of component names and their particles
1620 @note If the root node is named "System" or is a "State", do proper
1625 for mol
in IMP.atom.get_by_type(prot, IMP.atom.MOLECULE_TYPE):
1627 particle_dict[mol.get_name()] = sel.get_selected_particles()
1628 return particle_dict
1632 """Visualization of cross-links"""
1634 self.crosslinks = []
1635 self.external_csv_data =
None
1636 self.crosslinkedprots = set()
1637 self.mindist = +10000000.0
1638 self.maxdist = -10000000.0
1639 self.contactmap =
None
1641 def set_hierarchy(self, prot):
1642 self.prot_length_dict = {}
1643 self.model = prot.get_model()
1645 for i
in prot.get_children():
1647 residue_indexes = []
1651 if len(residue_indexes) != 0:
1652 self.prot_length_dict[name] = max(residue_indexes)
1654 def set_coordinates_for_contact_map(self, rmf_name, rmf_frame_index):
1655 from scipy.spatial.distance
import cdist
1657 rh = RMF.open_rmf_file_read_only(rmf_name)
1660 print(
"getting coordinates for frame %i rmf file %s"
1661 % (rmf_frame_index, rmf_name))
1670 self.index_dictionary = {}
1672 for name
in particles_dictionary:
1673 residue_indexes = []
1674 for p
in particles_dictionary[name]:
1678 if len(residue_indexes) != 0:
1680 for res
in range(min(residue_indexes),
1681 max(residue_indexes) + 1):
1684 crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1686 radii.append(d.get_radius())
1687 if name
not in self.index_dictionary:
1688 self.index_dictionary[name] = [resindex]
1690 self.index_dictionary[name].append(resindex)
1693 coords = np.array(coords)
1694 radii = np.array(radii)
1696 distances = cdist(coords, coords)
1697 distances = (distances - radii).T - radii
1699 distances = np.where(distances <= 20.0, 1.0, 0)
1700 if self.contactmap
is None:
1701 self.contactmap = np.zeros((len(coords), len(coords)))
1702 self.contactmap += distances
1705 IMP.atom.destroy(prot)
1708 self, data_file, search_label=
'ISDCrossLinkMS_Distance_',
1709 mapping=
None, filter_label=
None,
1711 filter_rmf_file_names=
None, external_csv_data_file=
None,
1712 external_csv_data_file_unique_id_key=
"Unique ID"):
1723 keys = po.get_keys()
1725 xl_keys = [k
for k
in keys
if search_label
in k]
1727 if filter_rmf_file_names
is not None:
1728 rmf_file_key =
"local_rmf_file_name"
1729 fs = po.get_fields(xl_keys+[rmf_file_key])
1731 fs = po.get_fields(xl_keys)
1734 self.cross_link_frequency = {}
1738 self.cross_link_distances = {}
1742 self.cross_link_distances_unique = {}
1744 if external_csv_data_file
is not None:
1747 self.external_csv_data = {}
1748 xldb = IMP.pmi.tools.get_db_from_csv(external_csv_data_file)
1751 self.external_csv_data[
1752 xl[external_csv_data_file_unique_id_key]] = xl
1757 cross_link_frequency_list = []
1759 self.unique_cross_link_list = []
1763 keysplit = key.replace(
"_",
" ").replace(
"-",
" ").replace(
1766 if filter_label
is not None:
1767 if filter_label
not in keysplit:
1771 r1 = int(keysplit[5])
1773 r2 = int(keysplit[7])
1776 confidence = keysplit[12]
1780 unique_identifier = keysplit[3]
1782 unique_identifier =
'0'
1784 r1 = int(keysplit[mapping[
"Residue1"]])
1785 c1 = keysplit[mapping[
"Protein1"]]
1786 r2 = int(keysplit[mapping[
"Residue2"]])
1787 c2 = keysplit[mapping[
"Protein2"]]
1789 confidence = keysplit[mapping[
"Confidence"]]
1793 unique_identifier = keysplit[mapping[
"Unique Identifier"]]
1795 unique_identifier =
'0'
1797 self.crosslinkedprots.add(c1)
1798 self.crosslinkedprots.add(c2)
1804 if filter_rmf_file_names
is not None:
1805 for n, d
in enumerate(fs[key]):
1806 if fs[rmf_file_key][n]
in filter_rmf_file_names:
1807 dists.append(float(d))
1809 dists = [float(f)
for f
in fs[key]]
1814 mdist = self.median(dists)
1816 stdv = np.std(np.array(dists))
1817 if self.mindist > mdist:
1818 self.mindist = mdist
1819 if self.maxdist < mdist:
1820 self.maxdist = mdist
1824 if (r1, c1, r2, c2, mdist)
not in cross_link_frequency_list:
1825 if (r1, c1, r2, c2)
not in self.cross_link_frequency:
1826 self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1827 self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1829 self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1830 self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1831 cross_link_frequency_list.append((r1, c1, r2, c2))
1832 cross_link_frequency_list.append((r2, c2, r1, c1))
1833 self.unique_cross_link_list.append(
1834 (r1, c1, r2, c2, mdist))
1836 if (r1, c1, r2, c2)
not in self.cross_link_distances:
1837 self.cross_link_distances[(
1838 r1, c1, r2, c2, mdist, confidence)] = dists
1839 self.cross_link_distances[(
1840 r2, c2, r1, c1, mdist, confidence)] = dists
1841 self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1843 self.cross_link_distances[(
1844 r2, c2, r1, c1, mdist, confidence)] += dists
1845 self.cross_link_distances[(
1846 r1, c1, r2, c2, mdist, confidence)] += dists
1848 self.crosslinks.append(
1849 (r1, c1, r2, c2, mdist, stdv, confidence, unique_identifier,
1851 self.crosslinks.append(
1852 (r2, c2, r1, c1, mdist, stdv, confidence, unique_identifier,
1855 self.cross_link_frequency_inverted = {}
1856 for xl
in self.unique_cross_link_list:
1857 (r1, c1, r2, c2, mdist) = xl
1858 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1859 if frequency
not in self.cross_link_frequency_inverted:
1860 self.cross_link_frequency_inverted[
1861 frequency] = [(r1, c1, r2, c2)]
1863 self.cross_link_frequency_inverted[
1864 frequency].append((r1, c1, r2, c2))
1868 def median(self, mylist):
1869 sorts = sorted(mylist)
1875 return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1876 return sorts[length / 2]
1878 def set_threshold(self, threshold):
1879 self.threshold = threshold
1881 def set_tolerance(self, tolerance):
1882 self.tolerance = tolerance
1884 def colormap(self, dist):
1885 if dist < self.threshold - self.tolerance:
1887 elif dist >= self.threshold + self.tolerance:
1892 def write_cross_link_database(self, filename, format='csv'):
1896 "Unique ID",
"Protein1",
"Residue1",
"Protein2",
"Residue2",
1897 "Median Distance",
"Standard Deviation",
"Confidence",
1898 "Frequency",
"Arrangement"]
1900 if self.external_csv_data
is not None:
1901 keys = list(self.external_csv_data.keys())
1902 innerkeys = list(self.external_csv_data[keys[0]].keys())
1904 fieldnames += innerkeys
1906 dw = csv.DictWriter(
1907 open(filename,
"w"),
1909 fieldnames=fieldnames)
1911 for xl
in self.crosslinks:
1912 (r1, c1, r2, c2, mdist, stdv, confidence,
1913 unique_identifier, descriptor) = xl
1914 if descriptor ==
'original':
1916 outdict[
"Unique ID"] = unique_identifier
1917 outdict[
"Protein1"] = c1
1918 outdict[
"Protein2"] = c2
1919 outdict[
"Residue1"] = r1
1920 outdict[
"Residue2"] = r2
1921 outdict[
"Median Distance"] = mdist
1922 outdict[
"Standard Deviation"] = stdv
1923 outdict[
"Confidence"] = confidence
1924 outdict[
"Frequency"] = self.cross_link_frequency[
1927 arrangement =
"Intra"
1929 arrangement =
"Inter"
1930 outdict[
"Arrangement"] = arrangement
1931 if self.external_csv_data
is not None:
1932 outdict.update(self.external_csv_data[unique_identifier])
1934 dw.writerow(outdict)
1936 def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
1937 no_confidence_info=
False, filter=
None, layout=
"whole",
1938 crosslinkedonly=
False, filename=
None, confidence_classes=
None,
1939 alphablend=0.1, scale_symbol_size=1.0, gap_between_components=0,
1940 rename_protein_map=
None):
1955 import matplotlib
as mpl
1957 import matplotlib.pyplot
as plt
1958 import matplotlib.cm
as cm
1960 fig = plt.figure(figsize=(10, 10))
1961 ax = fig.add_subplot(111)
1967 if prot_listx
is None:
1969 prot_listx = list(self.crosslinkedprots)
1971 prot_listx = list(self.prot_length_dict.keys())
1974 nresx = gap_between_components + \
1975 sum([self.prot_length_dict[name]
1976 + gap_between_components
for name
in prot_listx])
1980 if prot_listy
is None:
1982 prot_listy = list(self.crosslinkedprots)
1984 prot_listy = list(self.prot_length_dict.keys())
1987 nresy = gap_between_components + \
1988 sum([self.prot_length_dict[name]
1989 + gap_between_components
for name
in prot_listy])
1994 res = gap_between_components
1995 for prot
in prot_listx:
1996 resoffsetx[prot] = res
1997 res += self.prot_length_dict[prot]
1999 res += gap_between_components
2003 res = gap_between_components
2004 for prot
in prot_listy:
2005 resoffsety[prot] = res
2006 res += self.prot_length_dict[prot]
2008 res += gap_between_components
2010 resoffsetdiagonal = {}
2011 res = gap_between_components
2012 for prot
in IMP.pmi.tools.OrderedSet(prot_listx + prot_listy):
2013 resoffsetdiagonal[prot] = res
2014 res += self.prot_length_dict[prot]
2015 res += gap_between_components
2021 for n, prot
in enumerate(prot_listx):
2022 res = resoffsetx[prot]
2024 for proty
in prot_listy:
2025 resy = resoffsety[proty]
2026 endy = resendy[proty]
2027 ax.plot([res, res], [resy, endy],
'k-', lw=0.4)
2028 ax.plot([end, end], [resy, endy],
'k-', lw=0.4)
2029 xticks.append((float(res) + float(end)) / 2)
2030 if rename_protein_map
is not None:
2031 if prot
in rename_protein_map:
2032 prot = rename_protein_map[prot]
2033 xlabels.append(prot)
2037 for n, prot
in enumerate(prot_listy):
2038 res = resoffsety[prot]
2040 for protx
in prot_listx:
2041 resx = resoffsetx[protx]
2042 endx = resendx[protx]
2043 ax.plot([resx, endx], [res, res],
'k-', lw=0.4)
2044 ax.plot([resx, endx], [end, end],
'k-', lw=0.4)
2045 yticks.append((float(res) + float(end)) / 2)
2046 if rename_protein_map
is not None:
2047 if prot
in rename_protein_map:
2048 prot = rename_protein_map[prot]
2049 ylabels.append(prot)
2052 print(prot_listx, prot_listy)
2054 if self.contactmap
is not None:
2055 tmp_array = np.zeros((nresx, nresy))
2057 for px
in prot_listx:
2059 for py
in prot_listy:
2061 resx = resoffsety[px]
2062 lengx = resendx[px] - 1
2063 resy = resoffsety[py]
2064 lengy = resendy[py] - 1
2065 indexes_x = self.index_dictionary[px]
2066 minx = min(indexes_x)
2067 maxx = max(indexes_x)
2068 indexes_y = self.index_dictionary[py]
2069 miny = min(indexes_y)
2070 maxy = max(indexes_y)
2072 print(px, py, minx, maxx, miny, maxy)
2077 resy:lengy] = self.contactmap[
2083 ax.imshow(tmp_array,
2086 interpolation=
'nearest')
2088 ax.set_xticks(xticks)
2089 ax.set_xticklabels(xlabels, rotation=90)
2090 ax.set_yticks(yticks)
2091 ax.set_yticklabels(ylabels)
2092 ax.set_xlim(0, nresx)
2093 ax.set_ylim(0, nresy)
2097 already_added_xls = []
2099 for xl
in self.crosslinks:
2101 (r1, c1, r2, c2, mdist, stdv, confidence,
2102 unique_identifier, descriptor) = xl
2104 if confidence_classes
is not None:
2105 if confidence
not in confidence_classes:
2109 pos1 = r1 + resoffsetx[c1]
2113 pos2 = r2 + resoffsety[c2]
2117 if filter
is not None:
2118 xldb = self.external_csv_data[unique_identifier]
2119 xldb.update({
"Distance": mdist})
2120 xldb.update({
"Distance_stdv": stdv})
2122 if filter[1] ==
">":
2123 if float(xldb[filter[0]]) <= float(filter[2]):
2126 if filter[1] ==
"<":
2127 if float(xldb[filter[0]]) >= float(filter[2]):
2130 if filter[1] ==
"==":
2131 if float(xldb[filter[0]]) != float(filter[2]):
2137 pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
2138 pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
2140 if layout ==
'lowerdiagonal':
2141 if pos_for_diagonal1 <= pos_for_diagonal2:
2143 if layout ==
'upperdiagonal':
2144 if pos_for_diagonal1 >= pos_for_diagonal2:
2147 already_added_xls.append((r1, c1, r2, c2))
2149 if not no_confidence_info:
2150 if confidence ==
'0.01':
2151 markersize = 14 * scale_symbol_size
2152 elif confidence ==
'0.05':
2153 markersize = 9 * scale_symbol_size
2154 elif confidence ==
'0.1':
2155 markersize = 6 * scale_symbol_size
2157 markersize = 15 * scale_symbol_size
2159 markersize = 5 * scale_symbol_size
2161 if not no_dist_info:
2162 color = self.colormap(mdist)
2172 markersize=markersize)
2174 fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
2176 for i
in ax.spines.values():
2177 i.set_linewidth(2.0)
2180 plt.savefig(filename +
".pdf", dpi=300, transparent=
"False")
2184 def get_frequency_statistics(self, prot_list,
2187 violated_histogram = {}
2188 satisfied_histogram = {}
2189 unique_cross_links = []
2191 for xl
in self.unique_cross_link_list:
2192 (r1, c1, r2, c2, mdist) = xl
2195 if prot_list2
is None:
2196 if c1
not in prot_list:
2198 if c2
not in prot_list:
2201 if c1
in prot_list
and c2
in prot_list2:
2203 elif c1
in prot_list2
and c2
in prot_list:
2208 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2210 if (r1, c1, r2, c2)
not in unique_cross_links:
2212 if frequency
not in violated_histogram:
2213 violated_histogram[frequency] = 1
2215 violated_histogram[frequency] += 1
2217 if frequency
not in satisfied_histogram:
2218 satisfied_histogram[frequency] = 1
2220 satisfied_histogram[frequency] += 1
2221 unique_cross_links.append((r1, c1, r2, c2))
2222 unique_cross_links.append((r2, c2, r1, c1))
2224 print(
"# satisfied")
2226 total_number_of_crosslinks = 0
2228 for i
in satisfied_histogram:
2232 if i
in violated_histogram:
2233 print(i, violated_histogram[i]+satisfied_histogram[i])
2235 print(i, satisfied_histogram[i])
2236 total_number_of_crosslinks += i*satisfied_histogram[i]
2240 for i
in violated_histogram:
2241 print(i, violated_histogram[i])
2242 total_number_of_crosslinks += i*violated_histogram[i]
2244 print(total_number_of_crosslinks)
2246 def print_cross_link_binary_symbols(self, prot_list,
2249 confidence_list = []
2250 for xl
in self.crosslinks:
2251 (r1, c1, r2, c2, mdist, stdv, confidence,
2252 unique_identifier, descriptor) = xl
2254 if prot_list2
is None:
2255 if c1
not in prot_list:
2257 if c2
not in prot_list:
2260 if c1
in prot_list
and c2
in prot_list2:
2262 elif c1
in prot_list2
and c2
in prot_list:
2267 if descriptor !=
"original":
2270 confidence_list.append(confidence)
2272 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2273 tmp_dist_binary = []
2276 tmp_dist_binary.append(1)
2278 tmp_dist_binary.append(0)
2279 tmp_matrix.append(tmp_dist_binary)
2281 matrix = list(zip(*tmp_matrix))
2283 satisfied_high_sum = 0
2284 satisfied_mid_sum = 0
2285 satisfied_low_sum = 0
2286 total_satisfied_sum = 0
2287 for k, m
in enumerate(matrix):
2296 for n, b
in enumerate(m):
2297 if confidence_list[n] ==
"0.01":
2301 satisfied_high_sum += 1
2302 elif confidence_list[n] ==
"0.05":
2306 satisfied_mid_sum += 1
2307 elif confidence_list[n] ==
"0.1":
2311 satisfied_low_sum += 1
2313 total_satisfied += 1
2314 total_satisfied_sum += 1
2316 print(k, satisfied_high, total_high)
2317 print(k, satisfied_mid, total_mid)
2318 print(k, satisfied_low, total_low)
2319 print(k, total_satisfied, total)
2320 print(float(satisfied_high_sum) / len(matrix))
2321 print(float(satisfied_mid_sum) / len(matrix))
2322 print(float(satisfied_low_sum) / len(matrix))
2325 def get_unique_crosslinks_statistics(self, prot_list,
2338 satisfied_string = []
2339 for xl
in self.crosslinks:
2340 (r1, c1, r2, c2, mdist, stdv, confidence,
2341 unique_identifier, descriptor) = xl
2343 if prot_list2
is None:
2344 if c1
not in prot_list:
2346 if c2
not in prot_list:
2349 if c1
in prot_list
and c2
in prot_list2:
2351 elif c1
in prot_list2
and c2
in prot_list:
2356 if descriptor !=
"original":
2360 if confidence ==
"0.01":
2364 if confidence ==
"0.05":
2368 if confidence ==
"0.1":
2373 satisfied_string.append(1)
2375 satisfied_string.append(0)
2377 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2378 tmp_dist_binary = []
2381 tmp_dist_binary.append(1)
2383 tmp_dist_binary.append(0)
2384 tmp_matrix.append(tmp_dist_binary)
2386 print(
"unique satisfied_high/total_high", satisfied_high,
"/",
2388 print(
"unique satisfied_mid/total_mid", satisfied_mid,
"/", total_mid)
2389 print(
"unique satisfied_low/total_low", satisfied_low,
"/", total_low)
2390 print(
"total", total)
2392 matrix = list(zip(*tmp_matrix))
2393 satisfied_models = 0
2395 for b
in satisfied_string:
2402 all_satisfied =
True
2404 for n, b
in enumerate(m):
2409 if b == 1
and satisfied_string[n] == 1:
2411 elif b == 1
and satisfied_string[n] == 0:
2413 elif b == 0
and satisfied_string[n] == 0:
2415 elif b == 0
and satisfied_string[n] == 1:
2416 all_satisfied =
False
2418 satisfied_models += 1
2420 print(satstr, all_satisfied)
2421 print(
"models that satisfies the median satisfied cross-links/total "
2422 "models", satisfied_models, len(matrix))
2424 def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,
2427 import matplotlib
as mpl
2429 import matplotlib.pylab
as pl
2432 for kw
in self.cross_link_distances_unique:
2433 (r1, c1, r2, c2) = kw
2434 dists = self.cross_link_distances_unique[kw]
2436 if prot_list2
is None:
2437 if c1
not in prot_list:
2439 if c2
not in prot_list:
2442 if c1
in prot_list
and c2
in prot_list2:
2444 elif c1
in prot_list2
and c2
in prot_list:
2449 dists.append(sum(dists))
2450 tmp_matrix.append(dists)
2452 tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
2455 matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
2457 for i
in range(len(tmp_matrix)):
2458 for k
in range(len(tmp_matrix[i]) - 1):
2459 matrix[i][k] = tmp_matrix[i][k]
2464 ax = fig.add_subplot(211)
2466 cax = ax.imshow(matrix, interpolation=
'nearest')
2468 pl.savefig(figurename, dpi=300)
2477 arrangement=
"inter",
2478 confidence_input=
"None"):
2481 for xl
in self.cross_link_distances:
2482 (r1, c1, r2, c2, mdist, confidence) = xl
2483 if c1
in prots1
and c2
in prots2:
2484 if arrangement ==
"inter" and c1 == c2:
2486 if arrangement ==
"intra" and c1 != c2:
2488 if confidence_input == confidence:
2489 label = str(c1) +
":" + str(r1) + \
2490 "-" + str(c2) +
":" + str(r2)
2491 values = self.cross_link_distances[xl]
2492 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2493 data.append((label, values, mdist, frequency))
2495 sort_by_dist = sorted(data, key=
lambda tup: tup[2])
2496 sort_by_dist = list(zip(*sort_by_dist))
2497 values = sort_by_dist[1]
2498 positions = list(range(len(values)))
2499 labels = sort_by_dist[0]
2500 frequencies = list(map(float, sort_by_dist[3]))
2501 frequencies = [f * 10.0
for f
in frequencies]
2503 nchunks = int(float(len(values)) / nxl_per_row)
2504 values_chunks = IMP.pmi.tools.chunk_list_into_segments(values, nchunks)
2505 positions_chunks = IMP.pmi.tools.chunk_list_into_segments(
2508 frequencies_chunks = IMP.pmi.tools.chunk_list_into_segments(
2511 labels_chunks = IMP.pmi.tools.chunk_list_into_segments(labels, nchunks)
2513 for n, v
in enumerate(values_chunks):
2514 p = positions_chunks[n]
2515 f = frequencies_chunks[n]
2517 filename +
"." + str(n), v, p, f,
2518 valuename=
"Distance (Ang)",
2519 positionname=
"Unique " + arrangement +
" Crosslinks",
2520 xlabels=labels_chunks[n])
2522 def crosslink_distance_histogram(self, filename,
2525 confidence_classes=
None,
2531 if prot_list
is None:
2532 prot_list = list(self.prot_length_dict.keys())
2535 for xl
in self.crosslinks:
2536 (r1, c1, r2, c2, mdist, stdv, confidence,
2537 unique_identifier, descriptor) = xl
2539 if confidence_classes
is not None:
2540 if confidence
not in confidence_classes:
2543 if prot_list2
is None:
2544 if c1
not in prot_list:
2546 if c2
not in prot_list:
2549 if c1
in prot_list
and c2
in prot_list2:
2551 elif c1
in prot_list2
and c2
in prot_list:
2556 distances.append(mdist)
2559 filename, distances, valuename=
"C-alpha C-alpha distance [Ang]",
2560 bins=bins, color=color,
2562 reference_xline=35.0,
2563 yplotrange=yplotrange, normalized=normalized)
2565 def scatter_plot_xl_features(self, filename,
2571 reference_ylines=
None,
2572 distance_color=
True,
2574 import matplotlib
as mpl
2576 import matplotlib.pyplot
as plt
2578 fig = plt.figure(figsize=(10, 10))
2579 ax = fig.add_subplot(111)
2581 for xl
in self.crosslinks:
2582 (r1, c1, r2, c2, mdist, stdv, confidence,
2583 unique_identifier, arrangement) = xl
2585 if prot_list2
is None:
2586 if c1
not in prot_list:
2588 if c2
not in prot_list:
2591 if c1
in prot_list
and c2
in prot_list2:
2593 elif c1
in prot_list2
and c2
in prot_list:
2598 xldb = self.external_csv_data[unique_identifier]
2599 xldb.update({
"Distance": mdist})
2600 xldb.update({
"Distance_stdv": stdv})
2602 xvalue = float(xldb[feature1])
2603 yvalue = float(xldb[feature2])
2606 color = self.colormap(mdist)
2610 ax.plot([xvalue], [yvalue],
'o', c=color, alpha=0.1, markersize=7)
2612 if yplotrange
is not None:
2613 ax.set_ylim(yplotrange)
2614 if reference_ylines
is not None:
2615 for rl
in reference_ylines:
2616 ax.axhline(rl, color=
'red', linestyle=
'dashed', linewidth=1)
2619 plt.savefig(filename +
"." + format, dpi=150, transparent=
"False")
def get_output
Returns output for IMP.pmi.output.Output object.
Visualization of cross-links.
A decorator to associate a particle with a part of a protein/DNA/RNA.
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
A class for reading stat files (either rmf or ascii v1 and v2)
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
void handle_use_deprecated(std::string message)
Break in this method in gdb to find deprecated uses at runtime.
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
Compute the RMSD (without alignment) taking into account the copy ambiguity.
void link_restraints(RMF::FileConstHandle fh, const Restraints &hs)
A class to evaluate the precision of an ensemble.
A class to cluster structures.
def get_rmsd_wrt_reference_structure_with_alignment
First align then calculate RMSD.
Class for sampling a density map from particles.
A decorator for keeping track of copies of a molecule.
def add_structure
Read a structure into the ensemble and store (as coordinates).
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
Return a density map for which voxel i contains the result of m1[i]*m2[i].
Performs alignment and RMSD calculation for two sets of coordinates.
def add_subunits_density
Add a frame to the densities.
DensityMap * create_density_map(const IMP::algebra::GridD< 3, S, V, E > &arg)
Create a density map from an arbitrary IMP::algebra::GridD.
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Basic utilities for handling cryo-electron microscopy 3D density maps.
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
def fill
Add coordinates for a single model.
Tools for clustering and cluster analysis.
def do_cluster
Run K-means clustering.
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Classes for writing output files and processing them.
def set_reference_structure
Read in a structure used for reference computation.
def get_rmsf
Calculate the residue mean square fluctuations (RMSF).
def get_average_distance_wrt_reference_structure
Compare the structure set to the reference structure.
def get_density
Get the current density for some component name.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
def get_precision
Evaluate the precision of two named structure groups.
Restraints create_restraints(RMF::FileConstHandle fh, Model *m)
def get_moldict_coorddict
return data structure for the RMSD calculation
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name.
Python classes to represent, score, sample and analyze models.
def add_structures
Read a list of RMFs, supports parallel.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Select hierarchy particles identified by the biological name.
Compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
A decorator for a particle with x,y,z coordinates and a radius.