3 """@namespace IMP.pmi.analysis
4 Tools for clustering and cluster analysis
15 from operator
import itemgetter
16 from math
import log, sqrt
22 """Performs alignment and RMSD calculation for two sets of coordinates
24 The class also takes into account non-equal stoichiometry of the proteins.
25 If this is the case, the protein names of proteins in multiple copies
26 should be specified in the following form:
27 nameA..1, nameA..2 (note two dots).
30 def __init__(self, template, query, weights=None):
32 @param query {'p1':coords(L,3), 'p2':coords(L,3)}
33 @param template {'p1':coords(L,3), 'p2':coords(L,3)}
34 @param weights optional weights for each set of coordinates
37 self.template = template
38 self.weights = weights
40 if len(self.query.keys()) != len(self.template.keys()):
41 raise ValueError(
'''the number of proteins
42 in template and query does not match!''')
48 self.proteins = sorted(self.query.keys())
49 prots_uniq = [i.split(
'..')[0]
for i
in self.proteins]
57 copies = [i
for i
in self.proteins
if i.split(
'..')[0] == p]
58 prmts = list(itertools.permutations(copies, len(copies)))
61 self.Product = list(itertools.product(*P.values()))
69 torder = sum([list(i)
for i
in self.Product[0]], [])
72 if self.weights
is not None:
73 weights += [i
for i
in self.weights[t]]
75 self.rmsd = 10000000000.
76 for comb
in self.Product:
78 order = sum([list(i)
for i
in comb], [])
83 if self.weights
is not None:
85 template_xyz, query_xyz, weights)
93 from scipy.spatial.distance
import cdist
101 torder = sum([list(i)
for i
in self.Product[0]], [])
105 self.rmsd, Transformation = 10000000000.,
''
108 for comb
in self.Product:
109 order = sum([list(i)
for i
in comb], [])
114 if len(template_xyz) != len(query_xyz):
115 raise ValueError(
'''the number of coordinates
116 in template and query does not match!''')
120 query_xyz, template_xyz)
121 query_xyz_tr = [transformation.get_transformed(n)
125 sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2))
129 Transformation = transformation
132 return (self.rmsd, Transformation)
137 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
138 'a..2':np.array([np.array([1.,1.,])]),
139 'a..3':np.array([np.array([-2.,1.])]),
140 'b':np.array([np.array([0.,-1.])]),
141 'c..1':np.array([np.array([-1.,-1.])]),
142 'c..2':np.array([np.array([1.,-1.])]),
143 'd':np.array([np.array([0.,0.])]),
144 'e':np.array([np.array([0.,1.])])}
146 Ali = Alignment(Proteins, Proteins)
148 if Ali.get_rmsd() == 0.0: print 'successful test!'
149 else: print 'ERROR!'; exit()
158 self.violation_thresholds = {}
159 self.violation_counts = {}
161 with open(filename)
as data:
165 d = d.strip().split()
166 self.violation_thresholds[d[0]] = float(d[1])
168 def get_number_violated_restraints(self, rsts_dict):
170 for rst
in self.violation_thresholds:
171 if rst
not in rsts_dict:
173 if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
175 if rst
not in self.violation_counts:
176 self.violation_counts[rst] = 1
178 self.violation_counts[rst] += 1
183 """A class to cluster structures.
184 Uses scipy's cdist function to compute distance matrices
185 and sklearn's kmeans clustering module.
189 @param rmsd_weights Flat list of weights for each particle
193 from mpi4py
import MPI
194 self.comm = MPI.COMM_WORLD
195 self.rank = self.comm.Get_rank()
196 self.number_of_processes = self.comm.size
198 self.number_of_processes = 1
201 self.structure_cluster_ids =
None
202 self.tmpl_coords =
None
203 self.rmsd_weights = rmsd_weights
205 def set_template(self, part_coords):
206 self.tmpl_coords = part_coords
209 """Add coordinates for a single model."""
210 self.all_coords[frame] = Coords
212 def dist_matrix(self):
213 self.model_list_names = list(self.all_coords.keys())
214 self.model_indexes = list(range(len(self.model_list_names)))
215 self.model_indexes_dict = dict(
216 list(zip(self.model_list_names, self.model_indexes)))
217 model_indexes_unique_pairs = \
218 list(itertools.combinations(self.model_indexes, 2))
220 my_model_indexes_unique_pairs = IMP.pmi.tools.chunk_list_into_segments(
221 model_indexes_unique_pairs,
222 self.number_of_processes)[self.rank]
224 print(
"process %s assigned with %s pairs"
225 % (str(self.rank), str(len(my_model_indexes_unique_pairs))))
227 (raw_distance_dict, self.transformation_distance_dict) = \
228 self.matrix_calculation(self.all_coords, self.tmpl_coords,
229 my_model_indexes_unique_pairs)
231 if self.number_of_processes > 1:
234 pickable_transformations = \
235 self.get_pickable_transformation_distance_dict()
237 pickable_transformations)
238 self.set_transformation_distance_dict_from_pickable(
239 pickable_transformations)
241 self.raw_distance_matrix = np.zeros(
242 (len(self.model_list_names), len(self.model_list_names)))
243 for item
in raw_distance_dict:
245 self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
246 self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
248 def get_dist_matrix(self):
249 return self.raw_distance_matrix
252 """Run K-means clustering
253 @param number_of_clusters Num means
254 @param seed the random seed
256 from sklearn.cluster
import KMeans
261 kmeans = KMeans(n_clusters=number_of_clusters)
264 kmeans = KMeans(k=number_of_clusters)
265 kmeans.fit_predict(self.raw_distance_matrix)
267 self.structure_cluster_ids = kmeans.labels_
269 def get_pickable_transformation_distance_dict(self):
270 pickable_transformations = {}
271 for label
in self.transformation_distance_dict:
272 tr = self.transformation_distance_dict[label]
273 trans = tuple(tr.get_translation())
274 rot = tuple(tr.get_rotation().get_quaternion())
275 pickable_transformations[label] = (rot, trans)
276 return pickable_transformations
278 def set_transformation_distance_dict_from_pickable(
280 pickable_transformations):
281 self.transformation_distance_dict = {}
282 for label
in pickable_transformations:
283 tr = pickable_transformations[label]
286 self.transformation_distance_dict[
289 def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
296 pickable_transformations = \
297 self.get_pickable_transformation_distance_dict()
299 with open(file_name +
".data",
'wb')
as outf:
301 (self.structure_cluster_ids,
302 self.model_list_names,
303 pickable_transformations),
306 np.save(file_name +
".npy", self.raw_distance_matrix)
308 def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
311 with open(file_name +
".data",
'rb')
as inputf:
312 (self.structure_cluster_ids, self.model_list_names,
313 pickable_transformations) = pickle.load(inputf)
315 self.raw_distance_matrix = np.load(file_name +
".npy")
317 self.set_transformation_distance_dict_from_pickable(
318 pickable_transformations)
319 self.model_indexes = list(range(len(self.model_list_names)))
320 self.model_indexes_dict = dict(
321 list(zip(self.model_list_names, self.model_indexes)))
323 def plot_matrix(self, figurename="clustermatrix.pdf"):
324 import matplotlib
as mpl
326 import matplotlib.pylab
as pl
327 from scipy.cluster
import hierarchy
as hrc
329 fig = pl.figure(figsize=(10, 8))
330 ax = fig.add_subplot(212)
331 dendrogram = hrc.dendrogram(
332 hrc.linkage(self.raw_distance_matrix),
335 leaves_order = dendrogram[
'leaves']
336 ax.set_xlabel(
'Model')
337 ax.set_ylabel(
'RMSD [Angstroms]')
339 ax2 = fig.add_subplot(221)
341 self.raw_distance_matrix[leaves_order,
344 interpolation=
'nearest')
345 cb = fig.colorbar(cax)
346 cb.set_label(
'RMSD [Angstroms]')
347 ax2.set_xlabel(
'Model')
348 ax2.set_ylabel(
'Model')
350 pl.savefig(figurename, dpi=300)
353 def get_model_index_from_name(self, name):
354 return self.model_indexes_dict[name]
356 def get_cluster_labels(self):
358 return list(set(self.structure_cluster_ids))
360 def get_number_of_clusters(self):
361 return len(self.get_cluster_labels())
363 def get_cluster_label_indexes(self, label):
365 [i
for i, l
in enumerate(self.structure_cluster_ids)
if l == label]
368 def get_cluster_label_names(self, label):
370 [self.model_list_names[i]
371 for i
in self.get_cluster_label_indexes(label)]
374 def get_cluster_label_average_rmsd(self, label):
376 indexes = self.get_cluster_label_indexes(label)
379 sub_distance_matrix = self.raw_distance_matrix[
380 indexes, :][:, indexes]
381 average_rmsd = np.sum(sub_distance_matrix) / \
382 (len(sub_distance_matrix)
383 ** 2 - len(sub_distance_matrix))
388 def get_cluster_label_size(self, label):
389 return len(self.get_cluster_label_indexes(label))
391 def get_transformation_to_first_member(
395 reference = self.get_cluster_label_indexes(cluster_label)[0]
396 return self.transformation_distance_dict[(reference, structure_index)]
398 def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
400 model_list_names = list(all_coords.keys())
401 rmsd_protein_names = list(all_coords[model_list_names[0]].keys())
402 raw_distance_dict = {}
403 transformation_distance_dict = {}
404 if template_coords
is None:
408 alignment_template_protein_names = list(template_coords.keys())
410 for (f1, f2)
in list_of_pairs:
418 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
419 for pr
in rmsd_protein_names])
421 for pr
in rmsd_protein_names:
422 coords_f2[pr] = all_coords[model_list_names[f2]][pr]
424 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
425 rmsd = Ali.get_rmsd()
431 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
432 for pr
in alignment_template_protein_names])
433 coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
434 for pr
in alignment_template_protein_names])
437 template_rmsd, transformation = Ali.align()
442 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
443 for pr
in rmsd_protein_names])
445 for pr
in rmsd_protein_names:
446 coords_f2[pr] = [transformation.get_transformed(
447 i)
for i
in all_coords[model_list_names[f2]][pr]]
449 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
450 rmsd = Ali.get_rmsd()
452 raw_distance_dict[(f1, f2)] = rmsd
453 raw_distance_dict[(f2, f1)] = rmsd
454 transformation_distance_dict[(f1, f2)] = transformation
455 transformation_distance_dict[(f2, f1)] = transformation
457 return raw_distance_dict, transformation_distance_dict
461 """Compute the RMSD (without alignment) taking into account the copy
463 To be used with pmi2 hierarchies. Can be used for instance as follows:
465 rmsd = IMP.pmi.analysis.RMSD(
466 hier, hier, [mol.get_name() for mol in mols],
467 dynamic0=True, dynamic1=False)
468 output_objects.append(rmsd)
470 before shuffling the coordinates
473 def __init__(self, hier0, hier1, molnames, label="None", dynamic0=True,
474 dynamic1=
True, metric=IMP.algebra.get_rmsd):
476 @param hier0 first input hierarchy
477 @param hier1 second input hierarchy
478 @param molnames the names of the molecules used for the RMSD
479 @dynamic0 if True stores the decorators XYZ and coordinates of
480 hier0 can be update. If false coordinates are static
481 (stored in Vector3Ds) and will never be updated
482 @dynamic1 same as above
483 metric what metric should be used
485 self.moldict0, self.molcoords0, self.mol_XYZs0 = \
487 self.moldict1, self.molcoords1, self.mol_XYZs1 = \
489 self.dynamic0 = dynamic0
490 self.dynamic1 = dynamic1
495 """return data structure for the RMSD calculation"""
500 name = mol.get_name()
501 if name
not in molnames:
509 mol, residue_index=i,
510 representation_type=IMP.atom.BALLS,
512 parts = sel.get_selected_particles()
514 mol_coords[mol].append(
519 moldict[name].append(mol)
521 moldict[name] = [mol]
522 return moldict, mol_coords, mol_XYZs
524 def get_rmsd_and_assigments(self):
527 best_assignments = []
529 for molname, ref_mols
in self.moldict1.items():
531 for ref_mol
in ref_mols:
533 coord1 = [XYZ.get_coordinates()
534 for XYZ
in self.mol_XYZs1[ref_mol]]
536 coord1 = self.molcoords1[ref_mol]
541 for rmf_mols
in itertools.permutations(self.moldict0[molname]):
543 for rmf_mol
in rmf_mols:
545 coord0 = [XYZ.get_coordinates()
546 for XYZ
in self.mol_XYZs0[rmf_mol]]
548 coord0 = self.molcoords0[rmf_mol]
552 rmf_mols_list.append(rmf_mols)
554 rmf_mols_best_order = rmf_mols_list[rmsd.index(m)]
556 for n, (ref_mol, rmf_mol)
in enumerate(
557 zip(ref_mols, rmf_mols_best_order)):
558 best_assignments.append((rmf_mol, ref_mol))
560 coord0 = [XYZ.get_coordinates()
561 for XYZ
in self.mol_XYZs0[rmf_mol]]
563 coord0 = self.molcoords0[rmf_mol]
566 coord1 = [XYZ.get_coordinates()
567 for XYZ
in self.mol_XYZs1[ref_mol]]
569 coord1 = self.molcoords1[ref_mol]
570 rmsd_pair = self.metric(coord1, coord0)
571 N = len(self.molcoords1[ref_mol])
573 total_rmsd += rmsd_pair*rmsd_pair*N
574 rmsd_dict[ref_mol] = rmsd_pair
575 total_rmsd = sqrt(total_rmsd/total_N)
576 return total_rmsd, best_assignments
579 """Returns output for IMP.pmi.output.Output object"""
580 total_rmsd, best_assignments = self.get_rmsd_and_assigments()
583 for rmf_mol, ref_mol
in best_assignments:
584 ref_name = ref_mol.get_name()
586 rmf_name = rmf_mol.get_name()
588 assignments_out.append(rmf_name +
"." + str(rmf_copy) +
"->" +
589 ref_name +
"." + str(ref_copy))
590 return {
"RMSD_" + self.label: str(total_rmsd),
591 "RMSD_assignments_" + self.label: str(assignments_out)}
595 """A class to evaluate the precision of an ensemble.
597 Also can evaluate the cross-precision of multiple ensembles.
598 Supports MPI for coordinate reading.
599 Recommended procedure:
600 -# initialize object and pass the selection for evaluating precision
601 -# call add_structures() to read in the data (specify group name)
602 -# call get_precision() to evaluate inter/intra precision
603 -# call get_rmsf() to evaluate within-group fluctuations
605 def __init__(self, model, resolution=1, selection_dictionary={}):
607 @param model The IMP Model
608 @param resolution Use 1 or 10 (kluge: requires that "_Res:X" is
609 part of the hier name)
610 @param selection_dictionary Dictionary where keys are names for
611 selections and values are selection tuples for scoring
612 precision. "All" is automatically made as well
615 from mpi4py
import MPI
616 self.comm = MPI.COMM_WORLD
617 self.rank = self.comm.Get_rank()
618 self.number_of_processes = self.comm.size
620 self.number_of_processes = 1
623 self.styles = [
'pairwise_rmsd',
'pairwise_drmsd_k',
'pairwise_drmsd_Q',
624 'pairwise_drms_k',
'pairwise_rmsd',
'drmsd_from_center']
625 self.style =
'pairwise_drmsd_k'
626 self.structures_dictionary = {}
627 self.reference_structures_dictionary = {}
629 self.protein_names =
None
630 self.len_particles_resolution_one =
None
632 self.rmf_names_frames = {}
633 self.reference_rmf_names_frames =
None
634 self.reference_structure =
None
635 self.reference_prot =
None
636 self.selection_dictionary = selection_dictionary
637 self.threshold = 40.0
638 self.residue_particle_index_map =
None
640 if resolution
in [1, 10]:
641 self.resolution = resolution
643 raise KeyError(
"Currently only allow resolution 1 or 10")
645 def _get_structure(self, rmf_frame_index, rmf_name):
646 """Read an RMF file and return the particles"""
647 rh = RMF.open_rmf_file_read_only(rmf_name)
649 print(
"getting coordinates for frame %i rmf file %s"
650 % (rmf_frame_index, rmf_name))
654 print(
"linking coordinates for frame %i rmf file %s"
655 % (rmf_frame_index, rmf_name))
660 if self.resolution == 1:
662 elif self.resolution == 10:
665 protein_names = list(particle_dict.keys())
666 particles_resolution_one = []
667 for k
in particle_dict:
668 particles_resolution_one += (particle_dict[k])
670 if self.protein_names
is None:
671 self.protein_names = protein_names
673 if self.protein_names != protein_names:
674 print(
"Error: the protein names of the new coordinate set "
675 "is not compatible with the previous one")
677 if self.len_particles_resolution_one
is None:
678 self.len_particles_resolution_one = len(particles_resolution_one)
680 if self.len_particles_resolution_one != \
681 len(particles_resolution_one):
682 raise ValueError(
"the new coordinate set is not compatible "
683 "with the previous one")
685 return particles_resolution_one
691 setup_index_map=
False):
692 """ Read a structure into the ensemble and store (as coordinates).
693 @param rmf_name The name of the RMF file
694 @param rmf_frame_index The frame to read
695 @param structure_set_name Name for the set that includes this structure
697 @param setup_index_map if requested, set up a dictionary to help
702 if structure_set_name
in self.structures_dictionary:
703 cdict = self.structures_dictionary[structure_set_name]
704 rmflist = self.rmf_names_frames[structure_set_name]
706 self.structures_dictionary[structure_set_name] = {}
707 self.rmf_names_frames[structure_set_name] = []
708 cdict = self.structures_dictionary[structure_set_name]
709 rmflist = self.rmf_names_frames[structure_set_name]
713 particles_resolution_one = self._get_structure(
714 rmf_frame_index, rmf_name)
716 print(
"something wrong with the rmf")
718 self.selection_dictionary.update({
"All": self.protein_names})
720 for selection_name
in self.selection_dictionary:
721 selection_tuple = self.selection_dictionary[selection_name]
722 coords = self._select_coordinates(
723 selection_tuple, particles_resolution_one, self.prots[0])
724 if selection_name
not in cdict:
725 cdict[selection_name] = [coords]
727 cdict[selection_name].append(coords)
729 rmflist.append((rmf_name, rmf_frame_index))
733 self.residue_particle_index_map = {}
734 for prot_name
in self.protein_names:
735 self.residue_particle_index_map[prot_name] = \
736 self._get_residue_particle_index_map(
737 prot_name, particles_resolution_one, self.prots[0])
740 rmf_name_frame_tuples,
742 """Read a list of RMFs, supports parallel
743 @param rmf_name_frame_tuples list of (rmf_file_name,frame_number)
744 @param structure_set_name Name this set of structures
749 my_rmf_name_frame_tuples = IMP.pmi.tools.chunk_list_into_segments(
750 rmf_name_frame_tuples, self.number_of_processes)[self.rank]
751 for nfr, tup
in enumerate(my_rmf_name_frame_tuples):
753 rmf_frame_index = tup[1]
755 if self.residue_particle_index_map
is None:
756 setup_index_map =
True
758 setup_index_map =
False
765 if self.number_of_processes > 1:
767 self.rmf_names_frames)
769 self.comm.send(self.structures_dictionary, dest=0, tag=11)
771 for i
in range(1, self.number_of_processes):
772 data_tmp = self.comm.recv(source=i, tag=11)
773 for key
in self.structures_dictionary:
774 self.structures_dictionary[key].update(data_tmp[key])
775 for i
in range(1, self.number_of_processes):
776 self.comm.send(self.structures_dictionary, dest=i, tag=11)
778 self.structures_dictionary = self.comm.recv(source=0, tag=11)
780 def _get_residue_particle_index_map(self, prot_name, structure, hier):
782 residue_particle_index_map = []
784 all_selected_particles = s.get_selected_particles()
785 intersection = list(set(all_selected_particles) & set(structure))
786 sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
787 for p
in sorted_intersection:
788 residue_particle_index_map.append(
790 return residue_particle_index_map
792 def _select_coordinates(self, tuple_selections, structure, prot):
793 selected_coordinates = []
794 for t
in tuple_selections:
795 if isinstance(t, tuple)
and len(t) == 3:
797 prot, molecules=[t[2]],
798 residue_indexes=range(t[0], t[1]+1), resolution=1)
799 all_selected_particles = s.get_selected_particles()
800 intersection = list(set(all_selected_particles)
802 sorted_intersection = \
803 IMP.pmi.tools.sort_by_residues(intersection)
805 for p
in sorted_intersection]
806 selected_coordinates += cc
807 elif isinstance(t, str):
809 all_selected_particles = s.get_selected_particles()
810 intersection = list(set(all_selected_particles)
812 sorted_intersection = \
813 IMP.pmi.tools.sort_by_residues(intersection)
815 for p
in sorted_intersection]
816 selected_coordinates += cc
818 raise ValueError(
"Selection error")
819 return selected_coordinates
821 def set_threshold(self, threshold):
822 self.threshold = threshold
824 def _get_distance(self, structure_set_name1, structure_set_name2,
825 selection_name, index1, index2):
826 """Compute distance between structures with various metrics"""
827 c1 = self.structures_dictionary[
828 structure_set_name1][selection_name][index1]
829 c2 = self.structures_dictionary[
830 structure_set_name2][selection_name][index2]
834 if self.style ==
'pairwise_drmsd_k':
836 if self.style ==
'pairwise_drms_k':
838 if self.style ==
'pairwise_drmsd_Q':
842 if self.style ==
'pairwise_rmsd':
846 def _get_particle_distances(self, structure_set_name1, structure_set_name2,
847 selection_name, index1, index2):
848 c1 = self.structures_dictionary[
849 structure_set_name1][selection_name][index1]
850 c2 = self.structures_dictionary[
851 structure_set_name2][selection_name][index2]
856 distances = [np.linalg.norm(a-b)
857 for (a, b)
in zip(coordinates1, coordinates2)]
862 outfile=
None, skip=1, selection_keywords=
None):
863 """ Evaluate the precision of two named structure groups. Supports MPI.
864 When the structure_set_name1 is different from the structure_set_name2,
865 this evaluates the cross-precision (average pairwise distances).
866 @param outfile Name of the precision output file
867 @param structure_set_name1 string name of the first structure set
868 @param structure_set_name2 string name of the second structure set
869 @param skip analyze every (skip) structure for the distance
871 @param selection_keywords Specify the selection name you want to
872 calculate on. By default this is computed for everything
873 you provided in the constructor, plus all the subunits together.
875 if selection_keywords
is None:
876 sel_keys = list(self.selection_dictionary.keys())
878 for k
in selection_keywords:
879 if k
not in self.selection_dictionary:
881 "you are trying to find named selection "
882 + k +
" which was not requested in the constructor")
883 sel_keys = selection_keywords
885 if outfile
is not None:
886 of = open(outfile,
"w")
888 for selection_name
in sel_keys:
889 number_of_structures_1 = len(self.structures_dictionary[
890 structure_set_name1][selection_name])
891 number_of_structures_2 = len(self.structures_dictionary[
892 structure_set_name2][selection_name])
894 structure_pointers_1 = list(range(0, number_of_structures_1, skip))
895 structure_pointers_2 = list(range(0, number_of_structures_2, skip))
896 pair_combination_list = list(
897 itertools.product(structure_pointers_1, structure_pointers_2))
898 if len(pair_combination_list) == 0:
899 raise ValueError(
"no structure selected. Check the "
903 my_pair_combination_list = IMP.pmi.tools.chunk_list_into_segments(
904 pair_combination_list, self.number_of_processes)[self.rank]
905 for n, pair
in enumerate(my_pair_combination_list):
906 distances[pair] = self._get_distance(
907 structure_set_name1, structure_set_name2, selection_name,
909 if self.number_of_processes > 1:
914 if structure_set_name1 == structure_set_name2:
915 structure_pointers = structure_pointers_1
916 number_of_structures = number_of_structures_1
921 distances_to_structure = {}
922 distances_to_structure_normalization = {}
924 for n
in structure_pointers:
925 distances_to_structure[n] = 0.0
926 distances_to_structure_normalization[n] = 0
929 distance += distances[k]
930 distances_to_structure[k[0]] += distances[k]
931 distances_to_structure[k[1]] += distances[k]
932 distances_to_structure_normalization[k[0]] += 1
933 distances_to_structure_normalization[k[1]] += 1
935 for n
in structure_pointers:
936 distances_to_structure[n] = \
937 distances_to_structure[n] \
938 / distances_to_structure_normalization[n]
940 min_distance = min([distances_to_structure[n]
941 for n
in distances_to_structure])
942 centroid_index = [k
for k, v
in
943 distances_to_structure.items()
944 if v == min_distance][0]
945 centroid_rmf_name = \
946 self.rmf_names_frames[
947 structure_set_name1][centroid_index]
949 centroid_distance = 0.0
951 for n
in range(number_of_structures):
952 dist = self._get_distance(
953 structure_set_name1, structure_set_name1,
954 selection_name, centroid_index, n)
955 centroid_distance += dist
956 distance_list.append(dist)
958 centroid_distance /= number_of_structures
959 if outfile
is not None:
960 of.write(str(selection_name) +
" " +
961 structure_set_name1 +
962 " average centroid distance " +
963 str(centroid_distance) +
"\n")
964 of.write(str(selection_name) +
" " +
965 structure_set_name1 +
" centroid index " +
966 str(centroid_index) +
"\n")
967 of.write(str(selection_name) +
" " +
968 structure_set_name1 +
" centroid rmf name " +
969 str(centroid_rmf_name) +
"\n")
970 of.write(str(selection_name) +
" " +
971 structure_set_name1 +
972 " median centroid distance " +
973 str(np.median(distance_list)) +
"\n")
975 average_pairwise_distances = sum(
976 distances.values())/len(list(distances.values()))
977 if outfile
is not None:
978 of.write(str(selection_name) +
" " + structure_set_name1 +
979 " " + structure_set_name2 +
980 " average pairwise distance " +
981 str(average_pairwise_distances) +
"\n")
982 if outfile
is not None:
984 return centroid_index
986 def get_rmsf(self, structure_set_name, outdir="./", skip=1,
987 set_plot_yaxis_range=
None):
988 """ Calculate the residue mean square fluctuations (RMSF).
989 Automatically outputs as data file and pdf
990 @param structure_set_name Which structure set to calculate RMSF for
991 @param outdir Where to write the files
992 @param skip Skip this number of structures
993 @param set_plot_yaxis_range In case you need to change the plot
1001 for sel_name
in self.protein_names:
1002 self.selection_dictionary.update({sel_name: [sel_name]})
1004 number_of_structures = \
1005 len(self.structures_dictionary[
1006 structure_set_name][sel_name])
1010 rpim = self.residue_particle_index_map[sel_name]
1011 outfile = outdir+
"/rmsf."+sel_name+
".dat"
1012 of = open(outfile,
"w")
1013 residue_distances = {}
1015 for index
in range(number_of_structures):
1016 distances = self._get_particle_distances(
1017 structure_set_name, structure_set_name, sel_name,
1018 centroid_index, index)
1019 for nblock, block
in enumerate(rpim):
1020 for residue_number
in block:
1021 residue_nblock[residue_number] = nblock
1022 if residue_number
not in residue_distances:
1023 residue_distances[residue_number] = \
1027 residue_number].append(distances[nblock])
1031 for rn
in residue_distances:
1033 rmsf = np.std(residue_distances[rn])
1035 of.write(str(rn) +
" " + str(residue_nblock[rn])
1036 +
" " + str(rmsf) +
"\n")
1038 IMP.pmi.output.plot_xy_data(
1039 residues, rmsfs, title=sel_name,
1040 out_fn=outdir+
"/rmsf."+sel_name, display=
False,
1041 set_plot_yaxis_range=set_plot_yaxis_range,
1042 xlabel=
'Residue Number', ylabel=
'Standard error')
1046 """Read in a structure used for reference computation.
1047 Needed before calling get_average_distance_wrt_reference_structure()
1048 @param rmf_name The RMF file to read the reference
1049 @param rmf_frame_index The index in that file
1051 particles_resolution_one = self._get_structure(rmf_frame_index,
1053 self.reference_rmf_names_frames = (rmf_name, rmf_frame_index)
1055 for selection_name
in self.selection_dictionary:
1056 selection_tuple = self.selection_dictionary[selection_name]
1057 coords = self._select_coordinates(
1058 selection_tuple, particles_resolution_one, self.prots[0])
1059 self.reference_structures_dictionary[selection_name] = coords
1062 self, structure_set_name, alignment_selection_key):
1063 """First align then calculate RMSD
1064 @param structure_set_name: the name of the structure set
1065 @param alignment_selection_key: the key containing the selection tuples
1066 needed to make the alignment stored in self.selection_dictionary
1067 @return: for each structure in the structure set, returns the rmsd
1070 if self.reference_structures_dictionary == {}:
1071 print(
"Cannot compute until you set a reference structure")
1074 align_reference_coordinates = \
1075 self.reference_structures_dictionary[alignment_selection_key]
1076 align_coordinates = self.structures_dictionary[
1077 structure_set_name][alignment_selection_key]
1078 transformations = []
1079 for c
in align_coordinates:
1081 {
"All": align_reference_coordinates}, {
"All": c})
1082 transformation = Ali.align()[1]
1083 transformations.append(transformation)
1084 for selection_name
in self.selection_dictionary:
1085 reference_coordinates = \
1086 self.reference_structures_dictionary[selection_name]
1088 for c
in reference_coordinates]
1090 for n, sc
in enumerate(self.structures_dictionary[
1091 structure_set_name][selection_name]):
1096 distances.append(distance)
1097 ret[selection_name] = {
1098 'all_distances': distances,
1099 'average_distance': sum(distances)/len(distances),
1100 'minimum_distance': min(distances)}
1103 def _get_median(self, list_of_values):
1104 return np.median(np.array(list_of_values))
1107 """Compare the structure set to the reference structure.
1108 @param structure_set_name The structure set to compute this on
1109 @note First call set_reference_structure()
1112 if self.reference_structures_dictionary == {}:
1113 print(
"Cannot compute until you set a reference structure")
1115 for selection_name
in self.selection_dictionary:
1116 reference_coordinates = self.reference_structures_dictionary[
1119 for c
in reference_coordinates]
1121 for sc
in self.structures_dictionary[
1122 structure_set_name][selection_name]:
1124 if self.style ==
'pairwise_drmsd_k':
1126 if self.style ==
'pairwise_drms_k':
1128 if self.style ==
'pairwise_drmsd_Q':
1130 coordinates1, coordinates2, self.threshold)
1131 if self.style ==
'pairwise_rmsd':
1133 distances.append(distance)
1135 print(selection_name,
"average distance",
1136 sum(distances)/len(distances),
"minimum distance",
1137 min(distances),
'nframes', len(distances))
1138 ret[selection_name] = {
1139 'average_distance': sum(distances)/len(distances),
1140 'minimum_distance': min(distances)}
1143 def get_coordinates(self):
1146 def set_precision_style(self, style):
1147 if style
in self.styles:
1150 raise ValueError(
"No such style")
1154 """Compute mean density maps from structures.
1156 Keeps a dictionary of density maps,
1157 keys are in the custom ranges. When you call add_subunits_density, it adds
1158 particle coordinates to the existing density maps.
1161 def __init__(self, custom_ranges, representation=None, resolution=20.0,
1164 @param custom_ranges Required. It's a dictionary, keys are the
1165 density component names, values are selection tuples
1166 e.g. {'kin28':[['kin28',1,-1]],
1167 'density_name_1' :[('ccl1')],
1168 'density_name_2' :[(1,142,'tfb3d1'),
1169 (143,700,'tfb3d2')],
1170 @param resolution The MRC resolution of the output map (in
1172 @param voxel The voxel size for the output map (lower is slower)
1177 "The 'representation' argument is deprecated "
1178 "(and is ignored). Pass hierarchies to "
1179 "add_subunits_density() instead.")
1180 self.MRCresolution = resolution
1183 self.count_models = 0.0
1184 self.custom_ranges = custom_ranges
1187 """Add a frame to the densities.
1188 @param hierarchy The hierarchy to add.
1190 self.count_models += 1.0
1193 all_particles_by_resolution = []
1194 for name
in part_dict:
1195 all_particles_by_resolution += part_dict[name]
1197 for density_name
in self.custom_ranges:
1199 all_particles_by_segments = []
1201 for seg
in self.custom_ranges[density_name]:
1202 if isinstance(seg, str):
1204 elif isinstance(seg, tuple)
and len(seg) == 2:
1206 hierarchy, molecule=seg[0], copy_index=seg[1])
1207 elif isinstance(seg, tuple)
and len(seg) == 3:
1209 hierarchy, molecule=seg[2],
1210 residue_indexes=range(seg[0], seg[1] + 1))
1211 elif isinstance(seg, tuple)
and len(seg) == 4:
1213 hierarchy, molecule=seg[2],
1214 residue_indexes=range(seg[0], seg[1] + 1),
1217 raise Exception(
'could not understand selection tuple '
1220 all_particles_by_segments += s.get_selected_particles()
1221 parts = all_particles_by_segments
1222 self._create_density_from_particles(parts, density_name)
1224 def normalize_density(self):
1227 def _create_density_from_particles(self, ps, name,
1228 kernel_type=
'GAUSSIAN'):
1229 '''Internal function for adding to densities.
1230 pass XYZR particles with mass and create a density from them.
1231 kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
1234 dmap.set_was_used(
True)
1235 if name
not in self.densities:
1236 self.densities[name] = dmap
1242 dmap3.set_was_used(
True)
1244 dmap3.add(self.densities[name])
1245 self.densities[name] = dmap3
1247 def get_density_keys(self):
1248 return list(self.densities.keys())
1251 """Get the current density for some component name"""
1252 if name
not in self.densities:
1255 return self.densities[name]
1257 def write_mrc(self, path="./", suffix=None):
1259 for density_name
in self.densities:
1260 self.densities[density_name].
multiply(1. / self.count_models)
1262 name = path +
"/" + density_name +
".mrc"
1264 name = path +
"/" + density_name +
"." + suffix +
".mrc"
1265 path, file = os.path.split(name)
1266 os.makedirs(path, exist_ok=
True)
1267 IMP.em.write_map(self.densities[density_name], name,
1271 class GetContactMap:
1274 self.distance = distance
1275 self.contactmap =
''
1282 def set_prot(self, prot):
1288 for name
in particles_dictionary:
1289 residue_indexes = []
1290 for p
in particles_dictionary[name]:
1294 if len(residue_indexes) != 0:
1295 self.protnames.append(name)
1297 def get_subunit_coords(self, frame, align=0):
1298 from scipy.spatial.distance
import cdist
1302 for part
in self.prot.get_children():
1305 for chl
in part.get_children():
1309 SortedSegments.append((chl, startres))
1310 SortedSegments = sorted(SortedSegments, key=itemgetter(1))
1312 for sgmnt
in SortedSegments:
1315 crd = np.array([p.get_x(), p.get_y(), p.get_z()])
1318 radii.append(p.get_radius())
1320 new_name = part.get_name() +
'_' + sgmnt[0].get_name() + \
1323 .get_residue_indexes()[0])
1324 namelist.append(new_name)
1325 self.expanded[new_name] = len(
1327 if part.get_name()
not in self.resmap:
1328 self.resmap[part.get_name()] = {}
1330 self.resmap[part.get_name()][res] = new_name
1332 coords = np.array(coords)
1333 radii = np.array(radii)
1334 if len(self.namelist) == 0:
1335 self.namelist = namelist
1336 self.contactmap = np.zeros((len(coords), len(coords)))
1337 distances = cdist(coords, coords)
1338 distances = (distances - radii).T - radii
1339 distances = distances <= self.distance
1340 self.contactmap += distances
1342 def add_xlinks(self, filename,
1343 identification_string=
'ISDCrossLinkMS_Distance_'):
1345 with open(filename)
as data:
1346 D = data.readlines()
1349 if identification_string
in d:
1350 d = d.replace(
"_",
" ").replace(
"-",
" ").replace(
1353 t1, t2 = (d[0], d[1]), (d[1], d[0])
1354 if t1
not in self.XL:
1355 self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
1356 self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
1358 self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
1359 self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
1361 def dist_matrix(self, skip_cmap=0, skip_xl=1, outname=None):
1365 proteins = self.protnames
1370 proteins = [p.get_name()
for p
in self.prot.get_children()]
1372 for p1
in range(len(proteins)):
1373 for p2
in range(p1, len(proteins)):
1374 pl1, pl2 = (max(self.resmap[proteins[p1]].keys()),
1375 max(self.resmap[proteins[p2]].keys()))
1376 pn1, pn2 = proteins[p1], proteins[p2]
1377 mtr = np.zeros((pl1 + 1, pl2 + 1))
1378 print(
'Creating matrix for: ',
1379 p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1380 for i1
in range(1, pl1 + 1):
1381 for i2
in range(1, pl2 + 1):
1383 r1 = K.index(self.resmap[pn1][i1])
1384 r2 = K.index(self.resmap[pn2][i2])
1386 mtr[i1 - 1, i2 - 1] = r
1388 missing.append((pn1, pn2, i1, i2))
1389 Matrices[(pn1, pn2)] = mtr
1394 raise ValueError(
"cross-links were not provided, use "
1395 "add_xlinks function!")
1397 for p1
in range(len(proteins)):
1398 for p2
in range(p1, len(proteins)):
1399 pl1, pl2 = (max(self.resmap[proteins[p1]].keys()),
1400 max(self.resmap[proteins[p2]].keys()))
1401 pn1, pn2 = proteins[p1], proteins[p2]
1402 mtr = np.zeros((pl1 + 1, pl2 + 1))
1405 xls = self.XL[(pn1, pn2)]
1408 xls = self.XL[(pn2, pn1)]
1413 print(
'Creating matrix for: ',
1414 p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1415 for xl1, xl2
in xls:
1417 print(
'X' * 10, xl1, xl2)
1420 print(
'X' * 10, xl1, xl2)
1422 mtr[xl1 - 1, xl2 - 1] = 100
1424 print(
'Creating matrix for: ',
1425 p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1426 for xl1, xl2
in xls:
1428 print(
'X' * 10, xl1, xl2)
1431 print(
'X' * 10, xl1, xl2)
1433 mtr[xl2 - 1, xl1 - 1] = 100
1435 print(
'No cross-links between: ', pn1, pn2)
1436 Matrices_xl[(pn1, pn2)] = mtr
1443 for i, c
in enumerate(C):
1444 cl = max(self.resmap[c].keys())
1449 R.append(R[-1] + cl)
1454 import matplotlib
as mpl
1456 import matplotlib.pyplot
as plt
1457 import matplotlib.gridspec
as gridspec
1458 import scipy.sparse
as sparse
1461 gs = gridspec.GridSpec(len(W), len(W),
1466 for x1, r1
in enumerate(R):
1467 for x2, r2
in enumerate(R):
1468 ax = plt.subplot(gs[cnt])
1471 mtr = Matrices[(C[x1], C[x2])]
1473 mtr = Matrices[(C[x2], C[x1])].T
1476 interpolation=
'nearest',
1478 vmax=log(mtr.max())
if mtr.max() > 0.
else 1.)
1483 mtr = Matrices_xl[(C[x1], C[x2])]
1485 mtr = Matrices_xl[(C[x2], C[x1])].T
1487 sparse.csr_matrix(mtr),
1497 ax.set_ylabel(C[x1], rotation=90)
1499 plt.savefig(outname +
".pdf", dpi=300, transparent=
"False")
1507 def get_hiers_from_rmf(model, frame_number, rmf_file):
1509 print(
"getting coordinates for frame %i rmf file %s"
1510 % (frame_number, rmf_file))
1513 rh = RMF.open_rmf_file_read_only(rmf_file)
1518 print(
"Unable to open rmf file %s" % (rmf_file))
1524 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1532 def link_hiers_to_rmf(model, hiers, frame_number, rmf_file):
1533 print(
"linking hierarchies for frame %i rmf file %s"
1534 % (frame_number, rmf_file))
1535 rh = RMF.open_rmf_file_read_only(rmf_file)
1540 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1547 def get_hiers_and_restraints_from_rmf(model, frame_number, rmf_file):
1549 print(
"getting coordinates for frame %i rmf file %s"
1550 % (frame_number, rmf_file))
1553 rh = RMF.open_rmf_file_read_only(rmf_file)
1559 print(
"Unable to open rmf file %s" % (rmf_file))
1564 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1572 def link_hiers_and_restraints_to_rmf(model, hiers, rs, frame_number, rmf_file):
1573 print(
"linking hierarchies for frame %i rmf file %s"
1574 % (frame_number, rmf_file))
1575 rh = RMF.open_rmf_file_read_only(rmf_file)
1581 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1589 """Get particles at res 1, or any beads, based on the name.
1590 No Representation is needed. This is mainly used when the hierarchy
1591 is read from an RMF file.
1593 @return a dictionary of component names and their particles
1594 @note If the root node is named "System" or is a "State", do
1600 for mol
in IMP.atom.get_by_type(prot, IMP.atom.MOLECULE_TYPE):
1602 particle_dict[mol.get_name()] = sel.get_selected_particles()
1603 return particle_dict
1607 """Get particles at res 10, or any beads, based on the name.
1608 No Representation is needed.
1609 This is mainly used when the hierarchy is read from an RMF file.
1611 @return a dictionary of component names and their particles
1612 @note If the root node is named "System" or is a "State", do proper
1617 for mol
in IMP.atom.get_by_type(prot, IMP.atom.MOLECULE_TYPE):
1619 particle_dict[mol.get_name()] = sel.get_selected_particles()
1620 return particle_dict
1624 """Visualization of cross-links"""
1626 self.crosslinks = []
1627 self.external_csv_data =
None
1628 self.crosslinkedprots = set()
1629 self.mindist = +10000000.0
1630 self.maxdist = -10000000.0
1631 self.contactmap =
None
1633 def set_hierarchy(self, prot):
1634 self.prot_length_dict = {}
1635 self.model = prot.get_model()
1637 for i
in prot.get_children():
1639 residue_indexes = []
1643 if len(residue_indexes) != 0:
1644 self.prot_length_dict[name] = max(residue_indexes)
1646 def set_coordinates_for_contact_map(self, rmf_name, rmf_frame_index):
1647 from scipy.spatial.distance
import cdist
1649 rh = RMF.open_rmf_file_read_only(rmf_name)
1652 print(
"getting coordinates for frame %i rmf file %s"
1653 % (rmf_frame_index, rmf_name))
1662 self.index_dictionary = {}
1664 for name
in particles_dictionary:
1665 residue_indexes = []
1666 for p
in particles_dictionary[name]:
1670 if len(residue_indexes) != 0:
1672 for res
in range(min(residue_indexes),
1673 max(residue_indexes) + 1):
1676 crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1678 radii.append(d.get_radius())
1679 if name
not in self.index_dictionary:
1680 self.index_dictionary[name] = [resindex]
1682 self.index_dictionary[name].append(resindex)
1685 coords = np.array(coords)
1686 radii = np.array(radii)
1688 distances = cdist(coords, coords)
1689 distances = (distances - radii).T - radii
1691 distances = np.where(distances <= 20.0, 1.0, 0)
1692 if self.contactmap
is None:
1693 self.contactmap = np.zeros((len(coords), len(coords)))
1694 self.contactmap += distances
1697 IMP.atom.destroy(prot)
1700 self, data_file, search_label=
'ISDCrossLinkMS_Distance_',
1701 mapping=
None, filter_label=
None,
1703 filter_rmf_file_names=
None, external_csv_data_file=
None,
1704 external_csv_data_file_unique_id_key=
"Unique ID"):
1715 keys = po.get_keys()
1717 xl_keys = [k
for k
in keys
if search_label
in k]
1719 if filter_rmf_file_names
is not None:
1720 rmf_file_key =
"local_rmf_file_name"
1721 fs = po.get_fields(xl_keys+[rmf_file_key])
1723 fs = po.get_fields(xl_keys)
1726 self.cross_link_frequency = {}
1730 self.cross_link_distances = {}
1734 self.cross_link_distances_unique = {}
1736 if external_csv_data_file
is not None:
1739 self.external_csv_data = {}
1740 xldb = IMP.pmi.tools.get_db_from_csv(external_csv_data_file)
1743 self.external_csv_data[
1744 xl[external_csv_data_file_unique_id_key]] = xl
1749 cross_link_frequency_list = []
1751 self.unique_cross_link_list = []
1755 keysplit = key.replace(
"_",
" ").replace(
"-",
" ").replace(
1758 if filter_label
is not None:
1759 if filter_label
not in keysplit:
1763 r1 = int(keysplit[5])
1765 r2 = int(keysplit[7])
1768 confidence = keysplit[12]
1772 unique_identifier = keysplit[3]
1774 unique_identifier =
'0'
1776 r1 = int(keysplit[mapping[
"Residue1"]])
1777 c1 = keysplit[mapping[
"Protein1"]]
1778 r2 = int(keysplit[mapping[
"Residue2"]])
1779 c2 = keysplit[mapping[
"Protein2"]]
1781 confidence = keysplit[mapping[
"Confidence"]]
1785 unique_identifier = keysplit[mapping[
"Unique Identifier"]]
1787 unique_identifier =
'0'
1789 self.crosslinkedprots.add(c1)
1790 self.crosslinkedprots.add(c2)
1796 if filter_rmf_file_names
is not None:
1797 for n, d
in enumerate(fs[key]):
1798 if fs[rmf_file_key][n]
in filter_rmf_file_names:
1799 dists.append(float(d))
1801 dists = [float(f)
for f
in fs[key]]
1806 mdist = self.median(dists)
1808 stdv = np.std(np.array(dists))
1809 if self.mindist > mdist:
1810 self.mindist = mdist
1811 if self.maxdist < mdist:
1812 self.maxdist = mdist
1816 if (r1, c1, r2, c2, mdist)
not in cross_link_frequency_list:
1817 if (r1, c1, r2, c2)
not in self.cross_link_frequency:
1818 self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1819 self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1821 self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1822 self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1823 cross_link_frequency_list.append((r1, c1, r2, c2))
1824 cross_link_frequency_list.append((r2, c2, r1, c1))
1825 self.unique_cross_link_list.append(
1826 (r1, c1, r2, c2, mdist))
1828 if (r1, c1, r2, c2)
not in self.cross_link_distances:
1829 self.cross_link_distances[(
1830 r1, c1, r2, c2, mdist, confidence)] = dists
1831 self.cross_link_distances[(
1832 r2, c2, r1, c1, mdist, confidence)] = dists
1833 self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1835 self.cross_link_distances[(
1836 r2, c2, r1, c1, mdist, confidence)] += dists
1837 self.cross_link_distances[(
1838 r1, c1, r2, c2, mdist, confidence)] += dists
1840 self.crosslinks.append(
1841 (r1, c1, r2, c2, mdist, stdv, confidence, unique_identifier,
1843 self.crosslinks.append(
1844 (r2, c2, r1, c1, mdist, stdv, confidence, unique_identifier,
1847 self.cross_link_frequency_inverted = {}
1848 for xl
in self.unique_cross_link_list:
1849 (r1, c1, r2, c2, mdist) = xl
1850 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1851 if frequency
not in self.cross_link_frequency_inverted:
1852 self.cross_link_frequency_inverted[
1853 frequency] = [(r1, c1, r2, c2)]
1855 self.cross_link_frequency_inverted[
1856 frequency].append((r1, c1, r2, c2))
1860 def median(self, mylist):
1861 sorts = sorted(mylist)
1867 return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1868 return sorts[length / 2]
1870 def set_threshold(self, threshold):
1871 self.threshold = threshold
1873 def set_tolerance(self, tolerance):
1874 self.tolerance = tolerance
1876 def colormap(self, dist):
1877 if dist < self.threshold - self.tolerance:
1879 elif dist >= self.threshold + self.tolerance:
1884 def write_cross_link_database(self, filename, format='csv'):
1888 "Unique ID",
"Protein1",
"Residue1",
"Protein2",
"Residue2",
1889 "Median Distance",
"Standard Deviation",
"Confidence",
1890 "Frequency",
"Arrangement"]
1892 if self.external_csv_data
is not None:
1893 keys = list(self.external_csv_data.keys())
1894 innerkeys = list(self.external_csv_data[keys[0]].keys())
1896 fieldnames += innerkeys
1898 dw = csv.DictWriter(
1899 open(filename,
"w"),
1901 fieldnames=fieldnames)
1903 for xl
in self.crosslinks:
1904 (r1, c1, r2, c2, mdist, stdv, confidence,
1905 unique_identifier, descriptor) = xl
1906 if descriptor ==
'original':
1908 outdict[
"Unique ID"] = unique_identifier
1909 outdict[
"Protein1"] = c1
1910 outdict[
"Protein2"] = c2
1911 outdict[
"Residue1"] = r1
1912 outdict[
"Residue2"] = r2
1913 outdict[
"Median Distance"] = mdist
1914 outdict[
"Standard Deviation"] = stdv
1915 outdict[
"Confidence"] = confidence
1916 outdict[
"Frequency"] = self.cross_link_frequency[
1919 arrangement =
"Intra"
1921 arrangement =
"Inter"
1922 outdict[
"Arrangement"] = arrangement
1923 if self.external_csv_data
is not None:
1924 outdict.update(self.external_csv_data[unique_identifier])
1926 dw.writerow(outdict)
1928 def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
1929 no_confidence_info=
False, filter=
None, layout=
"whole",
1930 crosslinkedonly=
False, filename=
None, confidence_classes=
None,
1931 alphablend=0.1, scale_symbol_size=1.0, gap_between_components=0,
1932 rename_protein_map=
None):
1947 import matplotlib
as mpl
1949 import matplotlib.pyplot
as plt
1950 import matplotlib.cm
as cm
1952 fig = plt.figure(figsize=(10, 10))
1953 ax = fig.add_subplot(111)
1959 if prot_listx
is None:
1961 prot_listx = list(self.crosslinkedprots)
1963 prot_listx = list(self.prot_length_dict.keys())
1966 nresx = gap_between_components + \
1967 sum([self.prot_length_dict[name]
1968 + gap_between_components
for name
in prot_listx])
1972 if prot_listy
is None:
1974 prot_listy = list(self.crosslinkedprots)
1976 prot_listy = list(self.prot_length_dict.keys())
1979 nresy = gap_between_components + \
1980 sum([self.prot_length_dict[name]
1981 + gap_between_components
for name
in prot_listy])
1986 res = gap_between_components
1987 for prot
in prot_listx:
1988 resoffsetx[prot] = res
1989 res += self.prot_length_dict[prot]
1991 res += gap_between_components
1995 res = gap_between_components
1996 for prot
in prot_listy:
1997 resoffsety[prot] = res
1998 res += self.prot_length_dict[prot]
2000 res += gap_between_components
2002 resoffsetdiagonal = {}
2003 res = gap_between_components
2004 for prot
in IMP.pmi.tools.OrderedSet(prot_listx + prot_listy):
2005 resoffsetdiagonal[prot] = res
2006 res += self.prot_length_dict[prot]
2007 res += gap_between_components
2013 for n, prot
in enumerate(prot_listx):
2014 res = resoffsetx[prot]
2016 for proty
in prot_listy:
2017 resy = resoffsety[proty]
2018 endy = resendy[proty]
2019 ax.plot([res, res], [resy, endy],
'k-', lw=0.4)
2020 ax.plot([end, end], [resy, endy],
'k-', lw=0.4)
2021 xticks.append((float(res) + float(end)) / 2)
2022 if rename_protein_map
is not None:
2023 if prot
in rename_protein_map:
2024 prot = rename_protein_map[prot]
2025 xlabels.append(prot)
2029 for n, prot
in enumerate(prot_listy):
2030 res = resoffsety[prot]
2032 for protx
in prot_listx:
2033 resx = resoffsetx[protx]
2034 endx = resendx[protx]
2035 ax.plot([resx, endx], [res, res],
'k-', lw=0.4)
2036 ax.plot([resx, endx], [end, end],
'k-', lw=0.4)
2037 yticks.append((float(res) + float(end)) / 2)
2038 if rename_protein_map
is not None:
2039 if prot
in rename_protein_map:
2040 prot = rename_protein_map[prot]
2041 ylabels.append(prot)
2044 print(prot_listx, prot_listy)
2046 if self.contactmap
is not None:
2047 tmp_array = np.zeros((nresx, nresy))
2049 for px
in prot_listx:
2051 for py
in prot_listy:
2053 resx = resoffsety[px]
2054 lengx = resendx[px] - 1
2055 resy = resoffsety[py]
2056 lengy = resendy[py] - 1
2057 indexes_x = self.index_dictionary[px]
2058 minx = min(indexes_x)
2059 maxx = max(indexes_x)
2060 indexes_y = self.index_dictionary[py]
2061 miny = min(indexes_y)
2062 maxy = max(indexes_y)
2064 print(px, py, minx, maxx, miny, maxy)
2069 resy:lengy] = self.contactmap[
2075 ax.imshow(tmp_array,
2078 interpolation=
'nearest')
2080 ax.set_xticks(xticks)
2081 ax.set_xticklabels(xlabels, rotation=90)
2082 ax.set_yticks(yticks)
2083 ax.set_yticklabels(ylabels)
2084 ax.set_xlim(0, nresx)
2085 ax.set_ylim(0, nresy)
2089 already_added_xls = []
2091 for xl
in self.crosslinks:
2093 (r1, c1, r2, c2, mdist, stdv, confidence,
2094 unique_identifier, descriptor) = xl
2096 if confidence_classes
is not None:
2097 if confidence
not in confidence_classes:
2101 pos1 = r1 + resoffsetx[c1]
2105 pos2 = r2 + resoffsety[c2]
2109 if filter
is not None:
2110 xldb = self.external_csv_data[unique_identifier]
2111 xldb.update({
"Distance": mdist})
2112 xldb.update({
"Distance_stdv": stdv})
2114 if filter[1] ==
">":
2115 if float(xldb[filter[0]]) <= float(filter[2]):
2118 if filter[1] ==
"<":
2119 if float(xldb[filter[0]]) >= float(filter[2]):
2122 if filter[1] ==
"==":
2123 if float(xldb[filter[0]]) != float(filter[2]):
2129 pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
2130 pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
2132 if layout ==
'lowerdiagonal':
2133 if pos_for_diagonal1 <= pos_for_diagonal2:
2135 if layout ==
'upperdiagonal':
2136 if pos_for_diagonal1 >= pos_for_diagonal2:
2139 already_added_xls.append((r1, c1, r2, c2))
2141 if not no_confidence_info:
2142 if confidence ==
'0.01':
2143 markersize = 14 * scale_symbol_size
2144 elif confidence ==
'0.05':
2145 markersize = 9 * scale_symbol_size
2146 elif confidence ==
'0.1':
2147 markersize = 6 * scale_symbol_size
2149 markersize = 15 * scale_symbol_size
2151 markersize = 5 * scale_symbol_size
2153 if not no_dist_info:
2154 color = self.colormap(mdist)
2164 markersize=markersize)
2166 fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
2168 for i
in ax.spines.values():
2169 i.set_linewidth(2.0)
2172 plt.savefig(filename +
".pdf", dpi=300, transparent=
"False")
2176 def get_frequency_statistics(self, prot_list,
2179 violated_histogram = {}
2180 satisfied_histogram = {}
2181 unique_cross_links = []
2183 for xl
in self.unique_cross_link_list:
2184 (r1, c1, r2, c2, mdist) = xl
2187 if prot_list2
is None:
2188 if c1
not in prot_list:
2190 if c2
not in prot_list:
2193 if c1
in prot_list
and c2
in prot_list2:
2195 elif c1
in prot_list2
and c2
in prot_list:
2200 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2202 if (r1, c1, r2, c2)
not in unique_cross_links:
2204 if frequency
not in violated_histogram:
2205 violated_histogram[frequency] = 1
2207 violated_histogram[frequency] += 1
2209 if frequency
not in satisfied_histogram:
2210 satisfied_histogram[frequency] = 1
2212 satisfied_histogram[frequency] += 1
2213 unique_cross_links.append((r1, c1, r2, c2))
2214 unique_cross_links.append((r2, c2, r1, c1))
2216 print(
"# satisfied")
2218 total_number_of_crosslinks = 0
2220 for i
in satisfied_histogram:
2224 if i
in violated_histogram:
2225 print(i, violated_histogram[i]+satisfied_histogram[i])
2227 print(i, satisfied_histogram[i])
2228 total_number_of_crosslinks += i*satisfied_histogram[i]
2232 for i
in violated_histogram:
2233 print(i, violated_histogram[i])
2234 total_number_of_crosslinks += i*violated_histogram[i]
2236 print(total_number_of_crosslinks)
2238 def print_cross_link_binary_symbols(self, prot_list,
2241 confidence_list = []
2242 for xl
in self.crosslinks:
2243 (r1, c1, r2, c2, mdist, stdv, confidence,
2244 unique_identifier, descriptor) = xl
2246 if prot_list2
is None:
2247 if c1
not in prot_list:
2249 if c2
not in prot_list:
2252 if c1
in prot_list
and c2
in prot_list2:
2254 elif c1
in prot_list2
and c2
in prot_list:
2259 if descriptor !=
"original":
2262 confidence_list.append(confidence)
2264 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2265 tmp_dist_binary = []
2268 tmp_dist_binary.append(1)
2270 tmp_dist_binary.append(0)
2271 tmp_matrix.append(tmp_dist_binary)
2273 matrix = list(zip(*tmp_matrix))
2275 satisfied_high_sum = 0
2276 satisfied_mid_sum = 0
2277 satisfied_low_sum = 0
2278 total_satisfied_sum = 0
2279 for k, m
in enumerate(matrix):
2288 for n, b
in enumerate(m):
2289 if confidence_list[n] ==
"0.01":
2293 satisfied_high_sum += 1
2294 elif confidence_list[n] ==
"0.05":
2298 satisfied_mid_sum += 1
2299 elif confidence_list[n] ==
"0.1":
2303 satisfied_low_sum += 1
2305 total_satisfied += 1
2306 total_satisfied_sum += 1
2308 print(k, satisfied_high, total_high)
2309 print(k, satisfied_mid, total_mid)
2310 print(k, satisfied_low, total_low)
2311 print(k, total_satisfied, total)
2312 print(float(satisfied_high_sum) / len(matrix))
2313 print(float(satisfied_mid_sum) / len(matrix))
2314 print(float(satisfied_low_sum) / len(matrix))
2317 def get_unique_crosslinks_statistics(self, prot_list,
2330 satisfied_string = []
2331 for xl
in self.crosslinks:
2332 (r1, c1, r2, c2, mdist, stdv, confidence,
2333 unique_identifier, descriptor) = xl
2335 if prot_list2
is None:
2336 if c1
not in prot_list:
2338 if c2
not in prot_list:
2341 if c1
in prot_list
and c2
in prot_list2:
2343 elif c1
in prot_list2
and c2
in prot_list:
2348 if descriptor !=
"original":
2352 if confidence ==
"0.01":
2356 if confidence ==
"0.05":
2360 if confidence ==
"0.1":
2365 satisfied_string.append(1)
2367 satisfied_string.append(0)
2369 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2370 tmp_dist_binary = []
2373 tmp_dist_binary.append(1)
2375 tmp_dist_binary.append(0)
2376 tmp_matrix.append(tmp_dist_binary)
2378 print(
"unique satisfied_high/total_high", satisfied_high,
"/",
2380 print(
"unique satisfied_mid/total_mid", satisfied_mid,
"/", total_mid)
2381 print(
"unique satisfied_low/total_low", satisfied_low,
"/", total_low)
2382 print(
"total", total)
2384 matrix = list(zip(*tmp_matrix))
2385 satisfied_models = 0
2387 for b
in satisfied_string:
2394 all_satisfied =
True
2396 for n, b
in enumerate(m):
2401 if b == 1
and satisfied_string[n] == 1:
2403 elif b == 1
and satisfied_string[n] == 0:
2405 elif b == 0
and satisfied_string[n] == 0:
2407 elif b == 0
and satisfied_string[n] == 1:
2408 all_satisfied =
False
2410 satisfied_models += 1
2412 print(satstr, all_satisfied)
2413 print(
"models that satisfies the median satisfied cross-links/total "
2414 "models", satisfied_models, len(matrix))
2416 def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,
2419 import matplotlib
as mpl
2421 import matplotlib.pylab
as pl
2424 for kw
in self.cross_link_distances_unique:
2425 (r1, c1, r2, c2) = kw
2426 dists = self.cross_link_distances_unique[kw]
2428 if prot_list2
is None:
2429 if c1
not in prot_list:
2431 if c2
not in prot_list:
2434 if c1
in prot_list
and c2
in prot_list2:
2436 elif c1
in prot_list2
and c2
in prot_list:
2441 dists.append(sum(dists))
2442 tmp_matrix.append(dists)
2444 tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
2447 matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
2449 for i
in range(len(tmp_matrix)):
2450 for k
in range(len(tmp_matrix[i]) - 1):
2451 matrix[i][k] = tmp_matrix[i][k]
2456 ax = fig.add_subplot(211)
2458 cax = ax.imshow(matrix, interpolation=
'nearest')
2460 pl.savefig(figurename, dpi=300)
2469 arrangement=
"inter",
2470 confidence_input=
"None"):
2473 for xl
in self.cross_link_distances:
2474 (r1, c1, r2, c2, mdist, confidence) = xl
2475 if c1
in prots1
and c2
in prots2:
2476 if arrangement ==
"inter" and c1 == c2:
2478 if arrangement ==
"intra" and c1 != c2:
2480 if confidence_input == confidence:
2481 label = str(c1) +
":" + str(r1) + \
2482 "-" + str(c2) +
":" + str(r2)
2483 values = self.cross_link_distances[xl]
2484 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2485 data.append((label, values, mdist, frequency))
2487 sort_by_dist = sorted(data, key=
lambda tup: tup[2])
2488 sort_by_dist = list(zip(*sort_by_dist))
2489 values = sort_by_dist[1]
2490 positions = list(range(len(values)))
2491 labels = sort_by_dist[0]
2492 frequencies = list(map(float, sort_by_dist[3]))
2493 frequencies = [f * 10.0
for f
in frequencies]
2495 nchunks = int(float(len(values)) / nxl_per_row)
2496 values_chunks = IMP.pmi.tools.chunk_list_into_segments(values, nchunks)
2497 positions_chunks = IMP.pmi.tools.chunk_list_into_segments(
2500 frequencies_chunks = IMP.pmi.tools.chunk_list_into_segments(
2503 labels_chunks = IMP.pmi.tools.chunk_list_into_segments(labels, nchunks)
2505 for n, v
in enumerate(values_chunks):
2506 p = positions_chunks[n]
2507 f = frequencies_chunks[n]
2509 filename +
"." + str(n), v, p, f,
2510 valuename=
"Distance (Ang)",
2511 positionname=
"Unique " + arrangement +
" Crosslinks",
2512 xlabels=labels_chunks[n])
2514 def crosslink_distance_histogram(self, filename,
2517 confidence_classes=
None,
2523 if prot_list
is None:
2524 prot_list = list(self.prot_length_dict.keys())
2527 for xl
in self.crosslinks:
2528 (r1, c1, r2, c2, mdist, stdv, confidence,
2529 unique_identifier, descriptor) = xl
2531 if confidence_classes
is not None:
2532 if confidence
not in confidence_classes:
2535 if prot_list2
is None:
2536 if c1
not in prot_list:
2538 if c2
not in prot_list:
2541 if c1
in prot_list
and c2
in prot_list2:
2543 elif c1
in prot_list2
and c2
in prot_list:
2548 distances.append(mdist)
2551 filename, distances, valuename=
"C-alpha C-alpha distance [Ang]",
2552 bins=bins, color=color,
2554 reference_xline=35.0,
2555 yplotrange=yplotrange, normalized=normalized)
2557 def scatter_plot_xl_features(self, filename,
2563 reference_ylines=
None,
2564 distance_color=
True,
2566 import matplotlib
as mpl
2568 import matplotlib.pyplot
as plt
2570 fig = plt.figure(figsize=(10, 10))
2571 ax = fig.add_subplot(111)
2573 for xl
in self.crosslinks:
2574 (r1, c1, r2, c2, mdist, stdv, confidence,
2575 unique_identifier, arrangement) = xl
2577 if prot_list2
is None:
2578 if c1
not in prot_list:
2580 if c2
not in prot_list:
2583 if c1
in prot_list
and c2
in prot_list2:
2585 elif c1
in prot_list2
and c2
in prot_list:
2590 xldb = self.external_csv_data[unique_identifier]
2591 xldb.update({
"Distance": mdist})
2592 xldb.update({
"Distance_stdv": stdv})
2594 xvalue = float(xldb[feature1])
2595 yvalue = float(xldb[feature2])
2598 color = self.colormap(mdist)
2602 ax.plot([xvalue], [yvalue],
'o', c=color, alpha=0.1, markersize=7)
2604 if yplotrange
is not None:
2605 ax.set_ylim(yplotrange)
2606 if reference_ylines
is not None:
2607 for rl
in reference_ylines:
2608 ax.axhline(rl, color=
'red', linestyle=
'dashed', linewidth=1)
2611 plt.savefig(filename +
"." + format, dpi=150, transparent=
"False")
def get_output
Returns output for IMP.pmi.output.Output object.
Visualization of cross-links.
A decorator to associate a particle with a part of a protein/DNA/RNA.
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
A class for reading stat files (either rmf or ascii v1 and v2)
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
void handle_use_deprecated(std::string message)
Break in this method in gdb to find deprecated uses at runtime.
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
Compute the RMSD (without alignment) taking into account the copy ambiguity.
void link_restraints(RMF::FileConstHandle fh, const Restraints &hs)
A class to evaluate the precision of an ensemble.
A class to cluster structures.
def get_rmsd_wrt_reference_structure_with_alignment
First align then calculate RMSD.
Class for sampling a density map from particles.
A decorator for keeping track of copies of a molecule.
def add_structure
Read a structure into the ensemble and store (as coordinates).
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
Return a density map for which voxel i contains the result of m1[i]*m2[i].
Performs alignment and RMSD calculation for two sets of coordinates.
def add_subunits_density
Add a frame to the densities.
DensityMap * create_density_map(const IMP::algebra::GridD< 3, S, V, E > &arg)
Create a density map from an arbitrary IMP::algebra::GridD.
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Basic utilities for handling cryo-electron microscopy 3D density maps.
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
def fill
Add coordinates for a single model.
Tools for clustering and cluster analysis.
def do_cluster
Run K-means clustering.
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Classes for writing output files and processing them.
def set_reference_structure
Read in a structure used for reference computation.
def get_rmsf
Calculate the residue mean square fluctuations (RMSF).
def get_average_distance_wrt_reference_structure
Compare the structure set to the reference structure.
def get_density
Get the current density for some component name.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
def get_precision
Evaluate the precision of two named structure groups.
Restraints create_restraints(RMF::FileConstHandle fh, Model *m)
def get_moldict_coorddict
return data structure for the RMSD calculation
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name.
Python classes to represent, score, sample and analyze models.
def add_structures
Read a list of RMFs, supports parallel.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Select hierarchy particles identified by the biological name.
Compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
A decorator for a particle with x,y,z coordinates and a radius.