3 """@namespace IMP.pmi.analysis
4 Tools for clustering and cluster analysis
15 from operator
import itemgetter
16 from math
import log, sqrt
22 """Performs alignment and RMSD calculation for two sets of coordinates
24 The class also takes into account non-equal stoichiometry of the proteins.
25 If this is the case, the protein names of proteins in multiple copies
26 should be specified in the following form:
27 nameA..1, nameA..2 (note two dots).
30 def __init__(self, template, query, weights=None):
32 @param query {'p1':coords(L,3), 'p2':coords(L,3)}
33 @param template {'p1':coords(L,3), 'p2':coords(L,3)}
34 @param weights optional weights for each set of coordinates
37 self.template = template
38 self.weights = weights
40 if len(self.query.keys()) != len(self.template.keys()):
41 raise ValueError(
'''the number of proteins
42 in template and query does not match!''')
48 self.proteins = sorted(self.query.keys())
49 prots_uniq = [i.split(
'..')[0]
for i
in self.proteins]
57 copies = [i
for i
in self.proteins
if i.split(
'..')[0] == p]
58 prmts = list(itertools.permutations(copies, len(copies)))
61 self.Product = list(itertools.product(*P.values()))
69 torder = sum([list(i)
for i
in self.Product[0]], [])
72 if self.weights
is not None:
73 weights += [i
for i
in self.weights[t]]
75 self.rmsd = 10000000000.
76 for comb
in self.Product:
78 order = sum([list(i)
for i
in comb], [])
83 if self.weights
is not None:
85 template_xyz, query_xyz, weights)
93 from scipy.spatial.distance
import cdist
101 torder = sum([list(i)
for i
in self.Product[0]], [])
105 self.rmsd, Transformation = 10000000000.,
''
108 for comb
in self.Product:
109 order = sum([list(i)
for i
in comb], [])
114 if len(template_xyz) != len(query_xyz):
115 raise ValueError(
'''the number of coordinates
116 in template and query does not match!''')
120 query_xyz, template_xyz)
121 query_xyz_tr = [transformation.get_transformed(n)
125 sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2))
129 Transformation = transformation
132 return (self.rmsd, Transformation)
137 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
138 'a..2':np.array([np.array([1.,1.,])]),
139 'a..3':np.array([np.array([-2.,1.])]),
140 'b':np.array([np.array([0.,-1.])]),
141 'c..1':np.array([np.array([-1.,-1.])]),
142 'c..2':np.array([np.array([1.,-1.])]),
143 'd':np.array([np.array([0.,0.])]),
144 'e':np.array([np.array([0.,1.])])}
146 Ali = Alignment(Proteins, Proteins)
148 if Ali.get_rmsd() == 0.0: print 'successful test!'
149 else: print 'ERROR!'; exit()
158 self.violation_thresholds = {}
159 self.violation_counts = {}
161 data = open(filename)
166 d = d.strip().split()
167 self.violation_thresholds[d[0]] = float(d[1])
169 def get_number_violated_restraints(self, rsts_dict):
171 for rst
in self.violation_thresholds:
172 if rst
not in rsts_dict:
174 if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
176 if rst
not in self.violation_counts:
177 self.violation_counts[rst] = 1
179 self.violation_counts[rst] += 1
184 """A class to cluster structures.
185 Uses scipy's cdist function to compute distance matrices
186 and sklearn's kmeans clustering module.
190 @param rmsd_weights Flat list of weights for each particle
194 from mpi4py
import MPI
195 self.comm = MPI.COMM_WORLD
196 self.rank = self.comm.Get_rank()
197 self.number_of_processes = self.comm.size
199 self.number_of_processes = 1
202 self.structure_cluster_ids =
None
203 self.tmpl_coords =
None
204 self.rmsd_weights = rmsd_weights
206 def set_template(self, part_coords):
207 self.tmpl_coords = part_coords
210 """Add coordinates for a single model."""
211 self.all_coords[frame] = Coords
213 def dist_matrix(self):
214 self.model_list_names = list(self.all_coords.keys())
215 self.model_indexes = list(range(len(self.model_list_names)))
216 self.model_indexes_dict = dict(
217 list(zip(self.model_list_names, self.model_indexes)))
218 model_indexes_unique_pairs = \
219 list(itertools.combinations(self.model_indexes, 2))
221 my_model_indexes_unique_pairs = IMP.pmi.tools.chunk_list_into_segments(
222 model_indexes_unique_pairs,
223 self.number_of_processes)[self.rank]
225 print(
"process %s assigned with %s pairs"
226 % (str(self.rank), str(len(my_model_indexes_unique_pairs))))
228 (raw_distance_dict, self.transformation_distance_dict) = \
229 self.matrix_calculation(self.all_coords, self.tmpl_coords,
230 my_model_indexes_unique_pairs)
232 if self.number_of_processes > 1:
235 pickable_transformations = \
236 self.get_pickable_transformation_distance_dict()
238 pickable_transformations)
239 self.set_transformation_distance_dict_from_pickable(
240 pickable_transformations)
242 self.raw_distance_matrix = np.zeros(
243 (len(self.model_list_names), len(self.model_list_names)))
244 for item
in raw_distance_dict:
246 self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
247 self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
249 def get_dist_matrix(self):
250 return self.raw_distance_matrix
253 """Run K-means clustering
254 @param number_of_clusters Num means
255 @param seed the random seed
257 from sklearn.cluster
import KMeans
262 kmeans = KMeans(n_clusters=number_of_clusters)
265 kmeans = KMeans(k=number_of_clusters)
266 kmeans.fit_predict(self.raw_distance_matrix)
268 self.structure_cluster_ids = kmeans.labels_
270 def get_pickable_transformation_distance_dict(self):
271 pickable_transformations = {}
272 for label
in self.transformation_distance_dict:
273 tr = self.transformation_distance_dict[label]
274 trans = tuple(tr.get_translation())
275 rot = tuple(tr.get_rotation().get_quaternion())
276 pickable_transformations[label] = (rot, trans)
277 return pickable_transformations
279 def set_transformation_distance_dict_from_pickable(
281 pickable_transformations):
282 self.transformation_distance_dict = {}
283 for label
in pickable_transformations:
284 tr = pickable_transformations[label]
287 self.transformation_distance_dict[
290 def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
292 outf = open(file_name +
".data",
'wb')
298 pickable_transformations = \
299 self.get_pickable_transformation_distance_dict()
301 (self.structure_cluster_ids,
302 self.model_list_names,
303 pickable_transformations),
306 np.save(file_name +
".npy", self.raw_distance_matrix)
308 def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
311 inputf = open(file_name +
".data",
'rb')
312 (self.structure_cluster_ids, self.model_list_names,
313 pickable_transformations) = pickle.load(inputf)
316 self.raw_distance_matrix = np.load(file_name +
".npy")
318 self.set_transformation_distance_dict_from_pickable(
319 pickable_transformations)
320 self.model_indexes = list(range(len(self.model_list_names)))
321 self.model_indexes_dict = dict(
322 list(zip(self.model_list_names, self.model_indexes)))
324 def plot_matrix(self, figurename="clustermatrix.pdf"):
325 import matplotlib
as mpl
327 import matplotlib.pylab
as pl
328 from scipy.cluster
import hierarchy
as hrc
330 fig = pl.figure(figsize=(10, 8))
331 ax = fig.add_subplot(212)
332 dendrogram = hrc.dendrogram(
333 hrc.linkage(self.raw_distance_matrix),
336 leaves_order = dendrogram[
'leaves']
337 ax.set_xlabel(
'Model')
338 ax.set_ylabel(
'RMSD [Angstroms]')
340 ax2 = fig.add_subplot(221)
342 self.raw_distance_matrix[leaves_order,
345 interpolation=
'nearest')
346 cb = fig.colorbar(cax)
347 cb.set_label(
'RMSD [Angstroms]')
348 ax2.set_xlabel(
'Model')
349 ax2.set_ylabel(
'Model')
351 pl.savefig(figurename, dpi=300)
354 def get_model_index_from_name(self, name):
355 return self.model_indexes_dict[name]
357 def get_cluster_labels(self):
359 return list(set(self.structure_cluster_ids))
361 def get_number_of_clusters(self):
362 return len(self.get_cluster_labels())
364 def get_cluster_label_indexes(self, label):
366 [i
for i, l
in enumerate(self.structure_cluster_ids)
if l == label]
369 def get_cluster_label_names(self, label):
371 [self.model_list_names[i]
372 for i
in self.get_cluster_label_indexes(label)]
375 def get_cluster_label_average_rmsd(self, label):
377 indexes = self.get_cluster_label_indexes(label)
380 sub_distance_matrix = self.raw_distance_matrix[
381 indexes, :][:, indexes]
382 average_rmsd = np.sum(sub_distance_matrix) / \
383 (len(sub_distance_matrix)
384 ** 2 - len(sub_distance_matrix))
389 def get_cluster_label_size(self, label):
390 return len(self.get_cluster_label_indexes(label))
392 def get_transformation_to_first_member(
396 reference = self.get_cluster_label_indexes(cluster_label)[0]
397 return self.transformation_distance_dict[(reference, structure_index)]
399 def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
401 model_list_names = list(all_coords.keys())
402 rmsd_protein_names = list(all_coords[model_list_names[0]].keys())
403 raw_distance_dict = {}
404 transformation_distance_dict = {}
405 if template_coords
is None:
409 alignment_template_protein_names = list(template_coords.keys())
411 for (f1, f2)
in list_of_pairs:
419 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
420 for pr
in rmsd_protein_names])
422 for pr
in rmsd_protein_names:
423 coords_f2[pr] = all_coords[model_list_names[f2]][pr]
425 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
426 rmsd = Ali.get_rmsd()
432 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
433 for pr
in alignment_template_protein_names])
434 coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
435 for pr
in alignment_template_protein_names])
438 template_rmsd, transformation = Ali.align()
443 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
444 for pr
in rmsd_protein_names])
446 for pr
in rmsd_protein_names:
447 coords_f2[pr] = [transformation.get_transformed(
448 i)
for i
in all_coords[model_list_names[f2]][pr]]
450 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
451 rmsd = Ali.get_rmsd()
453 raw_distance_dict[(f1, f2)] = rmsd
454 raw_distance_dict[(f2, f1)] = rmsd
455 transformation_distance_dict[(f1, f2)] = transformation
456 transformation_distance_dict[(f2, f1)] = transformation
458 return raw_distance_dict, transformation_distance_dict
462 """Compute the RMSD (without alignment) taking into account the copy
464 To be used with pmi2 hierarchies. Can be used for instance as follows:
466 rmsd = IMP.pmi.analysis.RMSD(
467 hier, hier, [mol.get_name() for mol in mols],
468 dynamic0=True, dynamic1=False)
469 output_objects.append(rmsd)
471 before shuffling the coordinates
474 def __init__(self, hier0, hier1, molnames, label="None", dynamic0=True,
475 dynamic1=
True, metric=IMP.algebra.get_rmsd):
477 @param hier0 first input hierarchy
478 @param hier1 second input hierarchy
479 @param molnames the names of the molecules used for the RMSD
480 @dynamic0 if True stores the decorators XYZ and coordinates of
481 hier0 can be update. If false coordinates are static
482 (stored in Vector3Ds) and will never be updated
483 @dynamic1 same as above
484 metric what metric should be used
486 self.moldict0, self.molcoords0, self.mol_XYZs0 = \
488 self.moldict1, self.molcoords1, self.mol_XYZs1 = \
490 self.dynamic0 = dynamic0
491 self.dynamic1 = dynamic1
496 """return data structure for the RMSD calculation"""
501 name = mol.get_name()
502 if name
not in molnames:
510 mol, residue_index=i,
511 representation_type=IMP.atom.BALLS,
513 parts = sel.get_selected_particles()
515 mol_coords[mol].append(
520 moldict[name].append(mol)
522 moldict[name] = [mol]
523 return moldict, mol_coords, mol_XYZs
525 def get_rmsd_and_assigments(self):
528 best_assignments = []
530 for molname, ref_mols
in self.moldict1.items():
532 for ref_mol
in ref_mols:
534 coord1 = [XYZ.get_coordinates()
535 for XYZ
in self.mol_XYZs1[ref_mol]]
537 coord1 = self.molcoords1[ref_mol]
542 for rmf_mols
in itertools.permutations(self.moldict0[molname]):
544 for rmf_mol
in rmf_mols:
546 coord0 = [XYZ.get_coordinates()
547 for XYZ
in self.mol_XYZs0[rmf_mol]]
549 coord0 = self.molcoords0[rmf_mol]
553 rmf_mols_list.append(rmf_mols)
555 rmf_mols_best_order = rmf_mols_list[rmsd.index(m)]
557 for n, (ref_mol, rmf_mol)
in enumerate(
558 zip(ref_mols, rmf_mols_best_order)):
559 best_assignments.append((rmf_mol, ref_mol))
561 coord0 = [XYZ.get_coordinates()
562 for XYZ
in self.mol_XYZs0[rmf_mol]]
564 coord0 = self.molcoords0[rmf_mol]
567 coord1 = [XYZ.get_coordinates()
568 for XYZ
in self.mol_XYZs1[ref_mol]]
570 coord1 = self.molcoords1[ref_mol]
571 rmsd_pair = self.metric(coord1, coord0)
572 N = len(self.molcoords1[ref_mol])
574 total_rmsd += rmsd_pair*rmsd_pair*N
575 rmsd_dict[ref_mol] = rmsd_pair
576 total_rmsd = sqrt(total_rmsd/total_N)
577 return total_rmsd, best_assignments
580 """Returns output for IMP.pmi.output.Output object"""
581 total_rmsd, best_assignments = self.get_rmsd_and_assigments()
584 for rmf_mol, ref_mol
in best_assignments:
585 ref_name = ref_mol.get_name()
587 rmf_name = rmf_mol.get_name()
589 assignments_out.append(rmf_name +
"." + str(rmf_copy) +
"->" +
590 ref_name +
"." + str(ref_copy))
591 return {
"RMSD_" + self.label: str(total_rmsd),
592 "RMSD_assignments_" + self.label: str(assignments_out)}
596 """A class to evaluate the precision of an ensemble.
598 Also can evaluate the cross-precision of multiple ensembles.
599 Supports MPI for coordinate reading.
600 Recommended procedure:
601 -# initialize object and pass the selection for evaluating precision
602 -# call add_structures() to read in the data (specify group name)
603 -# call get_precision() to evaluate inter/intra precision
604 -# call get_rmsf() to evaluate within-group fluctuations
606 def __init__(self, model, resolution=1, selection_dictionary={}):
608 @param model The IMP Model
609 @param resolution Use 1 or 10 (kluge: requires that "_Res:X" is
610 part of the hier name)
611 @param selection_dictionary Dictionary where keys are names for
612 selections and values are selection tuples for scoring
613 precision. "All" is automatically made as well
616 from mpi4py
import MPI
617 self.comm = MPI.COMM_WORLD
618 self.rank = self.comm.Get_rank()
619 self.number_of_processes = self.comm.size
621 self.number_of_processes = 1
624 self.styles = [
'pairwise_rmsd',
'pairwise_drmsd_k',
'pairwise_drmsd_Q',
625 'pairwise_drms_k',
'pairwise_rmsd',
'drmsd_from_center']
626 self.style =
'pairwise_drmsd_k'
627 self.structures_dictionary = {}
628 self.reference_structures_dictionary = {}
630 self.protein_names =
None
631 self.len_particles_resolution_one =
None
633 self.rmf_names_frames = {}
634 self.reference_rmf_names_frames =
None
635 self.reference_structure =
None
636 self.reference_prot =
None
637 self.selection_dictionary = selection_dictionary
638 self.threshold = 40.0
639 self.residue_particle_index_map =
None
641 if resolution
in [1, 10]:
642 self.resolution = resolution
644 raise KeyError(
"Currently only allow resolution 1 or 10")
646 def _get_structure(self, rmf_frame_index, rmf_name):
647 """Read an RMF file and return the particles"""
648 rh = RMF.open_rmf_file_read_only(rmf_name)
650 print(
"getting coordinates for frame %i rmf file %s"
651 % (rmf_frame_index, rmf_name))
655 print(
"linking coordinates for frame %i rmf file %s"
656 % (rmf_frame_index, rmf_name))
661 if self.resolution == 1:
663 elif self.resolution == 10:
666 protein_names = list(particle_dict.keys())
667 particles_resolution_one = []
668 for k
in particle_dict:
669 particles_resolution_one += (particle_dict[k])
671 if self.protein_names
is None:
672 self.protein_names = protein_names
674 if self.protein_names != protein_names:
675 print(
"Error: the protein names of the new coordinate set "
676 "is not compatible with the previous one")
678 if self.len_particles_resolution_one
is None:
679 self.len_particles_resolution_one = len(particles_resolution_one)
681 if self.len_particles_resolution_one != \
682 len(particles_resolution_one):
683 raise ValueError(
"the new coordinate set is not compatible "
684 "with the previous one")
686 return particles_resolution_one
692 setup_index_map=
False):
693 """ Read a structure into the ensemble and store (as coordinates).
694 @param rmf_name The name of the RMF file
695 @param rmf_frame_index The frame to read
696 @param structure_set_name Name for the set that includes this structure
698 @param setup_index_map if requested, set up a dictionary to help
703 if structure_set_name
in self.structures_dictionary:
704 cdict = self.structures_dictionary[structure_set_name]
705 rmflist = self.rmf_names_frames[structure_set_name]
707 self.structures_dictionary[structure_set_name] = {}
708 self.rmf_names_frames[structure_set_name] = []
709 cdict = self.structures_dictionary[structure_set_name]
710 rmflist = self.rmf_names_frames[structure_set_name]
714 particles_resolution_one = self._get_structure(
715 rmf_frame_index, rmf_name)
717 print(
"something wrong with the rmf")
719 self.selection_dictionary.update({
"All": self.protein_names})
721 for selection_name
in self.selection_dictionary:
722 selection_tuple = self.selection_dictionary[selection_name]
723 coords = self._select_coordinates(
724 selection_tuple, particles_resolution_one, self.prots[0])
725 if selection_name
not in cdict:
726 cdict[selection_name] = [coords]
728 cdict[selection_name].append(coords)
730 rmflist.append((rmf_name, rmf_frame_index))
734 self.residue_particle_index_map = {}
735 for prot_name
in self.protein_names:
736 self.residue_particle_index_map[prot_name] = \
737 self._get_residue_particle_index_map(
738 prot_name, particles_resolution_one, self.prots[0])
741 rmf_name_frame_tuples,
743 """Read a list of RMFs, supports parallel
744 @param rmf_name_frame_tuples list of (rmf_file_name,frame_number)
745 @param structure_set_name Name this set of structures
750 my_rmf_name_frame_tuples = IMP.pmi.tools.chunk_list_into_segments(
751 rmf_name_frame_tuples, self.number_of_processes)[self.rank]
752 for nfr, tup
in enumerate(my_rmf_name_frame_tuples):
754 rmf_frame_index = tup[1]
756 if self.residue_particle_index_map
is None:
757 setup_index_map =
True
759 setup_index_map =
False
766 if self.number_of_processes > 1:
768 self.rmf_names_frames)
770 self.comm.send(self.structures_dictionary, dest=0, tag=11)
772 for i
in range(1, self.number_of_processes):
773 data_tmp = self.comm.recv(source=i, tag=11)
774 for key
in self.structures_dictionary:
775 self.structures_dictionary[key].update(data_tmp[key])
776 for i
in range(1, self.number_of_processes):
777 self.comm.send(self.structures_dictionary, dest=i, tag=11)
779 self.structures_dictionary = self.comm.recv(source=0, tag=11)
781 def _get_residue_particle_index_map(self, prot_name, structure, hier):
783 residue_particle_index_map = []
785 all_selected_particles = s.get_selected_particles()
786 intersection = list(set(all_selected_particles) & set(structure))
787 sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
788 for p
in sorted_intersection:
789 residue_particle_index_map.append(
791 return residue_particle_index_map
793 def _select_coordinates(self, tuple_selections, structure, prot):
794 selected_coordinates = []
795 for t
in tuple_selections:
796 if isinstance(t, tuple)
and len(t) == 3:
798 prot, molecules=[t[2]],
799 residue_indexes=range(t[0], t[1]+1), resolution=1)
800 all_selected_particles = s.get_selected_particles()
801 intersection = list(set(all_selected_particles)
803 sorted_intersection = \
804 IMP.pmi.tools.sort_by_residues(intersection)
806 for p
in sorted_intersection]
807 selected_coordinates += cc
808 elif isinstance(t, str):
810 all_selected_particles = s.get_selected_particles()
811 intersection = list(set(all_selected_particles)
813 sorted_intersection = \
814 IMP.pmi.tools.sort_by_residues(intersection)
816 for p
in sorted_intersection]
817 selected_coordinates += cc
819 raise ValueError(
"Selection error")
820 return selected_coordinates
822 def set_threshold(self, threshold):
823 self.threshold = threshold
825 def _get_distance(self, structure_set_name1, structure_set_name2,
826 selection_name, index1, index2):
827 """Compute distance between structures with various metrics"""
828 c1 = self.structures_dictionary[
829 structure_set_name1][selection_name][index1]
830 c2 = self.structures_dictionary[
831 structure_set_name2][selection_name][index2]
835 if self.style ==
'pairwise_drmsd_k':
837 if self.style ==
'pairwise_drms_k':
839 if self.style ==
'pairwise_drmsd_Q':
843 if self.style ==
'pairwise_rmsd':
847 def _get_particle_distances(self, structure_set_name1, structure_set_name2,
848 selection_name, index1, index2):
849 c1 = self.structures_dictionary[
850 structure_set_name1][selection_name][index1]
851 c2 = self.structures_dictionary[
852 structure_set_name2][selection_name][index2]
857 distances = [np.linalg.norm(a-b)
858 for (a, b)
in zip(coordinates1, coordinates2)]
863 outfile=
None, skip=1, selection_keywords=
None):
864 """ Evaluate the precision of two named structure groups. Supports MPI.
865 When the structure_set_name1 is different from the structure_set_name2,
866 this evaluates the cross-precision (average pairwise distances).
867 @param outfile Name of the precision output file
868 @param structure_set_name1 string name of the first structure set
869 @param structure_set_name2 string name of the second structure set
870 @param skip analyze every (skip) structure for the distance
872 @param selection_keywords Specify the selection name you want to
873 calculate on. By default this is computed for everything
874 you provided in the constructor, plus all the subunits together.
876 if selection_keywords
is None:
877 sel_keys = list(self.selection_dictionary.keys())
879 for k
in selection_keywords:
880 if k
not in self.selection_dictionary:
882 "you are trying to find named selection "
883 + k +
" which was not requested in the constructor")
884 sel_keys = selection_keywords
886 if outfile
is not None:
887 of = open(outfile,
"w")
889 for selection_name
in sel_keys:
890 number_of_structures_1 = len(self.structures_dictionary[
891 structure_set_name1][selection_name])
892 number_of_structures_2 = len(self.structures_dictionary[
893 structure_set_name2][selection_name])
895 structure_pointers_1 = list(range(0, number_of_structures_1, skip))
896 structure_pointers_2 = list(range(0, number_of_structures_2, skip))
897 pair_combination_list = list(
898 itertools.product(structure_pointers_1, structure_pointers_2))
899 if len(pair_combination_list) == 0:
900 raise ValueError(
"no structure selected. Check the "
904 my_pair_combination_list = IMP.pmi.tools.chunk_list_into_segments(
905 pair_combination_list, self.number_of_processes)[self.rank]
906 for n, pair
in enumerate(my_pair_combination_list):
907 distances[pair] = self._get_distance(
908 structure_set_name1, structure_set_name2, selection_name,
910 if self.number_of_processes > 1:
915 if structure_set_name1 == structure_set_name2:
916 structure_pointers = structure_pointers_1
917 number_of_structures = number_of_structures_1
922 distances_to_structure = {}
923 distances_to_structure_normalization = {}
925 for n
in structure_pointers:
926 distances_to_structure[n] = 0.0
927 distances_to_structure_normalization[n] = 0
930 distance += distances[k]
931 distances_to_structure[k[0]] += distances[k]
932 distances_to_structure[k[1]] += distances[k]
933 distances_to_structure_normalization[k[0]] += 1
934 distances_to_structure_normalization[k[1]] += 1
936 for n
in structure_pointers:
937 distances_to_structure[n] = \
938 distances_to_structure[n] \
939 / distances_to_structure_normalization[n]
941 min_distance = min([distances_to_structure[n]
942 for n
in distances_to_structure])
943 centroid_index = [k
for k, v
in
944 distances_to_structure.items()
945 if v == min_distance][0]
946 centroid_rmf_name = \
947 self.rmf_names_frames[
948 structure_set_name1][centroid_index]
950 centroid_distance = 0.0
952 for n
in range(number_of_structures):
953 dist = self._get_distance(
954 structure_set_name1, structure_set_name1,
955 selection_name, centroid_index, n)
956 centroid_distance += dist
957 distance_list.append(dist)
959 centroid_distance /= number_of_structures
960 if outfile
is not None:
961 of.write(str(selection_name) +
" " +
962 structure_set_name1 +
963 " average centroid distance " +
964 str(centroid_distance) +
"\n")
965 of.write(str(selection_name) +
" " +
966 structure_set_name1 +
" centroid index " +
967 str(centroid_index) +
"\n")
968 of.write(str(selection_name) +
" " +
969 structure_set_name1 +
" centroid rmf name " +
970 str(centroid_rmf_name) +
"\n")
971 of.write(str(selection_name) +
" " +
972 structure_set_name1 +
973 " median centroid distance " +
974 str(np.median(distance_list)) +
"\n")
976 average_pairwise_distances = sum(
977 distances.values())/len(list(distances.values()))
978 if outfile
is not None:
979 of.write(str(selection_name) +
" " + structure_set_name1 +
980 " " + structure_set_name2 +
981 " average pairwise distance " +
982 str(average_pairwise_distances) +
"\n")
983 if outfile
is not None:
985 return centroid_index
987 def get_rmsf(self, structure_set_name, outdir="./", skip=1,
988 set_plot_yaxis_range=
None):
989 """ Calculate the residue mean square fluctuations (RMSF).
990 Automatically outputs as data file and pdf
991 @param structure_set_name Which structure set to calculate RMSF for
992 @param outdir Where to write the files
993 @param skip Skip this number of structures
994 @param set_plot_yaxis_range In case you need to change the plot
1002 for sel_name
in self.protein_names:
1003 self.selection_dictionary.update({sel_name: [sel_name]})
1005 number_of_structures = \
1006 len(self.structures_dictionary[
1007 structure_set_name][sel_name])
1011 rpim = self.residue_particle_index_map[sel_name]
1012 outfile = outdir+
"/rmsf."+sel_name+
".dat"
1013 of = open(outfile,
"w")
1014 residue_distances = {}
1016 for index
in range(number_of_structures):
1017 distances = self._get_particle_distances(
1018 structure_set_name, structure_set_name, sel_name,
1019 centroid_index, index)
1020 for nblock, block
in enumerate(rpim):
1021 for residue_number
in block:
1022 residue_nblock[residue_number] = nblock
1023 if residue_number
not in residue_distances:
1024 residue_distances[residue_number] = \
1028 residue_number].append(distances[nblock])
1032 for rn
in residue_distances:
1034 rmsf = np.std(residue_distances[rn])
1036 of.write(str(rn) +
" " + str(residue_nblock[rn])
1037 +
" " + str(rmsf) +
"\n")
1039 IMP.pmi.output.plot_xy_data(
1040 residues, rmsfs, title=sel_name,
1041 out_fn=outdir+
"/rmsf."+sel_name, display=
False,
1042 set_plot_yaxis_range=set_plot_yaxis_range,
1043 xlabel=
'Residue Number', ylabel=
'Standard error')
1047 """Read in a structure used for reference computation.
1048 Needed before calling get_average_distance_wrt_reference_structure()
1049 @param rmf_name The RMF file to read the reference
1050 @param rmf_frame_index The index in that file
1052 particles_resolution_one = self._get_structure(rmf_frame_index,
1054 self.reference_rmf_names_frames = (rmf_name, rmf_frame_index)
1056 for selection_name
in self.selection_dictionary:
1057 selection_tuple = self.selection_dictionary[selection_name]
1058 coords = self._select_coordinates(
1059 selection_tuple, particles_resolution_one, self.prots[0])
1060 self.reference_structures_dictionary[selection_name] = coords
1063 self, structure_set_name, alignment_selection_key):
1064 """First align then calculate RMSD
1065 @param structure_set_name: the name of the structure set
1066 @param alignment_selection_key: the key containing the selection tuples
1067 needed to make the alignment stored in self.selection_dictionary
1068 @return: for each structure in the structure set, returns the rmsd
1071 if self.reference_structures_dictionary == {}:
1072 print(
"Cannot compute until you set a reference structure")
1075 align_reference_coordinates = \
1076 self.reference_structures_dictionary[alignment_selection_key]
1077 align_coordinates = self.structures_dictionary[
1078 structure_set_name][alignment_selection_key]
1079 transformations = []
1080 for c
in align_coordinates:
1082 {
"All": align_reference_coordinates}, {
"All": c})
1083 transformation = Ali.align()[1]
1084 transformations.append(transformation)
1085 for selection_name
in self.selection_dictionary:
1086 reference_coordinates = \
1087 self.reference_structures_dictionary[selection_name]
1089 for c
in reference_coordinates]
1091 for n, sc
in enumerate(self.structures_dictionary[
1092 structure_set_name][selection_name]):
1097 distances.append(distance)
1098 ret[selection_name] = {
1099 'all_distances': distances,
1100 'average_distance': sum(distances)/len(distances),
1101 'minimum_distance': min(distances)}
1104 def _get_median(self, list_of_values):
1105 return np.median(np.array(list_of_values))
1108 """Compare the structure set to the reference structure.
1109 @param structure_set_name The structure set to compute this on
1110 @note First call set_reference_structure()
1113 if self.reference_structures_dictionary == {}:
1114 print(
"Cannot compute until you set a reference structure")
1116 for selection_name
in self.selection_dictionary:
1117 reference_coordinates = self.reference_structures_dictionary[
1120 for c
in reference_coordinates]
1122 for sc
in self.structures_dictionary[
1123 structure_set_name][selection_name]:
1125 if self.style ==
'pairwise_drmsd_k':
1127 if self.style ==
'pairwise_drms_k':
1129 if self.style ==
'pairwise_drmsd_Q':
1131 coordinates1, coordinates2, self.threshold)
1132 if self.style ==
'pairwise_rmsd':
1134 distances.append(distance)
1136 print(selection_name,
"average distance",
1137 sum(distances)/len(distances),
"minimum distance",
1138 min(distances),
'nframes', len(distances))
1139 ret[selection_name] = {
1140 'average_distance': sum(distances)/len(distances),
1141 'minimum_distance': min(distances)}
1144 def get_coordinates(self):
1147 def set_precision_style(self, style):
1148 if style
in self.styles:
1151 raise ValueError(
"No such style")
1155 """Compute mean density maps from structures.
1157 Keeps a dictionary of density maps,
1158 keys are in the custom ranges. When you call add_subunits_density, it adds
1159 particle coordinates to the existing density maps.
1162 def __init__(self, custom_ranges, representation=None, resolution=20.0,
1165 @param custom_ranges Required. It's a dictionary, keys are the
1166 density component names, values are selection tuples
1167 e.g. {'kin28':[['kin28',1,-1]],
1168 'density_name_1' :[('ccl1')],
1169 'density_name_2' :[(1,142,'tfb3d1'),
1170 (143,700,'tfb3d2')],
1171 @param resolution The MRC resolution of the output map (in
1173 @param voxel The voxel size for the output map (lower is slower)
1178 "The 'representation' argument is deprecated "
1179 "(and is ignored). Pass hierarchies to "
1180 "add_subunits_density() instead.")
1181 self.MRCresolution = resolution
1184 self.count_models = 0.0
1185 self.custom_ranges = custom_ranges
1188 """Add a frame to the densities.
1189 @param hierarchy The hierarchy to add.
1191 self.count_models += 1.0
1194 all_particles_by_resolution = []
1195 for name
in part_dict:
1196 all_particles_by_resolution += part_dict[name]
1198 for density_name
in self.custom_ranges:
1200 all_particles_by_segments = []
1202 for seg
in self.custom_ranges[density_name]:
1203 if isinstance(seg, str):
1205 elif isinstance(seg, tuple)
and len(seg) == 2:
1207 hierarchy, molecule=seg[0], copy_index=seg[1])
1208 elif isinstance(seg, tuple)
and len(seg) == 3:
1210 hierarchy, molecule=seg[2],
1211 residue_indexes=range(seg[0], seg[1] + 1))
1212 elif isinstance(seg, tuple)
and len(seg) == 4:
1214 hierarchy, molecule=seg[2],
1215 residue_indexes=range(seg[0], seg[1] + 1),
1218 raise Exception(
'could not understand selection tuple '
1221 all_particles_by_segments += s.get_selected_particles()
1222 parts = all_particles_by_segments
1223 self._create_density_from_particles(parts, density_name)
1225 def normalize_density(self):
1228 def _create_density_from_particles(self, ps, name,
1229 kernel_type=
'GAUSSIAN'):
1230 '''Internal function for adding to densities.
1231 pass XYZR particles with mass and create a density from them.
1232 kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
1235 dmap.set_was_used(
True)
1236 if name
not in self.densities:
1237 self.densities[name] = dmap
1243 dmap3.set_was_used(
True)
1245 dmap3.add(self.densities[name])
1246 self.densities[name] = dmap3
1248 def get_density_keys(self):
1249 return list(self.densities.keys())
1252 """Get the current density for some component name"""
1253 if name
not in self.densities:
1256 return self.densities[name]
1258 def write_mrc(self, path="./", suffix=None):
1261 for density_name
in self.densities:
1262 self.densities[density_name].
multiply(1. / self.count_models)
1264 name = path +
"/" + density_name +
".mrc"
1266 name = path +
"/" + density_name +
"." + suffix +
".mrc"
1267 path, file = os.path.split(name)
1268 if not os.path.exists(path):
1271 except OSError
as e:
1272 if e.errno != errno.EEXIST:
1274 IMP.em.write_map(self.densities[density_name], name,
1278 class GetContactMap:
1281 self.distance = distance
1282 self.contactmap =
''
1289 def set_prot(self, prot):
1295 for name
in particles_dictionary:
1296 residue_indexes = []
1297 for p
in particles_dictionary[name]:
1301 if len(residue_indexes) != 0:
1302 self.protnames.append(name)
1304 def get_subunit_coords(self, frame, align=0):
1305 from scipy.spatial.distance
import cdist
1309 for part
in self.prot.get_children():
1312 for chl
in part.get_children():
1316 SortedSegments.append((chl, startres))
1317 SortedSegments = sorted(SortedSegments, key=itemgetter(1))
1319 for sgmnt
in SortedSegments:
1322 crd = np.array([p.get_x(), p.get_y(), p.get_z()])
1325 radii.append(p.get_radius())
1327 new_name = part.get_name() +
'_' + sgmnt[0].get_name() + \
1330 .get_residue_indexes()[0])
1331 namelist.append(new_name)
1332 self.expanded[new_name] = len(
1334 if part.get_name()
not in self.resmap:
1335 self.resmap[part.get_name()] = {}
1337 self.resmap[part.get_name()][res] = new_name
1339 coords = np.array(coords)
1340 radii = np.array(radii)
1341 if len(self.namelist) == 0:
1342 self.namelist = namelist
1343 self.contactmap = np.zeros((len(coords), len(coords)))
1344 distances = cdist(coords, coords)
1345 distances = (distances - radii).T - radii
1346 distances = distances <= self.distance
1347 self.contactmap += distances
1349 def add_xlinks(self, filename,
1350 identification_string=
'ISDCrossLinkMS_Distance_'):
1352 with open(filename)
as data:
1353 D = data.readlines()
1356 if identification_string
in d:
1357 d = d.replace(
"_",
" ").replace(
"-",
" ").replace(
1360 t1, t2 = (d[0], d[1]), (d[1], d[0])
1361 if t1
not in self.XL:
1362 self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
1363 self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
1365 self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
1366 self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
1368 def dist_matrix(self, skip_cmap=0, skip_xl=1, outname=None):
1372 proteins = self.protnames
1377 proteins = [p.get_name()
for p
in self.prot.get_children()]
1379 for p1
in range(len(proteins)):
1380 for p2
in range(p1, len(proteins)):
1381 pl1, pl2 = (max(self.resmap[proteins[p1]].keys()),
1382 max(self.resmap[proteins[p2]].keys()))
1383 pn1, pn2 = proteins[p1], proteins[p2]
1384 mtr = np.zeros((pl1 + 1, pl2 + 1))
1385 print(
'Creating matrix for: ',
1386 p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1387 for i1
in range(1, pl1 + 1):
1388 for i2
in range(1, pl2 + 1):
1390 r1 = K.index(self.resmap[pn1][i1])
1391 r2 = K.index(self.resmap[pn2][i2])
1393 mtr[i1 - 1, i2 - 1] = r
1395 missing.append((pn1, pn2, i1, i2))
1396 Matrices[(pn1, pn2)] = mtr
1401 raise ValueError(
"cross-links were not provided, use "
1402 "add_xlinks function!")
1404 for p1
in range(len(proteins)):
1405 for p2
in range(p1, len(proteins)):
1406 pl1, pl2 = (max(self.resmap[proteins[p1]].keys()),
1407 max(self.resmap[proteins[p2]].keys()))
1408 pn1, pn2 = proteins[p1], proteins[p2]
1409 mtr = np.zeros((pl1 + 1, pl2 + 1))
1412 xls = self.XL[(pn1, pn2)]
1415 xls = self.XL[(pn2, pn1)]
1420 print(
'Creating matrix for: ',
1421 p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1422 for xl1, xl2
in xls:
1424 print(
'X' * 10, xl1, xl2)
1427 print(
'X' * 10, xl1, xl2)
1429 mtr[xl1 - 1, xl2 - 1] = 100
1431 print(
'Creating matrix for: ',
1432 p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1433 for xl1, xl2
in xls:
1435 print(
'X' * 10, xl1, xl2)
1438 print(
'X' * 10, xl1, xl2)
1440 mtr[xl2 - 1, xl1 - 1] = 100
1442 print(
'No cross-links between: ', pn1, pn2)
1443 Matrices_xl[(pn1, pn2)] = mtr
1450 for i, c
in enumerate(C):
1451 cl = max(self.resmap[c].keys())
1456 R.append(R[-1] + cl)
1461 import matplotlib
as mpl
1463 import matplotlib.pyplot
as plt
1464 import matplotlib.gridspec
as gridspec
1465 import scipy.sparse
as sparse
1468 gs = gridspec.GridSpec(len(W), len(W),
1473 for x1, r1
in enumerate(R):
1474 for x2, r2
in enumerate(R):
1475 ax = plt.subplot(gs[cnt])
1478 mtr = Matrices[(C[x1], C[x2])]
1480 mtr = Matrices[(C[x2], C[x1])].T
1483 interpolation=
'nearest',
1485 vmax=log(mtr.max())
if mtr.max() > 0.
else 1.)
1490 mtr = Matrices_xl[(C[x1], C[x2])]
1492 mtr = Matrices_xl[(C[x2], C[x1])].T
1494 sparse.csr_matrix(mtr),
1504 ax.set_ylabel(C[x1], rotation=90)
1506 plt.savefig(outname +
".pdf", dpi=300, transparent=
"False")
1514 def get_hiers_from_rmf(model, frame_number, rmf_file):
1516 print(
"getting coordinates for frame %i rmf file %s"
1517 % (frame_number, rmf_file))
1520 rh = RMF.open_rmf_file_read_only(rmf_file)
1525 print(
"Unable to open rmf file %s" % (rmf_file))
1531 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1539 def link_hiers_to_rmf(model, hiers, frame_number, rmf_file):
1540 print(
"linking hierarchies for frame %i rmf file %s"
1541 % (frame_number, rmf_file))
1542 rh = RMF.open_rmf_file_read_only(rmf_file)
1547 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1554 def get_hiers_and_restraints_from_rmf(model, frame_number, rmf_file):
1556 print(
"getting coordinates for frame %i rmf file %s"
1557 % (frame_number, rmf_file))
1560 rh = RMF.open_rmf_file_read_only(rmf_file)
1566 print(
"Unable to open rmf file %s" % (rmf_file))
1571 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1579 def link_hiers_and_restraints_to_rmf(model, hiers, rs, frame_number, rmf_file):
1580 print(
"linking hierarchies for frame %i rmf file %s"
1581 % (frame_number, rmf_file))
1582 rh = RMF.open_rmf_file_read_only(rmf_file)
1588 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1596 """Get particles at res 1, or any beads, based on the name.
1597 No Representation is needed. This is mainly used when the hierarchy
1598 is read from an RMF file.
1600 @return a dictionary of component names and their particles
1601 @note If the root node is named "System" or is a "State", do
1607 for mol
in IMP.atom.get_by_type(prot, IMP.atom.MOLECULE_TYPE):
1609 particle_dict[mol.get_name()] = sel.get_selected_particles()
1610 return particle_dict
1614 """Get particles at res 10, or any beads, based on the name.
1615 No Representation is needed.
1616 This is mainly used when the hierarchy is read from an RMF file.
1618 @return a dictionary of component names and their particles
1619 @note If the root node is named "System" or is a "State", do proper
1624 for mol
in IMP.atom.get_by_type(prot, IMP.atom.MOLECULE_TYPE):
1626 particle_dict[mol.get_name()] = sel.get_selected_particles()
1627 return particle_dict
1631 """Visualization of cross-links"""
1633 self.crosslinks = []
1634 self.external_csv_data =
None
1635 self.crosslinkedprots = set()
1636 self.mindist = +10000000.0
1637 self.maxdist = -10000000.0
1638 self.contactmap =
None
1640 def set_hierarchy(self, prot):
1641 self.prot_length_dict = {}
1642 self.model = prot.get_model()
1644 for i
in prot.get_children():
1646 residue_indexes = []
1650 if len(residue_indexes) != 0:
1651 self.prot_length_dict[name] = max(residue_indexes)
1653 def set_coordinates_for_contact_map(self, rmf_name, rmf_frame_index):
1654 from scipy.spatial.distance
import cdist
1656 rh = RMF.open_rmf_file_read_only(rmf_name)
1659 print(
"getting coordinates for frame %i rmf file %s"
1660 % (rmf_frame_index, rmf_name))
1669 self.index_dictionary = {}
1671 for name
in particles_dictionary:
1672 residue_indexes = []
1673 for p
in particles_dictionary[name]:
1677 if len(residue_indexes) != 0:
1679 for res
in range(min(residue_indexes),
1680 max(residue_indexes) + 1):
1683 crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1685 radii.append(d.get_radius())
1686 if name
not in self.index_dictionary:
1687 self.index_dictionary[name] = [resindex]
1689 self.index_dictionary[name].append(resindex)
1692 coords = np.array(coords)
1693 radii = np.array(radii)
1695 distances = cdist(coords, coords)
1696 distances = (distances - radii).T - radii
1698 distances = np.where(distances <= 20.0, 1.0, 0)
1699 if self.contactmap
is None:
1700 self.contactmap = np.zeros((len(coords), len(coords)))
1701 self.contactmap += distances
1704 IMP.atom.destroy(prot)
1707 self, data_file, search_label=
'ISDCrossLinkMS_Distance_',
1708 mapping=
None, filter_label=
None,
1710 filter_rmf_file_names=
None, external_csv_data_file=
None,
1711 external_csv_data_file_unique_id_key=
"Unique ID"):
1722 keys = po.get_keys()
1724 xl_keys = [k
for k
in keys
if search_label
in k]
1726 if filter_rmf_file_names
is not None:
1727 rmf_file_key =
"local_rmf_file_name"
1728 fs = po.get_fields(xl_keys+[rmf_file_key])
1730 fs = po.get_fields(xl_keys)
1733 self.cross_link_frequency = {}
1737 self.cross_link_distances = {}
1741 self.cross_link_distances_unique = {}
1743 if external_csv_data_file
is not None:
1746 self.external_csv_data = {}
1747 xldb = IMP.pmi.tools.get_db_from_csv(external_csv_data_file)
1750 self.external_csv_data[
1751 xl[external_csv_data_file_unique_id_key]] = xl
1756 cross_link_frequency_list = []
1758 self.unique_cross_link_list = []
1762 keysplit = key.replace(
"_",
" ").replace(
"-",
" ").replace(
1765 if filter_label
is not None:
1766 if filter_label
not in keysplit:
1770 r1 = int(keysplit[5])
1772 r2 = int(keysplit[7])
1775 confidence = keysplit[12]
1779 unique_identifier = keysplit[3]
1781 unique_identifier =
'0'
1783 r1 = int(keysplit[mapping[
"Residue1"]])
1784 c1 = keysplit[mapping[
"Protein1"]]
1785 r2 = int(keysplit[mapping[
"Residue2"]])
1786 c2 = keysplit[mapping[
"Protein2"]]
1788 confidence = keysplit[mapping[
"Confidence"]]
1792 unique_identifier = keysplit[mapping[
"Unique Identifier"]]
1794 unique_identifier =
'0'
1796 self.crosslinkedprots.add(c1)
1797 self.crosslinkedprots.add(c2)
1803 if filter_rmf_file_names
is not None:
1804 for n, d
in enumerate(fs[key]):
1805 if fs[rmf_file_key][n]
in filter_rmf_file_names:
1806 dists.append(float(d))
1808 dists = [float(f)
for f
in fs[key]]
1813 mdist = self.median(dists)
1815 stdv = np.std(np.array(dists))
1816 if self.mindist > mdist:
1817 self.mindist = mdist
1818 if self.maxdist < mdist:
1819 self.maxdist = mdist
1823 if (r1, c1, r2, c2, mdist)
not in cross_link_frequency_list:
1824 if (r1, c1, r2, c2)
not in self.cross_link_frequency:
1825 self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1826 self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1828 self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1829 self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1830 cross_link_frequency_list.append((r1, c1, r2, c2))
1831 cross_link_frequency_list.append((r2, c2, r1, c1))
1832 self.unique_cross_link_list.append(
1833 (r1, c1, r2, c2, mdist))
1835 if (r1, c1, r2, c2)
not in self.cross_link_distances:
1836 self.cross_link_distances[(
1837 r1, c1, r2, c2, mdist, confidence)] = dists
1838 self.cross_link_distances[(
1839 r2, c2, r1, c1, mdist, confidence)] = dists
1840 self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1842 self.cross_link_distances[(
1843 r2, c2, r1, c1, mdist, confidence)] += dists
1844 self.cross_link_distances[(
1845 r1, c1, r2, c2, mdist, confidence)] += dists
1847 self.crosslinks.append(
1848 (r1, c1, r2, c2, mdist, stdv, confidence, unique_identifier,
1850 self.crosslinks.append(
1851 (r2, c2, r1, c1, mdist, stdv, confidence, unique_identifier,
1854 self.cross_link_frequency_inverted = {}
1855 for xl
in self.unique_cross_link_list:
1856 (r1, c1, r2, c2, mdist) = xl
1857 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1858 if frequency
not in self.cross_link_frequency_inverted:
1859 self.cross_link_frequency_inverted[
1860 frequency] = [(r1, c1, r2, c2)]
1862 self.cross_link_frequency_inverted[
1863 frequency].append((r1, c1, r2, c2))
1867 def median(self, mylist):
1868 sorts = sorted(mylist)
1874 return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1875 return sorts[length / 2]
1877 def set_threshold(self, threshold):
1878 self.threshold = threshold
1880 def set_tolerance(self, tolerance):
1881 self.tolerance = tolerance
1883 def colormap(self, dist):
1884 if dist < self.threshold - self.tolerance:
1886 elif dist >= self.threshold + self.tolerance:
1891 def write_cross_link_database(self, filename, format='csv'):
1895 "Unique ID",
"Protein1",
"Residue1",
"Protein2",
"Residue2",
1896 "Median Distance",
"Standard Deviation",
"Confidence",
1897 "Frequency",
"Arrangement"]
1899 if self.external_csv_data
is not None:
1900 keys = list(self.external_csv_data.keys())
1901 innerkeys = list(self.external_csv_data[keys[0]].keys())
1903 fieldnames += innerkeys
1905 dw = csv.DictWriter(
1906 open(filename,
"w"),
1908 fieldnames=fieldnames)
1910 for xl
in self.crosslinks:
1911 (r1, c1, r2, c2, mdist, stdv, confidence,
1912 unique_identifier, descriptor) = xl
1913 if descriptor ==
'original':
1915 outdict[
"Unique ID"] = unique_identifier
1916 outdict[
"Protein1"] = c1
1917 outdict[
"Protein2"] = c2
1918 outdict[
"Residue1"] = r1
1919 outdict[
"Residue2"] = r2
1920 outdict[
"Median Distance"] = mdist
1921 outdict[
"Standard Deviation"] = stdv
1922 outdict[
"Confidence"] = confidence
1923 outdict[
"Frequency"] = self.cross_link_frequency[
1926 arrangement =
"Intra"
1928 arrangement =
"Inter"
1929 outdict[
"Arrangement"] = arrangement
1930 if self.external_csv_data
is not None:
1931 outdict.update(self.external_csv_data[unique_identifier])
1933 dw.writerow(outdict)
1935 def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
1936 no_confidence_info=
False, filter=
None, layout=
"whole",
1937 crosslinkedonly=
False, filename=
None, confidence_classes=
None,
1938 alphablend=0.1, scale_symbol_size=1.0, gap_between_components=0,
1939 rename_protein_map=
None):
1954 import matplotlib
as mpl
1956 import matplotlib.pyplot
as plt
1957 import matplotlib.cm
as cm
1959 fig = plt.figure(figsize=(10, 10))
1960 ax = fig.add_subplot(111)
1966 if prot_listx
is None:
1968 prot_listx = list(self.crosslinkedprots)
1970 prot_listx = list(self.prot_length_dict.keys())
1973 nresx = gap_between_components + \
1974 sum([self.prot_length_dict[name]
1975 + gap_between_components
for name
in prot_listx])
1979 if prot_listy
is None:
1981 prot_listy = list(self.crosslinkedprots)
1983 prot_listy = list(self.prot_length_dict.keys())
1986 nresy = gap_between_components + \
1987 sum([self.prot_length_dict[name]
1988 + gap_between_components
for name
in prot_listy])
1993 res = gap_between_components
1994 for prot
in prot_listx:
1995 resoffsetx[prot] = res
1996 res += self.prot_length_dict[prot]
1998 res += gap_between_components
2002 res = gap_between_components
2003 for prot
in prot_listy:
2004 resoffsety[prot] = res
2005 res += self.prot_length_dict[prot]
2007 res += gap_between_components
2009 resoffsetdiagonal = {}
2010 res = gap_between_components
2011 for prot
in IMP.pmi.tools.OrderedSet(prot_listx + prot_listy):
2012 resoffsetdiagonal[prot] = res
2013 res += self.prot_length_dict[prot]
2014 res += gap_between_components
2020 for n, prot
in enumerate(prot_listx):
2021 res = resoffsetx[prot]
2023 for proty
in prot_listy:
2024 resy = resoffsety[proty]
2025 endy = resendy[proty]
2026 ax.plot([res, res], [resy, endy],
'k-', lw=0.4)
2027 ax.plot([end, end], [resy, endy],
'k-', lw=0.4)
2028 xticks.append((float(res) + float(end)) / 2)
2029 if rename_protein_map
is not None:
2030 if prot
in rename_protein_map:
2031 prot = rename_protein_map[prot]
2032 xlabels.append(prot)
2036 for n, prot
in enumerate(prot_listy):
2037 res = resoffsety[prot]
2039 for protx
in prot_listx:
2040 resx = resoffsetx[protx]
2041 endx = resendx[protx]
2042 ax.plot([resx, endx], [res, res],
'k-', lw=0.4)
2043 ax.plot([resx, endx], [end, end],
'k-', lw=0.4)
2044 yticks.append((float(res) + float(end)) / 2)
2045 if rename_protein_map
is not None:
2046 if prot
in rename_protein_map:
2047 prot = rename_protein_map[prot]
2048 ylabels.append(prot)
2051 print(prot_listx, prot_listy)
2053 if self.contactmap
is not None:
2054 tmp_array = np.zeros((nresx, nresy))
2056 for px
in prot_listx:
2058 for py
in prot_listy:
2060 resx = resoffsety[px]
2061 lengx = resendx[px] - 1
2062 resy = resoffsety[py]
2063 lengy = resendy[py] - 1
2064 indexes_x = self.index_dictionary[px]
2065 minx = min(indexes_x)
2066 maxx = max(indexes_x)
2067 indexes_y = self.index_dictionary[py]
2068 miny = min(indexes_y)
2069 maxy = max(indexes_y)
2071 print(px, py, minx, maxx, miny, maxy)
2076 resy:lengy] = self.contactmap[
2082 ax.imshow(tmp_array,
2085 interpolation=
'nearest')
2087 ax.set_xticks(xticks)
2088 ax.set_xticklabels(xlabels, rotation=90)
2089 ax.set_yticks(yticks)
2090 ax.set_yticklabels(ylabels)
2091 ax.set_xlim(0, nresx)
2092 ax.set_ylim(0, nresy)
2096 already_added_xls = []
2098 for xl
in self.crosslinks:
2100 (r1, c1, r2, c2, mdist, stdv, confidence,
2101 unique_identifier, descriptor) = xl
2103 if confidence_classes
is not None:
2104 if confidence
not in confidence_classes:
2108 pos1 = r1 + resoffsetx[c1]
2112 pos2 = r2 + resoffsety[c2]
2116 if filter
is not None:
2117 xldb = self.external_csv_data[unique_identifier]
2118 xldb.update({
"Distance": mdist})
2119 xldb.update({
"Distance_stdv": stdv})
2121 if filter[1] ==
">":
2122 if float(xldb[filter[0]]) <= float(filter[2]):
2125 if filter[1] ==
"<":
2126 if float(xldb[filter[0]]) >= float(filter[2]):
2129 if filter[1] ==
"==":
2130 if float(xldb[filter[0]]) != float(filter[2]):
2136 pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
2137 pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
2139 if layout ==
'lowerdiagonal':
2140 if pos_for_diagonal1 <= pos_for_diagonal2:
2142 if layout ==
'upperdiagonal':
2143 if pos_for_diagonal1 >= pos_for_diagonal2:
2146 already_added_xls.append((r1, c1, r2, c2))
2148 if not no_confidence_info:
2149 if confidence ==
'0.01':
2150 markersize = 14 * scale_symbol_size
2151 elif confidence ==
'0.05':
2152 markersize = 9 * scale_symbol_size
2153 elif confidence ==
'0.1':
2154 markersize = 6 * scale_symbol_size
2156 markersize = 15 * scale_symbol_size
2158 markersize = 5 * scale_symbol_size
2160 if not no_dist_info:
2161 color = self.colormap(mdist)
2171 markersize=markersize)
2173 fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
2175 for i
in ax.spines.values():
2176 i.set_linewidth(2.0)
2179 plt.savefig(filename +
".pdf", dpi=300, transparent=
"False")
2183 def get_frequency_statistics(self, prot_list,
2186 violated_histogram = {}
2187 satisfied_histogram = {}
2188 unique_cross_links = []
2190 for xl
in self.unique_cross_link_list:
2191 (r1, c1, r2, c2, mdist) = xl
2194 if prot_list2
is None:
2195 if c1
not in prot_list:
2197 if c2
not in prot_list:
2200 if c1
in prot_list
and c2
in prot_list2:
2202 elif c1
in prot_list2
and c2
in prot_list:
2207 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2209 if (r1, c1, r2, c2)
not in unique_cross_links:
2211 if frequency
not in violated_histogram:
2212 violated_histogram[frequency] = 1
2214 violated_histogram[frequency] += 1
2216 if frequency
not in satisfied_histogram:
2217 satisfied_histogram[frequency] = 1
2219 satisfied_histogram[frequency] += 1
2220 unique_cross_links.append((r1, c1, r2, c2))
2221 unique_cross_links.append((r2, c2, r1, c1))
2223 print(
"# satisfied")
2225 total_number_of_crosslinks = 0
2227 for i
in satisfied_histogram:
2231 if i
in violated_histogram:
2232 print(i, violated_histogram[i]+satisfied_histogram[i])
2234 print(i, satisfied_histogram[i])
2235 total_number_of_crosslinks += i*satisfied_histogram[i]
2239 for i
in violated_histogram:
2240 print(i, violated_histogram[i])
2241 total_number_of_crosslinks += i*violated_histogram[i]
2243 print(total_number_of_crosslinks)
2245 def print_cross_link_binary_symbols(self, prot_list,
2248 confidence_list = []
2249 for xl
in self.crosslinks:
2250 (r1, c1, r2, c2, mdist, stdv, confidence,
2251 unique_identifier, descriptor) = xl
2253 if prot_list2
is None:
2254 if c1
not in prot_list:
2256 if c2
not in prot_list:
2259 if c1
in prot_list
and c2
in prot_list2:
2261 elif c1
in prot_list2
and c2
in prot_list:
2266 if descriptor !=
"original":
2269 confidence_list.append(confidence)
2271 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2272 tmp_dist_binary = []
2275 tmp_dist_binary.append(1)
2277 tmp_dist_binary.append(0)
2278 tmp_matrix.append(tmp_dist_binary)
2280 matrix = list(zip(*tmp_matrix))
2282 satisfied_high_sum = 0
2283 satisfied_mid_sum = 0
2284 satisfied_low_sum = 0
2285 total_satisfied_sum = 0
2286 for k, m
in enumerate(matrix):
2295 for n, b
in enumerate(m):
2296 if confidence_list[n] ==
"0.01":
2300 satisfied_high_sum += 1
2301 elif confidence_list[n] ==
"0.05":
2305 satisfied_mid_sum += 1
2306 elif confidence_list[n] ==
"0.1":
2310 satisfied_low_sum += 1
2312 total_satisfied += 1
2313 total_satisfied_sum += 1
2315 print(k, satisfied_high, total_high)
2316 print(k, satisfied_mid, total_mid)
2317 print(k, satisfied_low, total_low)
2318 print(k, total_satisfied, total)
2319 print(float(satisfied_high_sum) / len(matrix))
2320 print(float(satisfied_mid_sum) / len(matrix))
2321 print(float(satisfied_low_sum) / len(matrix))
2324 def get_unique_crosslinks_statistics(self, prot_list,
2337 satisfied_string = []
2338 for xl
in self.crosslinks:
2339 (r1, c1, r2, c2, mdist, stdv, confidence,
2340 unique_identifier, descriptor) = xl
2342 if prot_list2
is None:
2343 if c1
not in prot_list:
2345 if c2
not in prot_list:
2348 if c1
in prot_list
and c2
in prot_list2:
2350 elif c1
in prot_list2
and c2
in prot_list:
2355 if descriptor !=
"original":
2359 if confidence ==
"0.01":
2363 if confidence ==
"0.05":
2367 if confidence ==
"0.1":
2372 satisfied_string.append(1)
2374 satisfied_string.append(0)
2376 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2377 tmp_dist_binary = []
2380 tmp_dist_binary.append(1)
2382 tmp_dist_binary.append(0)
2383 tmp_matrix.append(tmp_dist_binary)
2385 print(
"unique satisfied_high/total_high", satisfied_high,
"/",
2387 print(
"unique satisfied_mid/total_mid", satisfied_mid,
"/", total_mid)
2388 print(
"unique satisfied_low/total_low", satisfied_low,
"/", total_low)
2389 print(
"total", total)
2391 matrix = list(zip(*tmp_matrix))
2392 satisfied_models = 0
2394 for b
in satisfied_string:
2401 all_satisfied =
True
2403 for n, b
in enumerate(m):
2408 if b == 1
and satisfied_string[n] == 1:
2410 elif b == 1
and satisfied_string[n] == 0:
2412 elif b == 0
and satisfied_string[n] == 0:
2414 elif b == 0
and satisfied_string[n] == 1:
2415 all_satisfied =
False
2417 satisfied_models += 1
2419 print(satstr, all_satisfied)
2420 print(
"models that satisfies the median satisfied cross-links/total "
2421 "models", satisfied_models, len(matrix))
2423 def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,
2426 import matplotlib
as mpl
2428 import matplotlib.pylab
as pl
2431 for kw
in self.cross_link_distances_unique:
2432 (r1, c1, r2, c2) = kw
2433 dists = self.cross_link_distances_unique[kw]
2435 if prot_list2
is None:
2436 if c1
not in prot_list:
2438 if c2
not in prot_list:
2441 if c1
in prot_list
and c2
in prot_list2:
2443 elif c1
in prot_list2
and c2
in prot_list:
2448 dists.append(sum(dists))
2449 tmp_matrix.append(dists)
2451 tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
2454 matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
2456 for i
in range(len(tmp_matrix)):
2457 for k
in range(len(tmp_matrix[i]) - 1):
2458 matrix[i][k] = tmp_matrix[i][k]
2463 ax = fig.add_subplot(211)
2465 cax = ax.imshow(matrix, interpolation=
'nearest')
2467 pl.savefig(figurename, dpi=300)
2476 arrangement=
"inter",
2477 confidence_input=
"None"):
2480 for xl
in self.cross_link_distances:
2481 (r1, c1, r2, c2, mdist, confidence) = xl
2482 if c1
in prots1
and c2
in prots2:
2483 if arrangement ==
"inter" and c1 == c2:
2485 if arrangement ==
"intra" and c1 != c2:
2487 if confidence_input == confidence:
2488 label = str(c1) +
":" + str(r1) + \
2489 "-" + str(c2) +
":" + str(r2)
2490 values = self.cross_link_distances[xl]
2491 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2492 data.append((label, values, mdist, frequency))
2494 sort_by_dist = sorted(data, key=
lambda tup: tup[2])
2495 sort_by_dist = list(zip(*sort_by_dist))
2496 values = sort_by_dist[1]
2497 positions = list(range(len(values)))
2498 labels = sort_by_dist[0]
2499 frequencies = list(map(float, sort_by_dist[3]))
2500 frequencies = [f * 10.0
for f
in frequencies]
2502 nchunks = int(float(len(values)) / nxl_per_row)
2503 values_chunks = IMP.pmi.tools.chunk_list_into_segments(values, nchunks)
2504 positions_chunks = IMP.pmi.tools.chunk_list_into_segments(
2507 frequencies_chunks = IMP.pmi.tools.chunk_list_into_segments(
2510 labels_chunks = IMP.pmi.tools.chunk_list_into_segments(labels, nchunks)
2512 for n, v
in enumerate(values_chunks):
2513 p = positions_chunks[n]
2514 f = frequencies_chunks[n]
2516 filename +
"." + str(n), v, p, f,
2517 valuename=
"Distance (Ang)",
2518 positionname=
"Unique " + arrangement +
" Crosslinks",
2519 xlabels=labels_chunks[n])
2521 def crosslink_distance_histogram(self, filename,
2524 confidence_classes=
None,
2530 if prot_list
is None:
2531 prot_list = list(self.prot_length_dict.keys())
2534 for xl
in self.crosslinks:
2535 (r1, c1, r2, c2, mdist, stdv, confidence,
2536 unique_identifier, descriptor) = xl
2538 if confidence_classes
is not None:
2539 if confidence
not in confidence_classes:
2542 if prot_list2
is None:
2543 if c1
not in prot_list:
2545 if c2
not in prot_list:
2548 if c1
in prot_list
and c2
in prot_list2:
2550 elif c1
in prot_list2
and c2
in prot_list:
2555 distances.append(mdist)
2558 filename, distances, valuename=
"C-alpha C-alpha distance [Ang]",
2559 bins=bins, color=color,
2561 reference_xline=35.0,
2562 yplotrange=yplotrange, normalized=normalized)
2564 def scatter_plot_xl_features(self, filename,
2570 reference_ylines=
None,
2571 distance_color=
True,
2573 import matplotlib
as mpl
2575 import matplotlib.pyplot
as plt
2577 fig = plt.figure(figsize=(10, 10))
2578 ax = fig.add_subplot(111)
2580 for xl
in self.crosslinks:
2581 (r1, c1, r2, c2, mdist, stdv, confidence,
2582 unique_identifier, arrangement) = xl
2584 if prot_list2
is None:
2585 if c1
not in prot_list:
2587 if c2
not in prot_list:
2590 if c1
in prot_list
and c2
in prot_list2:
2592 elif c1
in prot_list2
and c2
in prot_list:
2597 xldb = self.external_csv_data[unique_identifier]
2598 xldb.update({
"Distance": mdist})
2599 xldb.update({
"Distance_stdv": stdv})
2601 xvalue = float(xldb[feature1])
2602 yvalue = float(xldb[feature2])
2605 color = self.colormap(mdist)
2609 ax.plot([xvalue], [yvalue],
'o', c=color, alpha=0.1, markersize=7)
2611 if yplotrange
is not None:
2612 ax.set_ylim(yplotrange)
2613 if reference_ylines
is not None:
2614 for rl
in reference_ylines:
2615 ax.axhline(rl, color=
'red', linestyle=
'dashed', linewidth=1)
2618 plt.savefig(filename +
"." + format, dpi=150, transparent=
"False")
def get_output
Returns output for IMP.pmi.output.Output object.
Visualization of cross-links.
A decorator to associate a particle with a part of a protein/DNA/RNA.
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
A class for reading stat files (either rmf or ascii v1 and v2)
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
void handle_use_deprecated(std::string message)
Break in this method in gdb to find deprecated uses at runtime.
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
Compute the RMSD (without alignment) taking into account the copy ambiguity.
void link_restraints(RMF::FileConstHandle fh, const Restraints &hs)
A class to evaluate the precision of an ensemble.
A class to cluster structures.
def get_rmsd_wrt_reference_structure_with_alignment
First align then calculate RMSD.
Class for sampling a density map from particles.
A decorator for keeping track of copies of a molecule.
def add_structure
Read a structure into the ensemble and store (as coordinates).
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
Return a density map for which voxel i contains the result of m1[i]*m2[i].
Performs alignment and RMSD calculation for two sets of coordinates.
def add_subunits_density
Add a frame to the densities.
DensityMap * create_density_map(const IMP::algebra::GridD< 3, S, V, E > &arg)
Create a density map from an arbitrary IMP::algebra::GridD.
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Basic utilities for handling cryo-electron microscopy 3D density maps.
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
def fill
Add coordinates for a single model.
Tools for clustering and cluster analysis.
def do_cluster
Run K-means clustering.
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Classes for writing output files and processing them.
def set_reference_structure
Read in a structure used for reference computation.
def get_rmsf
Calculate the residue mean square fluctuations (RMSF).
def get_average_distance_wrt_reference_structure
Compare the structure set to the reference structure.
def get_density
Get the current density for some component name.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
def get_precision
Evaluate the precision of two named structure groups.
Restraints create_restraints(RMF::FileConstHandle fh, Model *m)
def get_moldict_coorddict
return data structure for the RMSD calculation
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name.
Python classes to represent, score, sample and analyze models.
def add_structures
Read a list of RMFs, supports parallel.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Select hierarchy particles identified by the biological name.
Compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
A decorator for a particle with x,y,z coordinates and a radius.