3 """@namespace IMP.pmi.analysis
4 Tools for clustering and cluster analysis
6 from __future__
import print_function
16 from operator
import itemgetter
17 from copy
import deepcopy
18 from math
import log,sqrt
24 """Performs alignment and RMSD calculation for two sets of coordinates
26 The class also takes into account non-equal stoichiometry of the proteins.
27 If this is the case, the protein names of proteins in multiple copies
28 should be specified in the following form:
29 nameA..1, nameA..2 (note two dots).
32 def __init__(self, template, query, weights=None):
34 @param query {'p1':coords(L,3), 'p2':coords(L,3)}
35 @param template {'p1':coords(L,3), 'p2':coords(L,3)}
36 @param weights optional weights for each set of coordinates
39 self.template = template
42 if len(self.query.keys()) != len(self.template.keys()):
43 raise ValueError(
'''the number of proteins
44 in template and query does not match!''')
48 self.proteins = sorted(self.query.keys())
49 prots_uniq = [i.split(
'..')[0]
for i
in self.proteins]
52 np = prots_uniq.count(p)
53 copies = [i
for i
in self.proteins
if i.split(
'..')[0] == p]
54 prmts = list(itertools.permutations(copies, len(copies)))
57 self.Product = list(itertools.product(*P.values()))
65 torder = sum([list(i)
for i
in self.Product[0]], [])
68 if self.weights
is not None:
69 weights += [i
for i
in self.weights[t]]
72 self.rmsd = 10000000000.
73 for comb
in self.Product:
77 order = sum([list(i)
for i
in comb], [])
87 if self.weights
is not None:
98 from scipy.spatial.distance
import cdist
103 torder = sum([list(i)
for i
in self.Product[0]], [])
108 self.rmsd, Transformation = 10000000000.,
''
109 for comb
in self.Product:
110 order = sum([list(i)
for i
in comb], [])
116 if len(template_xyz) != len(query_xyz):
117 raise ValueError(
'''the number of coordinates
118 in template and query does not match!''')
123 query_xyz_tr = [transformation.get_transformed(n)
127 sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2)) / len(template_xyz))
130 Transformation = transformation
132 return (self.rmsd, Transformation)
137 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
138 'a..2':np.array([np.array([1.,1.,])]),
139 'a..3':np.array([np.array([-2.,1.])]),
140 'b':np.array([np.array([0.,-1.])]),
141 'c..1':np.array([np.array([-1.,-1.])]),
142 'c..2':np.array([np.array([1.,-1.])]),
143 'd':np.array([np.array([0.,0.])]),
144 'e':np.array([np.array([0.,1.])])}
146 Ali = Alignment(Proteins, Proteins)
148 if Ali.get_rmsd() == 0.0: print 'successful test!'
149 else: print 'ERROR!'; exit()
154 class Violations(object):
158 self.violation_thresholds = {}
159 self.violation_counts = {}
161 data = open(filename)
166 d = d.strip().split()
167 self.violation_thresholds[d[0]] = float(d[1])
169 def get_number_violated_restraints(self, rsts_dict):
171 for rst
in self.violation_thresholds:
172 if rst
not in rsts_dict:
174 if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
176 if rst
not in self.violation_counts:
177 self.violation_counts[rst] = 1
179 self.violation_counts[rst] += 1
185 """A class to cluster structures.
186 Uses scipy's cdist function to compute distance matrices
187 and sklearn's kmeans clustering module.
191 @param rmsd_weights Flat list of weights for each particle
195 from mpi4py
import MPI
196 self.comm = MPI.COMM_WORLD
197 self.rank = self.comm.Get_rank()
198 self.number_of_processes = self.comm.size
200 self.number_of_processes = 1
203 self.structure_cluster_ids =
None
204 self.tmpl_coords =
None
205 self.rmsd_weights=rmsd_weights
207 def set_template(self, part_coords):
209 self.tmpl_coords = part_coords
212 """Add coordinates for a single model."""
214 self.all_coords[frame] = Coords
216 def dist_matrix(self):
218 self.model_list_names = list(self.all_coords.keys())
219 self.model_indexes = list(range(len(self.model_list_names)))
220 self.model_indexes_dict = dict(
221 list(zip(self.model_list_names, self.model_indexes)))
222 model_indexes_unique_pairs = list(itertools.combinations(self.model_indexes, 2))
224 my_model_indexes_unique_pairs = IMP.pmi.tools.chunk_list_into_segments(
225 model_indexes_unique_pairs,
226 self.number_of_processes)[self.rank]
228 print(
"process %s assigned with %s pairs" % (str(self.rank), str(len(my_model_indexes_unique_pairs))))
230 (raw_distance_dict, self.transformation_distance_dict) = self.matrix_calculation(self.all_coords,
232 my_model_indexes_unique_pairs)
234 if self.number_of_processes > 1:
237 pickable_transformations = self.get_pickable_transformation_distance_dict(
240 pickable_transformations)
241 self.set_transformation_distance_dict_from_pickable(
242 pickable_transformations)
244 self.raw_distance_matrix = np.zeros(
245 (len(self.model_list_names), len(self.model_list_names)))
246 for item
in raw_distance_dict:
248 self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
249 self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
251 def get_dist_matrix(self):
252 return self.raw_distance_matrix
255 """Run K-means clustering
256 @param number_of_clusters Num means
257 @param seed the random seed
259 from sklearn.cluster
import KMeans
264 kmeans = KMeans(n_clusters=number_of_clusters)
267 kmeans = KMeans(k=number_of_clusters)
268 kmeans.fit_predict(self.raw_distance_matrix)
270 self.structure_cluster_ids = kmeans.labels_
272 def get_pickable_transformation_distance_dict(self):
273 pickable_transformations = {}
274 for label
in self.transformation_distance_dict:
275 tr = self.transformation_distance_dict[label]
276 trans = tuple(tr.get_translation())
277 rot = tuple(tr.get_rotation().get_quaternion())
278 pickable_transformations[label] = (rot, trans)
279 return pickable_transformations
281 def set_transformation_distance_dict_from_pickable(
283 pickable_transformations):
284 self.transformation_distance_dict = {}
285 for label
in pickable_transformations:
286 tr = pickable_transformations[label]
289 self.transformation_distance_dict[
292 def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
294 outf = open(file_name +
".data",
'wb')
300 pickable_transformations = self.get_pickable_transformation_distance_dict(
303 (self.structure_cluster_ids,
304 self.model_list_names,
305 pickable_transformations),
308 np.save(file_name +
".npy", self.raw_distance_matrix)
310 def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
313 inputf = open(file_name +
".data",
'rb')
314 (self.structure_cluster_ids, self.model_list_names,
315 pickable_transformations) = pickle.load(inputf)
318 self.raw_distance_matrix = np.load(file_name +
".npy")
320 self.set_transformation_distance_dict_from_pickable(
321 pickable_transformations)
322 self.model_indexes = list(range(len(self.model_list_names)))
323 self.model_indexes_dict = dict(
324 list(zip(self.model_list_names, self.model_indexes)))
326 def plot_matrix(self, figurename="clustermatrix.pdf"):
327 import matplotlib
as mpl
329 import matplotlib.pylab
as pl
330 from scipy.cluster
import hierarchy
as hrc
332 fig = pl.figure(figsize=(10,8))
333 ax = fig.add_subplot(212)
334 dendrogram = hrc.dendrogram(
335 hrc.linkage(self.raw_distance_matrix),
338 leaves_order = dendrogram[
'leaves']
339 ax.set_xlabel(
'Model')
340 ax.set_ylabel(
'RMSD [Angstroms]')
342 ax2 = fig.add_subplot(221)
344 self.raw_distance_matrix[leaves_order,
347 interpolation=
'nearest')
348 cb = fig.colorbar(cax)
349 cb.set_label(
'RMSD [Angstroms]')
350 ax2.set_xlabel(
'Model')
351 ax2.set_ylabel(
'Model')
353 pl.savefig(figurename, dpi=300)
356 def get_model_index_from_name(self, name):
357 return self.model_indexes_dict[name]
359 def get_cluster_labels(self):
361 return list(set(self.structure_cluster_ids))
363 def get_number_of_clusters(self):
364 return len(self.get_cluster_labels())
366 def get_cluster_label_indexes(self, label):
368 [i
for i, l
in enumerate(self.structure_cluster_ids)
if l == label]
371 def get_cluster_label_names(self, label):
373 [self.model_list_names[i]
374 for i
in self.get_cluster_label_indexes(label)]
377 def get_cluster_label_average_rmsd(self, label):
379 indexes = self.get_cluster_label_indexes(label)
382 sub_distance_matrix = self.raw_distance_matrix[
383 indexes, :][:, indexes]
384 average_rmsd = np.sum(sub_distance_matrix) / \
385 (len(sub_distance_matrix)
386 ** 2 - len(sub_distance_matrix))
391 def get_cluster_label_size(self, label):
392 return len(self.get_cluster_label_indexes(label))
394 def get_transformation_to_first_member(
398 reference = self.get_cluster_label_indexes(cluster_label)[0]
399 return self.transformation_distance_dict[(reference, structure_index)]
401 def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
403 model_list_names = list(all_coords.keys())
404 rmsd_protein_names = list(all_coords[model_list_names[0]].keys())
405 raw_distance_dict = {}
406 transformation_distance_dict = {}
407 if template_coords
is None:
411 alignment_template_protein_names = list(template_coords.keys())
413 for (f1, f2)
in list_of_pairs:
421 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
422 for pr
in rmsd_protein_names])
424 for pr
in rmsd_protein_names:
425 coords_f2[pr] = all_coords[model_list_names[f2]][pr]
427 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
428 rmsd = Ali.get_rmsd()
434 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
435 for pr
in alignment_template_protein_names])
436 coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
437 for pr
in alignment_template_protein_names])
440 template_rmsd, transformation = Ali.align()
445 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
446 for pr
in rmsd_protein_names])
448 for pr
in rmsd_protein_names:
449 coords_f2[pr] = [transformation.get_transformed(
450 i)
for i
in all_coords[model_list_names[f2]][pr]]
452 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
453 rmsd = Ali.get_rmsd()
455 raw_distance_dict[(f1, f2)] = rmsd
456 raw_distance_dict[(f2, f1)] = rmsd
457 transformation_distance_dict[(f1, f2)] = transformation
458 transformation_distance_dict[(f2, f1)] = transformation
460 return raw_distance_dict, transformation_distance_dict
464 """A class to evaluate the precision of an ensemble.
466 Also can evaluate the cross-precision of multiple ensembles.
467 Supports MPI for coordinate reading.
468 Recommended procedure:
469 -# initialize object and pass the selection for evaluating precision
470 -# call add_structures() to read in the data (specify group name)
471 -# call get_precision() to evaluate inter/intra precision
472 -# call get_rmsf() to evaluate within-group fluctuations
476 selection_dictionary={}):
478 @param model The IMP Model
479 @param resolution Use 1 or 10 (kluge: requires that "_Res:X" is
480 part of the hier name)
481 @param selection_dictionary Dictionary where keys are names for
482 selections and values are selection tuples for scoring
483 precision. "All" is automatically made as well
486 from mpi4py
import MPI
487 self.comm = MPI.COMM_WORLD
488 self.rank = self.comm.Get_rank()
489 self.number_of_processes = self.comm.size
491 self.number_of_processes = 1
494 self.styles = [
'pairwise_rmsd',
'pairwise_drmsd_k',
'pairwise_drmsd_Q',
495 'pairwise_drms_k',
'pairwise_rmsd',
'drmsd_from_center']
496 self.style =
'pairwise_drmsd_k'
497 self.structures_dictionary = {}
498 self.reference_structures_dictionary = {}
500 self.protein_names =
None
501 self.len_particles_resolution_one =
None
503 self.rmf_names_frames = {}
504 self.reference_rmf_names_frames =
None
505 self.reference_structure =
None
506 self.reference_prot =
None
507 self.selection_dictionary = selection_dictionary
508 self.threshold = 40.0
509 self.residue_particle_index_map =
None
511 if resolution
in [1,10]:
512 self.resolution = resolution
514 raise KeyError(
"Currently only allow resolution 1 or 10")
516 def _get_structure(self,rmf_frame_index,rmf_name):
517 """Read an RMF file and return the particles"""
518 rh = RMF.open_rmf_file_read_only(rmf_name)
522 print(
"getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
526 print(
"linking coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
529 if self.resolution==1:
531 elif self.resolution==10:
534 protein_names = list(particle_dict.keys())
535 particles_resolution_one = []
536 for k
in particle_dict:
537 particles_resolution_one += (particle_dict[k])
539 if self.protein_names==
None:
540 self.protein_names = protein_names
542 if self.protein_names!=protein_names:
543 print(
"Error: the protein names of the new coordinate set is not compatible with the previous one")
545 if self.len_particles_resolution_one==
None:
546 self.len_particles_resolution_one = len(particles_resolution_one)
548 if self.len_particles_resolution_one!=len(particles_resolution_one):
549 raise ValueError(
"the new coordinate set is not compatible with the previous one")
551 return particles_resolution_one
557 setup_index_map=
False):
558 """ Read a structure into the ensemble and store (as coordinates).
559 @param rmf_name The name of the RMF file
560 @param rmf_frame_index The frame to read
561 @param structure_set_name Name for the set that includes this structure
563 @param setup_index_map if requested, set up a dictionary to help
568 if structure_set_name
in self.structures_dictionary:
569 cdict = self.structures_dictionary[structure_set_name]
570 rmflist = self.rmf_names_frames[structure_set_name]
572 self.structures_dictionary[structure_set_name]={}
573 self.rmf_names_frames[structure_set_name]=[]
574 cdict = self.structures_dictionary[structure_set_name]
575 rmflist = self.rmf_names_frames[structure_set_name]
579 particles_resolution_one = self._get_structure(rmf_frame_index,rmf_name)
581 print(
"something wrong with the rmf")
583 self.selection_dictionary.update({
"All":self.protein_names})
585 for selection_name
in self.selection_dictionary:
586 selection_tuple = self.selection_dictionary[selection_name]
587 coords = self._select_coordinates(selection_tuple,particles_resolution_one,self.prots[0])
588 if selection_name
not in cdict:
589 cdict[selection_name] = [coords]
591 cdict[selection_name].append(coords)
593 rmflist.append((rmf_name,rmf_frame_index))
597 self.residue_particle_index_map = {}
598 for prot_name
in self.protein_names:
599 self.residue_particle_index_map[prot_name] = \
600 self._get_residue_particle_index_map(
602 particles_resolution_one,self.prots[0])
605 rmf_name_frame_tuples,
607 """Read a list of RMFs, supports parallel
608 @param rmf_name_frame_tuples list of (rmf_file_name,frame_number)
609 @param structure_set_name Name this set of structures (e.g. "cluster.1")
613 my_rmf_name_frame_tuples=IMP.pmi.tools.chunk_list_into_segments(
614 rmf_name_frame_tuples,self.number_of_processes)[self.rank]
615 for nfr,tup
in enumerate(my_rmf_name_frame_tuples):
617 rmf_frame_index=tup[1]
619 if self.residue_particle_index_map
is None:
620 setup_index_map =
True
622 setup_index_map =
False
629 if self.number_of_processes > 1:
632 self.comm.send(self.structures_dictionary, dest=0, tag=11)
634 for i
in range(1, self.number_of_processes):
635 data_tmp = self.comm.recv(source=i, tag=11)
636 for key
in self.structures_dictionary:
637 self.structures_dictionary[key].update(data_tmp[key])
638 for i
in range(1, self.number_of_processes):
639 self.comm.send(self.structures_dictionary, dest=i, tag=11)
641 self.structures_dictionary = self.comm.recv(source=0, tag=11)
643 def _get_residue_particle_index_map(self,prot_name,structure,hier):
645 residue_particle_index_map = []
651 all_selected_particles = s.get_selected_particles()
652 intersection = list(set(all_selected_particles) & set(structure))
653 sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
654 for p
in sorted_intersection:
656 return residue_particle_index_map
659 def _select_coordinates(self,tuple_selections,structure,prot):
660 selected_coordinates=[]
661 for t
in tuple_selections:
662 if type(t)==tuple
and len(t)==3:
668 all_selected_particles = s.get_selected_particles()
669 intersection = list(set(all_selected_particles) & set(structure))
670 sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
671 cc = [tuple(
IMP.core.XYZ(p).get_coordinates())
for p
in sorted_intersection]
672 selected_coordinates += cc
678 all_selected_particles = s.get_selected_particles()
679 intersection = list(set(all_selected_particles) & set(structure))
680 sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
681 cc = [tuple(
IMP.core.XYZ(p).get_coordinates())
for p
in sorted_intersection]
682 selected_coordinates += cc
684 raise ValueError(
"Selection error")
685 return selected_coordinates
687 def set_threshold(self,threshold):
688 self.threshold = threshold
690 def _get_distance(self,
696 """ Compute distance between structures with various metrics """
697 c1 = self.structures_dictionary[structure_set_name1][selection_name][index1]
698 c2 = self.structures_dictionary[structure_set_name2][selection_name][index2]
702 if self.style==
'pairwise_drmsd_k':
704 if self.style==
'pairwise_drms_k':
706 if self.style==
'pairwise_drmsd_Q':
709 if self.style==
'pairwise_rmsd':
713 def _get_particle_distances(self,structure_set_name1,structure_set_name2,
714 selection_name,index1,index2):
715 c1 = self.structures_dictionary[structure_set_name1][selection_name][index1]
716 c2 = self.structures_dictionary[structure_set_name2][selection_name][index2]
721 distances=[np.linalg.norm(a-b)
for (a,b)
in zip(coordinates1,coordinates2)]
730 selection_keywords=
None):
731 """ Evaluate the precision of two named structure groups. Supports MPI.
732 When the structure_set_name1 is different from the structure_set_name2,
733 this evaluates the cross-precision (average pairwise distances).
734 @param outfile Name of the precision output file
735 @param structure_set_name1 string name of the first structure set
736 @param structure_set_name2 string name of the second structure set
737 @param skip analyze every (skip) structure for the distance matrix calculation
738 @param selection_keywords Specify the selection name you want to calculate on.
739 By default this is computed for everything you provided in the constructor,
740 plus all the subunits together.
742 if selection_keywords
is None:
743 sel_keys = list(self.selection_dictionary.keys())
745 for k
in selection_keywords:
746 if k
not in self.selection_dictionary:
747 raise KeyError(
"you are trying to find named selection " \
748 + k +
" which was not requested in the constructor")
749 sel_keys = selection_keywords
751 if outfile
is not None:
752 of = open(outfile,
"w")
754 for selection_name
in sel_keys:
755 number_of_structures_1 = len(self.structures_dictionary[structure_set_name1][selection_name])
756 number_of_structures_2 = len(self.structures_dictionary[structure_set_name2][selection_name])
758 structure_pointers_1 = list(range(0,number_of_structures_1,skip))
759 structure_pointers_2 = list(range(0,number_of_structures_2,skip))
760 pair_combination_list = list(itertools.product(structure_pointers_1,structure_pointers_2))
761 if len(pair_combination_list)==0:
762 raise ValueError(
"no structure selected. Check the skip parameter.")
765 my_pair_combination_list = IMP.pmi.tools.chunk_list_into_segments(
766 pair_combination_list,self.number_of_processes)[self.rank]
767 my_length = len(my_pair_combination_list)
768 for n,pair
in enumerate(my_pair_combination_list):
769 progression = int(float(n)/my_length*100.0)
770 distances[pair] = self._get_distance(structure_set_name1,structure_set_name2,
771 selection_name,pair[0],pair[1])
772 if self.number_of_processes > 1:
777 if structure_set_name1==structure_set_name2:
778 structure_pointers = structure_pointers_1
779 number_of_structures = number_of_structures_1
784 distances_to_structure = {}
785 distances_to_structure_normalization = {}
787 for n
in structure_pointers:
788 distances_to_structure[n] = 0.0
789 distances_to_structure_normalization[n]=0
792 distance += distances[k]
793 distances_to_structure[k[0]] += distances[k]
794 distances_to_structure[k[1]] += distances[k]
795 distances_to_structure_normalization[k[0]] += 1
796 distances_to_structure_normalization[k[1]] += 1
798 for n
in structure_pointers:
799 distances_to_structure[n] = distances_to_structure[n]/distances_to_structure_normalization[n]
801 min_distance = min([distances_to_structure[n]
for n
in distances_to_structure])
802 centroid_index = [k
for k, v
in distances_to_structure.items()
if v == min_distance][0]
803 centroid_rmf_name = self.rmf_names_frames[structure_set_name1][centroid_index]
805 centroid_distance = 0.0
807 for n
in range(number_of_structures):
808 dist = self._get_distance(structure_set_name1,structure_set_name1,
809 selection_name,centroid_index,n)
810 centroid_distance += dist
811 distance_list.append(dist)
814 centroid_distance /= number_of_structures
816 if outfile
is not None:
817 of.write(str(selection_name)+
" "+structure_set_name1+
818 " average centroid distance "+str(centroid_distance)+
"\n")
819 of.write(str(selection_name)+
" "+structure_set_name1+
820 " centroid index "+str(centroid_index)+
"\n")
821 of.write(str(selection_name)+
" "+structure_set_name1+
822 " centroid rmf name "+str(centroid_rmf_name)+
"\n")
823 of.write(str(selection_name)+
" "+structure_set_name1+
824 " median centroid distance "+str(np.median(distance_list))+
"\n")
826 average_pairwise_distances=sum(distances.values())/len(list(distances.values()))
827 if outfile
is not None:
828 of.write(str(selection_name)+
" "+structure_set_name1+
" "+structure_set_name2+
829 " average pairwise distance "+str(average_pairwise_distances)+
"\n")
830 if outfile
is not None:
832 return centroid_index
838 set_plot_yaxis_range=
None):
839 """ Calculate the residue mean square fluctuations (RMSF).
840 Automatically outputs as data file and pdf
841 @param structure_set_name Which structure set to calculate RMSF for
842 @param outdir Where to write the files
843 @param skip Skip this number of structures
844 @param set_plot_yaxis_range In case you need to change the plot
852 for sel_name
in self.protein_names:
853 self.selection_dictionary.update({sel_name:[sel_name]})
855 number_of_structures = len(self.structures_dictionary[structure_set_name][sel_name])
859 rpim = self.residue_particle_index_map[sel_name]
860 outfile = outdir+
"/rmsf."+sel_name+
".dat"
861 of = open(outfile,
"w")
862 residue_distances = {}
864 for index
in range(number_of_structures):
865 distances = self._get_particle_distances(structure_set_name,
868 centroid_index,index)
869 for nblock,block
in enumerate(rpim):
870 for residue_number
in block:
871 residue_nblock[residue_number] = nblock
872 if residue_number
not in residue_distances:
873 residue_distances[residue_number] = [distances[nblock]]
875 residue_distances[residue_number].append(distances[nblock])
879 for rn
in residue_distances:
881 rmsf = np.std(residue_distances[rn])
883 of.write(str(rn)+
" "+str(residue_nblock[rn])+
" "+str(rmsf)+
"\n")
885 IMP.pmi.output.plot_xy_data(residues,rmsfs,title=sel_name,
886 out_fn=outdir+
"/rmsf."+sel_name,display=
False,
887 set_plot_yaxis_range=set_plot_yaxis_range,
888 xlabel=
'Residue Number',ylabel=
'Standard error')
893 """Read in a structure used for reference computation.
894 Needed before calling get_average_distance_wrt_reference_structure()
895 @param rmf_name The RMF file to read the reference
896 @param rmf_frame_index The index in that file
898 particles_resolution_one = self._get_structure(rmf_frame_index,rmf_name)
899 self.reference_rmf_names_frames = (rmf_name,rmf_frame_index)
901 for selection_name
in self.selection_dictionary:
902 selection_tuple = self.selection_dictionary[selection_name]
903 coords = self._select_coordinates(selection_tuple,
904 particles_resolution_one,self.prots[0])
905 self.reference_structures_dictionary[selection_name] = coords
908 """First align then calculate RMSD
909 @param structure_set_name: the name of the structure set
910 @param alignment_selection: the key containing the selection tuples needed to make the alignment stored in self.selection_dictionary
911 @return: for each structure in the structure set, returns the rmsd
913 if self.reference_structures_dictionary=={}:
914 print(
"Cannot compute until you set a reference structure")
917 align_reference_coordinates = self.reference_structures_dictionary[alignment_selection_key]
918 align_coordinates = self.structures_dictionary[structure_set_name][alignment_selection_key]
920 for c
in align_coordinates:
922 transformation = Ali.align()[1]
923 transformations.append(transformation)
924 for selection_name
in self.selection_dictionary:
925 reference_coordinates = self.reference_structures_dictionary[selection_name]
928 for n,sc
in enumerate(self.structures_dictionary[structure_set_name][selection_name]):
931 distances.append(distance)
932 print(selection_name,
"average rmsd",sum(distances)/len(distances),
"median",self._get_median(distances),
"minimum distance",min(distances))
934 def _get_median(self,list_of_values):
935 return np.median(np.array(list_of_values))
938 """Compare the structure set to the reference structure.
939 @param structure_set_name The structure set to compute this on
940 @note First call set_reference_structure()
943 if self.reference_structures_dictionary=={}:
944 print(
"Cannot compute until you set a reference structure")
946 for selection_name
in self.selection_dictionary:
947 reference_coordinates = self.reference_structures_dictionary[selection_name]
950 for sc
in self.structures_dictionary[structure_set_name][selection_name]:
952 if self.style==
'pairwise_drmsd_k':
954 if self.style==
'pairwise_drms_k':
956 if self.style==
'pairwise_drmsd_Q':
958 if self.style==
'pairwise_rmsd':
960 distances.append(distance)
962 print(selection_name,
"average distance",sum(distances)/len(distances),
"minimum distance",min(distances),
'nframes',len(distances))
963 ret[selection_name] = {
'average_distance':sum(distances)/len(distances),
'minimum_distance':min(distances)}
966 def get_coordinates(self):
969 def set_precision_style(self, style):
970 if style
in self.styles:
973 raise ValueError(
"No such style")
977 """Compute mean density maps from structures.
979 Keeps a dictionary of density maps,
980 keys are in the custom ranges. When you call add_subunits_density, it adds
981 particle coordinates to the existing density maps.
984 def __init__(self, custom_ranges, representation=None, resolution=20.0, voxel=5.0):
986 @param custom_ranges Required. It's a dictionary, keys are the
987 density component names, values are selection tuples
988 e.g. {'kin28':[['kin28',1,-1]],
989 'density_name_1' :[('ccl1')],
990 'density_name_2' :[(1,142,'tfb3d1'),
992 @param representation PMI representation, for doing selections.
993 Not needed if you only pass hierarchies
994 @param resolution The MRC resolution of the output map (in Angstrom unit)
995 @param voxel The voxel size for the output map (lower is slower)
998 self.representation = representation
999 self.MRCresolution = resolution
1002 self.count_models = 0.0
1003 self.custom_ranges = custom_ranges
1006 """Add a frame to the densities.
1007 @param hierarchy Optionally read the hierarchy from somewhere.
1008 If not passed, will just read the representation.
1010 self.count_models += 1.0
1014 all_particles_by_resolution = []
1015 for name
in part_dict:
1016 all_particles_by_resolution += part_dict[name]
1018 for density_name
in self.custom_ranges:
1021 all_particles_by_segments = []
1023 for seg
in self.custom_ranges[density_name]:
1026 parts += IMP.tools.select_by_tuple(self.representation,
1027 seg, resolution=1, name_is_ambiguous=
False)
1031 for h
in hierarchy.get_children():
1035 if type(seg) == str:
1037 elif type(seg) == tuple:
1039 hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1))
1041 raise Exception(
'could not understand selection tuple '+str(seg))
1043 all_particles_by_segments += s.get_selected_particles()
1046 parts = all_particles_by_segments
1049 set(all_particles_by_segments) & set(all_particles_by_resolution))
1050 self._create_density_from_particles(parts, density_name)
1052 def normalize_density(self):
1055 def _create_density_from_particles(self, ps, name,
1056 kernel_type=
'GAUSSIAN'):
1057 '''Internal function for adding to densities.
1058 pass XYZR particles with mass and create a density from them.
1059 kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
1061 'GAUSSIAN': IMP.em.GAUSSIAN,
1062 'BINARIZED_SPHERE': IMP.em.BINARIZED_SPHERE,
1063 'SPHERE': IMP.em.SPHERE}
1067 dmap.set_was_used(
True)
1068 if name
not in self.densities:
1069 self.densities[name] = dmap
1075 dmap3.set_was_used(
True)
1077 dmap3.add(self.densities[name])
1078 self.densities[name] = dmap3
1080 def get_density_keys(self):
1081 return list(self.densities.keys())
1084 """Get the current density for some component name"""
1085 if name
not in self.densities:
1088 return self.densities[name]
1090 def write_mrc(self, path="./"):
1091 for density_name
in self.densities:
1092 self.densities[density_name].
multiply(1. / self.count_models)
1094 self.densities[density_name],
1095 path +
"/" + density_name +
".mrc",
1099 class GetContactMap(object):
1102 self.distance = distance
1103 self.contactmap =
''
1110 def set_prot(self, prot):
1111 from scipy.spatial.distance
import cdist
1120 for name
in particles_dictionary:
1121 residue_indexes = []
1122 for p
in particles_dictionary[name]:
1127 if len(residue_indexes) != 0:
1128 self.protnames.append(name)
1129 for res
in range(min(residue_indexes), max(residue_indexes) + 1):
1131 new_name = name +
":" + str(res)
1132 if name
not in self.resmap:
1133 self.resmap[name] = {}
1134 if res
not in self.resmap:
1135 self.resmap[name][res] = {}
1137 self.resmap[name][res] = new_name
1138 namelist.append(new_name)
1140 crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1142 radii.append(d.get_radius())
1144 coords = np.array(coords)
1145 radii = np.array(radii)
1147 if len(self.namelist) == 0:
1148 self.namelist = namelist
1149 self.contactmap = np.zeros((len(coords), len(coords)))
1151 distances = cdist(coords, coords)
1152 distances = (distances - radii).T - radii
1153 distances = distances <= self.distance
1159 self.contactmap += distances
1161 def get_subunit_coords(self, frame, align=0):
1162 from scipy.spatial.distance
import cdist
1166 test, testr = [], []
1167 for part
in self.prot.get_children():
1170 for chl
in part.get_children():
1176 SortedSegments.append((chl, startres))
1177 SortedSegments = sorted(SortedSegments, key=itemgetter(1))
1179 for sgmnt
in SortedSegments:
1182 crd = np.array([p.get_x(), p.get_y(), p.get_z()])
1185 radii.append(p.get_radius())
1187 new_name = part.get_name() +
'_' + sgmnt[0].get_name() +\
1190 .get_residue_indexes()[0])
1191 namelist.append(new_name)
1192 self.expanded[new_name] = len(
1194 if part.get_name()
not in self.resmap:
1195 self.resmap[part.get_name()] = {}
1197 self.resmap[part.get_name()][res] = new_name
1199 coords = np.array(coords)
1200 radii = np.array(radii)
1201 if len(self.namelist) == 0:
1202 self.namelist = namelist
1203 self.contactmap = np.zeros((len(coords), len(coords)))
1204 distances = cdist(coords, coords)
1205 distances = (distances - radii).T - radii
1206 distances = distances <= self.distance
1207 self.contactmap += distances
1212 identification_string=
'ISDCrossLinkMS_Distance_'):
1215 data = open(filname)
1216 D = data.readlines()
1220 if identification_string
in d:
1227 t1, t2 = (d[0], d[1]), (d[1], d[0])
1228 if t1
not in self.XL:
1229 self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
1230 self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
1232 self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
1233 self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
1235 def dist_matrix(self, skip_cmap=0, skip_xl=1):
1239 L = sum(self.expanded.values())
1240 proteins = self.protnames
1245 proteins = [p.get_name()
for p
in self.prot.get_children()]
1247 for p1
in range(len(proteins)):
1248 for p2
in range(p1, len(proteins)):
1250 self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1251 pn1, pn2 = proteins[p1], proteins[p2]
1252 mtr = np.zeros((pl1 + 1, pl2 + 1))
1253 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1254 for i1
in range(1, pl1 + 1):
1255 for i2
in range(1, pl2 + 1):
1257 r1 = K.index(self.resmap[pn1][i1])
1258 r2 = K.index(self.resmap[pn2][i2])
1260 mtr[i1 - 1, i2 - 1] = r
1262 missing.append((pn1, pn2, i1, i2))
1264 Matrices[(pn1, pn2)] = mtr
1269 raise ValueError(
"cross-links were not provided, use add_xlinks function!")
1272 for p1
in range(len(proteins)):
1273 for p2
in range(p1, len(proteins)):
1275 self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1276 pn1, pn2 = proteins[p1], proteins[p2]
1277 mtr = np.zeros((pl1 + 1, pl2 + 1))
1280 xls = self.XL[(pn1, pn2)]
1283 xls = self.XL[(pn2, pn1)]
1288 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1289 for xl1, xl2
in xls:
1291 print(
'X' * 10, xl1, xl2)
1294 print(
'X' * 10, xl1, xl2)
1296 mtr[xl1 - 1, xl2 - 1] = 100
1298 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1299 for xl1, xl2
in xls:
1301 print(
'X' * 10, xl1, xl2)
1304 print(
'X' * 10, xl1, xl2)
1306 mtr[xl2 - 1, xl1 - 1] = 100
1308 raise RuntimeError(
'WTF!')
1309 Matrices_xl[(pn1, pn2)] = mtr
1325 for i, c
in enumerate(C):
1326 cl = max(self.resmap[c].keys())
1331 R.append(R[-1] + cl)
1336 import matplotlib
as mpl
1338 import matplotlib.pyplot
as plt
1339 import matplotlib.gridspec
as gridspec
1340 import scipy.sparse
as sparse
1343 gs = gridspec.GridSpec(len(W), len(W),
1348 for x1, r1
in enumerate(R):
1353 for x2, r2
in enumerate(R):
1359 ax = plt.subplot(gs[cnt])
1362 mtr = Matrices[(C[x1], C[x2])]
1364 mtr = Matrices[(C[x2], C[x1])].T
1368 interpolation=
'nearest',
1370 vmax=log(NewM.max()))
1375 mtr = Matrices_xl[(C[x1], C[x2])]
1377 mtr = Matrices_xl[(C[x2], C[x1])].T
1379 sparse.csr_matrix(mtr),
1389 ax.set_ylabel(C[x1], rotation=90)
1396 def get_hiers_from_rmf(model, frame_number, rmf_file):
1398 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1401 rh = RMF.open_rmf_file_read_only(rmf_file)
1406 print(
"Unable to open rmf file %s" % (rmf_file))
1413 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1420 def link_hiers_to_rmf(model,hiers,frame_number, rmf_file):
1421 print(
"linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1422 rh = RMF.open_rmf_file_read_only(rmf_file)
1427 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1434 def get_hiers_and_restraints_from_rmf(model, frame_number, rmf_file):
1436 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1439 rh = RMF.open_rmf_file_read_only(rmf_file)
1445 print(
"Unable to open rmf file %s" % (rmf_file))
1450 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1457 def link_hiers_and_restraints_to_rmf(model,hiers,rs, frame_number, rmf_file):
1458 print(
"linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1459 rh = RMF.open_rmf_file_read_only(rmf_file)
1465 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1472 def get_hiers_from_rmf(model, frame_number, rmf_file):
1473 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1476 rh = RMF.open_rmf_file_read_only(rmf_file)
1481 print(
"Unable to open rmf file %s" % (rmf_file))
1488 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1496 """Get particles at res 1, or any beads, based on the name.
1497 No Representation is needed. This is mainly used when the hierarchy
1498 is read from an RMF file.
1499 @return a dictionary of component names and their particles
1500 \note If the root node is named "System" or is a "State", do proper selection.
1506 for mol
in IMP.atom.get_by_type(prot,IMP.atom.MOLECULE_TYPE):
1508 particle_dict[mol.get_name()] = sel.get_selected_particles()
1511 for c
in prot.get_children():
1514 for s
in c.get_children():
1515 if "_Res:1" in s.get_name()
and "_Res:10" not in s.get_name():
1517 if "Beads" in s.get_name():
1521 for name
in particle_dict:
1522 particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1523 list(set(particle_dict[name]) & set(allparticles)))
1524 return particle_dict
1527 """Get particles at res 10, or any beads, based on the name.
1528 No Representation is needed.
1529 This is mainly used when the hierarchy is read from an RMF file.
1530 @return a dictionary of component names and their particles
1531 \note If the root node is named "System" or is a "State", do proper selection.
1536 for mol
in IMP.atom.get_by_type(prot,IMP.atom.MOLECULE_TYPE):
1538 particle_dict[mol.get_name()] = sel.get_selected_particles()
1541 for c
in prot.get_children():
1544 for s
in c.get_children():
1545 if "_Res:10" in s.get_name():
1547 if "Beads" in s.get_name():
1550 for name
in particle_dict:
1551 particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1552 list(set(particle_dict[name]) & set(allparticles)))
1553 return particle_dict
1555 def select_by_tuple(first_res_last_res_name_tuple):
1556 first_res = first_res_last_res_hier_tuple[0]
1557 last_res = first_res_last_res_hier_tuple[1]
1558 name = first_res_last_res_hier_tuple[2]
1561 """Visualization of crosslinks"""
1563 self.crosslinks = []
1564 self.external_csv_data =
None
1565 self.crosslinkedprots = set()
1566 self.mindist = +10000000.0
1567 self.maxdist = -10000000.0
1568 self.contactmap =
None
1570 def set_hierarchy(self, prot):
1571 self.prot_length_dict = {}
1572 self.model=prot.get_model()
1574 for i
in prot.get_children():
1576 residue_indexes = []
1580 if len(residue_indexes) != 0:
1581 self.prot_length_dict[name] = max(residue_indexes)
1583 def set_coordinates_for_contact_map(self, rmf_name,rmf_frame_index):
1584 from scipy.spatial.distance
import cdist
1586 rh= RMF.open_rmf_file_read_only(rmf_name)
1589 print(
"getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
1600 self.index_dictionary = {}
1602 for name
in particles_dictionary:
1603 residue_indexes = []
1604 for p
in particles_dictionary[name]:
1609 if len(residue_indexes) != 0:
1611 for res
in range(min(residue_indexes), max(residue_indexes) + 1):
1614 crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1616 radii.append(d.get_radius())
1617 if name
not in self.index_dictionary:
1618 self.index_dictionary[name] = [resindex]
1620 self.index_dictionary[name].append(resindex)
1623 coords = np.array(coords)
1624 radii = np.array(radii)
1626 distances = cdist(coords, coords)
1627 distances = (distances - radii).T - radii
1629 distances = np.where(distances <= 20.0, 1.0, 0)
1630 if self.contactmap
is None:
1631 self.contactmap = np.zeros((len(coords), len(coords)))
1632 self.contactmap += distances
1634 for prot
in prots: IMP.atom.destroy(prot)
1637 self, data_file, search_label=
'ISDCrossLinkMS_Distance_',
1640 filter_rmf_file_names=
None,
1641 external_csv_data_file=
None,
1642 external_csv_data_file_unique_id_key=
"Unique ID"):
1651 keys = po.get_keys()
1653 xl_keys = [k
for k
in keys
if search_label
in k]
1655 if filter_rmf_file_names
is not None:
1656 rmf_file_key=
"local_rmf_file_name"
1657 fs = po.get_fields(xl_keys+[rmf_file_key])
1659 fs = po.get_fields(xl_keys)
1662 self.cross_link_frequency = {}
1666 self.cross_link_distances = {}
1670 self.cross_link_distances_unique = {}
1672 if not external_csv_data_file
is None:
1675 self.external_csv_data = {}
1676 xldb = IMP.pmi.tools.get_db_from_csv(external_csv_data_file)
1679 self.external_csv_data[
1680 xl[external_csv_data_file_unique_id_key]] = xl
1685 cross_link_frequency_list = []
1687 self.unique_cross_link_list = []
1691 keysplit = key.replace(
1700 if filter_label!=
None:
1701 if filter_label
not in keysplit:
continue
1704 r1 = int(keysplit[5])
1706 r2 = int(keysplit[7])
1709 confidence = keysplit[12]
1713 unique_identifier = keysplit[3]
1715 unique_identifier =
'0'
1717 r1 = int(keysplit[mapping[
"Residue1"]])
1718 c1 = keysplit[mapping[
"Protein1"]]
1719 r2 = int(keysplit[mapping[
"Residue2"]])
1720 c2 = keysplit[mapping[
"Protein2"]]
1722 confidence = keysplit[mapping[
"Confidence"]]
1726 unique_identifier = keysplit[mapping[
"Unique Identifier"]]
1728 unique_identifier =
'0'
1730 self.crosslinkedprots.add(c1)
1731 self.crosslinkedprots.add(c2)
1737 if filter_rmf_file_names
is not None:
1738 for n,d
in enumerate(fs[key]):
1739 if fs[rmf_file_key][n]
in filter_rmf_file_names:
1740 dists.append(float(d))
1742 dists=[float(f)
for f
in fs[key]]
1747 mdist = self.median(dists)
1749 stdv = np.std(np.array(dists))
1750 if self.mindist > mdist:
1751 self.mindist = mdist
1752 if self.maxdist < mdist:
1753 self.maxdist = mdist
1757 if not self.external_csv_data
is None:
1758 sample = self.external_csv_data[unique_identifier][
"Sample"]
1762 if (r1, c1, r2, c2,mdist)
not in cross_link_frequency_list:
1763 if (r1, c1, r2, c2)
not in self.cross_link_frequency:
1764 self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1765 self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1767 self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1768 self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1769 cross_link_frequency_list.append((r1, c1, r2, c2))
1770 cross_link_frequency_list.append((r2, c2, r1, c1))
1771 self.unique_cross_link_list.append(
1772 (r1, c1, r2, c2,mdist))
1774 if (r1, c1, r2, c2)
not in self.cross_link_distances:
1775 self.cross_link_distances[(
1781 confidence)] = dists
1782 self.cross_link_distances[(
1788 confidence)] = dists
1789 self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1791 self.cross_link_distances[(
1797 confidence)] += dists
1798 self.cross_link_distances[(
1804 confidence)] += dists
1806 self.crosslinks.append(
1816 self.crosslinks.append(
1827 self.cross_link_frequency_inverted = {}
1828 for xl
in self.unique_cross_link_list:
1829 (r1, c1, r2, c2, mdist) = xl
1830 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1831 if frequency
not in self.cross_link_frequency_inverted:
1832 self.cross_link_frequency_inverted[
1833 frequency] = [(r1, c1, r2, c2)]
1835 self.cross_link_frequency_inverted[
1836 frequency].append((r1, c1, r2, c2))
1840 def median(self, mylist):
1841 sorts = sorted(mylist)
1847 return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1848 return sorts[length / 2]
1850 def set_threshold(self,threshold):
1851 self.threshold=threshold
1853 def set_tolerance(self,tolerance):
1854 self.tolerance=tolerance
1856 def colormap(self, dist):
1857 if dist < self.threshold - self.tolerance:
1859 elif dist >= self.threshold + self.tolerance:
1864 def write_cross_link_database(self, filename, format='csv'):
1868 "Unique ID",
"Protein1",
"Residue1",
"Protein2",
"Residue2",
1869 "Median Distance",
"Standard Deviation",
"Confidence",
"Frequency",
"Arrangement"]
1871 if not self.external_csv_data
is None:
1872 keys = list(self.external_csv_data.keys())
1873 innerkeys = list(self.external_csv_data[keys[0]].keys())
1875 fieldnames += innerkeys
1877 dw = csv.DictWriter(
1881 fieldnames=fieldnames)
1883 for xl
in self.crosslinks:
1884 (r1, c1, r2, c2, mdist, stdv, confidence,
1885 unique_identifier, descriptor) = xl
1886 if descriptor ==
'original':
1888 outdict[
"Unique ID"] = unique_identifier
1889 outdict[
"Protein1"] = c1
1890 outdict[
"Protein2"] = c2
1891 outdict[
"Residue1"] = r1
1892 outdict[
"Residue2"] = r2
1893 outdict[
"Median Distance"] = mdist
1894 outdict[
"Standard Deviation"] = stdv
1895 outdict[
"Confidence"] = confidence
1896 outdict[
"Frequency"] = self.cross_link_frequency[
1899 arrangement =
"Intra"
1901 arrangement =
"Inter"
1902 outdict[
"Arrangement"] = arrangement
1903 if not self.external_csv_data
is None:
1904 outdict.update(self.external_csv_data[unique_identifier])
1906 dw.writerow(outdict)
1908 def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
1909 no_confidence_info=
False, filter=
None, layout=
"whole", crosslinkedonly=
False,
1910 filename=
None, confidence_classes=
None, alphablend=0.1, scale_symbol_size=1.0,
1911 gap_between_components=0,
1912 rename_protein_map=
None):
1926 import matplotlib
as mpl
1928 import matplotlib.pyplot
as plt
1929 import matplotlib.cm
as cm
1931 fig = plt.figure(figsize=(10, 10))
1932 ax = fig.add_subplot(111)
1938 if prot_listx
is None:
1940 prot_listx = list(self.crosslinkedprots)
1942 prot_listx = list(self.prot_length_dict.keys())
1945 nresx = gap_between_components + \
1946 sum([self.prot_length_dict[name]
1947 + gap_between_components
for name
in prot_listx])
1951 if prot_listy
is None:
1953 prot_listy = list(self.crosslinkedprots)
1955 prot_listy = list(self.prot_length_dict.keys())
1958 nresy = gap_between_components + \
1959 sum([self.prot_length_dict[name]
1960 + gap_between_components
for name
in prot_listy])
1965 res = gap_between_components
1966 for prot
in prot_listx:
1967 resoffsetx[prot] = res
1968 res += self.prot_length_dict[prot]
1970 res += gap_between_components
1974 res = gap_between_components
1975 for prot
in prot_listy:
1976 resoffsety[prot] = res
1977 res += self.prot_length_dict[prot]
1979 res += gap_between_components
1981 resoffsetdiagonal = {}
1982 res = gap_between_components
1983 for prot
in IMP.pmi.tools.OrderedSet(prot_listx + prot_listy):
1984 resoffsetdiagonal[prot] = res
1985 res += self.prot_length_dict[prot]
1986 res += gap_between_components
1992 for n, prot
in enumerate(prot_listx):
1993 res = resoffsetx[prot]
1995 for proty
in prot_listy:
1996 resy = resoffsety[proty]
1997 endy = resendy[proty]
1998 ax.plot([res, res], [resy, endy],
'k-', lw=0.4)
1999 ax.plot([end, end], [resy, endy],
'k-', lw=0.4)
2000 xticks.append((float(res) + float(end)) / 2)
2001 if rename_protein_map
is not None:
2002 if prot
in rename_protein_map:
2003 prot=rename_protein_map[prot]
2004 xlabels.append(prot)
2008 for n, prot
in enumerate(prot_listy):
2009 res = resoffsety[prot]
2011 for protx
in prot_listx:
2012 resx = resoffsetx[protx]
2013 endx = resendx[protx]
2014 ax.plot([resx, endx], [res, res],
'k-', lw=0.4)
2015 ax.plot([resx, endx], [end, end],
'k-', lw=0.4)
2016 yticks.append((float(res) + float(end)) / 2)
2017 if rename_protein_map
is not None:
2018 if prot
in rename_protein_map:
2019 prot=rename_protein_map[prot]
2020 ylabels.append(prot)
2023 print(prot_listx, prot_listy)
2025 if not self.contactmap
is None:
2026 import matplotlib.cm
as cm
2027 tmp_array = np.zeros((nresx, nresy))
2029 for px
in prot_listx:
2031 for py
in prot_listy:
2033 resx = resoffsety[px]
2034 lengx = resendx[px] - 1
2035 resy = resoffsety[py]
2036 lengy = resendy[py] - 1
2037 indexes_x = self.index_dictionary[px]
2038 minx = min(indexes_x)
2039 maxx = max(indexes_x)
2040 indexes_y = self.index_dictionary[py]
2041 miny = min(indexes_y)
2042 maxy = max(indexes_y)
2044 print(px, py, minx, maxx, miny, maxy)
2049 resy:lengy] = self.contactmap[
2056 ax.imshow(tmp_array,
2059 interpolation=
'nearest')
2061 ax.set_xticks(xticks)
2062 ax.set_xticklabels(xlabels, rotation=90)
2063 ax.set_yticks(yticks)
2064 ax.set_yticklabels(ylabels)
2065 ax.set_xlim(0,nresx)
2066 ax.set_ylim(0,nresy)
2071 already_added_xls = []
2073 for xl
in self.crosslinks:
2075 (r1, c1, r2, c2, mdist, stdv, confidence,
2076 unique_identifier, descriptor) = xl
2078 if confidence_classes
is not None:
2079 if confidence
not in confidence_classes:
2083 pos1 = r1 + resoffsetx[c1]
2087 pos2 = r2 + resoffsety[c2]
2091 if not filter
is None:
2092 xldb = self.external_csv_data[unique_identifier]
2093 xldb.update({
"Distance": mdist})
2094 xldb.update({
"Distance_stdv": stdv})
2096 if filter[1] ==
">":
2097 if float(xldb[filter[0]]) <= float(filter[2]):
2100 if filter[1] ==
"<":
2101 if float(xldb[filter[0]]) >= float(filter[2]):
2104 if filter[1] ==
"==":
2105 if float(xldb[filter[0]]) != float(filter[2]):
2111 pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
2112 pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
2114 if layout ==
'lowerdiagonal':
2115 if pos_for_diagonal1 <= pos_for_diagonal2:
2117 if layout ==
'upperdiagonal':
2118 if pos_for_diagonal1 >= pos_for_diagonal2:
2121 already_added_xls.append((r1, c1, r2, c2))
2123 if not no_confidence_info:
2124 if confidence ==
'0.01':
2125 markersize = 14 * scale_symbol_size
2126 elif confidence ==
'0.05':
2127 markersize = 9 * scale_symbol_size
2128 elif confidence ==
'0.1':
2129 markersize = 6 * scale_symbol_size
2131 markersize = 15 * scale_symbol_size
2133 markersize = 5 * scale_symbol_size
2135 if not no_dist_info:
2136 color = self.colormap(mdist)
2146 markersize=markersize)
2150 fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
2152 [i.set_linewidth(2.0)
for i
in ax.spines.values()]
2157 plt.savefig(filename +
".pdf", dpi=300, transparent=
"False")
2161 def get_frequency_statistics(self, prot_list,
2164 violated_histogram = {}
2165 satisfied_histogram = {}
2166 unique_cross_links = []
2168 for xl
in self.unique_cross_link_list:
2169 (r1, c1, r2, c2, mdist) = xl
2172 if prot_list2
is None:
2173 if not c1
in prot_list:
2175 if not c2
in prot_list:
2178 if c1
in prot_list
and c2
in prot_list2:
2180 elif c1
in prot_list2
and c2
in prot_list:
2185 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2187 if (r1, c1, r2, c2)
not in unique_cross_links:
2189 if frequency
not in violated_histogram:
2190 violated_histogram[frequency] = 1
2192 violated_histogram[frequency] += 1
2194 if frequency
not in satisfied_histogram:
2195 satisfied_histogram[frequency] = 1
2197 satisfied_histogram[frequency] += 1
2198 unique_cross_links.append((r1, c1, r2, c2))
2199 unique_cross_links.append((r2, c2, r1, c1))
2201 print(
"# satisfied")
2203 total_number_of_crosslinks=0
2205 for i
in satisfied_histogram:
2209 if i
in violated_histogram:
2210 print(i, violated_histogram[i]+satisfied_histogram[i])
2212 print(i, satisfied_histogram[i])
2213 total_number_of_crosslinks+=i*satisfied_histogram[i]
2217 for i
in violated_histogram:
2218 print(i, violated_histogram[i])
2219 total_number_of_crosslinks+=i*violated_histogram[i]
2221 print(total_number_of_crosslinks)
2225 def print_cross_link_binary_symbols(self, prot_list,
2228 confidence_list = []
2229 for xl
in self.crosslinks:
2230 (r1, c1, r2, c2, mdist, stdv, confidence,
2231 unique_identifier, descriptor) = xl
2233 if prot_list2
is None:
2234 if not c1
in prot_list:
2236 if not c2
in prot_list:
2239 if c1
in prot_list
and c2
in prot_list2:
2241 elif c1
in prot_list2
and c2
in prot_list:
2246 if descriptor !=
"original":
2249 confidence_list.append(confidence)
2251 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2252 tmp_dist_binary = []
2255 tmp_dist_binary.append(1)
2257 tmp_dist_binary.append(0)
2258 tmp_matrix.append(tmp_dist_binary)
2260 matrix = list(zip(*tmp_matrix))
2262 satisfied_high_sum = 0
2263 satisfied_mid_sum = 0
2264 satisfied_low_sum = 0
2265 total_satisfied_sum = 0
2266 for k, m
in enumerate(matrix):
2275 for n, b
in enumerate(m):
2276 if confidence_list[n] ==
"0.01":
2280 satisfied_high_sum += 1
2281 elif confidence_list[n] ==
"0.05":
2285 satisfied_mid_sum += 1
2286 elif confidence_list[n] ==
"0.1":
2290 satisfied_low_sum += 1
2292 total_satisfied += 1
2293 total_satisfied_sum += 1
2295 print(k, satisfied_high, total_high)
2296 print(k, satisfied_mid, total_mid)
2297 print(k, satisfied_low, total_low)
2298 print(k, total_satisfied, total)
2299 print(float(satisfied_high_sum) / len(matrix))
2300 print(float(satisfied_mid_sum) / len(matrix))
2301 print(float(satisfied_low_sum) / len(matrix))
2304 def get_unique_crosslinks_statistics(self, prot_list,
2317 satisfied_string = []
2318 for xl
in self.crosslinks:
2319 (r1, c1, r2, c2, mdist, stdv, confidence,
2320 unique_identifier, descriptor) = xl
2322 if prot_list2
is None:
2323 if not c1
in prot_list:
2325 if not c2
in prot_list:
2328 if c1
in prot_list
and c2
in prot_list2:
2330 elif c1
in prot_list2
and c2
in prot_list:
2335 if descriptor !=
"original":
2339 if confidence ==
"0.01":
2343 if confidence ==
"0.05":
2347 if confidence ==
"0.1":
2352 satisfied_string.append(1)
2354 satisfied_string.append(0)
2356 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2357 tmp_dist_binary = []
2360 tmp_dist_binary.append(1)
2362 tmp_dist_binary.append(0)
2363 tmp_matrix.append(tmp_dist_binary)
2365 print(
"unique satisfied_high/total_high", satisfied_high,
"/", total_high)
2366 print(
"unique satisfied_mid/total_mid", satisfied_mid,
"/", total_mid)
2367 print(
"unique satisfied_low/total_low", satisfied_low,
"/", total_low)
2368 print(
"total", total)
2370 matrix = list(zip(*tmp_matrix))
2371 satisfied_models = 0
2373 for b
in satisfied_string:
2380 all_satisfied =
True
2382 for n, b
in enumerate(m):
2387 if b == 1
and satisfied_string[n] == 1:
2389 elif b == 1
and satisfied_string[n] == 0:
2391 elif b == 0
and satisfied_string[n] == 0:
2393 elif b == 0
and satisfied_string[n] == 1:
2394 all_satisfied =
False
2396 satisfied_models += 1
2398 print(satstr, all_satisfied)
2399 print(
"models that satisfies the median satisfied crosslinks/total models", satisfied_models, len(matrix))
2401 def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,
2404 import matplotlib
as mpl
2406 import matplotlib.pylab
as pl
2409 for kw
in self.cross_link_distances_unique:
2410 (r1, c1, r2, c2) = kw
2411 dists = self.cross_link_distances_unique[kw]
2413 if prot_list2
is None:
2414 if not c1
in prot_list:
2416 if not c2
in prot_list:
2419 if c1
in prot_list
and c2
in prot_list2:
2421 elif c1
in prot_list2
and c2
in prot_list:
2426 dists.append(sum(dists))
2427 tmp_matrix.append(dists)
2429 tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
2432 matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
2434 for i
in range(len(tmp_matrix)):
2435 for k
in range(len(tmp_matrix[i]) - 1):
2436 matrix[i][k] = tmp_matrix[i][k]
2441 ax = fig.add_subplot(211)
2443 cax = ax.imshow(matrix, interpolation=
'nearest')
2447 pl.savefig(figurename, dpi=300)
2456 arrangement=
"inter",
2457 confidence_input=
"None"):
2460 for xl
in self.cross_link_distances:
2461 (r1, c1, r2, c2, mdist, confidence) = xl
2462 if c1
in prots1
and c2
in prots2:
2463 if arrangement ==
"inter" and c1 == c2:
2465 if arrangement ==
"intra" and c1 != c2:
2467 if confidence_input == confidence:
2468 label = str(c1) +
":" + str(r1) + \
2469 "-" + str(c2) +
":" + str(r2)
2470 values = self.cross_link_distances[xl]
2471 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2472 data.append((label, values, mdist, frequency))
2474 sort_by_dist = sorted(data, key=
lambda tup: tup[2])
2475 sort_by_dist = list(zip(*sort_by_dist))
2476 values = sort_by_dist[1]
2477 positions = list(range(len(values)))
2478 labels = sort_by_dist[0]
2479 frequencies = list(map(float, sort_by_dist[3]))
2480 frequencies = [f * 10.0
for f
in frequencies]
2482 nchunks = int(float(len(values)) / nxl_per_row)
2483 values_chunks = IMP.pmi.tools.chunk_list_into_segments(values, nchunks)
2484 positions_chunks = IMP.pmi.tools.chunk_list_into_segments(
2487 frequencies_chunks = IMP.pmi.tools.chunk_list_into_segments(
2490 labels_chunks = IMP.pmi.tools.chunk_list_into_segments(labels, nchunks)
2492 for n, v
in enumerate(values_chunks):
2493 p = positions_chunks[n]
2494 f = frequencies_chunks[n]
2495 l = labels_chunks[n]
2497 filename +
"." + str(n), v, p, f,
2498 valuename=
"Distance (Ang)", positionname=
"Unique " + arrangement +
" Crosslinks", xlabels=l)
2500 def crosslink_distance_histogram(self, filename,
2503 confidence_classes=
None,
2509 if prot_list
is None:
2510 prot_list = list(self.prot_length_dict.keys())
2513 for xl
in self.crosslinks:
2514 (r1, c1, r2, c2, mdist, stdv, confidence,
2515 unique_identifier, descriptor) = xl
2517 if not confidence_classes
is None:
2518 if confidence
not in confidence_classes:
2521 if prot_list2
is None:
2522 if not c1
in prot_list:
2524 if not c2
in prot_list:
2527 if c1
in prot_list
and c2
in prot_list2:
2529 elif c1
in prot_list2
and c2
in prot_list:
2534 distances.append(mdist)
2537 filename, distances, valuename=
"C-alpha C-alpha distance [Ang]",
2538 bins=bins, color=color,
2540 reference_xline=35.0,
2541 yplotrange=yplotrange, normalized=normalized)
2543 def scatter_plot_xl_features(self, filename,
2549 reference_ylines=
None,
2550 distance_color=
True,
2552 import matplotlib
as mpl
2554 import matplotlib.pyplot
as plt
2555 import matplotlib.cm
as cm
2557 fig = plt.figure(figsize=(10, 10))
2558 ax = fig.add_subplot(111)
2560 for xl
in self.crosslinks:
2561 (r1, c1, r2, c2, mdist, stdv, confidence,
2562 unique_identifier, arrangement) = xl
2564 if prot_list2
is None:
2565 if not c1
in prot_list:
2567 if not c2
in prot_list:
2570 if c1
in prot_list
and c2
in prot_list2:
2572 elif c1
in prot_list2
and c2
in prot_list:
2577 xldb = self.external_csv_data[unique_identifier]
2578 xldb.update({
"Distance": mdist})
2579 xldb.update({
"Distance_stdv": stdv})
2581 xvalue = float(xldb[feature1])
2582 yvalue = float(xldb[feature2])
2585 color = self.colormap(mdist)
2589 ax.plot([xvalue], [yvalue],
'o', c=color, alpha=0.1, markersize=7)
2591 if not yplotrange
is None:
2592 ax.set_ylim(yplotrange)
2593 if not reference_ylines
is None:
2594 for rl
in reference_ylines:
2595 ax.axhline(rl, color=
'red', linestyle=
'dashed', linewidth=1)
2598 plt.savefig(filename +
"." + format, dpi=150, transparent=
"False")
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Visualization of crosslinks.
A decorator to associate a particle with a part of a protein/DNA/RNA.
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
A class for reading stat files.
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
void link_restraints(RMF::FileConstHandle fh, const Restraints &hs)
A class to evaluate the precision of an ensemble.
A class to cluster structures.
def get_rmsd_wrt_reference_structure_with_alignment
First align then calculate RMSD.
static Molecule setup_particle(Model *m, ParticleIndex pi)
Class for sampling a density map from particles.
def add_structure
Read a structure into the ensemble and store (as coordinates).
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
Return a density map for which voxel i contains the result of m1[i]*m2[i].
Performs alignment and RMSD calculation for two sets of coordinates.
def add_subunits_density
Add a frame to the densities.
DensityMap * create_density_map(const IMP::algebra::GridD< 3, S, V, E > &arg)
Create a density map from an arbitrary IMP::algebra::GridD.
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Basic utilities for handling cryo-electron microscopy 3D density maps.
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
def fill
Add coordinates for a single model.
Tools for clustering and cluster analysis.
bool get_is_canonical(atom::Hierarchy h)
Walk up a PMI2 hierarchy/representations and check if the root is named System.
def do_cluster
Run K-means clustering.
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Classes for writing output files and processing them.
def set_reference_structure
Read in a structure used for reference computation.
def get_rmsf
Calculate the residue mean square fluctuations (RMSF).
def get_average_distance_wrt_reference_structure
Compare the structure set to the reference structure.
def get_density
Get the current density for some component name.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
def get_precision
Evaluate the precision of two named structure groups.
Restraints create_restraints(RMF::FileConstHandle fh, Model *m)
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name.
Python classes to represent, score, sample and analyze models.
def add_structures
Read a list of RMFs, supports parallel.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Select hierarchy particles identified by the biological name.
Compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
A decorator for a particle with x,y,z coordinates and a radius.