3 """@namespace IMP.pmi.analysis
4 Tools for clustering and cluster analysis
6 from __future__
import print_function
16 from operator
import itemgetter
17 from copy
import deepcopy
18 from math
import log,sqrt
24 """Performs alignment and RMSD calculation for two sets of coordinates
26 The class also takes into account non-equal stoichiometry of the proteins.
27 If this is the case, the protein names of proteins in multiple copies
28 should be specified in the following form:
29 nameA..1, nameA..2 (note two dots).
32 def __init__(self, template, query, weights=None):
34 @param query {'p1':coords(L,3), 'p2':coords(L,3)}
35 @param template {'p1':coords(L,3), 'p2':coords(L,3)}
36 @param weights optional weights for each set of coordinates
39 self.template = template
42 if len(self.query.keys()) != len(self.template.keys()):
43 raise ValueError(
'''the number of proteins
44 in template and query does not match!''')
50 self.proteins = sorted(self.query.keys())
51 prots_uniq = [i.split(
'..')[0]
for i
in self.proteins]
59 np = prots_uniq.count(p)
60 copies = [i
for i
in self.proteins
if i.split(
'..')[0] == p]
61 prmts = list(itertools.permutations(copies, len(copies)))
64 self.Product = list(itertools.product(*P.values()))
72 torder = sum([list(i)
for i
in self.Product[0]], [])
75 if self.weights
is not None:
76 weights += [i
for i
in self.weights[t]]
79 self.rmsd = 10000000000.
80 for comb
in self.Product:
82 order = sum([list(i)
for i
in comb], [])
92 if self.weights
is not None:
103 from scipy.spatial.distance
import cdist
111 torder = sum([list(i)
for i
in self.Product[0]], [])
115 self.rmsd, Transformation = 10000000000.,
''
118 for comb
in self.Product:
119 order = sum([list(i)
for i
in comb], [])
125 if len(template_xyz) != len(query_xyz):
126 raise ValueError(
'''the number of coordinates
127 in template and query does not match!''')
132 query_xyz_tr = [transformation.get_transformed(n)
136 sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2)) / len(template_xyz))
139 Transformation = transformation
142 return (self.rmsd, Transformation)
147 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
148 'a..2':np.array([np.array([1.,1.,])]),
149 'a..3':np.array([np.array([-2.,1.])]),
150 'b':np.array([np.array([0.,-1.])]),
151 'c..1':np.array([np.array([-1.,-1.])]),
152 'c..2':np.array([np.array([1.,-1.])]),
153 'd':np.array([np.array([0.,0.])]),
154 'e':np.array([np.array([0.,1.])])}
156 Ali = Alignment(Proteins, Proteins)
158 if Ali.get_rmsd() == 0.0: print 'successful test!'
159 else: print 'ERROR!'; exit()
164 class Violations(object):
168 self.violation_thresholds = {}
169 self.violation_counts = {}
171 data = open(filename)
176 d = d.strip().split()
177 self.violation_thresholds[d[0]] = float(d[1])
179 def get_number_violated_restraints(self, rsts_dict):
181 for rst
in self.violation_thresholds:
182 if rst
not in rsts_dict:
184 if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
186 if rst
not in self.violation_counts:
187 self.violation_counts[rst] = 1
189 self.violation_counts[rst] += 1
195 """A class to cluster structures.
196 Uses scipy's cdist function to compute distance matrices
197 and sklearn's kmeans clustering module.
201 @param rmsd_weights Flat list of weights for each particle
205 from mpi4py
import MPI
206 self.comm = MPI.COMM_WORLD
207 self.rank = self.comm.Get_rank()
208 self.number_of_processes = self.comm.size
210 self.number_of_processes = 1
213 self.structure_cluster_ids =
None
214 self.tmpl_coords =
None
215 self.rmsd_weights=rmsd_weights
217 def set_template(self, part_coords):
219 self.tmpl_coords = part_coords
222 """Add coordinates for a single model."""
224 self.all_coords[frame] = Coords
226 def dist_matrix(self):
228 self.model_list_names = list(self.all_coords.keys())
229 self.model_indexes = list(range(len(self.model_list_names)))
230 self.model_indexes_dict = dict(
231 list(zip(self.model_list_names, self.model_indexes)))
232 model_indexes_unique_pairs = list(itertools.combinations(self.model_indexes, 2))
234 my_model_indexes_unique_pairs = IMP.pmi.tools.chunk_list_into_segments(
235 model_indexes_unique_pairs,
236 self.number_of_processes)[self.rank]
238 print(
"process %s assigned with %s pairs" % (str(self.rank), str(len(my_model_indexes_unique_pairs))))
240 (raw_distance_dict, self.transformation_distance_dict) = self.matrix_calculation(self.all_coords,
242 my_model_indexes_unique_pairs)
244 if self.number_of_processes > 1:
247 pickable_transformations = self.get_pickable_transformation_distance_dict(
250 pickable_transformations)
251 self.set_transformation_distance_dict_from_pickable(
252 pickable_transformations)
254 self.raw_distance_matrix = np.zeros(
255 (len(self.model_list_names), len(self.model_list_names)))
256 for item
in raw_distance_dict:
258 self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
259 self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
261 def get_dist_matrix(self):
262 return self.raw_distance_matrix
265 """Run K-means clustering
266 @param number_of_clusters Num means
267 @param seed the random seed
269 from sklearn.cluster
import KMeans
274 kmeans = KMeans(n_clusters=number_of_clusters)
277 kmeans = KMeans(k=number_of_clusters)
278 kmeans.fit_predict(self.raw_distance_matrix)
280 self.structure_cluster_ids = kmeans.labels_
282 def get_pickable_transformation_distance_dict(self):
283 pickable_transformations = {}
284 for label
in self.transformation_distance_dict:
285 tr = self.transformation_distance_dict[label]
286 trans = tuple(tr.get_translation())
287 rot = tuple(tr.get_rotation().get_quaternion())
288 pickable_transformations[label] = (rot, trans)
289 return pickable_transformations
291 def set_transformation_distance_dict_from_pickable(
293 pickable_transformations):
294 self.transformation_distance_dict = {}
295 for label
in pickable_transformations:
296 tr = pickable_transformations[label]
299 self.transformation_distance_dict[
302 def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
304 outf = open(file_name +
".data",
'wb')
310 pickable_transformations = self.get_pickable_transformation_distance_dict(
313 (self.structure_cluster_ids,
314 self.model_list_names,
315 pickable_transformations),
318 np.save(file_name +
".npy", self.raw_distance_matrix)
320 def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
323 inputf = open(file_name +
".data",
'rb')
324 (self.structure_cluster_ids, self.model_list_names,
325 pickable_transformations) = pickle.load(inputf)
328 self.raw_distance_matrix = np.load(file_name +
".npy")
330 self.set_transformation_distance_dict_from_pickable(
331 pickable_transformations)
332 self.model_indexes = list(range(len(self.model_list_names)))
333 self.model_indexes_dict = dict(
334 list(zip(self.model_list_names, self.model_indexes)))
336 def plot_matrix(self, figurename="clustermatrix.pdf"):
337 import matplotlib
as mpl
339 import matplotlib.pylab
as pl
340 from scipy.cluster
import hierarchy
as hrc
342 fig = pl.figure(figsize=(10,8))
343 ax = fig.add_subplot(212)
344 dendrogram = hrc.dendrogram(
345 hrc.linkage(self.raw_distance_matrix),
348 leaves_order = dendrogram[
'leaves']
349 ax.set_xlabel(
'Model')
350 ax.set_ylabel(
'RMSD [Angstroms]')
352 ax2 = fig.add_subplot(221)
354 self.raw_distance_matrix[leaves_order,
357 interpolation=
'nearest')
358 cb = fig.colorbar(cax)
359 cb.set_label(
'RMSD [Angstroms]')
360 ax2.set_xlabel(
'Model')
361 ax2.set_ylabel(
'Model')
363 pl.savefig(figurename, dpi=300)
366 def get_model_index_from_name(self, name):
367 return self.model_indexes_dict[name]
369 def get_cluster_labels(self):
371 return list(set(self.structure_cluster_ids))
373 def get_number_of_clusters(self):
374 return len(self.get_cluster_labels())
376 def get_cluster_label_indexes(self, label):
378 [i
for i, l
in enumerate(self.structure_cluster_ids)
if l == label]
381 def get_cluster_label_names(self, label):
383 [self.model_list_names[i]
384 for i
in self.get_cluster_label_indexes(label)]
387 def get_cluster_label_average_rmsd(self, label):
389 indexes = self.get_cluster_label_indexes(label)
392 sub_distance_matrix = self.raw_distance_matrix[
393 indexes, :][:, indexes]
394 average_rmsd = np.sum(sub_distance_matrix) / \
395 (len(sub_distance_matrix)
396 ** 2 - len(sub_distance_matrix))
401 def get_cluster_label_size(self, label):
402 return len(self.get_cluster_label_indexes(label))
404 def get_transformation_to_first_member(
408 reference = self.get_cluster_label_indexes(cluster_label)[0]
409 return self.transformation_distance_dict[(reference, structure_index)]
411 def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
413 model_list_names = list(all_coords.keys())
414 rmsd_protein_names = list(all_coords[model_list_names[0]].keys())
415 raw_distance_dict = {}
416 transformation_distance_dict = {}
417 if template_coords
is None:
421 alignment_template_protein_names = list(template_coords.keys())
423 for (f1, f2)
in list_of_pairs:
431 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
432 for pr
in rmsd_protein_names])
434 for pr
in rmsd_protein_names:
435 coords_f2[pr] = all_coords[model_list_names[f2]][pr]
437 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
438 rmsd = Ali.get_rmsd()
444 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
445 for pr
in alignment_template_protein_names])
446 coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
447 for pr
in alignment_template_protein_names])
450 template_rmsd, transformation = Ali.align()
455 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
456 for pr
in rmsd_protein_names])
458 for pr
in rmsd_protein_names:
459 coords_f2[pr] = [transformation.get_transformed(
460 i)
for i
in all_coords[model_list_names[f2]][pr]]
462 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
463 rmsd = Ali.get_rmsd()
465 raw_distance_dict[(f1, f2)] = rmsd
466 raw_distance_dict[(f2, f1)] = rmsd
467 transformation_distance_dict[(f1, f2)] = transformation
468 transformation_distance_dict[(f2, f1)] = transformation
470 return raw_distance_dict, transformation_distance_dict
474 """A class to evaluate the precision of an ensemble.
476 Also can evaluate the cross-precision of multiple ensembles.
477 Supports MPI for coordinate reading.
478 Recommended procedure:
479 -# initialize object and pass the selection for evaluating precision
480 -# call add_structures() to read in the data (specify group name)
481 -# call get_precision() to evaluate inter/intra precision
482 -# call get_rmsf() to evaluate within-group fluctuations
486 selection_dictionary={}):
488 @param model The IMP Model
489 @param resolution Use 1 or 10 (kluge: requires that "_Res:X" is
490 part of the hier name)
491 @param selection_dictionary Dictionary where keys are names for
492 selections and values are selection tuples for scoring
493 precision. "All" is automatically made as well
496 from mpi4py
import MPI
497 self.comm = MPI.COMM_WORLD
498 self.rank = self.comm.Get_rank()
499 self.number_of_processes = self.comm.size
501 self.number_of_processes = 1
504 self.styles = [
'pairwise_rmsd',
'pairwise_drmsd_k',
'pairwise_drmsd_Q',
505 'pairwise_drms_k',
'pairwise_rmsd',
'drmsd_from_center']
506 self.style =
'pairwise_drmsd_k'
507 self.structures_dictionary = {}
508 self.reference_structures_dictionary = {}
510 self.protein_names =
None
511 self.len_particles_resolution_one =
None
513 self.rmf_names_frames = {}
514 self.reference_rmf_names_frames =
None
515 self.reference_structure =
None
516 self.reference_prot =
None
517 self.selection_dictionary = selection_dictionary
518 self.threshold = 40.0
519 self.residue_particle_index_map =
None
521 if resolution
in [1,10]:
522 self.resolution = resolution
524 raise KeyError(
"Currently only allow resolution 1 or 10")
526 def _get_structure(self,rmf_frame_index,rmf_name):
527 """Read an RMF file and return the particles"""
528 rh = RMF.open_rmf_file_read_only(rmf_name)
530 print(
"getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
534 print(
"linking coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
539 if self.resolution==1:
541 elif self.resolution==10:
544 protein_names = list(particle_dict.keys())
545 particles_resolution_one = []
546 for k
in particle_dict:
547 particles_resolution_one += (particle_dict[k])
549 if self.protein_names==
None:
550 self.protein_names = protein_names
552 if self.protein_names!=protein_names:
553 print(
"Error: the protein names of the new coordinate set is not compatible with the previous one")
555 if self.len_particles_resolution_one==
None:
556 self.len_particles_resolution_one = len(particles_resolution_one)
558 if self.len_particles_resolution_one!=len(particles_resolution_one):
559 raise ValueError(
"the new coordinate set is not compatible with the previous one")
561 return particles_resolution_one
567 setup_index_map=
False):
568 """ Read a structure into the ensemble and store (as coordinates).
569 @param rmf_name The name of the RMF file
570 @param rmf_frame_index The frame to read
571 @param structure_set_name Name for the set that includes this structure
573 @param setup_index_map if requested, set up a dictionary to help
578 if structure_set_name
in self.structures_dictionary:
579 cdict = self.structures_dictionary[structure_set_name]
580 rmflist = self.rmf_names_frames[structure_set_name]
582 self.structures_dictionary[structure_set_name]={}
583 self.rmf_names_frames[structure_set_name]=[]
584 cdict = self.structures_dictionary[structure_set_name]
585 rmflist = self.rmf_names_frames[structure_set_name]
589 particles_resolution_one = self._get_structure(rmf_frame_index,rmf_name)
591 print(
"something wrong with the rmf")
593 self.selection_dictionary.update({
"All":self.protein_names})
595 for selection_name
in self.selection_dictionary:
596 selection_tuple = self.selection_dictionary[selection_name]
597 coords = self._select_coordinates(selection_tuple,particles_resolution_one,self.prots[0])
598 if selection_name
not in cdict:
599 cdict[selection_name] = [coords]
601 cdict[selection_name].append(coords)
603 rmflist.append((rmf_name,rmf_frame_index))
607 self.residue_particle_index_map = {}
608 for prot_name
in self.protein_names:
609 self.residue_particle_index_map[prot_name] = \
610 self._get_residue_particle_index_map(
612 particles_resolution_one,self.prots[0])
615 rmf_name_frame_tuples,
617 """Read a list of RMFs, supports parallel
618 @param rmf_name_frame_tuples list of (rmf_file_name,frame_number)
619 @param structure_set_name Name this set of structures (e.g. "cluster.1")
623 my_rmf_name_frame_tuples=IMP.pmi.tools.chunk_list_into_segments(
624 rmf_name_frame_tuples,self.number_of_processes)[self.rank]
625 for nfr,tup
in enumerate(my_rmf_name_frame_tuples):
627 rmf_frame_index=tup[1]
629 if self.residue_particle_index_map
is None:
630 setup_index_map =
True
632 setup_index_map =
False
639 if self.number_of_processes > 1:
642 self.comm.send(self.structures_dictionary, dest=0, tag=11)
644 for i
in range(1, self.number_of_processes):
645 data_tmp = self.comm.recv(source=i, tag=11)
646 for key
in self.structures_dictionary:
647 self.structures_dictionary[key].update(data_tmp[key])
648 for i
in range(1, self.number_of_processes):
649 self.comm.send(self.structures_dictionary, dest=i, tag=11)
651 self.structures_dictionary = self.comm.recv(source=0, tag=11)
653 def _get_residue_particle_index_map(self,prot_name,structure,hier):
655 residue_particle_index_map = []
661 all_selected_particles = s.get_selected_particles()
662 intersection = list(set(all_selected_particles) & set(structure))
663 sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
664 for p
in sorted_intersection:
666 return residue_particle_index_map
669 def _select_coordinates(self,tuple_selections,structure,prot):
670 selected_coordinates=[]
671 for t
in tuple_selections:
672 if type(t)==tuple
and len(t)==3:
678 all_selected_particles = s.get_selected_particles()
679 intersection = list(set(all_selected_particles) & set(structure))
680 sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
681 cc = [tuple(
IMP.core.XYZ(p).get_coordinates())
for p
in sorted_intersection]
682 selected_coordinates += cc
688 all_selected_particles = s.get_selected_particles()
689 intersection = list(set(all_selected_particles) & set(structure))
690 sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
691 cc = [tuple(
IMP.core.XYZ(p).get_coordinates())
for p
in sorted_intersection]
692 selected_coordinates += cc
694 raise ValueError(
"Selection error")
695 return selected_coordinates
697 def set_threshold(self,threshold):
698 self.threshold = threshold
700 def _get_distance(self,
706 """ Compute distance between structures with various metrics """
707 c1 = self.structures_dictionary[structure_set_name1][selection_name][index1]
708 c2 = self.structures_dictionary[structure_set_name2][selection_name][index2]
712 if self.style==
'pairwise_drmsd_k':
714 if self.style==
'pairwise_drms_k':
716 if self.style==
'pairwise_drmsd_Q':
719 if self.style==
'pairwise_rmsd':
723 def _get_particle_distances(self,structure_set_name1,structure_set_name2,
724 selection_name,index1,index2):
725 c1 = self.structures_dictionary[structure_set_name1][selection_name][index1]
726 c2 = self.structures_dictionary[structure_set_name2][selection_name][index2]
731 distances=[np.linalg.norm(a-b)
for (a,b)
in zip(coordinates1,coordinates2)]
740 selection_keywords=
None):
741 """ Evaluate the precision of two named structure groups. Supports MPI.
742 When the structure_set_name1 is different from the structure_set_name2,
743 this evaluates the cross-precision (average pairwise distances).
744 @param outfile Name of the precision output file
745 @param structure_set_name1 string name of the first structure set
746 @param structure_set_name2 string name of the second structure set
747 @param skip analyze every (skip) structure for the distance matrix calculation
748 @param selection_keywords Specify the selection name you want to calculate on.
749 By default this is computed for everything you provided in the constructor,
750 plus all the subunits together.
752 if selection_keywords
is None:
753 sel_keys = list(self.selection_dictionary.keys())
755 for k
in selection_keywords:
756 if k
not in self.selection_dictionary:
757 raise KeyError(
"you are trying to find named selection " \
758 + k +
" which was not requested in the constructor")
759 sel_keys = selection_keywords
761 if outfile
is not None:
762 of = open(outfile,
"w")
764 for selection_name
in sel_keys:
765 number_of_structures_1 = len(self.structures_dictionary[structure_set_name1][selection_name])
766 number_of_structures_2 = len(self.structures_dictionary[structure_set_name2][selection_name])
768 structure_pointers_1 = list(range(0,number_of_structures_1,skip))
769 structure_pointers_2 = list(range(0,number_of_structures_2,skip))
770 pair_combination_list = list(itertools.product(structure_pointers_1,structure_pointers_2))
771 if len(pair_combination_list)==0:
772 raise ValueError(
"no structure selected. Check the skip parameter.")
775 my_pair_combination_list = IMP.pmi.tools.chunk_list_into_segments(
776 pair_combination_list,self.number_of_processes)[self.rank]
777 my_length = len(my_pair_combination_list)
778 for n,pair
in enumerate(my_pair_combination_list):
779 progression = int(float(n)/my_length*100.0)
780 distances[pair] = self._get_distance(structure_set_name1,structure_set_name2,
781 selection_name,pair[0],pair[1])
782 if self.number_of_processes > 1:
787 if structure_set_name1==structure_set_name2:
788 structure_pointers = structure_pointers_1
789 number_of_structures = number_of_structures_1
794 distances_to_structure = {}
795 distances_to_structure_normalization = {}
797 for n
in structure_pointers:
798 distances_to_structure[n] = 0.0
799 distances_to_structure_normalization[n]=0
802 distance += distances[k]
803 distances_to_structure[k[0]] += distances[k]
804 distances_to_structure[k[1]] += distances[k]
805 distances_to_structure_normalization[k[0]] += 1
806 distances_to_structure_normalization[k[1]] += 1
808 for n
in structure_pointers:
809 distances_to_structure[n] = distances_to_structure[n]/distances_to_structure_normalization[n]
811 min_distance = min([distances_to_structure[n]
for n
in distances_to_structure])
812 centroid_index = [k
for k, v
in distances_to_structure.items()
if v == min_distance][0]
813 centroid_rmf_name = self.rmf_names_frames[structure_set_name1][centroid_index]
815 centroid_distance = 0.0
817 for n
in range(number_of_structures):
818 dist = self._get_distance(structure_set_name1,structure_set_name1,
819 selection_name,centroid_index,n)
820 centroid_distance += dist
821 distance_list.append(dist)
824 centroid_distance /= number_of_structures
826 if outfile
is not None:
827 of.write(str(selection_name)+
" "+structure_set_name1+
828 " average centroid distance "+str(centroid_distance)+
"\n")
829 of.write(str(selection_name)+
" "+structure_set_name1+
830 " centroid index "+str(centroid_index)+
"\n")
831 of.write(str(selection_name)+
" "+structure_set_name1+
832 " centroid rmf name "+str(centroid_rmf_name)+
"\n")
833 of.write(str(selection_name)+
" "+structure_set_name1+
834 " median centroid distance "+str(np.median(distance_list))+
"\n")
836 average_pairwise_distances=sum(distances.values())/len(list(distances.values()))
837 if outfile
is not None:
838 of.write(str(selection_name)+
" "+structure_set_name1+
" "+structure_set_name2+
839 " average pairwise distance "+str(average_pairwise_distances)+
"\n")
840 if outfile
is not None:
842 return centroid_index
848 set_plot_yaxis_range=
None):
849 """ Calculate the residue mean square fluctuations (RMSF).
850 Automatically outputs as data file and pdf
851 @param structure_set_name Which structure set to calculate RMSF for
852 @param outdir Where to write the files
853 @param skip Skip this number of structures
854 @param set_plot_yaxis_range In case you need to change the plot
862 for sel_name
in self.protein_names:
863 self.selection_dictionary.update({sel_name:[sel_name]})
865 number_of_structures = len(self.structures_dictionary[structure_set_name][sel_name])
869 rpim = self.residue_particle_index_map[sel_name]
870 outfile = outdir+
"/rmsf."+sel_name+
".dat"
871 of = open(outfile,
"w")
872 residue_distances = {}
874 for index
in range(number_of_structures):
875 distances = self._get_particle_distances(structure_set_name,
878 centroid_index,index)
879 for nblock,block
in enumerate(rpim):
880 for residue_number
in block:
881 residue_nblock[residue_number] = nblock
882 if residue_number
not in residue_distances:
883 residue_distances[residue_number] = [distances[nblock]]
885 residue_distances[residue_number].append(distances[nblock])
889 for rn
in residue_distances:
891 rmsf = np.std(residue_distances[rn])
893 of.write(str(rn)+
" "+str(residue_nblock[rn])+
" "+str(rmsf)+
"\n")
895 IMP.pmi.output.plot_xy_data(residues,rmsfs,title=sel_name,
896 out_fn=outdir+
"/rmsf."+sel_name,display=
False,
897 set_plot_yaxis_range=set_plot_yaxis_range,
898 xlabel=
'Residue Number',ylabel=
'Standard error')
903 """Read in a structure used for reference computation.
904 Needed before calling get_average_distance_wrt_reference_structure()
905 @param rmf_name The RMF file to read the reference
906 @param rmf_frame_index The index in that file
908 particles_resolution_one = self._get_structure(rmf_frame_index,rmf_name)
909 self.reference_rmf_names_frames = (rmf_name,rmf_frame_index)
911 for selection_name
in self.selection_dictionary:
912 selection_tuple = self.selection_dictionary[selection_name]
913 coords = self._select_coordinates(selection_tuple,
914 particles_resolution_one,self.prots[0])
915 self.reference_structures_dictionary[selection_name] = coords
918 """First align then calculate RMSD
919 @param structure_set_name: the name of the structure set
920 @param alignment_selection: the key containing the selection tuples needed to make the alignment stored in self.selection_dictionary
921 @return: for each structure in the structure set, returns the rmsd
923 if self.reference_structures_dictionary=={}:
924 print(
"Cannot compute until you set a reference structure")
927 align_reference_coordinates = self.reference_structures_dictionary[alignment_selection_key]
928 align_coordinates = self.structures_dictionary[structure_set_name][alignment_selection_key]
930 for c
in align_coordinates:
932 transformation = Ali.align()[1]
933 transformations.append(transformation)
934 for selection_name
in self.selection_dictionary:
935 reference_coordinates = self.reference_structures_dictionary[selection_name]
938 for n,sc
in enumerate(self.structures_dictionary[structure_set_name][selection_name]):
941 distances.append(distance)
942 print(selection_name,
"average rmsd",sum(distances)/len(distances),
"median",self._get_median(distances),
"minimum distance",min(distances))
944 def _get_median(self,list_of_values):
945 return np.median(np.array(list_of_values))
948 """Compare the structure set to the reference structure.
949 @param structure_set_name The structure set to compute this on
950 @note First call set_reference_structure()
953 if self.reference_structures_dictionary=={}:
954 print(
"Cannot compute until you set a reference structure")
956 for selection_name
in self.selection_dictionary:
957 reference_coordinates = self.reference_structures_dictionary[selection_name]
960 for sc
in self.structures_dictionary[structure_set_name][selection_name]:
962 if self.style==
'pairwise_drmsd_k':
964 if self.style==
'pairwise_drms_k':
966 if self.style==
'pairwise_drmsd_Q':
968 if self.style==
'pairwise_rmsd':
970 distances.append(distance)
972 print(selection_name,
"average distance",sum(distances)/len(distances),
"minimum distance",min(distances),
'nframes',len(distances))
973 ret[selection_name] = {
'average_distance':sum(distances)/len(distances),
'minimum_distance':min(distances)}
976 def get_coordinates(self):
979 def set_precision_style(self, style):
980 if style
in self.styles:
983 raise ValueError(
"No such style")
986 """This object can be added to output_objects in (for example in the ReplicaExchange0 macro).
987 It will load a frame from an rmf file and compute the rmsd of a hierarchy with respect to that reference structure.
988 The output can be parsed from the stat.*.out files.
992 def __init__(self, hier, rmf_file_name, rmf_frame_index, rmf_state_index, label):
994 @param hier hierarchy of the structure
995 @param rmf_file_name path to the rmf file containing the reference structure
996 @param rmf_frame_index the index of the frame containing the reference structure in the rmf file
997 @param voxel rmf_state_index the index of the state containing the reference structure in the rmf file
998 @param label label that will be used when parsing the stat file
1001 rmf_fh = RMF.open_rmf_file_read_only(rmf_file_name)
1003 self.rmf_state = hh[0].get_children()[rmf_state_index]
1004 self.current_state = hier.get_children()[rmf_state_index]
1006 self.current_moldict = {}
1007 self.rmf_moldict = {}
1008 for mol
in self.rmf_state.get_children():
1009 name = mol.get_name()
1010 if name
in self.rmf_moldict:
1011 self.rmf_moldict[name].append(mol)
1013 self.rmf_moldict[name] = [mol]
1014 for mol
in self.current_state.get_children():
1015 name = mol.get_name()
1016 if name
in self.current_moldict:
1017 self.current_moldict[name].append(mol)
1019 self.current_moldict[name] = [mol]
1022 def get_output(self):
1025 for molname, rmf_mols
in self.rmf_moldict.iteritems():
1027 rmf_mols, representation_type=IMP.atom.BALLS)
1028 N = len(sel_rmf.get_selected_particle_indexes())
1031 for current_mols
in itertools.permutations(self.current_moldict[molname]):
1033 current_mols, representation_type=IMP.atom.BALLS)
1036 total_rmsd += m * m * N
1037 return {self.label:
"%e" % (self.math.sqrt(total_rmsd / total_N))}
1041 """Compute mean density maps from structures.
1043 Keeps a dictionary of density maps,
1044 keys are in the custom ranges. When you call add_subunits_density, it adds
1045 particle coordinates to the existing density maps.
1048 def __init__(self, custom_ranges, representation=None, resolution=20.0, voxel=5.0):
1050 @param custom_ranges Required. It's a dictionary, keys are the
1051 density component names, values are selection tuples
1052 e.g. {'kin28':[['kin28',1,-1]],
1053 'density_name_1' :[('ccl1')],
1054 'density_name_2' :[(1,142,'tfb3d1'),
1055 (143,700,'tfb3d2')],
1056 @param representation PMI representation, for doing selections.
1057 Not needed if you only pass hierarchies
1058 @param resolution The MRC resolution of the output map (in Angstrom unit)
1059 @param voxel The voxel size for the output map (lower is slower)
1062 self.representation = representation
1063 self.MRCresolution = resolution
1066 self.count_models = 0.0
1067 self.custom_ranges = custom_ranges
1070 """Add a frame to the densities.
1071 @param hierarchy Optionally read the hierarchy from somewhere.
1072 If not passed, will just read the representation.
1074 self.count_models += 1.0
1078 all_particles_by_resolution = []
1079 for name
in part_dict:
1080 all_particles_by_resolution += part_dict[name]
1082 for density_name
in self.custom_ranges:
1085 all_particles_by_segments = []
1087 for seg
in self.custom_ranges[density_name]:
1090 parts += IMP.tools.select_by_tuple(self.representation,
1091 seg, resolution=1, name_is_ambiguous=
False)
1095 for h
in hierarchy.get_children():
1099 if type(seg) == str:
1101 elif type(seg) == tuple:
1103 hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1))
1105 raise Exception(
'could not understand selection tuple '+str(seg))
1107 all_particles_by_segments += s.get_selected_particles()
1110 parts = all_particles_by_segments
1113 set(all_particles_by_segments) & set(all_particles_by_resolution))
1114 self._create_density_from_particles(parts, density_name)
1116 def normalize_density(self):
1119 def _create_density_from_particles(self, ps, name,
1120 kernel_type=
'GAUSSIAN'):
1121 '''Internal function for adding to densities.
1122 pass XYZR particles with mass and create a density from them.
1123 kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
1125 'GAUSSIAN': IMP.em.GAUSSIAN,
1126 'BINARIZED_SPHERE': IMP.em.BINARIZED_SPHERE,
1127 'SPHERE': IMP.em.SPHERE}
1131 dmap.set_was_used(
True)
1132 if name
not in self.densities:
1133 self.densities[name] = dmap
1139 dmap3.set_was_used(
True)
1141 dmap3.add(self.densities[name])
1142 self.densities[name] = dmap3
1144 def get_density_keys(self):
1145 return list(self.densities.keys())
1148 """Get the current density for some component name"""
1149 if name
not in self.densities:
1152 return self.densities[name]
1154 def write_mrc(self, path="./"):
1155 for density_name
in self.densities:
1156 self.densities[density_name].
multiply(1. / self.count_models)
1158 self.densities[density_name],
1159 path +
"/" + density_name +
".mrc",
1163 class GetContactMap(object):
1166 self.distance = distance
1167 self.contactmap =
''
1174 def set_prot(self, prot):
1175 from scipy.spatial.distance
import cdist
1184 for name
in particles_dictionary:
1185 residue_indexes = []
1186 for p
in particles_dictionary[name]:
1190 if len(residue_indexes) != 0:
1191 self.protnames.append(name)
1193 def get_subunit_coords(self, frame, align=0):
1194 from scipy.spatial.distance
import cdist
1198 test, testr = [], []
1199 for part
in self.prot.get_children():
1202 for chl
in part.get_children():
1208 SortedSegments.append((chl, startres))
1209 SortedSegments = sorted(SortedSegments, key=itemgetter(1))
1211 for sgmnt
in SortedSegments:
1214 crd = np.array([p.get_x(), p.get_y(), p.get_z()])
1217 radii.append(p.get_radius())
1219 new_name = part.get_name() +
'_' + sgmnt[0].get_name() +\
1222 .get_residue_indexes()[0])
1223 namelist.append(new_name)
1224 self.expanded[new_name] = len(
1226 if part.get_name()
not in self.resmap:
1227 self.resmap[part.get_name()] = {}
1229 self.resmap[part.get_name()][res] = new_name
1231 coords = np.array(coords)
1232 radii = np.array(radii)
1233 if len(self.namelist) == 0:
1234 self.namelist = namelist
1235 self.contactmap = np.zeros((len(coords), len(coords)))
1236 distances = cdist(coords, coords)
1237 distances = (distances - radii).T - radii
1238 distances = distances <= self.distance
1239 self.contactmap += distances
1244 identification_string=
'ISDCrossLinkMS_Distance_'):
1247 data = open(filname)
1248 D = data.readlines()
1252 if identification_string
in d:
1259 t1, t2 = (d[0], d[1]), (d[1], d[0])
1260 if t1
not in self.XL:
1261 self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
1262 self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
1264 self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
1265 self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
1267 def dist_matrix(self, skip_cmap=0, skip_xl=1, outname=None):
1271 L = sum(self.expanded.values())
1272 proteins = self.protnames
1277 proteins = [p.get_name()
for p
in self.prot.get_children()]
1279 for p1
in range(len(proteins)):
1280 for p2
in range(p1, len(proteins)):
1282 self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1283 pn1, pn2 = proteins[p1], proteins[p2]
1284 mtr = np.zeros((pl1 + 1, pl2 + 1))
1285 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1286 for i1
in range(1, pl1 + 1):
1287 for i2
in range(1, pl2 + 1):
1289 r1 = K.index(self.resmap[pn1][i1])
1290 r2 = K.index(self.resmap[pn2][i2])
1292 mtr[i1 - 1, i2 - 1] = r
1294 missing.append((pn1, pn2, i1, i2))
1296 Matrices[(pn1, pn2)] = mtr
1301 raise ValueError(
"cross-links were not provided, use add_xlinks function!")
1304 for p1
in range(len(proteins)):
1305 for p2
in range(p1, len(proteins)):
1307 self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1308 pn1, pn2 = proteins[p1], proteins[p2]
1309 mtr = np.zeros((pl1 + 1, pl2 + 1))
1312 xls = self.XL[(pn1, pn2)]
1315 xls = self.XL[(pn2, pn1)]
1320 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1321 for xl1, xl2
in xls:
1323 print(
'X' * 10, xl1, xl2)
1326 print(
'X' * 10, xl1, xl2)
1328 mtr[xl1 - 1, xl2 - 1] = 100
1330 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1331 for xl1, xl2
in xls:
1333 print(
'X' * 10, xl1, xl2)
1336 print(
'X' * 10, xl1, xl2)
1338 mtr[xl2 - 1, xl1 - 1] = 100
1340 print(
'No cross links between: ', pn1, pn2)
1341 Matrices_xl[(pn1, pn2)] = mtr
1357 for i, c
in enumerate(C):
1358 cl = max(self.resmap[c].keys())
1363 R.append(R[-1] + cl)
1368 import matplotlib
as mpl
1370 import matplotlib.pyplot
as plt
1371 import matplotlib.gridspec
as gridspec
1372 import scipy.sparse
as sparse
1375 gs = gridspec.GridSpec(len(W), len(W),
1380 for x1, r1
in enumerate(R):
1385 for x2, r2
in enumerate(R):
1391 ax = plt.subplot(gs[cnt])
1394 mtr = Matrices[(C[x1], C[x2])]
1396 mtr = Matrices[(C[x2], C[x1])].T
1400 interpolation=
'nearest',
1402 vmax=log(mtr.max()))
1407 mtr = Matrices_xl[(C[x1], C[x2])]
1409 mtr = Matrices_xl[(C[x2], C[x1])].T
1411 sparse.csr_matrix(mtr),
1421 ax.set_ylabel(C[x1], rotation=90)
1423 plt.savefig(outname +
".pdf", dpi=300, transparent=
"False")
1431 def get_hiers_from_rmf(model, frame_number, rmf_file):
1433 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1436 rh = RMF.open_rmf_file_read_only(rmf_file)
1441 print(
"Unable to open rmf file %s" % (rmf_file))
1448 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1455 def link_hiers_to_rmf(model,hiers,frame_number, rmf_file):
1456 print(
"linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1457 rh = RMF.open_rmf_file_read_only(rmf_file)
1462 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1469 def get_hiers_and_restraints_from_rmf(model, frame_number, rmf_file):
1471 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1474 rh = RMF.open_rmf_file_read_only(rmf_file)
1480 print(
"Unable to open rmf file %s" % (rmf_file))
1485 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1492 def link_hiers_and_restraints_to_rmf(model,hiers,rs, frame_number, rmf_file):
1493 print(
"linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1494 rh = RMF.open_rmf_file_read_only(rmf_file)
1500 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1507 def get_hiers_from_rmf(model, frame_number, rmf_file):
1508 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1511 rh = RMF.open_rmf_file_read_only(rmf_file)
1516 print(
"Unable to open rmf file %s" % (rmf_file))
1523 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1531 """Get particles at res 1, or any beads, based on the name.
1532 No Representation is needed. This is mainly used when the hierarchy
1533 is read from an RMF file.
1534 @return a dictionary of component names and their particles
1535 \note If the root node is named "System" or is a "State", do proper selection.
1541 for mol
in IMP.atom.get_by_type(prot,IMP.atom.MOLECULE_TYPE):
1543 particle_dict[mol.get_name()] = sel.get_selected_particles()
1546 for c
in prot.get_children():
1549 for s
in c.get_children():
1550 if "_Res:1" in s.get_name()
and "_Res:10" not in s.get_name():
1552 if "Beads" in s.get_name():
1556 for name
in particle_dict:
1557 particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1558 list(set(particle_dict[name]) & set(allparticles)))
1559 return particle_dict
1562 """Get particles at res 10, or any beads, based on the name.
1563 No Representation is needed.
1564 This is mainly used when the hierarchy is read from an RMF file.
1565 @return a dictionary of component names and their particles
1566 \note If the root node is named "System" or is a "State", do proper selection.
1571 for mol
in IMP.atom.get_by_type(prot,IMP.atom.MOLECULE_TYPE):
1573 particle_dict[mol.get_name()] = sel.get_selected_particles()
1576 for c
in prot.get_children():
1579 for s
in c.get_children():
1580 if "_Res:10" in s.get_name():
1582 if "Beads" in s.get_name():
1585 for name
in particle_dict:
1586 particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1587 list(set(particle_dict[name]) & set(allparticles)))
1588 return particle_dict
1590 def select_by_tuple(first_res_last_res_name_tuple):
1591 first_res = first_res_last_res_hier_tuple[0]
1592 last_res = first_res_last_res_hier_tuple[1]
1593 name = first_res_last_res_hier_tuple[2]
1596 """Visualization of crosslinks"""
1598 self.crosslinks = []
1599 self.external_csv_data =
None
1600 self.crosslinkedprots = set()
1601 self.mindist = +10000000.0
1602 self.maxdist = -10000000.0
1603 self.contactmap =
None
1605 def set_hierarchy(self, prot):
1606 self.prot_length_dict = {}
1607 self.model=prot.get_model()
1609 for i
in prot.get_children():
1611 residue_indexes = []
1615 if len(residue_indexes) != 0:
1616 self.prot_length_dict[name] = max(residue_indexes)
1618 def set_coordinates_for_contact_map(self, rmf_name,rmf_frame_index):
1619 from scipy.spatial.distance
import cdist
1621 rh= RMF.open_rmf_file_read_only(rmf_name)
1624 print(
"getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
1635 self.index_dictionary = {}
1637 for name
in particles_dictionary:
1638 residue_indexes = []
1639 for p
in particles_dictionary[name]:
1644 if len(residue_indexes) != 0:
1646 for res
in range(min(residue_indexes), max(residue_indexes) + 1):
1649 crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1651 radii.append(d.get_radius())
1652 if name
not in self.index_dictionary:
1653 self.index_dictionary[name] = [resindex]
1655 self.index_dictionary[name].append(resindex)
1658 coords = np.array(coords)
1659 radii = np.array(radii)
1661 distances = cdist(coords, coords)
1662 distances = (distances - radii).T - radii
1664 distances = np.where(distances <= 20.0, 1.0, 0)
1665 if self.contactmap
is None:
1666 self.contactmap = np.zeros((len(coords), len(coords)))
1667 self.contactmap += distances
1669 for prot
in prots: IMP.atom.destroy(prot)
1672 self, data_file, search_label=
'ISDCrossLinkMS_Distance_',
1675 filter_rmf_file_names=
None,
1676 external_csv_data_file=
None,
1677 external_csv_data_file_unique_id_key=
"Unique ID"):
1686 keys = po.get_keys()
1688 xl_keys = [k
for k
in keys
if search_label
in k]
1690 if filter_rmf_file_names
is not None:
1691 rmf_file_key=
"local_rmf_file_name"
1692 fs = po.get_fields(xl_keys+[rmf_file_key])
1694 fs = po.get_fields(xl_keys)
1697 self.cross_link_frequency = {}
1701 self.cross_link_distances = {}
1705 self.cross_link_distances_unique = {}
1707 if not external_csv_data_file
is None:
1710 self.external_csv_data = {}
1711 xldb = IMP.pmi.tools.get_db_from_csv(external_csv_data_file)
1714 self.external_csv_data[
1715 xl[external_csv_data_file_unique_id_key]] = xl
1720 cross_link_frequency_list = []
1722 self.unique_cross_link_list = []
1726 keysplit = key.replace(
1735 if filter_label!=
None:
1736 if filter_label
not in keysplit:
continue
1739 r1 = int(keysplit[5])
1741 r2 = int(keysplit[7])
1744 confidence = keysplit[12]
1748 unique_identifier = keysplit[3]
1750 unique_identifier =
'0'
1752 r1 = int(keysplit[mapping[
"Residue1"]])
1753 c1 = keysplit[mapping[
"Protein1"]]
1754 r2 = int(keysplit[mapping[
"Residue2"]])
1755 c2 = keysplit[mapping[
"Protein2"]]
1757 confidence = keysplit[mapping[
"Confidence"]]
1761 unique_identifier = keysplit[mapping[
"Unique Identifier"]]
1763 unique_identifier =
'0'
1765 self.crosslinkedprots.add(c1)
1766 self.crosslinkedprots.add(c2)
1772 if filter_rmf_file_names
is not None:
1773 for n,d
in enumerate(fs[key]):
1774 if fs[rmf_file_key][n]
in filter_rmf_file_names:
1775 dists.append(float(d))
1777 dists=[float(f)
for f
in fs[key]]
1782 mdist = self.median(dists)
1784 stdv = np.std(np.array(dists))
1785 if self.mindist > mdist:
1786 self.mindist = mdist
1787 if self.maxdist < mdist:
1788 self.maxdist = mdist
1792 if not self.external_csv_data
is None:
1793 sample = self.external_csv_data[unique_identifier][
"Sample"]
1797 if (r1, c1, r2, c2,mdist)
not in cross_link_frequency_list:
1798 if (r1, c1, r2, c2)
not in self.cross_link_frequency:
1799 self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1800 self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1802 self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1803 self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1804 cross_link_frequency_list.append((r1, c1, r2, c2))
1805 cross_link_frequency_list.append((r2, c2, r1, c1))
1806 self.unique_cross_link_list.append(
1807 (r1, c1, r2, c2,mdist))
1809 if (r1, c1, r2, c2)
not in self.cross_link_distances:
1810 self.cross_link_distances[(
1816 confidence)] = dists
1817 self.cross_link_distances[(
1823 confidence)] = dists
1824 self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1826 self.cross_link_distances[(
1832 confidence)] += dists
1833 self.cross_link_distances[(
1839 confidence)] += dists
1841 self.crosslinks.append(
1851 self.crosslinks.append(
1862 self.cross_link_frequency_inverted = {}
1863 for xl
in self.unique_cross_link_list:
1864 (r1, c1, r2, c2, mdist) = xl
1865 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1866 if frequency
not in self.cross_link_frequency_inverted:
1867 self.cross_link_frequency_inverted[
1868 frequency] = [(r1, c1, r2, c2)]
1870 self.cross_link_frequency_inverted[
1871 frequency].append((r1, c1, r2, c2))
1875 def median(self, mylist):
1876 sorts = sorted(mylist)
1882 return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1883 return sorts[length / 2]
1885 def set_threshold(self,threshold):
1886 self.threshold=threshold
1888 def set_tolerance(self,tolerance):
1889 self.tolerance=tolerance
1891 def colormap(self, dist):
1892 if dist < self.threshold - self.tolerance:
1894 elif dist >= self.threshold + self.tolerance:
1899 def write_cross_link_database(self, filename, format='csv'):
1903 "Unique ID",
"Protein1",
"Residue1",
"Protein2",
"Residue2",
1904 "Median Distance",
"Standard Deviation",
"Confidence",
"Frequency",
"Arrangement"]
1906 if not self.external_csv_data
is None:
1907 keys = list(self.external_csv_data.keys())
1908 innerkeys = list(self.external_csv_data[keys[0]].keys())
1910 fieldnames += innerkeys
1912 dw = csv.DictWriter(
1916 fieldnames=fieldnames)
1918 for xl
in self.crosslinks:
1919 (r1, c1, r2, c2, mdist, stdv, confidence,
1920 unique_identifier, descriptor) = xl
1921 if descriptor ==
'original':
1923 outdict[
"Unique ID"] = unique_identifier
1924 outdict[
"Protein1"] = c1
1925 outdict[
"Protein2"] = c2
1926 outdict[
"Residue1"] = r1
1927 outdict[
"Residue2"] = r2
1928 outdict[
"Median Distance"] = mdist
1929 outdict[
"Standard Deviation"] = stdv
1930 outdict[
"Confidence"] = confidence
1931 outdict[
"Frequency"] = self.cross_link_frequency[
1934 arrangement =
"Intra"
1936 arrangement =
"Inter"
1937 outdict[
"Arrangement"] = arrangement
1938 if not self.external_csv_data
is None:
1939 outdict.update(self.external_csv_data[unique_identifier])
1941 dw.writerow(outdict)
1943 def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
1944 no_confidence_info=
False, filter=
None, layout=
"whole", crosslinkedonly=
False,
1945 filename=
None, confidence_classes=
None, alphablend=0.1, scale_symbol_size=1.0,
1946 gap_between_components=0,
1947 rename_protein_map=
None):
1961 import matplotlib
as mpl
1963 import matplotlib.pyplot
as plt
1964 import matplotlib.cm
as cm
1966 fig = plt.figure(figsize=(10, 10))
1967 ax = fig.add_subplot(111)
1973 if prot_listx
is None:
1975 prot_listx = list(self.crosslinkedprots)
1977 prot_listx = list(self.prot_length_dict.keys())
1980 nresx = gap_between_components + \
1981 sum([self.prot_length_dict[name]
1982 + gap_between_components
for name
in prot_listx])
1986 if prot_listy
is None:
1988 prot_listy = list(self.crosslinkedprots)
1990 prot_listy = list(self.prot_length_dict.keys())
1993 nresy = gap_between_components + \
1994 sum([self.prot_length_dict[name]
1995 + gap_between_components
for name
in prot_listy])
2000 res = gap_between_components
2001 for prot
in prot_listx:
2002 resoffsetx[prot] = res
2003 res += self.prot_length_dict[prot]
2005 res += gap_between_components
2009 res = gap_between_components
2010 for prot
in prot_listy:
2011 resoffsety[prot] = res
2012 res += self.prot_length_dict[prot]
2014 res += gap_between_components
2016 resoffsetdiagonal = {}
2017 res = gap_between_components
2018 for prot
in IMP.pmi.tools.OrderedSet(prot_listx + prot_listy):
2019 resoffsetdiagonal[prot] = res
2020 res += self.prot_length_dict[prot]
2021 res += gap_between_components
2027 for n, prot
in enumerate(prot_listx):
2028 res = resoffsetx[prot]
2030 for proty
in prot_listy:
2031 resy = resoffsety[proty]
2032 endy = resendy[proty]
2033 ax.plot([res, res], [resy, endy],
'k-', lw=0.4)
2034 ax.plot([end, end], [resy, endy],
'k-', lw=0.4)
2035 xticks.append((float(res) + float(end)) / 2)
2036 if rename_protein_map
is not None:
2037 if prot
in rename_protein_map:
2038 prot=rename_protein_map[prot]
2039 xlabels.append(prot)
2043 for n, prot
in enumerate(prot_listy):
2044 res = resoffsety[prot]
2046 for protx
in prot_listx:
2047 resx = resoffsetx[protx]
2048 endx = resendx[protx]
2049 ax.plot([resx, endx], [res, res],
'k-', lw=0.4)
2050 ax.plot([resx, endx], [end, end],
'k-', lw=0.4)
2051 yticks.append((float(res) + float(end)) / 2)
2052 if rename_protein_map
is not None:
2053 if prot
in rename_protein_map:
2054 prot=rename_protein_map[prot]
2055 ylabels.append(prot)
2058 print(prot_listx, prot_listy)
2060 if not self.contactmap
is None:
2061 import matplotlib.cm
as cm
2062 tmp_array = np.zeros((nresx, nresy))
2064 for px
in prot_listx:
2066 for py
in prot_listy:
2068 resx = resoffsety[px]
2069 lengx = resendx[px] - 1
2070 resy = resoffsety[py]
2071 lengy = resendy[py] - 1
2072 indexes_x = self.index_dictionary[px]
2073 minx = min(indexes_x)
2074 maxx = max(indexes_x)
2075 indexes_y = self.index_dictionary[py]
2076 miny = min(indexes_y)
2077 maxy = max(indexes_y)
2079 print(px, py, minx, maxx, miny, maxy)
2084 resy:lengy] = self.contactmap[
2091 ax.imshow(tmp_array,
2094 interpolation=
'nearest')
2096 ax.set_xticks(xticks)
2097 ax.set_xticklabels(xlabels, rotation=90)
2098 ax.set_yticks(yticks)
2099 ax.set_yticklabels(ylabels)
2100 ax.set_xlim(0,nresx)
2101 ax.set_ylim(0,nresy)
2106 already_added_xls = []
2108 for xl
in self.crosslinks:
2110 (r1, c1, r2, c2, mdist, stdv, confidence,
2111 unique_identifier, descriptor) = xl
2113 if confidence_classes
is not None:
2114 if confidence
not in confidence_classes:
2118 pos1 = r1 + resoffsetx[c1]
2122 pos2 = r2 + resoffsety[c2]
2126 if not filter
is None:
2127 xldb = self.external_csv_data[unique_identifier]
2128 xldb.update({
"Distance": mdist})
2129 xldb.update({
"Distance_stdv": stdv})
2131 if filter[1] ==
">":
2132 if float(xldb[filter[0]]) <= float(filter[2]):
2135 if filter[1] ==
"<":
2136 if float(xldb[filter[0]]) >= float(filter[2]):
2139 if filter[1] ==
"==":
2140 if float(xldb[filter[0]]) != float(filter[2]):
2146 pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
2147 pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
2149 if layout ==
'lowerdiagonal':
2150 if pos_for_diagonal1 <= pos_for_diagonal2:
2152 if layout ==
'upperdiagonal':
2153 if pos_for_diagonal1 >= pos_for_diagonal2:
2156 already_added_xls.append((r1, c1, r2, c2))
2158 if not no_confidence_info:
2159 if confidence ==
'0.01':
2160 markersize = 14 * scale_symbol_size
2161 elif confidence ==
'0.05':
2162 markersize = 9 * scale_symbol_size
2163 elif confidence ==
'0.1':
2164 markersize = 6 * scale_symbol_size
2166 markersize = 15 * scale_symbol_size
2168 markersize = 5 * scale_symbol_size
2170 if not no_dist_info:
2171 color = self.colormap(mdist)
2181 markersize=markersize)
2185 fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
2187 [i.set_linewidth(2.0)
for i
in ax.spines.values()]
2192 plt.savefig(filename +
".pdf", dpi=300, transparent=
"False")
2196 def get_frequency_statistics(self, prot_list,
2199 violated_histogram = {}
2200 satisfied_histogram = {}
2201 unique_cross_links = []
2203 for xl
in self.unique_cross_link_list:
2204 (r1, c1, r2, c2, mdist) = xl
2207 if prot_list2
is None:
2208 if not c1
in prot_list:
2210 if not c2
in prot_list:
2213 if c1
in prot_list
and c2
in prot_list2:
2215 elif c1
in prot_list2
and c2
in prot_list:
2220 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2222 if (r1, c1, r2, c2)
not in unique_cross_links:
2224 if frequency
not in violated_histogram:
2225 violated_histogram[frequency] = 1
2227 violated_histogram[frequency] += 1
2229 if frequency
not in satisfied_histogram:
2230 satisfied_histogram[frequency] = 1
2232 satisfied_histogram[frequency] += 1
2233 unique_cross_links.append((r1, c1, r2, c2))
2234 unique_cross_links.append((r2, c2, r1, c1))
2236 print(
"# satisfied")
2238 total_number_of_crosslinks=0
2240 for i
in satisfied_histogram:
2244 if i
in violated_histogram:
2245 print(i, violated_histogram[i]+satisfied_histogram[i])
2247 print(i, satisfied_histogram[i])
2248 total_number_of_crosslinks+=i*satisfied_histogram[i]
2252 for i
in violated_histogram:
2253 print(i, violated_histogram[i])
2254 total_number_of_crosslinks+=i*violated_histogram[i]
2256 print(total_number_of_crosslinks)
2260 def print_cross_link_binary_symbols(self, prot_list,
2263 confidence_list = []
2264 for xl
in self.crosslinks:
2265 (r1, c1, r2, c2, mdist, stdv, confidence,
2266 unique_identifier, descriptor) = xl
2268 if prot_list2
is None:
2269 if not c1
in prot_list:
2271 if not c2
in prot_list:
2274 if c1
in prot_list
and c2
in prot_list2:
2276 elif c1
in prot_list2
and c2
in prot_list:
2281 if descriptor !=
"original":
2284 confidence_list.append(confidence)
2286 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2287 tmp_dist_binary = []
2290 tmp_dist_binary.append(1)
2292 tmp_dist_binary.append(0)
2293 tmp_matrix.append(tmp_dist_binary)
2295 matrix = list(zip(*tmp_matrix))
2297 satisfied_high_sum = 0
2298 satisfied_mid_sum = 0
2299 satisfied_low_sum = 0
2300 total_satisfied_sum = 0
2301 for k, m
in enumerate(matrix):
2310 for n, b
in enumerate(m):
2311 if confidence_list[n] ==
"0.01":
2315 satisfied_high_sum += 1
2316 elif confidence_list[n] ==
"0.05":
2320 satisfied_mid_sum += 1
2321 elif confidence_list[n] ==
"0.1":
2325 satisfied_low_sum += 1
2327 total_satisfied += 1
2328 total_satisfied_sum += 1
2330 print(k, satisfied_high, total_high)
2331 print(k, satisfied_mid, total_mid)
2332 print(k, satisfied_low, total_low)
2333 print(k, total_satisfied, total)
2334 print(float(satisfied_high_sum) / len(matrix))
2335 print(float(satisfied_mid_sum) / len(matrix))
2336 print(float(satisfied_low_sum) / len(matrix))
2339 def get_unique_crosslinks_statistics(self, prot_list,
2352 satisfied_string = []
2353 for xl
in self.crosslinks:
2354 (r1, c1, r2, c2, mdist, stdv, confidence,
2355 unique_identifier, descriptor) = xl
2357 if prot_list2
is None:
2358 if not c1
in prot_list:
2360 if not c2
in prot_list:
2363 if c1
in prot_list
and c2
in prot_list2:
2365 elif c1
in prot_list2
and c2
in prot_list:
2370 if descriptor !=
"original":
2374 if confidence ==
"0.01":
2378 if confidence ==
"0.05":
2382 if confidence ==
"0.1":
2387 satisfied_string.append(1)
2389 satisfied_string.append(0)
2391 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2392 tmp_dist_binary = []
2395 tmp_dist_binary.append(1)
2397 tmp_dist_binary.append(0)
2398 tmp_matrix.append(tmp_dist_binary)
2400 print(
"unique satisfied_high/total_high", satisfied_high,
"/", total_high)
2401 print(
"unique satisfied_mid/total_mid", satisfied_mid,
"/", total_mid)
2402 print(
"unique satisfied_low/total_low", satisfied_low,
"/", total_low)
2403 print(
"total", total)
2405 matrix = list(zip(*tmp_matrix))
2406 satisfied_models = 0
2408 for b
in satisfied_string:
2415 all_satisfied =
True
2417 for n, b
in enumerate(m):
2422 if b == 1
and satisfied_string[n] == 1:
2424 elif b == 1
and satisfied_string[n] == 0:
2426 elif b == 0
and satisfied_string[n] == 0:
2428 elif b == 0
and satisfied_string[n] == 1:
2429 all_satisfied =
False
2431 satisfied_models += 1
2433 print(satstr, all_satisfied)
2434 print(
"models that satisfies the median satisfied crosslinks/total models", satisfied_models, len(matrix))
2436 def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,
2439 import matplotlib
as mpl
2441 import matplotlib.pylab
as pl
2444 for kw
in self.cross_link_distances_unique:
2445 (r1, c1, r2, c2) = kw
2446 dists = self.cross_link_distances_unique[kw]
2448 if prot_list2
is None:
2449 if not c1
in prot_list:
2451 if not c2
in prot_list:
2454 if c1
in prot_list
and c2
in prot_list2:
2456 elif c1
in prot_list2
and c2
in prot_list:
2461 dists.append(sum(dists))
2462 tmp_matrix.append(dists)
2464 tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
2467 matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
2469 for i
in range(len(tmp_matrix)):
2470 for k
in range(len(tmp_matrix[i]) - 1):
2471 matrix[i][k] = tmp_matrix[i][k]
2476 ax = fig.add_subplot(211)
2478 cax = ax.imshow(matrix, interpolation=
'nearest')
2482 pl.savefig(figurename, dpi=300)
2491 arrangement=
"inter",
2492 confidence_input=
"None"):
2495 for xl
in self.cross_link_distances:
2496 (r1, c1, r2, c2, mdist, confidence) = xl
2497 if c1
in prots1
and c2
in prots2:
2498 if arrangement ==
"inter" and c1 == c2:
2500 if arrangement ==
"intra" and c1 != c2:
2502 if confidence_input == confidence:
2503 label = str(c1) +
":" + str(r1) + \
2504 "-" + str(c2) +
":" + str(r2)
2505 values = self.cross_link_distances[xl]
2506 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2507 data.append((label, values, mdist, frequency))
2509 sort_by_dist = sorted(data, key=
lambda tup: tup[2])
2510 sort_by_dist = list(zip(*sort_by_dist))
2511 values = sort_by_dist[1]
2512 positions = list(range(len(values)))
2513 labels = sort_by_dist[0]
2514 frequencies = list(map(float, sort_by_dist[3]))
2515 frequencies = [f * 10.0
for f
in frequencies]
2517 nchunks = int(float(len(values)) / nxl_per_row)
2518 values_chunks = IMP.pmi.tools.chunk_list_into_segments(values, nchunks)
2519 positions_chunks = IMP.pmi.tools.chunk_list_into_segments(
2522 frequencies_chunks = IMP.pmi.tools.chunk_list_into_segments(
2525 labels_chunks = IMP.pmi.tools.chunk_list_into_segments(labels, nchunks)
2527 for n, v
in enumerate(values_chunks):
2528 p = positions_chunks[n]
2529 f = frequencies_chunks[n]
2530 l = labels_chunks[n]
2532 filename +
"." + str(n), v, p, f,
2533 valuename=
"Distance (Ang)", positionname=
"Unique " + arrangement +
" Crosslinks", xlabels=l)
2535 def crosslink_distance_histogram(self, filename,
2538 confidence_classes=
None,
2544 if prot_list
is None:
2545 prot_list = list(self.prot_length_dict.keys())
2548 for xl
in self.crosslinks:
2549 (r1, c1, r2, c2, mdist, stdv, confidence,
2550 unique_identifier, descriptor) = xl
2552 if not confidence_classes
is None:
2553 if confidence
not in confidence_classes:
2556 if prot_list2
is None:
2557 if not c1
in prot_list:
2559 if not c2
in prot_list:
2562 if c1
in prot_list
and c2
in prot_list2:
2564 elif c1
in prot_list2
and c2
in prot_list:
2569 distances.append(mdist)
2572 filename, distances, valuename=
"C-alpha C-alpha distance [Ang]",
2573 bins=bins, color=color,
2575 reference_xline=35.0,
2576 yplotrange=yplotrange, normalized=normalized)
2578 def scatter_plot_xl_features(self, filename,
2584 reference_ylines=
None,
2585 distance_color=
True,
2587 import matplotlib
as mpl
2589 import matplotlib.pyplot
as plt
2590 import matplotlib.cm
as cm
2592 fig = plt.figure(figsize=(10, 10))
2593 ax = fig.add_subplot(111)
2595 for xl
in self.crosslinks:
2596 (r1, c1, r2, c2, mdist, stdv, confidence,
2597 unique_identifier, arrangement) = xl
2599 if prot_list2
is None:
2600 if not c1
in prot_list:
2602 if not c2
in prot_list:
2605 if c1
in prot_list
and c2
in prot_list2:
2607 elif c1
in prot_list2
and c2
in prot_list:
2612 xldb = self.external_csv_data[unique_identifier]
2613 xldb.update({
"Distance": mdist})
2614 xldb.update({
"Distance_stdv": stdv})
2616 xvalue = float(xldb[feature1])
2617 yvalue = float(xldb[feature2])
2620 color = self.colormap(mdist)
2624 ax.plot([xvalue], [yvalue],
'o', c=color, alpha=0.1, markersize=7)
2626 if not yplotrange
is None:
2627 ax.set_ylim(yplotrange)
2628 if not reference_ylines
is None:
2629 for rl
in reference_ylines:
2630 ax.axhline(rl, color=
'red', linestyle=
'dashed', linewidth=1)
2633 plt.savefig(filename +
"." + format, dpi=150, transparent=
"False")
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Visualization of crosslinks.
A decorator to associate a particle with a part of a protein/DNA/RNA.
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
A class for reading stat files.
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
void link_restraints(RMF::FileConstHandle fh, const Restraints &hs)
A class to evaluate the precision of an ensemble.
A class to cluster structures.
def get_rmsd_wrt_reference_structure_with_alignment
First align then calculate RMSD.
static Molecule setup_particle(Model *m, ParticleIndex pi)
Class for sampling a density map from particles.
double get_rmsd(const Selection &s0, const Selection &s1)
def add_structure
Read a structure into the ensemble and store (as coordinates).
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
Return a density map for which voxel i contains the result of m1[i]*m2[i].
Performs alignment and RMSD calculation for two sets of coordinates.
def add_subunits_density
Add a frame to the densities.
DensityMap * create_density_map(const IMP::algebra::GridD< 3, S, V, E > &arg)
Create a density map from an arbitrary IMP::algebra::GridD.
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Basic utilities for handling cryo-electron microscopy 3D density maps.
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
def fill
Add coordinates for a single model.
Tools for clustering and cluster analysis.
bool get_is_canonical(atom::Hierarchy h)
Walk up a PMI2 hierarchy/representations and check if the root is named System.
def do_cluster
Run K-means clustering.
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Classes for writing output files and processing them.
def set_reference_structure
Read in a structure used for reference computation.
def get_rmsf
Calculate the residue mean square fluctuations (RMSF).
def get_average_distance_wrt_reference_structure
Compare the structure set to the reference structure.
def get_density
Get the current density for some component name.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
def get_precision
Evaluate the precision of two named structure groups.
Restraints create_restraints(RMF::FileConstHandle fh, Model *m)
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
This object can be added to output_objects in (for example in the ReplicaExchange0 macro)...
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name.
Python classes to represent, score, sample and analyze models.
def add_structures
Read a list of RMFs, supports parallel.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Select hierarchy particles identified by the biological name.
Compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
A decorator for a particle with x,y,z coordinates and a radius.