3 """@namespace IMP.pmi.analysis
4 Tools for clustering and cluster analysis
6 from __future__
import print_function
16 from operator
import itemgetter
17 from copy
import deepcopy
18 from math
import log,sqrt
24 """Performs alignment and RMSD calculation for two sets of coordinates
26 The class also takes into account non-equal stoichiometry of the proteins.
27 If this is the case, the protein names of proteins in multiple copies
28 should be specified in the following form:
29 nameA..1, nameA..2 (note two dots).
32 def __init__(self, template, query, weights=None):
34 @param query {'p1':coords(L,3), 'p2':coords(L,3)}
35 @param template {'p1':coords(L,3), 'p2':coords(L,3)}
38 self.template = template
41 if len(self.query.keys()) != len(self.template.keys()):
42 raise ValueError(
'''the number of proteins
43 in template and query does not match!''')
47 self.proteins = sorted(self.query.keys())
48 prots_uniq = [i.split(
'..')[0]
for i
in self.proteins]
51 np = prots_uniq.count(p)
52 copies = [i
for i
in self.proteins
if i.split(
'..')[0] == p]
53 prmts = list(itertools.permutations(copies, len(copies)))
56 self.Product = list(itertools.product(*P.values()))
64 torder = sum([list(i)
for i
in self.Product[0]], [])
67 if self.weights
is not None:
68 weights += [i
for i
in self.weights[t]]
71 self.rmsd = 10000000000.
72 for comb
in self.Product:
76 order = sum([list(i)
for i
in comb], [])
86 if self.weights
is not None:
97 from scipy.spatial.distance
import cdist
102 torder = sum([list(i)
for i
in self.Product[0]], [])
107 self.rmsd, Transformation = 10000000000.,
''
108 for comb
in self.Product:
109 order = sum([list(i)
for i
in comb], [])
115 if len(template_xyz) != len(query_xyz):
116 raise ValueError(
'''the number of coordinates
117 in template and query does not match!''')
122 query_xyz_tr = [transformation.get_transformed(n)
126 sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2)) / len(template_xyz))
129 Transformation = transformation
131 return (self.rmsd, Transformation)
136 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
137 'a..2':np.array([np.array([1.,1.,])]),
138 'a..3':np.array([np.array([-2.,1.])]),
139 'b':np.array([np.array([0.,-1.])]),
140 'c..1':np.array([np.array([-1.,-1.])]),
141 'c..2':np.array([np.array([1.,-1.])]),
142 'd':np.array([np.array([0.,0.])]),
143 'e':np.array([np.array([0.,1.])])}
145 Ali = Alignment(Proteins, Proteins)
147 if Ali.get_rmsd() == 0.0: print 'successful test!'
148 else: print 'ERROR!'; exit()
153 class Violations(object):
157 self.violation_thresholds = {}
158 self.violation_counts = {}
160 data = open(filename)
165 d = d.strip().split()
166 self.violation_thresholds[d[0]] = float(d[1])
168 def get_number_violated_restraints(self, rsts_dict):
170 for rst
in self.violation_thresholds:
171 if rst
not in rsts_dict:
173 if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
175 if rst
not in self.violation_counts:
176 self.violation_counts[rst] = 1
178 self.violation_counts[rst] += 1
184 """A class to cluster structures.
185 Uses scipy's cdist function to compute distance matrices
186 and sklearn's kmeans clustering module.
190 @param rmsd_weights Flat list of weights for each particle
194 from mpi4py
import MPI
195 self.comm = MPI.COMM_WORLD
196 self.rank = self.comm.Get_rank()
197 self.number_of_processes = self.comm.size
199 self.number_of_processes = 1
202 self.structure_cluster_ids =
None
203 self.tmpl_coords =
None
204 self.rmsd_weights=rmsd_weights
206 def set_template(self, part_coords):
208 self.tmpl_coords = part_coords
211 """Add coordinates for a single model."""
213 self.all_coords[frame] = Coords
215 def dist_matrix(self):
217 self.model_list_names = list(self.all_coords.keys())
218 self.model_indexes = list(range(len(self.model_list_names)))
219 self.model_indexes_dict = dict(
220 list(zip(self.model_list_names, self.model_indexes)))
221 model_indexes_unique_pairs = list(itertools.combinations(self.model_indexes, 2))
223 my_model_indexes_unique_pairs = IMP.pmi.tools.chunk_list_into_segments(
224 model_indexes_unique_pairs,
225 self.number_of_processes)[self.rank]
227 print(
"process %s assigned with %s pairs" % (str(self.rank), str(len(my_model_indexes_unique_pairs))))
229 (raw_distance_dict, self.transformation_distance_dict) = self.matrix_calculation(self.all_coords,
231 my_model_indexes_unique_pairs)
233 if self.number_of_processes > 1:
236 pickable_transformations = self.get_pickable_transformation_distance_dict(
239 pickable_transformations)
240 self.set_transformation_distance_dict_from_pickable(
241 pickable_transformations)
243 self.raw_distance_matrix = np.zeros(
244 (len(self.model_list_names), len(self.model_list_names)))
245 for item
in raw_distance_dict:
247 self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
248 self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
250 def get_dist_matrix(self):
251 return self.raw_distance_matrix
254 """Run K-means clustering
255 @param number_of_clusters Num means
256 @param seed the random seed
258 from sklearn.cluster
import KMeans
263 kmeans = KMeans(n_clusters=number_of_clusters)
266 kmeans = KMeans(k=number_of_clusters)
267 kmeans.fit_predict(self.raw_distance_matrix)
269 self.structure_cluster_ids = kmeans.labels_
271 def get_pickable_transformation_distance_dict(self):
272 pickable_transformations = {}
273 for label
in self.transformation_distance_dict:
274 tr = self.transformation_distance_dict[label]
275 trans = tuple(tr.get_translation())
276 rot = tuple(tr.get_rotation().get_quaternion())
277 pickable_transformations[label] = (rot, trans)
278 return pickable_transformations
280 def set_transformation_distance_dict_from_pickable(
282 pickable_transformations):
283 self.transformation_distance_dict = {}
284 for label
in pickable_transformations:
285 tr = pickable_transformations[label]
288 self.transformation_distance_dict[
291 def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
293 outf = open(file_name +
".data",
'wb')
299 pickable_transformations = self.get_pickable_transformation_distance_dict(
302 (self.structure_cluster_ids,
303 self.model_list_names,
304 pickable_transformations),
307 np.save(file_name +
".npy", self.raw_distance_matrix)
309 def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
312 inputf = open(file_name +
".data",
'rb')
313 (self.structure_cluster_ids, self.model_list_names,
314 pickable_transformations) = pickle.load(inputf)
317 self.raw_distance_matrix = np.load(file_name +
".npy")
319 self.set_transformation_distance_dict_from_pickable(
320 pickable_transformations)
321 self.model_indexes = list(range(len(self.model_list_names)))
322 self.model_indexes_dict = dict(
323 list(zip(self.model_list_names, self.model_indexes)))
325 def plot_matrix(self, figurename="clustermatrix.pdf"):
327 from scipy.cluster
import hierarchy
as hrc
329 fig = pl.figure(figsize=(10,8))
330 ax = fig.add_subplot(212)
331 dendrogram = hrc.dendrogram(
332 hrc.linkage(self.raw_distance_matrix),
335 leaves_order = dendrogram[
'leaves']
336 ax.set_xlabel(
'Model')
337 ax.set_ylabel(
'RMSD [Angstroms]')
339 ax2 = fig.add_subplot(221)
341 self.raw_distance_matrix[leaves_order,
344 interpolation=
'nearest')
345 cb = fig.colorbar(cax)
346 cb.set_label(
'RMSD [Angstroms]')
347 ax2.set_xlabel(
'Model')
348 ax2.set_ylabel(
'Model')
350 pl.savefig(figurename, dpi=300)
353 def get_model_index_from_name(self, name):
354 return self.model_indexes_dict[name]
356 def get_cluster_labels(self):
358 return list(set(self.structure_cluster_ids))
360 def get_number_of_clusters(self):
361 return len(self.get_cluster_labels())
363 def get_cluster_label_indexes(self, label):
365 [i
for i, l
in enumerate(self.structure_cluster_ids)
if l == label]
368 def get_cluster_label_names(self, label):
370 [self.model_list_names[i]
371 for i
in self.get_cluster_label_indexes(label)]
374 def get_cluster_label_average_rmsd(self, label):
376 indexes = self.get_cluster_label_indexes(label)
379 sub_distance_matrix = self.raw_distance_matrix[
380 indexes, :][:, indexes]
381 average_rmsd = np.sum(sub_distance_matrix) / \
382 (len(sub_distance_matrix)
383 ** 2 - len(sub_distance_matrix))
388 def get_cluster_label_size(self, label):
389 return len(self.get_cluster_label_indexes(label))
391 def get_transformation_to_first_member(
395 reference = self.get_cluster_label_indexes(cluster_label)[0]
396 return self.transformation_distance_dict[(reference, structure_index)]
398 def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
400 model_list_names = list(all_coords.keys())
401 rmsd_protein_names = list(all_coords[model_list_names[0]].keys())
402 raw_distance_dict = {}
403 transformation_distance_dict = {}
404 if template_coords
is None:
408 alignment_template_protein_names = list(template_coords.keys())
410 for (f1, f2)
in list_of_pairs:
418 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
419 for pr
in rmsd_protein_names])
421 for pr
in rmsd_protein_names:
422 coords_f2[pr] = all_coords[model_list_names[f2]][pr]
424 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
425 rmsd = Ali.get_rmsd()
431 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
432 for pr
in alignment_template_protein_names])
433 coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
434 for pr
in alignment_template_protein_names])
437 template_rmsd, transformation = Ali.align()
442 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
443 for pr
in rmsd_protein_names])
445 for pr
in rmsd_protein_names:
446 coords_f2[pr] = [transformation.get_transformed(
447 i)
for i
in all_coords[model_list_names[f2]][pr]]
449 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
450 rmsd = Ali.get_rmsd()
452 raw_distance_dict[(f1, f2)] = rmsd
453 raw_distance_dict[(f2, f1)] = rmsd
454 transformation_distance_dict[(f1, f2)] = transformation
455 transformation_distance_dict[(f2, f1)] = transformation
457 return raw_distance_dict, transformation_distance_dict
461 """A class to evaluate the precision of an ensemble.
463 Also can evaluate the cross-precision of multiple ensembles.
464 Supports MPI for coordinate reading.
465 Recommended procedure:
466 -# initialize object and pass the selection for evaluating precision
467 -# call add_structures() to read in the data (specify group name)
468 -# call get_precision() to evaluate inter/intra precision
469 -# call get_rmsf() to evaluate within-group fluctuations
473 selection_dictionary={}):
475 @param model The IMP Model
476 @param resolution Use 1 or 10 (kluge: requires that "_Res:X" is
477 part of the hier name)
478 @param selection_dictionary Dictionary where keys are names for
479 selections and values are selection tuples for scoring
480 precision. "All" is automatically made as well
483 from mpi4py
import MPI
484 self.comm = MPI.COMM_WORLD
485 self.rank = self.comm.Get_rank()
486 self.number_of_processes = self.comm.size
488 self.number_of_processes=1
491 self.styles=[
'pairwise_rmsd',
'pairwise_drmsd_k',
'pairwise_drmsd_Q',
492 'pairwise_drms_k',
'pairwise_rmsd',
'drmsd_from_center']
493 self.style=
'pairwise_drmsd_k'
494 self.structures_dictionary={}
495 self.reference_structures_dictionary={}
497 self.protein_names=
None
498 self.len_particles_resolution_one=
None
500 self.rmf_names_frames={}
501 self.reference_rmf_names_frames=
None
502 self.reference_structure=
None
503 self.reference_prot=
None
504 self.selection_dictionary=selection_dictionary
506 self.residue_particle_index_map=
None
507 if resolution
in [1,10]:
508 self.resolution=resolution
510 raise KeyError(
"no such resolution")
512 def _get_structure(self,rmf_frame_index,rmf_name):
513 """Read an RMF file and return the particles"""
514 rh= RMF.open_rmf_file_read_only(rmf_name)
517 print(
"getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
520 if self.resolution==1:
522 elif self.resolution==10:
525 protein_names=list(particle_dict.keys())
526 particles_resolution_one=[]
527 for k
in particle_dict:
528 particles_resolution_one+=(particle_dict[k])
530 if self.protein_names==
None:
531 self.protein_names=protein_names
533 if self.protein_names!=protein_names:
534 print(
"Error: the protein names of the new coordinate set is not compatible with the previous one")
536 if self.len_particles_resolution_one==
None:
537 self.len_particles_resolution_one=len(particles_resolution_one)
539 if self.len_particles_resolution_one!=len(particles_resolution_one):
540 raise ValueError(
"the new coordinate set is not compatible with the previous one")
542 return particles_resolution_one,prots
548 setup_index_map=
False):
549 """ Read a structure into the ensemble and store (as coordinates).
550 @param rmf_name The name of the RMF file
551 @param rmf_frame_index The frame to read
552 @param structure_set_name Name for the set that includes this structure
557 if structure_set_name
in self.structures_dictionary:
558 cdict=self.structures_dictionary[structure_set_name]
559 rmflist=self.rmf_names_frames[structure_set_name]
561 self.structures_dictionary[structure_set_name]={}
562 self.rmf_names_frames[structure_set_name]=[]
563 cdict=self.structures_dictionary[structure_set_name]
564 rmflist=self.rmf_names_frames[structure_set_name]
568 (particles_resolution_one, prots)=self._get_structure(rmf_frame_index,rmf_name)
570 print(
"something wrong with the rmf")
573 self.selection_dictionary.update({
"All":self.protein_names})
575 for selection_name
in self.selection_dictionary:
576 selection_tuple=self.selection_dictionary[selection_name]
577 coords=self._select_coordinates(selection_tuple,particles_resolution_one,prots[0])
578 if selection_name
not in cdict:
579 cdict[selection_name]=[coords]
581 cdict[selection_name].append(coords)
583 rmflist.append((rmf_name,rmf_frame_index))
587 self.residue_particle_index_map={}
588 for prot_name
in self.protein_names:
589 self.residue_particle_index_map[prot_name] = \
590 self._get_residue_particle_index_map(
592 particles_resolution_one,prots[0])
597 rmf_name_frame_tuples,
599 """Read a list of RMFs, supports parallel
600 @param rmf_name_frame_tuples list of (rmf_file_name,frame_number)
601 @param structure_set_name Name this set of structures (e.g. "cluster.1")
605 my_rmf_name_frame_tuples=IMP.pmi.tools.chunk_list_into_segments(
606 rmf_name_frame_tuples,self.number_of_processes)[self.rank]
607 for nfr,tup
in enumerate(my_rmf_name_frame_tuples):
609 rmf_frame_index=tup[1]
611 if self.residue_particle_index_map
is None:
614 setup_index_map=
False
621 if self.number_of_processes > 1:
624 self.comm.send(self.structures_dictionary, dest=0, tag=11)
626 for i
in range(1, self.number_of_processes):
627 data_tmp = self.comm.recv(source=i, tag=11)
628 for key
in self.structures_dictionary:
629 self.structures_dictionary[key].update(data_tmp[key])
630 for i
in range(1, self.number_of_processes):
631 self.comm.send(self.structures_dictionary, dest=i, tag=11)
633 self.structures_dictionary = self.comm.recv(source=0, tag=11)
635 def _get_residue_particle_index_map(self,prot_name,structure,hier):
636 residue_particle_index_map=[]
638 all_selected_particles=s.get_selected_particles()
639 intersection=list(set(all_selected_particles) & set(structure))
640 sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
641 for p
in sorted_intersection:
643 return residue_particle_index_map
646 def _select_coordinates(self,tuple_selections,structure,prot):
647 selected_coordinates=[]
648 for t
in tuple_selections:
649 if type(t)==tuple
and len(t)==3:
651 all_selected_particles=s.get_selected_particles()
652 intersection=list(set(all_selected_particles) & set(structure))
653 sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
654 cc=[tuple(
IMP.core.XYZ(p).get_coordinates())
for p
in sorted_intersection]
655 selected_coordinates+=cc
659 all_selected_particles=s.get_selected_particles()
660 intersection=list(set(all_selected_particles) & set(structure))
661 sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
662 cc=[tuple(
IMP.core.XYZ(p).get_coordinates())
for p
in sorted_intersection]
663 selected_coordinates+=cc
665 raise ValueError(
"Selection error")
666 return selected_coordinates
668 def set_threshold(self,threshold):
669 self.threshold=threshold
671 def _get_distance(self,
677 """ Compute distance between structures with various metrics """
678 c1=self.structures_dictionary[structure_set_name1][selection_name][index1]
679 c2=self.structures_dictionary[structure_set_name2][selection_name][index2]
684 if self.style==
'pairwise_drmsd_k':
686 if self.style==
'pairwise_drms_k':
688 if self.style==
'pairwise_drmsd_Q':
691 if self.style==
'pairwise_rmsd':
695 def _get_particle_distances(self,structure_set_name1,structure_set_name2,
696 selection_name,index1,index2):
698 c1=self.structures_dictionary[structure_set_name1][selection_name][index1]
699 c2=self.structures_dictionary[structure_set_name2][selection_name][index2]
704 distances=[np.linalg.norm(a-b)
for (a,b)
in zip(coordinates1,coordinates2)]
713 selection_keywords=
None):
714 """ Evaluate the precision of two named structure groups. Supports MPI.
715 When the structure_set_name1 is different from the structure_set_name2,
716 this evaluates the cross-precision (average pairwise distances).
717 @param outfile Name of the precision output file
718 @param structure_set_name1 string name of the first structure set
719 @param structure_set_name2 string name of the second structure set
720 @param skip analyze every (skip) structure for the distance matrix calculation
721 @param selection_keywords Specify the selection name you want to calculate on.
722 By default this is computed for everything you provided in the constructor,
723 plus all the subunits together.
725 if selection_keywords
is None:
726 sel_keys=list(self.selection_dictionary.keys())
728 for k
in selection_keywords:
729 if k
not in self.selection_dictionary:
730 raise KeyError(
"you are trying to find named selection " \
731 + k +
" which was not requested in the constructor")
732 sel_keys=selection_keywords
734 if outfile
is not None:
737 for selection_name
in sel_keys:
738 number_of_structures_1=len(self.structures_dictionary[structure_set_name1][selection_name])
739 number_of_structures_2=len(self.structures_dictionary[structure_set_name2][selection_name])
742 structure_pointers_1=list(range(0,number_of_structures_1,skip))
743 structure_pointers_2=list(range(0,number_of_structures_2,skip))
745 pair_combination_list=list(itertools.product(structure_pointers_1,structure_pointers_2))
747 if len(pair_combination_list)==0:
748 raise ValueError(
"no structure selected. Check the skip parameter.")
750 my_pair_combination_list=IMP.pmi.tools.chunk_list_into_segments(
751 pair_combination_list,self.number_of_processes)[self.rank]
752 my_length=len(my_pair_combination_list)
753 for n,pair
in enumerate(my_pair_combination_list):
755 progression=int(float(n)/my_length*100.0)
756 distances[pair]=self._get_distance(structure_set_name1,structure_set_name2,
757 selection_name,pair[0],pair[1])
759 if self.number_of_processes > 1:
762 if structure_set_name1==structure_set_name2:
763 structure_pointers=structure_pointers_1
764 number_of_structures=number_of_structures_1
770 distances_to_structure={}
771 distances_to_structure_normalization={}
773 for n
in structure_pointers:
774 distances_to_structure[n]=0.0
775 distances_to_structure_normalization[n]=0
778 distance+=distances[k]
779 distances_to_structure[k[0]]+=distances[k]
780 distances_to_structure[k[1]]+=distances[k]
781 distances_to_structure_normalization[k[0]]+=1
782 distances_to_structure_normalization[k[1]]+=1
784 for n
in structure_pointers:
785 distances_to_structure[n]=distances_to_structure[n]/distances_to_structure_normalization[n]
787 min_distance=min([distances_to_structure[n]
for n
in distances_to_structure])
788 centroid_index=[k
for k, v
in distances_to_structure.items()
if v == min_distance][0]
789 centroid_rmf_name=self.rmf_names_frames[structure_set_name1][centroid_index]
791 centroid_distance=0.0
792 for n
in range(number_of_structures):
793 centroid_distance+=self._get_distance(structure_set_name1,structure_set_name1,
794 selection_name,centroid_index,n)
797 centroid_distance/=number_of_structures
799 if outfile
is not None:
800 of.write(str(selection_name)+
" "+structure_set_name1+
801 " average centroid distance "+str(centroid_distance)+
"\n")
802 of.write(str(selection_name)+
" "+structure_set_name1+
803 " centroid index "+str(centroid_index)+
"\n")
804 of.write(str(selection_name)+
" "+structure_set_name1+
805 " centroid rmf name "+str(centroid_rmf_name)+
"\n")
807 average_pairwise_distances=sum(distances.values())/len(list(distances.values()))
808 if outfile
is not None:
809 of.write(str(selection_name)+
" "+structure_set_name1+
" "+structure_set_name2+
810 " average pairwise distance "+str(average_pairwise_distances)+
"\n")
811 if outfile
is not None:
813 return centroid_index
819 set_plot_yaxis_range=
None):
820 """ Calculate the residue mean square fluctuations (RMSF).
821 Automatically outputs as data file and pdf
822 @param structure_set_name Which structure set to calculate RMSF for
823 @param outdir Where to write the files
824 @param skip Skip this number of structures
825 @param set_plot_yaxis_range In case you need to change the plot
834 for sel_name
in self.protein_names:
835 self.selection_dictionary.update({sel_name:[sel_name]})
837 number_of_structures=len(self.structures_dictionary[structure_set_name][sel_name])
841 rpim=self.residue_particle_index_map[sel_name]
842 outfile=outdir+
"/rmsf."+sel_name+
".dat"
846 for index
in range(number_of_structures):
847 distances=self._get_particle_distances(structure_set_name,
850 centroid_index,index)
851 for nblock,block
in enumerate(rpim):
852 for residue_number
in block:
853 residue_nblock[residue_number]=nblock
854 if residue_number
not in residue_distances:
855 residue_distances[residue_number]=[distances[nblock]]
857 residue_distances[residue_number].append(distances[nblock])
861 for rn
in residue_distances:
863 rmsf=np.std(residue_distances[rn])
865 of.write(str(rn)+
" "+str(residue_nblock[rn])+
" "+str(rmsf)+
"\n")
867 IMP.pmi.output.plot_xy_data(residues,rmsfs,title=sel_name,
868 out_fn=outdir+
"/rmsf."+sel_name,display=
False,
869 set_plot_yaxis_range=set_plot_yaxis_range,
870 xlabel=
'Residue Number',ylabel=
'Standard error')
875 """Read in a structure used for reference computation.
876 Needed before calling get_average_distance_wrt_reference_structure()
877 @param rmf_name The RMF file to read the reference
878 @param rmf_frame_index The index in that file
880 (particles_resolution_one, prot)=self._get_structure(rmf_frame_index,rmf_name)
881 self.reference_rmf_names_frames=(rmf_name,rmf_frame_index)
884 for selection_name
in self.selection_dictionary:
885 selection_tuple=self.selection_dictionary[selection_name]
886 coords=self._select_coordinates(selection_tuple,
887 particles_resolution_one,prot)
888 self.reference_structures_dictionary[selection_name]=coords
892 """Compare the structure set to the reference structure.
893 @param structure_set_name The structure set to compute this on
894 @note First call set_reference_structure()
896 if self.reference_structures_dictionary=={}:
897 print(
"Cannot compute until you set a reference structure")
899 for selection_name
in self.selection_dictionary:
900 reference_coordinates=self.reference_structures_dictionary[selection_name]
904 for sc
in self.structures_dictionary[structure_set_name][selection_name]:
906 if self.style==
'pairwise_drmsd_k':
908 if self.style==
'pairwise_drms_k':
910 if self.style==
'pairwise_drmsd_Q':
912 if self.style==
'pairwise_rmsd':
915 distances.append(distance)
917 print(selection_name,
"average distance",sum(distances)/len(distances),
"minimum distance",min(distances))
919 def get_coordinates(self):
922 def set_precision_style(self, style):
923 if style
in self.styles:
926 raise ValueError(
"No such style")
930 """Compute mean density maps from structures.
932 Keeps a dictionary of density maps,
933 keys are in the custom ranges. When you call add_subunits_density, it adds
934 particle coordinates to the existing density maps.
937 def __init__(self, custom_ranges, representation=None, voxel=5.0):
939 @param custom_ranges Required. It's a dictionary, keys are the
940 density component names, values are selection tuples
941 e.g. {'kin28':[['kin28',1,-1]],
942 'density_name_1' :[('ccl1')],
943 'density_name_2' :[(1,142,'tfb3d1'),
945 @param representation PMI representation, for doing selections.
946 Not needed if you only pass hierarchies
947 @param voxel The voxel size for the output map (lower is slower)
950 self.representation = representation
953 self.count_models = 0.0
954 self.custom_ranges = custom_ranges
957 """Add a frame to the densities.
958 @param hierarchy Optionally read the hierarchy from somewhere.
959 If not passed, will just read the representation.
961 self.count_models += 1.0
965 all_particles_by_resolution = []
966 for name
in part_dict:
967 all_particles_by_resolution += part_dict[name]
970 for density_name
in self.custom_ranges:
973 all_particles_by_segments = []
975 for seg
in self.custom_ranges[density_name]:
977 parts += IMP.tools.select_by_tuple(self.representation,
978 seg, resolution=1, name_is_ambiguous=
False)
982 elif type(seg) == tuple:
984 hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1))
986 raise Exception(
'could not understand selection tuple '+str(seg))
988 all_particles_by_segments += s.get_selected_particles()
992 set(all_particles_by_segments) & set(all_particles_by_resolution))
994 self._create_density_from_particles(parts, density_name)
996 def normalize_density(self):
999 def _create_density_from_particles(self, ps, name,
1001 kernel_type=
'GAUSSIAN'):
1002 '''Internal function for adding to densities.
1003 pass XYZR particles with mass and create a density from them.
1004 kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
1006 'GAUSSIAN': IMP.em.GAUSSIAN,
1007 'BINARIZED_SPHERE': IMP.em.BINARIZED_SPHERE,
1008 'SPHERE': IMP.em.SPHERE}
1012 if name
not in self.densities:
1013 self.densities[name] = dmap
1020 dmap3.add(self.densities[name])
1021 self.densities[name] = dmap3
1023 def get_density_keys(self):
1024 return list(self.densities.keys())
1027 """Get the current density for some component name"""
1028 if name
not in self.densities:
1031 return self.densities[name]
1033 def write_mrc(self, path="./"):
1034 for density_name
in self.densities:
1035 self.densities[density_name].
multiply(1. / self.count_models)
1037 self.densities[density_name],
1038 path +
"/" + density_name +
".mrc",
1042 class GetContactMap(object):
1045 self.distance = distance
1046 self.contactmap =
''
1053 def set_prot(self, prot):
1054 from scipy.spatial.distance
import cdist
1063 for name
in particles_dictionary:
1064 residue_indexes = []
1065 for p
in particles_dictionary[name]:
1070 if len(residue_indexes) != 0:
1071 self.protnames.append(name)
1072 for res
in range(min(residue_indexes), max(residue_indexes) + 1):
1074 new_name = name +
":" + str(res)
1075 if name
not in self.resmap:
1076 self.resmap[name] = {}
1077 if res
not in self.resmap:
1078 self.resmap[name][res] = {}
1080 self.resmap[name][res] = new_name
1081 namelist.append(new_name)
1083 crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1085 radii.append(d.get_radius())
1087 coords = np.array(coords)
1088 radii = np.array(radii)
1090 if len(self.namelist) == 0:
1091 self.namelist = namelist
1092 self.contactmap = np.zeros((len(coords), len(coords)))
1094 distances = cdist(coords, coords)
1095 distances = (distances - radii).T - radii
1096 distances = distances <= self.distance
1102 self.contactmap += distances
1104 def get_subunit_coords(self, frame, align=0):
1105 from scipy.spatial.distance
import cdist
1109 test, testr = [], []
1110 for part
in self.prot.get_children():
1113 for chl
in part.get_children():
1119 SortedSegments.append((chl, startres))
1120 SortedSegments = sorted(SortedSegments, key=itemgetter(1))
1122 for sgmnt
in SortedSegments:
1125 crd = np.array([p.get_x(), p.get_y(), p.get_z()])
1128 radii.append(p.get_radius())
1130 new_name = part.get_name() +
'_' + sgmnt[0].get_name() +\
1133 .get_residue_indexes()[0])
1134 namelist.append(new_name)
1135 self.expanded[new_name] = len(
1137 if part.get_name()
not in self.resmap:
1138 self.resmap[part.get_name()] = {}
1140 self.resmap[part.get_name()][res] = new_name
1142 coords = np.array(coords)
1143 radii = np.array(radii)
1144 if len(self.namelist) == 0:
1145 self.namelist = namelist
1146 self.contactmap = np.zeros((len(coords), len(coords)))
1147 distances = cdist(coords, coords)
1148 distances = (distances - radii).T - radii
1149 distances = distances <= self.distance
1150 self.contactmap += distances
1155 identification_string=
'ISDCrossLinkMS_Distance_'):
1158 data = open(filname)
1159 D = data.readlines()
1163 if identification_string
in d:
1170 t1, t2 = (d[0], d[1]), (d[1], d[0])
1171 if t1
not in self.XL:
1172 self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
1173 self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
1175 self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
1176 self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
1178 def dist_matrix(self, skip_cmap=0, skip_xl=1):
1182 L = sum(self.expanded.values())
1183 proteins = self.protnames
1188 proteins = [p.get_name()
for p
in self.prot.get_children()]
1190 for p1
in range(len(proteins)):
1191 for p2
in range(p1, len(proteins)):
1193 self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1194 pn1, pn2 = proteins[p1], proteins[p2]
1195 mtr = np.zeros((pl1 + 1, pl2 + 1))
1196 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1197 for i1
in range(1, pl1 + 1):
1198 for i2
in range(1, pl2 + 1):
1200 r1 = K.index(self.resmap[pn1][i1])
1201 r2 = K.index(self.resmap[pn2][i2])
1203 mtr[i1 - 1, i2 - 1] = r
1205 missing.append((pn1, pn2, i1, i2))
1207 Matrices[(pn1, pn2)] = mtr
1212 raise ValueError(
"cross-links were not provided, use add_xlinks function!")
1215 for p1
in range(len(proteins)):
1216 for p2
in range(p1, len(proteins)):
1218 self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1219 pn1, pn2 = proteins[p1], proteins[p2]
1220 mtr = np.zeros((pl1 + 1, pl2 + 1))
1223 xls = self.XL[(pn1, pn2)]
1226 xls = self.XL[(pn2, pn1)]
1231 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1232 for xl1, xl2
in xls:
1234 print(
'X' * 10, xl1, xl2)
1237 print(
'X' * 10, xl1, xl2)
1239 mtr[xl1 - 1, xl2 - 1] = 100
1241 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1242 for xl1, xl2
in xls:
1244 print(
'X' * 10, xl1, xl2)
1247 print(
'X' * 10, xl1, xl2)
1249 mtr[xl2 - 1, xl1 - 1] = 100
1251 raise RuntimeError(
'WTF!')
1252 Matrices_xl[(pn1, pn2)] = mtr
1268 for i, c
in enumerate(C):
1269 cl = max(self.resmap[c].keys())
1274 R.append(R[-1] + cl)
1279 import matplotlib
as mpl
1281 import matplotlib.pyplot
as plt
1282 import matplotlib.gridspec
as gridspec
1283 import scipy.sparse
as sparse
1286 gs = gridspec.GridSpec(len(W), len(W),
1291 for x1, r1
in enumerate(R):
1296 for x2, r2
in enumerate(R):
1302 ax = plt.subplot(gs[cnt])
1305 mtr = Matrices[(C[x1], C[x2])]
1307 mtr = Matrices[(C[x2], C[x1])].T
1311 interpolation=
'nearest',
1313 vmax=log(NewM.max()))
1318 mtr = Matrices_xl[(C[x1], C[x2])]
1320 mtr = Matrices_xl[(C[x2], C[x1])].T
1322 sparse.csr_matrix(mtr),
1332 ax.set_ylabel(C[x1], rotation=90)
1339 def get_hier_from_rmf(model, frame_number, rmf_file,state_number=0):
1341 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1344 rh = RMF.open_rmf_file_read_only(rmf_file)
1349 print(
"Unable to open rmf file %s" % (rmf_file))
1353 prot = prots[state_number]
1357 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1363 def get_hier_and_restraints_from_rmf(model, frame_number, rmf_file, state_number=0):
1365 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1368 rh = RMF.open_rmf_file_read_only(rmf_file)
1374 print(
"Unable to open rmf file %s" % (rmf_file))
1379 prot = prots[state_number]
1383 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1389 def get_hiers_from_rmf(model, frame_number, rmf_file):
1390 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1393 rh = RMF.open_rmf_file_read_only(rmf_file)
1398 print(
"Unable to open rmf file %s" % (rmf_file))
1405 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1414 Get particles at res 1, or any beads, based on the name.
1415 No Representation is needed. This is mainly used when the hierarchy
1416 is read from an RMF file.
1417 @return a dictionary of component names and their particles
1421 for c
in prot.get_children():
1424 for s
in c.get_children():
1425 if "_Res:1" in s.get_name()
and "_Res:10" not in s.get_name():
1427 if "Beads" in s.get_name():
1431 for name
in particle_dict:
1432 particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1433 list(set(particle_dict[name]) & set(allparticles)))
1434 return particle_dict
1438 Get particles at res 10, or any beads, based on the name.
1439 No Representation is needed.
1440 This is mainly used when the hierarchy is read from an RMF file.
1441 @return a dictionary of component names and their particles
1445 for c
in prot.get_children():
1448 for s
in c.get_children():
1449 if "_Res:10" in s.get_name():
1451 if "Beads" in s.get_name():
1454 for name
in particle_dict:
1455 particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1456 list(set(particle_dict[name]) & set(allparticles)))
1457 return particle_dict
1461 def select_by_tuple(first_res_last_res_name_tuple):
1462 first_res = first_res_last_res_hier_tuple[0]
1463 last_res = first_res_last_res_hier_tuple[1]
1464 name = first_res_last_res_hier_tuple[2]
1467 """Visualization of crosslinks"""
1469 self.crosslinks = []
1470 self.external_csv_data =
None
1471 self.crosslinkedprots = set()
1472 self.mindist = +10000000.0
1473 self.maxdist = -10000000.0
1474 self.contactmap =
None
1476 def set_hierarchy(self, prot):
1477 self.prot_length_dict = {}
1478 self.model=prot.get_model()
1480 for i
in prot.get_children():
1482 residue_indexes = []
1486 if len(residue_indexes) != 0:
1487 self.prot_length_dict[name] = max(residue_indexes)
1489 def set_coordinates_for_contact_map(self, rmf_name,rmf_frame_index):
1490 from scipy.spatial.distance
import cdist
1492 rh= RMF.open_rmf_file_read_only(rmf_name)
1495 print(
"getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
1506 self.index_dictionary = {}
1508 for name
in particles_dictionary:
1509 residue_indexes = []
1510 for p
in particles_dictionary[name]:
1515 if len(residue_indexes) != 0:
1517 for res
in range(min(residue_indexes), max(residue_indexes) + 1):
1520 crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1522 radii.append(d.get_radius())
1523 if name
not in self.index_dictionary:
1524 self.index_dictionary[name] = [resindex]
1526 self.index_dictionary[name].append(resindex)
1529 coords = np.array(coords)
1530 radii = np.array(radii)
1532 distances = cdist(coords, coords)
1533 distances = (distances - radii).T - radii
1535 distances = np.where(distances <= 20.0, 1.0, 0)
1536 if self.contactmap
is None:
1537 self.contactmap = np.zeros((len(coords), len(coords)))
1538 self.contactmap += distances
1543 self, data_file, search_label=
'ISDCrossLinkMS_Distance_',
1546 filter_rmf_file_names=
None,
1547 external_csv_data_file=
None,
1548 external_csv_data_file_unique_id_key=
"Unique ID"):
1557 keys = po.get_keys()
1559 xl_keys = [k
for k
in keys
if search_label
in k]
1561 if filter_rmf_file_names
is not None:
1562 rmf_file_key=
"local_rmf_file_name"
1563 fs = po.get_fields(xl_keys+[rmf_file_key])
1565 fs = po.get_fields(xl_keys)
1568 self.cross_link_frequency = {}
1572 self.cross_link_distances = {}
1576 self.cross_link_distances_unique = {}
1578 if not external_csv_data_file
is None:
1581 self.external_csv_data = {}
1582 xldb = IMP.pmi.tools.get_db_from_csv(external_csv_data_file)
1585 self.external_csv_data[
1586 xl[external_csv_data_file_unique_id_key]] = xl
1591 cross_link_frequency_list = []
1593 self.unique_cross_link_list = []
1597 keysplit = key.replace(
1606 if filter_label!=
None:
1607 if filter_label
not in keysplit:
continue
1610 r1 = int(keysplit[5])
1612 r2 = int(keysplit[7])
1615 confidence = keysplit[12]
1619 unique_identifier = keysplit[3]
1621 unique_identifier =
'0'
1623 r1 = int(keysplit[mapping[
"Residue1"]])
1624 c1 = keysplit[mapping[
"Protein1"]]
1625 r2 = int(keysplit[mapping[
"Residue2"]])
1626 c2 = keysplit[mapping[
"Protein2"]]
1628 confidence = keysplit[mapping[
"Confidence"]]
1632 unique_identifier = keysplit[mapping[
"Unique Identifier"]]
1634 unique_identifier =
'0'
1636 self.crosslinkedprots.add(c1)
1637 self.crosslinkedprots.add(c2)
1643 if filter_rmf_file_names
is not None:
1644 for n,d
in enumerate(fs[key]):
1645 if fs[rmf_file_key][n]
in filter_rmf_file_names:
1646 dists.append(float(d))
1648 dists=[float(f)
for f
in fs[key]]
1653 mdist = self.median(dists)
1655 stdv = np.std(np.array(dists))
1656 if self.mindist > mdist:
1657 self.mindist = mdist
1658 if self.maxdist < mdist:
1659 self.maxdist = mdist
1663 if not self.external_csv_data
is None:
1664 sample = self.external_csv_data[unique_identifier][
"Sample"]
1668 if (r1, c1, r2, c2,mdist)
not in cross_link_frequency_list:
1669 if (r1, c1, r2, c2)
not in self.cross_link_frequency:
1670 self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1671 self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1673 self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1674 self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1675 cross_link_frequency_list.append((r1, c1, r2, c2))
1676 cross_link_frequency_list.append((r2, c2, r1, c1))
1677 self.unique_cross_link_list.append(
1678 (r1, c1, r2, c2,mdist))
1680 if (r1, c1, r2, c2)
not in self.cross_link_distances:
1681 self.cross_link_distances[(
1687 confidence)] = dists
1688 self.cross_link_distances[(
1694 confidence)] = dists
1695 self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1697 self.cross_link_distances[(
1703 confidence)] += dists
1704 self.cross_link_distances[(
1710 confidence)] += dists
1712 self.crosslinks.append(
1722 self.crosslinks.append(
1733 self.cross_link_frequency_inverted = {}
1734 for xl
in self.unique_cross_link_list:
1735 (r1, c1, r2, c2, mdist) = xl
1736 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1737 if frequency
not in self.cross_link_frequency_inverted:
1738 self.cross_link_frequency_inverted[
1739 frequency] = [(r1, c1, r2, c2)]
1741 self.cross_link_frequency_inverted[
1742 frequency].append((r1, c1, r2, c2))
1746 def median(self, mylist):
1747 sorts = sorted(mylist)
1753 return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1754 return sorts[length / 2]
1756 def set_threshold(self,threshold):
1757 self.threshold=threshold
1759 def set_tolerance(self,tolerance):
1760 self.tolerance=tolerance
1762 def colormap(self, dist):
1763 if dist < self.threshold - self.tolerance:
1765 elif dist >= self.threshold + self.tolerance:
1770 def write_cross_link_database(self, filename, format='csv'):
1774 "Unique ID",
"Protein1",
"Residue1",
"Protein2",
"Residue2",
1775 "Median Distance",
"Standard Deviation",
"Confidence",
"Frequency",
"Arrangement"]
1777 if not self.external_csv_data
is None:
1778 keys = list(self.external_csv_data.keys())
1779 innerkeys = list(self.external_csv_data[keys[0]].keys())
1781 fieldnames += innerkeys
1783 dw = csv.DictWriter(
1787 fieldnames=fieldnames)
1789 for xl
in self.crosslinks:
1790 (r1, c1, r2, c2, mdist, stdv, confidence,
1791 unique_identifier, descriptor) = xl
1792 if descriptor ==
'original':
1794 outdict[
"Unique ID"] = unique_identifier
1795 outdict[
"Protein1"] = c1
1796 outdict[
"Protein2"] = c2
1797 outdict[
"Residue1"] = r1
1798 outdict[
"Residue2"] = r2
1799 outdict[
"Median Distance"] = mdist
1800 outdict[
"Standard Deviation"] = stdv
1801 outdict[
"Confidence"] = confidence
1802 outdict[
"Frequency"] = self.cross_link_frequency[
1805 arrangement =
"Intra"
1807 arrangement =
"Inter"
1808 outdict[
"Arrangement"] = arrangement
1809 if not self.external_csv_data
is None:
1810 outdict.update(self.external_csv_data[unique_identifier])
1812 dw.writerow(outdict)
1814 def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
1815 no_confidence_info=
False, filter=
None, layout=
"whole", crosslinkedonly=
False,
1816 filename=
None, confidence_classes=
None, alphablend=0.1, scale_symbol_size=1.0,
1817 gap_between_components=0,
1818 rename_protein_map=
None):
1832 import matplotlib.pyplot
as plt
1833 import matplotlib.cm
as cm
1835 fig = plt.figure(figsize=(10, 10))
1836 ax = fig.add_subplot(111)
1842 if prot_listx
is None:
1844 prot_listx = list(self.crosslinkedprots)
1846 prot_listx = list(self.prot_length_dict.keys())
1849 nresx = gap_between_components + \
1850 sum([self.prot_length_dict[name]
1851 + gap_between_components
for name
in prot_listx])
1855 if prot_listy
is None:
1857 prot_listy = list(self.crosslinkedprots)
1859 prot_listy = list(self.prot_length_dict.keys())
1862 nresy = gap_between_components + \
1863 sum([self.prot_length_dict[name]
1864 + gap_between_components
for name
in prot_listy])
1869 res = gap_between_components
1870 for prot
in prot_listx:
1871 resoffsetx[prot] = res
1872 res += self.prot_length_dict[prot]
1874 res += gap_between_components
1878 res = gap_between_components
1879 for prot
in prot_listy:
1880 resoffsety[prot] = res
1881 res += self.prot_length_dict[prot]
1883 res += gap_between_components
1885 resoffsetdiagonal = {}
1886 res = gap_between_components
1887 for prot
in IMP.pmi.tools.OrderedSet(prot_listx + prot_listy):
1888 resoffsetdiagonal[prot] = res
1889 res += self.prot_length_dict[prot]
1890 res += gap_between_components
1896 for n, prot
in enumerate(prot_listx):
1897 res = resoffsetx[prot]
1899 for proty
in prot_listy:
1900 resy = resoffsety[proty]
1901 endy = resendy[proty]
1902 ax.plot([res, res], [resy, endy],
'k-', lw=0.4)
1903 ax.plot([end, end], [resy, endy],
'k-', lw=0.4)
1904 xticks.append((float(res) + float(end)) / 2)
1905 if rename_protein_map
is not None:
1906 if prot
in rename_protein_map:
1907 prot=rename_protein_map[prot]
1908 xlabels.append(prot)
1912 for n, prot
in enumerate(prot_listy):
1913 res = resoffsety[prot]
1915 for protx
in prot_listx:
1916 resx = resoffsetx[protx]
1917 endx = resendx[protx]
1918 ax.plot([resx, endx], [res, res],
'k-', lw=0.4)
1919 ax.plot([resx, endx], [end, end],
'k-', lw=0.4)
1920 yticks.append((float(res) + float(end)) / 2)
1921 if rename_protein_map
is not None:
1922 if prot
in rename_protein_map:
1923 prot=rename_protein_map[prot]
1924 ylabels.append(prot)
1927 print(prot_listx, prot_listy)
1929 if not self.contactmap
is None:
1930 import matplotlib.cm
as cm
1931 tmp_array = np.zeros((nresx, nresy))
1933 for px
in prot_listx:
1935 for py
in prot_listy:
1937 resx = resoffsety[px]
1938 lengx = resendx[px] - 1
1939 resy = resoffsety[py]
1940 lengy = resendy[py] - 1
1941 indexes_x = self.index_dictionary[px]
1942 minx = min(indexes_x)
1943 maxx = max(indexes_x)
1944 indexes_y = self.index_dictionary[py]
1945 miny = min(indexes_y)
1946 maxy = max(indexes_y)
1948 print(px, py, minx, maxx, miny, maxy)
1953 resy:lengy] = self.contactmap[
1960 ax.imshow(tmp_array,
1963 interpolation=
'nearest')
1965 ax.set_xticks(xticks)
1966 ax.set_xticklabels(xlabels, rotation=90)
1967 ax.set_yticks(yticks)
1968 ax.set_yticklabels(ylabels)
1969 ax.set_xlim(0,nresx)
1970 ax.set_ylim(0,nresy)
1975 already_added_xls = []
1977 for xl
in self.crosslinks:
1979 (r1, c1, r2, c2, mdist, stdv, confidence,
1980 unique_identifier, descriptor) = xl
1982 if confidence_classes
is not None:
1983 if confidence
not in confidence_classes:
1987 pos1 = r1 + resoffsetx[c1]
1991 pos2 = r2 + resoffsety[c2]
1995 if not filter
is None:
1996 xldb = self.external_csv_data[unique_identifier]
1997 xldb.update({
"Distance": mdist})
1998 xldb.update({
"Distance_stdv": stdv})
2000 if filter[1] ==
">":
2001 if float(xldb[filter[0]]) <= float(filter[2]):
2004 if filter[1] ==
"<":
2005 if float(xldb[filter[0]]) >= float(filter[2]):
2008 if filter[1] ==
"==":
2009 if float(xldb[filter[0]]) != float(filter[2]):
2015 pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
2016 pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
2018 if layout ==
'lowerdiagonal':
2019 if pos_for_diagonal1 <= pos_for_diagonal2:
2021 if layout ==
'upperdiagonal':
2022 if pos_for_diagonal1 >= pos_for_diagonal2:
2025 already_added_xls.append((r1, c1, r2, c2))
2027 if not no_confidence_info:
2028 if confidence ==
'0.01':
2029 markersize = 14 * scale_symbol_size
2030 elif confidence ==
'0.05':
2031 markersize = 9 * scale_symbol_size
2032 elif confidence ==
'0.1':
2033 markersize = 6 * scale_symbol_size
2035 markersize = 15 * scale_symbol_size
2037 markersize = 5 * scale_symbol_size
2039 if not no_dist_info:
2040 color = self.colormap(mdist)
2050 markersize=markersize)
2054 fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
2056 [i.set_linewidth(2.0)
for i
in ax.spines.values()]
2061 plt.savefig(filename +
".pdf", dpi=300, transparent=
"False")
2065 def get_frequency_statistics(self, prot_list,
2068 violated_histogram = {}
2069 satisfied_histogram = {}
2070 unique_cross_links = []
2072 for xl
in self.unique_cross_link_list:
2073 (r1, c1, r2, c2, mdist) = xl
2076 if prot_list2
is None:
2077 if not c1
in prot_list:
2079 if not c2
in prot_list:
2082 if c1
in prot_list
and c2
in prot_list2:
2084 elif c1
in prot_list2
and c2
in prot_list:
2089 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2091 if (r1, c1, r2, c2)
not in unique_cross_links:
2093 if frequency
not in violated_histogram:
2094 violated_histogram[frequency] = 1
2096 violated_histogram[frequency] += 1
2098 if frequency
not in satisfied_histogram:
2099 satisfied_histogram[frequency] = 1
2101 satisfied_histogram[frequency] += 1
2102 unique_cross_links.append((r1, c1, r2, c2))
2103 unique_cross_links.append((r2, c2, r1, c1))
2105 print(
"# satisfied")
2107 total_number_of_crosslinks=0
2109 for i
in satisfied_histogram:
2113 if i
in violated_histogram:
2114 print(i, violated_histogram[i]+satisfied_histogram[i])
2116 print(i, satisfied_histogram[i])
2117 total_number_of_crosslinks+=i*satisfied_histogram[i]
2121 for i
in violated_histogram:
2122 print(i, violated_histogram[i])
2123 total_number_of_crosslinks+=i*violated_histogram[i]
2125 print(total_number_of_crosslinks)
2129 def print_cross_link_binary_symbols(self, prot_list,
2132 confidence_list = []
2133 for xl
in self.crosslinks:
2134 (r1, c1, r2, c2, mdist, stdv, confidence,
2135 unique_identifier, descriptor) = xl
2137 if prot_list2
is None:
2138 if not c1
in prot_list:
2140 if not c2
in prot_list:
2143 if c1
in prot_list
and c2
in prot_list2:
2145 elif c1
in prot_list2
and c2
in prot_list:
2150 if descriptor !=
"original":
2153 confidence_list.append(confidence)
2155 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2156 tmp_dist_binary = []
2159 tmp_dist_binary.append(1)
2161 tmp_dist_binary.append(0)
2162 tmp_matrix.append(tmp_dist_binary)
2164 matrix = list(zip(*tmp_matrix))
2166 satisfied_high_sum = 0
2167 satisfied_mid_sum = 0
2168 satisfied_low_sum = 0
2169 total_satisfied_sum = 0
2170 for k, m
in enumerate(matrix):
2179 for n, b
in enumerate(m):
2180 if confidence_list[n] ==
"0.01":
2184 satisfied_high_sum += 1
2185 elif confidence_list[n] ==
"0.05":
2189 satisfied_mid_sum += 1
2190 elif confidence_list[n] ==
"0.1":
2194 satisfied_low_sum += 1
2196 total_satisfied += 1
2197 total_satisfied_sum += 1
2199 print(k, satisfied_high, total_high)
2200 print(k, satisfied_mid, total_mid)
2201 print(k, satisfied_low, total_low)
2202 print(k, total_satisfied, total)
2203 print(float(satisfied_high_sum) / len(matrix))
2204 print(float(satisfied_mid_sum) / len(matrix))
2205 print(float(satisfied_low_sum) / len(matrix))
2208 def get_unique_crosslinks_statistics(self, prot_list,
2221 satisfied_string = []
2222 for xl
in self.crosslinks:
2223 (r1, c1, r2, c2, mdist, stdv, confidence,
2224 unique_identifier, descriptor) = xl
2226 if prot_list2
is None:
2227 if not c1
in prot_list:
2229 if not c2
in prot_list:
2232 if c1
in prot_list
and c2
in prot_list2:
2234 elif c1
in prot_list2
and c2
in prot_list:
2239 if descriptor !=
"original":
2243 if confidence ==
"0.01":
2247 if confidence ==
"0.05":
2251 if confidence ==
"0.1":
2256 satisfied_string.append(1)
2258 satisfied_string.append(0)
2260 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2261 tmp_dist_binary = []
2264 tmp_dist_binary.append(1)
2266 tmp_dist_binary.append(0)
2267 tmp_matrix.append(tmp_dist_binary)
2269 print(
"unique satisfied_high/total_high", satisfied_high,
"/", total_high)
2270 print(
"unique satisfied_mid/total_mid", satisfied_mid,
"/", total_mid)
2271 print(
"unique satisfied_low/total_low", satisfied_low,
"/", total_low)
2272 print(
"total", total)
2274 matrix = list(zip(*tmp_matrix))
2275 satisfied_models = 0
2277 for b
in satisfied_string:
2284 all_satisfied =
True
2286 for n, b
in enumerate(m):
2291 if b == 1
and satisfied_string[n] == 1:
2293 elif b == 1
and satisfied_string[n] == 0:
2295 elif b == 0
and satisfied_string[n] == 0:
2297 elif b == 0
and satisfied_string[n] == 1:
2298 all_satisfied =
False
2300 satisfied_models += 1
2302 print(satstr, all_satisfied)
2303 print(
"models that satisfies the median satisfied crosslinks/total models", satisfied_models, len(matrix))
2305 def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,
2311 for kw
in self.cross_link_distances_unique:
2312 (r1, c1, r2, c2) = kw
2313 dists = self.cross_link_distances_unique[kw]
2315 if prot_list2
is None:
2316 if not c1
in prot_list:
2318 if not c2
in prot_list:
2321 if c1
in prot_list
and c2
in prot_list2:
2323 elif c1
in prot_list2
and c2
in prot_list:
2328 dists.append(sum(dists))
2329 tmp_matrix.append(dists)
2331 tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
2334 matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
2336 for i
in range(len(tmp_matrix)):
2337 for k
in range(len(tmp_matrix[i]) - 1):
2338 matrix[i][k] = tmp_matrix[i][k]
2343 ax = fig.add_subplot(211)
2345 cax = ax.imshow(matrix, interpolation=
'nearest')
2349 pl.savefig(figurename, dpi=300)
2358 arrangement=
"inter",
2359 confidence_input=
"None"):
2362 for xl
in self.cross_link_distances:
2363 (r1, c1, r2, c2, mdist, confidence) = xl
2364 if c1
in prots1
and c2
in prots2:
2365 if arrangement ==
"inter" and c1 == c2:
2367 if arrangement ==
"intra" and c1 != c2:
2369 if confidence_input == confidence:
2370 label = str(c1) +
":" + str(r1) + \
2371 "-" + str(c2) +
":" + str(r2)
2372 values = self.cross_link_distances[xl]
2373 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2374 data.append((label, values, mdist, frequency))
2376 sort_by_dist = sorted(data, key=
lambda tup: tup[2])
2377 sort_by_dist = list(zip(*sort_by_dist))
2378 values = sort_by_dist[1]
2379 positions = list(range(len(values)))
2380 labels = sort_by_dist[0]
2381 frequencies = list(map(float, sort_by_dist[3]))
2382 frequencies = [f * 10.0
for f
in frequencies]
2384 nchunks = int(float(len(values)) / nxl_per_row)
2385 values_chunks = IMP.pmi.tools.chunk_list_into_segments(values, nchunks)
2386 positions_chunks = IMP.pmi.tools.chunk_list_into_segments(
2389 frequencies_chunks = IMP.pmi.tools.chunk_list_into_segments(
2392 labels_chunks = IMP.pmi.tools.chunk_list_into_segments(labels, nchunks)
2394 for n, v
in enumerate(values_chunks):
2395 p = positions_chunks[n]
2396 f = frequencies_chunks[n]
2397 l = labels_chunks[n]
2399 filename +
"." + str(n), v, p, f,
2400 valuename=
"Distance (Ang)", positionname=
"Unique " + arrangement +
" Crosslinks", xlabels=l)
2402 def crosslink_distance_histogram(self, filename,
2405 confidence_classes=
None,
2411 if prot_list
is None:
2412 prot_list = list(self.prot_length_dict.keys())
2415 for xl
in self.crosslinks:
2416 (r1, c1, r2, c2, mdist, stdv, confidence,
2417 unique_identifier, descriptor) = xl
2419 if not confidence_classes
is None:
2420 if confidence
not in confidence_classes:
2423 if prot_list2
is None:
2424 if not c1
in prot_list:
2426 if not c2
in prot_list:
2429 if c1
in prot_list
and c2
in prot_list2:
2431 elif c1
in prot_list2
and c2
in prot_list:
2436 distances.append(mdist)
2439 filename, distances, valuename=
"C-alpha C-alpha distance [Ang]",
2440 bins=bins, color=color,
2442 reference_xline=35.0,
2443 yplotrange=yplotrange, normalized=normalized)
2445 def scatter_plot_xl_features(self, filename,
2451 reference_ylines=
None,
2452 distance_color=
True,
2454 import matplotlib.pyplot
as plt
2455 import matplotlib.cm
as cm
2457 fig = plt.figure(figsize=(10, 10))
2458 ax = fig.add_subplot(111)
2460 for xl
in self.crosslinks:
2461 (r1, c1, r2, c2, mdist, stdv, confidence,
2462 unique_identifier, arrangement) = xl
2464 if prot_list2
is None:
2465 if not c1
in prot_list:
2467 if not c2
in prot_list:
2470 if c1
in prot_list
and c2
in prot_list2:
2472 elif c1
in prot_list2
and c2
in prot_list:
2477 xldb = self.external_csv_data[unique_identifier]
2478 xldb.update({
"Distance": mdist})
2479 xldb.update({
"Distance_stdv": stdv})
2481 xvalue = float(xldb[feature1])
2482 yvalue = float(xldb[feature2])
2485 color = self.colormap(mdist)
2489 ax.plot([xvalue], [yvalue],
'o', c=color, alpha=0.1, markersize=7)
2491 if not yplotrange
is None:
2492 ax.set_ylim(yplotrange)
2493 if not reference_ylines
is None:
2494 for rl
in reference_ylines:
2495 ax.axhline(rl, color=
'red', linestyle=
'dashed', linewidth=1)
2498 plt.savefig(filename +
"." + format, dpi=150, transparent=
"False")
Visualization of crosslinks.
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, kernel::Model *m)
A decorator to associate a particle with a part of a protein/DNA/RNA.
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
DensityMap * create_density_map(const algebra::GridD< 3, algebra::DenseGridStorageD< 3, float >, float > &grid)
A class for reading stat files.
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
kernel::Restraints create_restraints(RMF::FileConstHandle fh, kernel::Model *m)
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
A class to evaluate the precision of an ensemble.
void load_frame(RMF::FileConstHandle file, unsigned int frame)
double get_rmsd(const core::XYZs &s0, const core::XYZs &s1, const IMP::algebra::Transformation3D &tr_for_second)
A class to cluster structures.
void write_map(DensityMap *m, std::string filename)
Write a density map to a file.
Class for sampling a density map from particles.
def add_structure
Read a structure into the ensemble and store (as coordinates).
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
Performs alignment and RMSD calculation for two sets of coordinates.
def add_subunits_density
Add a frame to the densities.
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Basic utilities for handling cryo-electron microscopy 3D density maps.
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
def fill
Add coordinates for a single model.
Tools for clustering and cluster analysis.
def do_cluster
Run K-means clustering.
void destroy(Hierarchy d)
Delete the Hierarchy.
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Classes for writing output files and processing them.
def set_reference_structure
Read in a structure used for reference computation.
def get_rmsf
Calculate the residue mean square fluctuations (RMSF).
def get_average_distance_wrt_reference_structure
Compare the structure set to the reference structure.
def get_density
Get the current density for some component name.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
def get_precision
Evaluate the precision of two named structure groups.
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name.
Python classes to represent, score, sample and analyze models.
def add_structures
Read a list of RMFs, supports parallel.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Select hierarchy particles identified by the biological name.
Compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
A decorator for a particle with x,y,z coordinates and a radius.