3 """@namespace IMP.pmi.analysis
4 Tools for clustering and cluster analysis
6 from __future__
import print_function
16 from operator
import itemgetter
17 from copy
import deepcopy
18 from math
import log,sqrt
24 """Performs alignment and RMSD calculation for two sets of coordinates
26 The class also takes into account non-equal stoichiometry of the proteins.
27 If this is the case, the protein names of proteins in multiple copies
28 should be specified in the following form:
29 nameA..1, nameA..2 (note two dots).
32 def __init__(self, template, query, weights=None):
34 @param query {'p1':coords(L,3), 'p2':coords(L,3)}
35 @param template {'p1':coords(L,3), 'p2':coords(L,3)}
36 @param weights optional weights for each set of coordinates
39 self.template = template
42 if len(self.query.keys()) != len(self.template.keys()):
43 raise ValueError(
'''the number of proteins
44 in template and query does not match!''')
48 self.proteins = sorted(self.query.keys())
49 prots_uniq = [i.split(
'..')[0]
for i
in self.proteins]
52 np = prots_uniq.count(p)
53 copies = [i
for i
in self.proteins
if i.split(
'..')[0] == p]
54 prmts = list(itertools.permutations(copies, len(copies)))
57 self.Product = list(itertools.product(*P.values()))
65 torder = sum([list(i)
for i
in self.Product[0]], [])
68 if self.weights
is not None:
69 weights += [i
for i
in self.weights[t]]
72 self.rmsd = 10000000000.
73 for comb
in self.Product:
77 order = sum([list(i)
for i
in comb], [])
87 if self.weights
is not None:
98 from scipy.spatial.distance
import cdist
103 torder = sum([list(i)
for i
in self.Product[0]], [])
108 self.rmsd, Transformation = 10000000000.,
''
109 for comb
in self.Product:
110 order = sum([list(i)
for i
in comb], [])
116 if len(template_xyz) != len(query_xyz):
117 raise ValueError(
'''the number of coordinates
118 in template and query does not match!''')
123 query_xyz_tr = [transformation.get_transformed(n)
127 sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2)) / len(template_xyz))
130 Transformation = transformation
132 return (self.rmsd, Transformation)
137 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
138 'a..2':np.array([np.array([1.,1.,])]),
139 'a..3':np.array([np.array([-2.,1.])]),
140 'b':np.array([np.array([0.,-1.])]),
141 'c..1':np.array([np.array([-1.,-1.])]),
142 'c..2':np.array([np.array([1.,-1.])]),
143 'd':np.array([np.array([0.,0.])]),
144 'e':np.array([np.array([0.,1.])])}
146 Ali = Alignment(Proteins, Proteins)
148 if Ali.get_rmsd() == 0.0: print 'successful test!'
149 else: print 'ERROR!'; exit()
154 class Violations(object):
158 self.violation_thresholds = {}
159 self.violation_counts = {}
161 data = open(filename)
166 d = d.strip().split()
167 self.violation_thresholds[d[0]] = float(d[1])
169 def get_number_violated_restraints(self, rsts_dict):
171 for rst
in self.violation_thresholds:
172 if rst
not in rsts_dict:
174 if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
176 if rst
not in self.violation_counts:
177 self.violation_counts[rst] = 1
179 self.violation_counts[rst] += 1
185 """A class to cluster structures.
186 Uses scipy's cdist function to compute distance matrices
187 and sklearn's kmeans clustering module.
191 @param rmsd_weights Flat list of weights for each particle
195 from mpi4py
import MPI
196 self.comm = MPI.COMM_WORLD
197 self.rank = self.comm.Get_rank()
198 self.number_of_processes = self.comm.size
200 self.number_of_processes = 1
203 self.structure_cluster_ids =
None
204 self.tmpl_coords =
None
205 self.rmsd_weights=rmsd_weights
207 def set_template(self, part_coords):
209 self.tmpl_coords = part_coords
212 """Add coordinates for a single model."""
214 self.all_coords[frame] = Coords
216 def dist_matrix(self):
218 self.model_list_names = list(self.all_coords.keys())
219 self.model_indexes = list(range(len(self.model_list_names)))
220 self.model_indexes_dict = dict(
221 list(zip(self.model_list_names, self.model_indexes)))
222 model_indexes_unique_pairs = list(itertools.combinations(self.model_indexes, 2))
224 my_model_indexes_unique_pairs = IMP.pmi.tools.chunk_list_into_segments(
225 model_indexes_unique_pairs,
226 self.number_of_processes)[self.rank]
228 print(
"process %s assigned with %s pairs" % (str(self.rank), str(len(my_model_indexes_unique_pairs))))
230 (raw_distance_dict, self.transformation_distance_dict) = self.matrix_calculation(self.all_coords,
232 my_model_indexes_unique_pairs)
234 if self.number_of_processes > 1:
237 pickable_transformations = self.get_pickable_transformation_distance_dict(
240 pickable_transformations)
241 self.set_transformation_distance_dict_from_pickable(
242 pickable_transformations)
244 self.raw_distance_matrix = np.zeros(
245 (len(self.model_list_names), len(self.model_list_names)))
246 for item
in raw_distance_dict:
248 self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
249 self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
251 def get_dist_matrix(self):
252 return self.raw_distance_matrix
255 """Run K-means clustering
256 @param number_of_clusters Num means
257 @param seed the random seed
259 from sklearn.cluster
import KMeans
264 kmeans = KMeans(n_clusters=number_of_clusters)
267 kmeans = KMeans(k=number_of_clusters)
268 kmeans.fit_predict(self.raw_distance_matrix)
270 self.structure_cluster_ids = kmeans.labels_
272 def get_pickable_transformation_distance_dict(self):
273 pickable_transformations = {}
274 for label
in self.transformation_distance_dict:
275 tr = self.transformation_distance_dict[label]
276 trans = tuple(tr.get_translation())
277 rot = tuple(tr.get_rotation().get_quaternion())
278 pickable_transformations[label] = (rot, trans)
279 return pickable_transformations
281 def set_transformation_distance_dict_from_pickable(
283 pickable_transformations):
284 self.transformation_distance_dict = {}
285 for label
in pickable_transformations:
286 tr = pickable_transformations[label]
289 self.transformation_distance_dict[
292 def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
294 outf = open(file_name +
".data",
'wb')
300 pickable_transformations = self.get_pickable_transformation_distance_dict(
303 (self.structure_cluster_ids,
304 self.model_list_names,
305 pickable_transformations),
308 np.save(file_name +
".npy", self.raw_distance_matrix)
310 def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
313 inputf = open(file_name +
".data",
'rb')
314 (self.structure_cluster_ids, self.model_list_names,
315 pickable_transformations) = pickle.load(inputf)
318 self.raw_distance_matrix = np.load(file_name +
".npy")
320 self.set_transformation_distance_dict_from_pickable(
321 pickable_transformations)
322 self.model_indexes = list(range(len(self.model_list_names)))
323 self.model_indexes_dict = dict(
324 list(zip(self.model_list_names, self.model_indexes)))
326 def plot_matrix(self, figurename="clustermatrix.pdf"):
328 from scipy.cluster
import hierarchy
as hrc
330 fig = pl.figure(figsize=(10,8))
331 ax = fig.add_subplot(212)
332 dendrogram = hrc.dendrogram(
333 hrc.linkage(self.raw_distance_matrix),
336 leaves_order = dendrogram[
'leaves']
337 ax.set_xlabel(
'Model')
338 ax.set_ylabel(
'RMSD [Angstroms]')
340 ax2 = fig.add_subplot(221)
342 self.raw_distance_matrix[leaves_order,
345 interpolation=
'nearest')
346 cb = fig.colorbar(cax)
347 cb.set_label(
'RMSD [Angstroms]')
348 ax2.set_xlabel(
'Model')
349 ax2.set_ylabel(
'Model')
351 pl.savefig(figurename, dpi=300)
354 def get_model_index_from_name(self, name):
355 return self.model_indexes_dict[name]
357 def get_cluster_labels(self):
359 return list(set(self.structure_cluster_ids))
361 def get_number_of_clusters(self):
362 return len(self.get_cluster_labels())
364 def get_cluster_label_indexes(self, label):
366 [i
for i, l
in enumerate(self.structure_cluster_ids)
if l == label]
369 def get_cluster_label_names(self, label):
371 [self.model_list_names[i]
372 for i
in self.get_cluster_label_indexes(label)]
375 def get_cluster_label_average_rmsd(self, label):
377 indexes = self.get_cluster_label_indexes(label)
380 sub_distance_matrix = self.raw_distance_matrix[
381 indexes, :][:, indexes]
382 average_rmsd = np.sum(sub_distance_matrix) / \
383 (len(sub_distance_matrix)
384 ** 2 - len(sub_distance_matrix))
389 def get_cluster_label_size(self, label):
390 return len(self.get_cluster_label_indexes(label))
392 def get_transformation_to_first_member(
396 reference = self.get_cluster_label_indexes(cluster_label)[0]
397 return self.transformation_distance_dict[(reference, structure_index)]
399 def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
401 model_list_names = list(all_coords.keys())
402 rmsd_protein_names = list(all_coords[model_list_names[0]].keys())
403 raw_distance_dict = {}
404 transformation_distance_dict = {}
405 if template_coords
is None:
409 alignment_template_protein_names = list(template_coords.keys())
411 for (f1, f2)
in list_of_pairs:
419 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
420 for pr
in rmsd_protein_names])
422 for pr
in rmsd_protein_names:
423 coords_f2[pr] = all_coords[model_list_names[f2]][pr]
425 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
426 rmsd = Ali.get_rmsd()
432 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
433 for pr
in alignment_template_protein_names])
434 coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
435 for pr
in alignment_template_protein_names])
438 template_rmsd, transformation = Ali.align()
443 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
444 for pr
in rmsd_protein_names])
446 for pr
in rmsd_protein_names:
447 coords_f2[pr] = [transformation.get_transformed(
448 i)
for i
in all_coords[model_list_names[f2]][pr]]
450 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
451 rmsd = Ali.get_rmsd()
453 raw_distance_dict[(f1, f2)] = rmsd
454 raw_distance_dict[(f2, f1)] = rmsd
455 transformation_distance_dict[(f1, f2)] = transformation
456 transformation_distance_dict[(f2, f1)] = transformation
458 return raw_distance_dict, transformation_distance_dict
462 """A class to evaluate the precision of an ensemble.
464 Also can evaluate the cross-precision of multiple ensembles.
465 Supports MPI for coordinate reading.
466 Recommended procedure:
467 -# initialize object and pass the selection for evaluating precision
468 -# call add_structures() to read in the data (specify group name)
469 -# call get_precision() to evaluate inter/intra precision
470 -# call get_rmsf() to evaluate within-group fluctuations
474 selection_dictionary={}):
476 @param model The IMP Model
477 @param resolution Use 1 or 10 (kluge: requires that "_Res:X" is
478 part of the hier name)
479 @param selection_dictionary Dictionary where keys are names for
480 selections and values are selection tuples for scoring
481 precision. "All" is automatically made as well
484 from mpi4py
import MPI
485 self.comm = MPI.COMM_WORLD
486 self.rank = self.comm.Get_rank()
487 self.number_of_processes = self.comm.size
489 self.number_of_processes=1
492 self.styles=[
'pairwise_rmsd',
'pairwise_drmsd_k',
'pairwise_drmsd_Q',
493 'pairwise_drms_k',
'pairwise_rmsd',
'drmsd_from_center']
494 self.style=
'pairwise_drmsd_k'
495 self.structures_dictionary={}
496 self.reference_structures_dictionary={}
498 self.protein_names=
None
499 self.len_particles_resolution_one=
None
501 self.rmf_names_frames={}
502 self.reference_rmf_names_frames=
None
503 self.reference_structure=
None
504 self.reference_prot=
None
505 self.selection_dictionary=selection_dictionary
507 self.residue_particle_index_map=
None
508 if resolution
in [1,10]:
509 self.resolution=resolution
511 raise KeyError(
"no such resolution")
513 def _get_structure(self,rmf_frame_index,rmf_name):
514 """Read an RMF file and return the particles"""
515 rh= RMF.open_rmf_file_read_only(rmf_name)
518 print(
"getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
521 if self.resolution==1:
523 elif self.resolution==10:
526 protein_names=list(particle_dict.keys())
527 particles_resolution_one=[]
528 for k
in particle_dict:
529 particles_resolution_one+=(particle_dict[k])
531 if self.protein_names==
None:
532 self.protein_names=protein_names
534 if self.protein_names!=protein_names:
535 print(
"Error: the protein names of the new coordinate set is not compatible with the previous one")
537 if self.len_particles_resolution_one==
None:
538 self.len_particles_resolution_one=len(particles_resolution_one)
540 if self.len_particles_resolution_one!=len(particles_resolution_one):
541 raise ValueError(
"the new coordinate set is not compatible with the previous one")
543 return particles_resolution_one,prots
549 setup_index_map=
False):
550 """ Read a structure into the ensemble and store (as coordinates).
551 @param rmf_name The name of the RMF file
552 @param rmf_frame_index The frame to read
553 @param structure_set_name Name for the set that includes this structure
555 @param setup_index_map if requested, set up a dictionary to help
560 if structure_set_name
in self.structures_dictionary:
561 cdict=self.structures_dictionary[structure_set_name]
562 rmflist=self.rmf_names_frames[structure_set_name]
564 self.structures_dictionary[structure_set_name]={}
565 self.rmf_names_frames[structure_set_name]=[]
566 cdict=self.structures_dictionary[structure_set_name]
567 rmflist=self.rmf_names_frames[structure_set_name]
571 (particles_resolution_one, prots)=self._get_structure(rmf_frame_index,rmf_name)
573 print(
"something wrong with the rmf")
576 self.selection_dictionary.update({
"All":self.protein_names})
578 for selection_name
in self.selection_dictionary:
579 selection_tuple=self.selection_dictionary[selection_name]
580 coords=self._select_coordinates(selection_tuple,particles_resolution_one,prots[0])
581 if selection_name
not in cdict:
582 cdict[selection_name]=[coords]
584 cdict[selection_name].append(coords)
586 rmflist.append((rmf_name,rmf_frame_index))
590 self.residue_particle_index_map={}
591 for prot_name
in self.protein_names:
592 self.residue_particle_index_map[prot_name] = \
593 self._get_residue_particle_index_map(
595 particles_resolution_one,prots[0])
600 rmf_name_frame_tuples,
602 """Read a list of RMFs, supports parallel
603 @param rmf_name_frame_tuples list of (rmf_file_name,frame_number)
604 @param structure_set_name Name this set of structures (e.g. "cluster.1")
608 my_rmf_name_frame_tuples=IMP.pmi.tools.chunk_list_into_segments(
609 rmf_name_frame_tuples,self.number_of_processes)[self.rank]
610 for nfr,tup
in enumerate(my_rmf_name_frame_tuples):
612 rmf_frame_index=tup[1]
614 if self.residue_particle_index_map
is None:
617 setup_index_map=
False
624 if self.number_of_processes > 1:
627 self.comm.send(self.structures_dictionary, dest=0, tag=11)
629 for i
in range(1, self.number_of_processes):
630 data_tmp = self.comm.recv(source=i, tag=11)
631 for key
in self.structures_dictionary:
632 self.structures_dictionary[key].update(data_tmp[key])
633 for i
in range(1, self.number_of_processes):
634 self.comm.send(self.structures_dictionary, dest=i, tag=11)
636 self.structures_dictionary = self.comm.recv(source=0, tag=11)
638 def _get_residue_particle_index_map(self,prot_name,structure,hier):
639 residue_particle_index_map=[]
641 all_selected_particles=s.get_selected_particles()
642 intersection=list(set(all_selected_particles) & set(structure))
643 sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
644 for p
in sorted_intersection:
646 return residue_particle_index_map
649 def _select_coordinates(self,tuple_selections,structure,prot):
650 selected_coordinates=[]
651 for t
in tuple_selections:
652 if type(t)==tuple
and len(t)==3:
654 all_selected_particles=s.get_selected_particles()
655 intersection=list(set(all_selected_particles) & set(structure))
656 sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
657 cc=[tuple(
IMP.core.XYZ(p).get_coordinates())
for p
in sorted_intersection]
658 selected_coordinates+=cc
662 all_selected_particles=s.get_selected_particles()
663 intersection=list(set(all_selected_particles) & set(structure))
664 sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
665 cc=[tuple(
IMP.core.XYZ(p).get_coordinates())
for p
in sorted_intersection]
666 selected_coordinates+=cc
668 raise ValueError(
"Selection error")
669 return selected_coordinates
671 def set_threshold(self,threshold):
672 self.threshold=threshold
674 def _get_distance(self,
680 """ Compute distance between structures with various metrics """
681 c1=self.structures_dictionary[structure_set_name1][selection_name][index1]
682 c2=self.structures_dictionary[structure_set_name2][selection_name][index2]
687 if self.style==
'pairwise_drmsd_k':
689 if self.style==
'pairwise_drms_k':
691 if self.style==
'pairwise_drmsd_Q':
694 if self.style==
'pairwise_rmsd':
698 def _get_particle_distances(self,structure_set_name1,structure_set_name2,
699 selection_name,index1,index2):
701 c1=self.structures_dictionary[structure_set_name1][selection_name][index1]
702 c2=self.structures_dictionary[structure_set_name2][selection_name][index2]
707 distances=[np.linalg.norm(a-b)
for (a,b)
in zip(coordinates1,coordinates2)]
716 selection_keywords=
None):
717 """ Evaluate the precision of two named structure groups. Supports MPI.
718 When the structure_set_name1 is different from the structure_set_name2,
719 this evaluates the cross-precision (average pairwise distances).
720 @param outfile Name of the precision output file
721 @param structure_set_name1 string name of the first structure set
722 @param structure_set_name2 string name of the second structure set
723 @param skip analyze every (skip) structure for the distance matrix calculation
724 @param selection_keywords Specify the selection name you want to calculate on.
725 By default this is computed for everything you provided in the constructor,
726 plus all the subunits together.
728 if selection_keywords
is None:
729 sel_keys=list(self.selection_dictionary.keys())
731 for k
in selection_keywords:
732 if k
not in self.selection_dictionary:
733 raise KeyError(
"you are trying to find named selection " \
734 + k +
" which was not requested in the constructor")
735 sel_keys=selection_keywords
737 if outfile
is not None:
740 for selection_name
in sel_keys:
741 number_of_structures_1=len(self.structures_dictionary[structure_set_name1][selection_name])
742 number_of_structures_2=len(self.structures_dictionary[structure_set_name2][selection_name])
745 structure_pointers_1=list(range(0,number_of_structures_1,skip))
746 structure_pointers_2=list(range(0,number_of_structures_2,skip))
748 pair_combination_list=list(itertools.product(structure_pointers_1,structure_pointers_2))
750 if len(pair_combination_list)==0:
751 raise ValueError(
"no structure selected. Check the skip parameter.")
753 my_pair_combination_list=IMP.pmi.tools.chunk_list_into_segments(
754 pair_combination_list,self.number_of_processes)[self.rank]
755 my_length=len(my_pair_combination_list)
756 for n,pair
in enumerate(my_pair_combination_list):
758 progression=int(float(n)/my_length*100.0)
759 distances[pair]=self._get_distance(structure_set_name1,structure_set_name2,
760 selection_name,pair[0],pair[1])
762 if self.number_of_processes > 1:
765 if structure_set_name1==structure_set_name2:
766 structure_pointers=structure_pointers_1
767 number_of_structures=number_of_structures_1
773 distances_to_structure={}
774 distances_to_structure_normalization={}
776 for n
in structure_pointers:
777 distances_to_structure[n]=0.0
778 distances_to_structure_normalization[n]=0
781 distance+=distances[k]
782 distances_to_structure[k[0]]+=distances[k]
783 distances_to_structure[k[1]]+=distances[k]
784 distances_to_structure_normalization[k[0]]+=1
785 distances_to_structure_normalization[k[1]]+=1
787 for n
in structure_pointers:
788 distances_to_structure[n]=distances_to_structure[n]/distances_to_structure_normalization[n]
790 min_distance=min([distances_to_structure[n]
for n
in distances_to_structure])
791 centroid_index=[k
for k, v
in distances_to_structure.items()
if v == min_distance][0]
792 centroid_rmf_name=self.rmf_names_frames[structure_set_name1][centroid_index]
794 centroid_distance=0.0
795 for n
in range(number_of_structures):
796 centroid_distance+=self._get_distance(structure_set_name1,structure_set_name1,
797 selection_name,centroid_index,n)
800 centroid_distance/=number_of_structures
802 if outfile
is not None:
803 of.write(str(selection_name)+
" "+structure_set_name1+
804 " average centroid distance "+str(centroid_distance)+
"\n")
805 of.write(str(selection_name)+
" "+structure_set_name1+
806 " centroid index "+str(centroid_index)+
"\n")
807 of.write(str(selection_name)+
" "+structure_set_name1+
808 " centroid rmf name "+str(centroid_rmf_name)+
"\n")
810 average_pairwise_distances=sum(distances.values())/len(list(distances.values()))
811 if outfile
is not None:
812 of.write(str(selection_name)+
" "+structure_set_name1+
" "+structure_set_name2+
813 " average pairwise distance "+str(average_pairwise_distances)+
"\n")
814 if outfile
is not None:
816 return centroid_index
822 set_plot_yaxis_range=
None):
823 """ Calculate the residue mean square fluctuations (RMSF).
824 Automatically outputs as data file and pdf
825 @param structure_set_name Which structure set to calculate RMSF for
826 @param outdir Where to write the files
827 @param skip Skip this number of structures
828 @param set_plot_yaxis_range In case you need to change the plot
837 for sel_name
in self.protein_names:
838 self.selection_dictionary.update({sel_name:[sel_name]})
840 number_of_structures=len(self.structures_dictionary[structure_set_name][sel_name])
844 rpim=self.residue_particle_index_map[sel_name]
845 outfile=outdir+
"/rmsf."+sel_name+
".dat"
849 for index
in range(number_of_structures):
850 distances=self._get_particle_distances(structure_set_name,
853 centroid_index,index)
854 for nblock,block
in enumerate(rpim):
855 for residue_number
in block:
856 residue_nblock[residue_number]=nblock
857 if residue_number
not in residue_distances:
858 residue_distances[residue_number]=[distances[nblock]]
860 residue_distances[residue_number].append(distances[nblock])
864 for rn
in residue_distances:
866 rmsf=np.std(residue_distances[rn])
868 of.write(str(rn)+
" "+str(residue_nblock[rn])+
" "+str(rmsf)+
"\n")
870 IMP.pmi.output.plot_xy_data(residues,rmsfs,title=sel_name,
871 out_fn=outdir+
"/rmsf."+sel_name,display=
False,
872 set_plot_yaxis_range=set_plot_yaxis_range,
873 xlabel=
'Residue Number',ylabel=
'Standard error')
878 """Read in a structure used for reference computation.
879 Needed before calling get_average_distance_wrt_reference_structure()
880 @param rmf_name The RMF file to read the reference
881 @param rmf_frame_index The index in that file
883 (particles_resolution_one, prot)=self._get_structure(rmf_frame_index,rmf_name)
884 self.reference_rmf_names_frames=(rmf_name,rmf_frame_index)
887 for selection_name
in self.selection_dictionary:
888 selection_tuple=self.selection_dictionary[selection_name]
889 coords=self._select_coordinates(selection_tuple,
890 particles_resolution_one,prot)
891 self.reference_structures_dictionary[selection_name]=coords
895 """Compare the structure set to the reference structure.
896 @param structure_set_name The structure set to compute this on
897 @note First call set_reference_structure()
899 if self.reference_structures_dictionary=={}:
900 print(
"Cannot compute until you set a reference structure")
902 for selection_name
in self.selection_dictionary:
903 reference_coordinates=self.reference_structures_dictionary[selection_name]
907 for sc
in self.structures_dictionary[structure_set_name][selection_name]:
909 if self.style==
'pairwise_drmsd_k':
911 if self.style==
'pairwise_drms_k':
913 if self.style==
'pairwise_drmsd_Q':
915 if self.style==
'pairwise_rmsd':
917 distances.append(distance)
919 print(selection_name,
"average distance",sum(distances)/len(distances),
"minimum distance",min(distances))
921 def get_coordinates(self):
924 def set_precision_style(self, style):
925 if style
in self.styles:
928 raise ValueError(
"No such style")
932 """Compute mean density maps from structures.
934 Keeps a dictionary of density maps,
935 keys are in the custom ranges. When you call add_subunits_density, it adds
936 particle coordinates to the existing density maps.
939 def __init__(self, custom_ranges, representation=None, voxel=5.0):
941 @param custom_ranges Required. It's a dictionary, keys are the
942 density component names, values are selection tuples
943 e.g. {'kin28':[['kin28',1,-1]],
944 'density_name_1' :[('ccl1')],
945 'density_name_2' :[(1,142,'tfb3d1'),
947 @param representation PMI representation, for doing selections.
948 Not needed if you only pass hierarchies
949 @param voxel The voxel size for the output map (lower is slower)
952 self.representation = representation
955 self.count_models = 0.0
956 self.custom_ranges = custom_ranges
959 """Add a frame to the densities.
960 @param hierarchy Optionally read the hierarchy from somewhere.
961 If not passed, will just read the representation.
963 self.count_models += 1.0
967 all_particles_by_resolution = []
968 for name
in part_dict:
969 all_particles_by_resolution += part_dict[name]
972 for density_name
in self.custom_ranges:
975 all_particles_by_segments = []
977 for seg
in self.custom_ranges[density_name]:
979 parts += IMP.tools.select_by_tuple(self.representation,
980 seg, resolution=1, name_is_ambiguous=
False)
984 elif type(seg) == tuple:
986 hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1))
988 raise Exception(
'could not understand selection tuple '+str(seg))
990 all_particles_by_segments += s.get_selected_particles()
994 set(all_particles_by_segments) & set(all_particles_by_resolution))
996 self._create_density_from_particles(parts, density_name)
998 def normalize_density(self):
1001 def _create_density_from_particles(self, ps, name,
1003 kernel_type=
'GAUSSIAN'):
1004 '''Internal function for adding to densities.
1005 pass XYZR particles with mass and create a density from them.
1006 kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
1008 'GAUSSIAN': IMP.em.GAUSSIAN,
1009 'BINARIZED_SPHERE': IMP.em.BINARIZED_SPHERE,
1010 'SPHERE': IMP.em.SPHERE}
1014 if name
not in self.densities:
1015 self.densities[name] = dmap
1022 dmap3.add(self.densities[name])
1023 self.densities[name] = dmap3
1025 def get_density_keys(self):
1026 return list(self.densities.keys())
1029 """Get the current density for some component name"""
1030 if name
not in self.densities:
1033 return self.densities[name]
1035 def write_mrc(self, path="./"):
1036 for density_name
in self.densities:
1037 self.densities[density_name].
multiply(1. / self.count_models)
1039 self.densities[density_name],
1040 path +
"/" + density_name +
".mrc",
1044 class GetContactMap(object):
1047 self.distance = distance
1048 self.contactmap =
''
1055 def set_prot(self, prot):
1056 from scipy.spatial.distance
import cdist
1065 for name
in particles_dictionary:
1066 residue_indexes = []
1067 for p
in particles_dictionary[name]:
1072 if len(residue_indexes) != 0:
1073 self.protnames.append(name)
1074 for res
in range(min(residue_indexes), max(residue_indexes) + 1):
1076 new_name = name +
":" + str(res)
1077 if name
not in self.resmap:
1078 self.resmap[name] = {}
1079 if res
not in self.resmap:
1080 self.resmap[name][res] = {}
1082 self.resmap[name][res] = new_name
1083 namelist.append(new_name)
1085 crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1087 radii.append(d.get_radius())
1089 coords = np.array(coords)
1090 radii = np.array(radii)
1092 if len(self.namelist) == 0:
1093 self.namelist = namelist
1094 self.contactmap = np.zeros((len(coords), len(coords)))
1096 distances = cdist(coords, coords)
1097 distances = (distances - radii).T - radii
1098 distances = distances <= self.distance
1104 self.contactmap += distances
1106 def get_subunit_coords(self, frame, align=0):
1107 from scipy.spatial.distance
import cdist
1111 test, testr = [], []
1112 for part
in self.prot.get_children():
1115 for chl
in part.get_children():
1121 SortedSegments.append((chl, startres))
1122 SortedSegments = sorted(SortedSegments, key=itemgetter(1))
1124 for sgmnt
in SortedSegments:
1127 crd = np.array([p.get_x(), p.get_y(), p.get_z()])
1130 radii.append(p.get_radius())
1132 new_name = part.get_name() +
'_' + sgmnt[0].get_name() +\
1135 .get_residue_indexes()[0])
1136 namelist.append(new_name)
1137 self.expanded[new_name] = len(
1139 if part.get_name()
not in self.resmap:
1140 self.resmap[part.get_name()] = {}
1142 self.resmap[part.get_name()][res] = new_name
1144 coords = np.array(coords)
1145 radii = np.array(radii)
1146 if len(self.namelist) == 0:
1147 self.namelist = namelist
1148 self.contactmap = np.zeros((len(coords), len(coords)))
1149 distances = cdist(coords, coords)
1150 distances = (distances - radii).T - radii
1151 distances = distances <= self.distance
1152 self.contactmap += distances
1157 identification_string=
'ISDCrossLinkMS_Distance_'):
1160 data = open(filname)
1161 D = data.readlines()
1165 if identification_string
in d:
1172 t1, t2 = (d[0], d[1]), (d[1], d[0])
1173 if t1
not in self.XL:
1174 self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
1175 self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
1177 self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
1178 self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
1180 def dist_matrix(self, skip_cmap=0, skip_xl=1):
1184 L = sum(self.expanded.values())
1185 proteins = self.protnames
1190 proteins = [p.get_name()
for p
in self.prot.get_children()]
1192 for p1
in range(len(proteins)):
1193 for p2
in range(p1, len(proteins)):
1195 self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1196 pn1, pn2 = proteins[p1], proteins[p2]
1197 mtr = np.zeros((pl1 + 1, pl2 + 1))
1198 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1199 for i1
in range(1, pl1 + 1):
1200 for i2
in range(1, pl2 + 1):
1202 r1 = K.index(self.resmap[pn1][i1])
1203 r2 = K.index(self.resmap[pn2][i2])
1205 mtr[i1 - 1, i2 - 1] = r
1207 missing.append((pn1, pn2, i1, i2))
1209 Matrices[(pn1, pn2)] = mtr
1214 raise ValueError(
"cross-links were not provided, use add_xlinks function!")
1217 for p1
in range(len(proteins)):
1218 for p2
in range(p1, len(proteins)):
1220 self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1221 pn1, pn2 = proteins[p1], proteins[p2]
1222 mtr = np.zeros((pl1 + 1, pl2 + 1))
1225 xls = self.XL[(pn1, pn2)]
1228 xls = self.XL[(pn2, pn1)]
1233 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1234 for xl1, xl2
in xls:
1236 print(
'X' * 10, xl1, xl2)
1239 print(
'X' * 10, xl1, xl2)
1241 mtr[xl1 - 1, xl2 - 1] = 100
1243 print(
'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1244 for xl1, xl2
in xls:
1246 print(
'X' * 10, xl1, xl2)
1249 print(
'X' * 10, xl1, xl2)
1251 mtr[xl2 - 1, xl1 - 1] = 100
1253 raise RuntimeError(
'WTF!')
1254 Matrices_xl[(pn1, pn2)] = mtr
1270 for i, c
in enumerate(C):
1271 cl = max(self.resmap[c].keys())
1276 R.append(R[-1] + cl)
1281 import matplotlib
as mpl
1283 import matplotlib.pyplot
as plt
1284 import matplotlib.gridspec
as gridspec
1285 import scipy.sparse
as sparse
1288 gs = gridspec.GridSpec(len(W), len(W),
1293 for x1, r1
in enumerate(R):
1298 for x2, r2
in enumerate(R):
1304 ax = plt.subplot(gs[cnt])
1307 mtr = Matrices[(C[x1], C[x2])]
1309 mtr = Matrices[(C[x2], C[x1])].T
1313 interpolation=
'nearest',
1315 vmax=log(NewM.max()))
1320 mtr = Matrices_xl[(C[x1], C[x2])]
1322 mtr = Matrices_xl[(C[x2], C[x1])].T
1324 sparse.csr_matrix(mtr),
1334 ax.set_ylabel(C[x1], rotation=90)
1341 def get_hiers_from_rmf(model, frame_number, rmf_file):
1343 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1346 rh = RMF.open_rmf_file_read_only(rmf_file)
1351 print(
"Unable to open rmf file %s" % (rmf_file))
1355 prot = prots[state_number]
1359 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1366 def link_hiers_to_rmf(model,hiers,frame_number, rmf_file):
1367 print(
"linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1368 rh = RMF.open_rmf_file_read_only(rmf_file)
1374 def get_hiers_and_restraints_from_rmf(model, frame_number, rmf_file):
1376 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1379 rh = RMF.open_rmf_file_read_only(rmf_file)
1385 print(
"Unable to open rmf file %s" % (rmf_file))
1392 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1400 def link_hiers_and_restraints_to_rmf(model,hiers,rs, frame_number, rmf_file):
1401 print(
"linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1402 rh = RMF.open_rmf_file_read_only(rmf_file)
1409 def get_hiers_from_rmf(model, frame_number, rmf_file):
1410 print(
"getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1413 rh = RMF.open_rmf_file_read_only(rmf_file)
1418 print(
"Unable to open rmf file %s" % (rmf_file))
1425 print(
"Unable to open frame %i of file %s" % (frame_number, rmf_file))
1434 Get particles at res 1, or any beads, based on the name.
1435 No Representation is needed. This is mainly used when the hierarchy
1436 is read from an RMF file.
1437 @return a dictionary of component names and their particles
1441 for c
in prot.get_children():
1444 for s
in c.get_children():
1445 if "_Res:1" in s.get_name()
and "_Res:10" not in s.get_name():
1447 if "Beads" in s.get_name():
1451 for name
in particle_dict:
1452 particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1453 list(set(particle_dict[name]) & set(allparticles)))
1454 return particle_dict
1458 Get particles at res 10, or any beads, based on the name.
1459 No Representation is needed.
1460 This is mainly used when the hierarchy is read from an RMF file.
1461 @return a dictionary of component names and their particles
1465 for c
in prot.get_children():
1468 for s
in c.get_children():
1469 if "_Res:10" in s.get_name():
1471 if "Beads" in s.get_name():
1474 for name
in particle_dict:
1475 particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1476 list(set(particle_dict[name]) & set(allparticles)))
1477 return particle_dict
1481 def select_by_tuple(first_res_last_res_name_tuple):
1482 first_res = first_res_last_res_hier_tuple[0]
1483 last_res = first_res_last_res_hier_tuple[1]
1484 name = first_res_last_res_hier_tuple[2]
1487 """Visualization of crosslinks"""
1489 self.crosslinks = []
1490 self.external_csv_data =
None
1491 self.crosslinkedprots = set()
1492 self.mindist = +10000000.0
1493 self.maxdist = -10000000.0
1494 self.contactmap =
None
1496 def set_hierarchy(self, prot):
1497 self.prot_length_dict = {}
1498 self.model=prot.get_model()
1500 for i
in prot.get_children():
1502 residue_indexes = []
1506 if len(residue_indexes) != 0:
1507 self.prot_length_dict[name] = max(residue_indexes)
1509 def set_coordinates_for_contact_map(self, rmf_name,rmf_frame_index):
1510 from scipy.spatial.distance
import cdist
1512 rh= RMF.open_rmf_file_read_only(rmf_name)
1515 print(
"getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
1526 self.index_dictionary = {}
1528 for name
in particles_dictionary:
1529 residue_indexes = []
1530 for p
in particles_dictionary[name]:
1535 if len(residue_indexes) != 0:
1537 for res
in range(min(residue_indexes), max(residue_indexes) + 1):
1540 crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1542 radii.append(d.get_radius())
1543 if name
not in self.index_dictionary:
1544 self.index_dictionary[name] = [resindex]
1546 self.index_dictionary[name].append(resindex)
1549 coords = np.array(coords)
1550 radii = np.array(radii)
1552 distances = cdist(coords, coords)
1553 distances = (distances - radii).T - radii
1555 distances = np.where(distances <= 20.0, 1.0, 0)
1556 if self.contactmap
is None:
1557 self.contactmap = np.zeros((len(coords), len(coords)))
1558 self.contactmap += distances
1563 self, data_file, search_label=
'ISDCrossLinkMS_Distance_',
1566 filter_rmf_file_names=
None,
1567 external_csv_data_file=
None,
1568 external_csv_data_file_unique_id_key=
"Unique ID"):
1577 keys = po.get_keys()
1579 xl_keys = [k
for k
in keys
if search_label
in k]
1581 if filter_rmf_file_names
is not None:
1582 rmf_file_key=
"local_rmf_file_name"
1583 fs = po.get_fields(xl_keys+[rmf_file_key])
1585 fs = po.get_fields(xl_keys)
1588 self.cross_link_frequency = {}
1592 self.cross_link_distances = {}
1596 self.cross_link_distances_unique = {}
1598 if not external_csv_data_file
is None:
1601 self.external_csv_data = {}
1602 xldb = IMP.pmi.tools.get_db_from_csv(external_csv_data_file)
1605 self.external_csv_data[
1606 xl[external_csv_data_file_unique_id_key]] = xl
1611 cross_link_frequency_list = []
1613 self.unique_cross_link_list = []
1617 keysplit = key.replace(
1626 if filter_label!=
None:
1627 if filter_label
not in keysplit:
continue
1630 r1 = int(keysplit[5])
1632 r2 = int(keysplit[7])
1635 confidence = keysplit[12]
1639 unique_identifier = keysplit[3]
1641 unique_identifier =
'0'
1643 r1 = int(keysplit[mapping[
"Residue1"]])
1644 c1 = keysplit[mapping[
"Protein1"]]
1645 r2 = int(keysplit[mapping[
"Residue2"]])
1646 c2 = keysplit[mapping[
"Protein2"]]
1648 confidence = keysplit[mapping[
"Confidence"]]
1652 unique_identifier = keysplit[mapping[
"Unique Identifier"]]
1654 unique_identifier =
'0'
1656 self.crosslinkedprots.add(c1)
1657 self.crosslinkedprots.add(c2)
1663 if filter_rmf_file_names
is not None:
1664 for n,d
in enumerate(fs[key]):
1665 if fs[rmf_file_key][n]
in filter_rmf_file_names:
1666 dists.append(float(d))
1668 dists=[float(f)
for f
in fs[key]]
1673 mdist = self.median(dists)
1675 stdv = np.std(np.array(dists))
1676 if self.mindist > mdist:
1677 self.mindist = mdist
1678 if self.maxdist < mdist:
1679 self.maxdist = mdist
1683 if not self.external_csv_data
is None:
1684 sample = self.external_csv_data[unique_identifier][
"Sample"]
1688 if (r1, c1, r2, c2,mdist)
not in cross_link_frequency_list:
1689 if (r1, c1, r2, c2)
not in self.cross_link_frequency:
1690 self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1691 self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1693 self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1694 self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1695 cross_link_frequency_list.append((r1, c1, r2, c2))
1696 cross_link_frequency_list.append((r2, c2, r1, c1))
1697 self.unique_cross_link_list.append(
1698 (r1, c1, r2, c2,mdist))
1700 if (r1, c1, r2, c2)
not in self.cross_link_distances:
1701 self.cross_link_distances[(
1707 confidence)] = dists
1708 self.cross_link_distances[(
1714 confidence)] = dists
1715 self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1717 self.cross_link_distances[(
1723 confidence)] += dists
1724 self.cross_link_distances[(
1730 confidence)] += dists
1732 self.crosslinks.append(
1742 self.crosslinks.append(
1753 self.cross_link_frequency_inverted = {}
1754 for xl
in self.unique_cross_link_list:
1755 (r1, c1, r2, c2, mdist) = xl
1756 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1757 if frequency
not in self.cross_link_frequency_inverted:
1758 self.cross_link_frequency_inverted[
1759 frequency] = [(r1, c1, r2, c2)]
1761 self.cross_link_frequency_inverted[
1762 frequency].append((r1, c1, r2, c2))
1766 def median(self, mylist):
1767 sorts = sorted(mylist)
1773 return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1774 return sorts[length / 2]
1776 def set_threshold(self,threshold):
1777 self.threshold=threshold
1779 def set_tolerance(self,tolerance):
1780 self.tolerance=tolerance
1782 def colormap(self, dist):
1783 if dist < self.threshold - self.tolerance:
1785 elif dist >= self.threshold + self.tolerance:
1790 def write_cross_link_database(self, filename, format='csv'):
1794 "Unique ID",
"Protein1",
"Residue1",
"Protein2",
"Residue2",
1795 "Median Distance",
"Standard Deviation",
"Confidence",
"Frequency",
"Arrangement"]
1797 if not self.external_csv_data
is None:
1798 keys = list(self.external_csv_data.keys())
1799 innerkeys = list(self.external_csv_data[keys[0]].keys())
1801 fieldnames += innerkeys
1803 dw = csv.DictWriter(
1807 fieldnames=fieldnames)
1809 for xl
in self.crosslinks:
1810 (r1, c1, r2, c2, mdist, stdv, confidence,
1811 unique_identifier, descriptor) = xl
1812 if descriptor ==
'original':
1814 outdict[
"Unique ID"] = unique_identifier
1815 outdict[
"Protein1"] = c1
1816 outdict[
"Protein2"] = c2
1817 outdict[
"Residue1"] = r1
1818 outdict[
"Residue2"] = r2
1819 outdict[
"Median Distance"] = mdist
1820 outdict[
"Standard Deviation"] = stdv
1821 outdict[
"Confidence"] = confidence
1822 outdict[
"Frequency"] = self.cross_link_frequency[
1825 arrangement =
"Intra"
1827 arrangement =
"Inter"
1828 outdict[
"Arrangement"] = arrangement
1829 if not self.external_csv_data
is None:
1830 outdict.update(self.external_csv_data[unique_identifier])
1832 dw.writerow(outdict)
1834 def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
1835 no_confidence_info=
False, filter=
None, layout=
"whole", crosslinkedonly=
False,
1836 filename=
None, confidence_classes=
None, alphablend=0.1, scale_symbol_size=1.0,
1837 gap_between_components=0,
1838 rename_protein_map=
None):
1852 import matplotlib.pyplot
as plt
1853 import matplotlib.cm
as cm
1855 fig = plt.figure(figsize=(10, 10))
1856 ax = fig.add_subplot(111)
1862 if prot_listx
is None:
1864 prot_listx = list(self.crosslinkedprots)
1866 prot_listx = list(self.prot_length_dict.keys())
1869 nresx = gap_between_components + \
1870 sum([self.prot_length_dict[name]
1871 + gap_between_components
for name
in prot_listx])
1875 if prot_listy
is None:
1877 prot_listy = list(self.crosslinkedprots)
1879 prot_listy = list(self.prot_length_dict.keys())
1882 nresy = gap_between_components + \
1883 sum([self.prot_length_dict[name]
1884 + gap_between_components
for name
in prot_listy])
1889 res = gap_between_components
1890 for prot
in prot_listx:
1891 resoffsetx[prot] = res
1892 res += self.prot_length_dict[prot]
1894 res += gap_between_components
1898 res = gap_between_components
1899 for prot
in prot_listy:
1900 resoffsety[prot] = res
1901 res += self.prot_length_dict[prot]
1903 res += gap_between_components
1905 resoffsetdiagonal = {}
1906 res = gap_between_components
1907 for prot
in IMP.pmi.tools.OrderedSet(prot_listx + prot_listy):
1908 resoffsetdiagonal[prot] = res
1909 res += self.prot_length_dict[prot]
1910 res += gap_between_components
1916 for n, prot
in enumerate(prot_listx):
1917 res = resoffsetx[prot]
1919 for proty
in prot_listy:
1920 resy = resoffsety[proty]
1921 endy = resendy[proty]
1922 ax.plot([res, res], [resy, endy],
'k-', lw=0.4)
1923 ax.plot([end, end], [resy, endy],
'k-', lw=0.4)
1924 xticks.append((float(res) + float(end)) / 2)
1925 if rename_protein_map
is not None:
1926 if prot
in rename_protein_map:
1927 prot=rename_protein_map[prot]
1928 xlabels.append(prot)
1932 for n, prot
in enumerate(prot_listy):
1933 res = resoffsety[prot]
1935 for protx
in prot_listx:
1936 resx = resoffsetx[protx]
1937 endx = resendx[protx]
1938 ax.plot([resx, endx], [res, res],
'k-', lw=0.4)
1939 ax.plot([resx, endx], [end, end],
'k-', lw=0.4)
1940 yticks.append((float(res) + float(end)) / 2)
1941 if rename_protein_map
is not None:
1942 if prot
in rename_protein_map:
1943 prot=rename_protein_map[prot]
1944 ylabels.append(prot)
1947 print(prot_listx, prot_listy)
1949 if not self.contactmap
is None:
1950 import matplotlib.cm
as cm
1951 tmp_array = np.zeros((nresx, nresy))
1953 for px
in prot_listx:
1955 for py
in prot_listy:
1957 resx = resoffsety[px]
1958 lengx = resendx[px] - 1
1959 resy = resoffsety[py]
1960 lengy = resendy[py] - 1
1961 indexes_x = self.index_dictionary[px]
1962 minx = min(indexes_x)
1963 maxx = max(indexes_x)
1964 indexes_y = self.index_dictionary[py]
1965 miny = min(indexes_y)
1966 maxy = max(indexes_y)
1968 print(px, py, minx, maxx, miny, maxy)
1973 resy:lengy] = self.contactmap[
1980 ax.imshow(tmp_array,
1983 interpolation=
'nearest')
1985 ax.set_xticks(xticks)
1986 ax.set_xticklabels(xlabels, rotation=90)
1987 ax.set_yticks(yticks)
1988 ax.set_yticklabels(ylabels)
1989 ax.set_xlim(0,nresx)
1990 ax.set_ylim(0,nresy)
1995 already_added_xls = []
1997 for xl
in self.crosslinks:
1999 (r1, c1, r2, c2, mdist, stdv, confidence,
2000 unique_identifier, descriptor) = xl
2002 if confidence_classes
is not None:
2003 if confidence
not in confidence_classes:
2007 pos1 = r1 + resoffsetx[c1]
2011 pos2 = r2 + resoffsety[c2]
2015 if not filter
is None:
2016 xldb = self.external_csv_data[unique_identifier]
2017 xldb.update({
"Distance": mdist})
2018 xldb.update({
"Distance_stdv": stdv})
2020 if filter[1] ==
">":
2021 if float(xldb[filter[0]]) <= float(filter[2]):
2024 if filter[1] ==
"<":
2025 if float(xldb[filter[0]]) >= float(filter[2]):
2028 if filter[1] ==
"==":
2029 if float(xldb[filter[0]]) != float(filter[2]):
2035 pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
2036 pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
2038 if layout ==
'lowerdiagonal':
2039 if pos_for_diagonal1 <= pos_for_diagonal2:
2041 if layout ==
'upperdiagonal':
2042 if pos_for_diagonal1 >= pos_for_diagonal2:
2045 already_added_xls.append((r1, c1, r2, c2))
2047 if not no_confidence_info:
2048 if confidence ==
'0.01':
2049 markersize = 14 * scale_symbol_size
2050 elif confidence ==
'0.05':
2051 markersize = 9 * scale_symbol_size
2052 elif confidence ==
'0.1':
2053 markersize = 6 * scale_symbol_size
2055 markersize = 15 * scale_symbol_size
2057 markersize = 5 * scale_symbol_size
2059 if not no_dist_info:
2060 color = self.colormap(mdist)
2070 markersize=markersize)
2074 fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
2076 [i.set_linewidth(2.0)
for i
in ax.spines.values()]
2081 plt.savefig(filename +
".pdf", dpi=300, transparent=
"False")
2085 def get_frequency_statistics(self, prot_list,
2088 violated_histogram = {}
2089 satisfied_histogram = {}
2090 unique_cross_links = []
2092 for xl
in self.unique_cross_link_list:
2093 (r1, c1, r2, c2, mdist) = xl
2096 if prot_list2
is None:
2097 if not c1
in prot_list:
2099 if not c2
in prot_list:
2102 if c1
in prot_list
and c2
in prot_list2:
2104 elif c1
in prot_list2
and c2
in prot_list:
2109 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2111 if (r1, c1, r2, c2)
not in unique_cross_links:
2113 if frequency
not in violated_histogram:
2114 violated_histogram[frequency] = 1
2116 violated_histogram[frequency] += 1
2118 if frequency
not in satisfied_histogram:
2119 satisfied_histogram[frequency] = 1
2121 satisfied_histogram[frequency] += 1
2122 unique_cross_links.append((r1, c1, r2, c2))
2123 unique_cross_links.append((r2, c2, r1, c1))
2125 print(
"# satisfied")
2127 total_number_of_crosslinks=0
2129 for i
in satisfied_histogram:
2133 if i
in violated_histogram:
2134 print(i, violated_histogram[i]+satisfied_histogram[i])
2136 print(i, satisfied_histogram[i])
2137 total_number_of_crosslinks+=i*satisfied_histogram[i]
2141 for i
in violated_histogram:
2142 print(i, violated_histogram[i])
2143 total_number_of_crosslinks+=i*violated_histogram[i]
2145 print(total_number_of_crosslinks)
2149 def print_cross_link_binary_symbols(self, prot_list,
2152 confidence_list = []
2153 for xl
in self.crosslinks:
2154 (r1, c1, r2, c2, mdist, stdv, confidence,
2155 unique_identifier, descriptor) = xl
2157 if prot_list2
is None:
2158 if not c1
in prot_list:
2160 if not c2
in prot_list:
2163 if c1
in prot_list
and c2
in prot_list2:
2165 elif c1
in prot_list2
and c2
in prot_list:
2170 if descriptor !=
"original":
2173 confidence_list.append(confidence)
2175 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2176 tmp_dist_binary = []
2179 tmp_dist_binary.append(1)
2181 tmp_dist_binary.append(0)
2182 tmp_matrix.append(tmp_dist_binary)
2184 matrix = list(zip(*tmp_matrix))
2186 satisfied_high_sum = 0
2187 satisfied_mid_sum = 0
2188 satisfied_low_sum = 0
2189 total_satisfied_sum = 0
2190 for k, m
in enumerate(matrix):
2199 for n, b
in enumerate(m):
2200 if confidence_list[n] ==
"0.01":
2204 satisfied_high_sum += 1
2205 elif confidence_list[n] ==
"0.05":
2209 satisfied_mid_sum += 1
2210 elif confidence_list[n] ==
"0.1":
2214 satisfied_low_sum += 1
2216 total_satisfied += 1
2217 total_satisfied_sum += 1
2219 print(k, satisfied_high, total_high)
2220 print(k, satisfied_mid, total_mid)
2221 print(k, satisfied_low, total_low)
2222 print(k, total_satisfied, total)
2223 print(float(satisfied_high_sum) / len(matrix))
2224 print(float(satisfied_mid_sum) / len(matrix))
2225 print(float(satisfied_low_sum) / len(matrix))
2228 def get_unique_crosslinks_statistics(self, prot_list,
2241 satisfied_string = []
2242 for xl
in self.crosslinks:
2243 (r1, c1, r2, c2, mdist, stdv, confidence,
2244 unique_identifier, descriptor) = xl
2246 if prot_list2
is None:
2247 if not c1
in prot_list:
2249 if not c2
in prot_list:
2252 if c1
in prot_list
and c2
in prot_list2:
2254 elif c1
in prot_list2
and c2
in prot_list:
2259 if descriptor !=
"original":
2263 if confidence ==
"0.01":
2267 if confidence ==
"0.05":
2271 if confidence ==
"0.1":
2276 satisfied_string.append(1)
2278 satisfied_string.append(0)
2280 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2281 tmp_dist_binary = []
2284 tmp_dist_binary.append(1)
2286 tmp_dist_binary.append(0)
2287 tmp_matrix.append(tmp_dist_binary)
2289 print(
"unique satisfied_high/total_high", satisfied_high,
"/", total_high)
2290 print(
"unique satisfied_mid/total_mid", satisfied_mid,
"/", total_mid)
2291 print(
"unique satisfied_low/total_low", satisfied_low,
"/", total_low)
2292 print(
"total", total)
2294 matrix = list(zip(*tmp_matrix))
2295 satisfied_models = 0
2297 for b
in satisfied_string:
2304 all_satisfied =
True
2306 for n, b
in enumerate(m):
2311 if b == 1
and satisfied_string[n] == 1:
2313 elif b == 1
and satisfied_string[n] == 0:
2315 elif b == 0
and satisfied_string[n] == 0:
2317 elif b == 0
and satisfied_string[n] == 1:
2318 all_satisfied =
False
2320 satisfied_models += 1
2322 print(satstr, all_satisfied)
2323 print(
"models that satisfies the median satisfied crosslinks/total models", satisfied_models, len(matrix))
2325 def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,
2331 for kw
in self.cross_link_distances_unique:
2332 (r1, c1, r2, c2) = kw
2333 dists = self.cross_link_distances_unique[kw]
2335 if prot_list2
is None:
2336 if not c1
in prot_list:
2338 if not c2
in prot_list:
2341 if c1
in prot_list
and c2
in prot_list2:
2343 elif c1
in prot_list2
and c2
in prot_list:
2348 dists.append(sum(dists))
2349 tmp_matrix.append(dists)
2351 tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
2354 matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
2356 for i
in range(len(tmp_matrix)):
2357 for k
in range(len(tmp_matrix[i]) - 1):
2358 matrix[i][k] = tmp_matrix[i][k]
2363 ax = fig.add_subplot(211)
2365 cax = ax.imshow(matrix, interpolation=
'nearest')
2369 pl.savefig(figurename, dpi=300)
2378 arrangement=
"inter",
2379 confidence_input=
"None"):
2382 for xl
in self.cross_link_distances:
2383 (r1, c1, r2, c2, mdist, confidence) = xl
2384 if c1
in prots1
and c2
in prots2:
2385 if arrangement ==
"inter" and c1 == c2:
2387 if arrangement ==
"intra" and c1 != c2:
2389 if confidence_input == confidence:
2390 label = str(c1) +
":" + str(r1) + \
2391 "-" + str(c2) +
":" + str(r2)
2392 values = self.cross_link_distances[xl]
2393 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2394 data.append((label, values, mdist, frequency))
2396 sort_by_dist = sorted(data, key=
lambda tup: tup[2])
2397 sort_by_dist = list(zip(*sort_by_dist))
2398 values = sort_by_dist[1]
2399 positions = list(range(len(values)))
2400 labels = sort_by_dist[0]
2401 frequencies = list(map(float, sort_by_dist[3]))
2402 frequencies = [f * 10.0
for f
in frequencies]
2404 nchunks = int(float(len(values)) / nxl_per_row)
2405 values_chunks = IMP.pmi.tools.chunk_list_into_segments(values, nchunks)
2406 positions_chunks = IMP.pmi.tools.chunk_list_into_segments(
2409 frequencies_chunks = IMP.pmi.tools.chunk_list_into_segments(
2412 labels_chunks = IMP.pmi.tools.chunk_list_into_segments(labels, nchunks)
2414 for n, v
in enumerate(values_chunks):
2415 p = positions_chunks[n]
2416 f = frequencies_chunks[n]
2417 l = labels_chunks[n]
2419 filename +
"." + str(n), v, p, f,
2420 valuename=
"Distance (Ang)", positionname=
"Unique " + arrangement +
" Crosslinks", xlabels=l)
2422 def crosslink_distance_histogram(self, filename,
2425 confidence_classes=
None,
2431 if prot_list
is None:
2432 prot_list = list(self.prot_length_dict.keys())
2435 for xl
in self.crosslinks:
2436 (r1, c1, r2, c2, mdist, stdv, confidence,
2437 unique_identifier, descriptor) = xl
2439 if not confidence_classes
is None:
2440 if confidence
not in confidence_classes:
2443 if prot_list2
is None:
2444 if not c1
in prot_list:
2446 if not c2
in prot_list:
2449 if c1
in prot_list
and c2
in prot_list2:
2451 elif c1
in prot_list2
and c2
in prot_list:
2456 distances.append(mdist)
2459 filename, distances, valuename=
"C-alpha C-alpha distance [Ang]",
2460 bins=bins, color=color,
2462 reference_xline=35.0,
2463 yplotrange=yplotrange, normalized=normalized)
2465 def scatter_plot_xl_features(self, filename,
2471 reference_ylines=
None,
2472 distance_color=
True,
2474 import matplotlib.pyplot
as plt
2475 import matplotlib.cm
as cm
2477 fig = plt.figure(figsize=(10, 10))
2478 ax = fig.add_subplot(111)
2480 for xl
in self.crosslinks:
2481 (r1, c1, r2, c2, mdist, stdv, confidence,
2482 unique_identifier, arrangement) = xl
2484 if prot_list2
is None:
2485 if not c1
in prot_list:
2487 if not c2
in prot_list:
2490 if c1
in prot_list
and c2
in prot_list2:
2492 elif c1
in prot_list2
and c2
in prot_list:
2497 xldb = self.external_csv_data[unique_identifier]
2498 xldb.update({
"Distance": mdist})
2499 xldb.update({
"Distance_stdv": stdv})
2501 xvalue = float(xldb[feature1])
2502 yvalue = float(xldb[feature2])
2505 color = self.colormap(mdist)
2509 ax.plot([xvalue], [yvalue],
'o', c=color, alpha=0.1, markersize=7)
2511 if not yplotrange
is None:
2512 ax.set_ylim(yplotrange)
2513 if not reference_ylines
is None:
2514 for rl
in reference_ylines:
2515 ax.axhline(rl, color=
'red', linestyle=
'dashed', linewidth=1)
2518 plt.savefig(filename +
"." + format, dpi=150, transparent=
"False")
Visualization of crosslinks.
A decorator to associate a particle with a part of a protein/DNA/RNA.
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
DensityMap * create_density_map(const algebra::GridD< 3, algebra::DenseGridStorageD< 3, float >, float > &grid)
A class for reading stat files.
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
void link_restraints(RMF::FileConstHandle fh, const Restraints &hs)
A class to evaluate the precision of an ensemble.
A class to cluster structures.
void write_map(DensityMap *m, std::string filename)
Write a density map to a file.
Class for sampling a density map from particles.
def add_structure
Read a structure into the ensemble and store (as coordinates).
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
Performs alignment and RMSD calculation for two sets of coordinates.
def add_subunits_density
Add a frame to the densities.
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Basic utilities for handling cryo-electron microscopy 3D density maps.
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
def fill
Add coordinates for a single model.
Tools for clustering and cluster analysis.
def do_cluster
Run K-means clustering.
void destroy(Hierarchy d)
Delete the Hierarchy.
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Classes for writing output files and processing them.
def set_reference_structure
Read in a structure used for reference computation.
def get_rmsf
Calculate the residue mean square fluctuations (RMSF).
def get_average_distance_wrt_reference_structure
Compare the structure set to the reference structure.
def get_density
Get the current density for some component name.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
def get_precision
Evaluate the precision of two named structure groups.
Restraints create_restraints(RMF::FileConstHandle fh, Model *m)
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name.
Python classes to represent, score, sample and analyze models.
def add_structures
Read a list of RMFs, supports parallel.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Select hierarchy particles identified by the biological name.
Compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
A decorator for a particle with x,y,z coordinates and a radius.