3 """@namespace IMP.pmi.analysis
4 Tools for clustering and cluster analysis
15 from operator
import itemgetter
16 from copy
import deepcopy
17 from math
import log,sqrt
23 """Performs alignment and RMSD calculation for two sets of coordinates
24 @param query {'p1':coords(L,3), 'p2':coords(L,3)}
25 @param template {'p1':coords(L,3), 'p2':coords(L,3)}
27 The class also takes into accout non-equal stoichiometry of the proteins. If this
28 is the case, the protein names of protein in multiple copies should be delivered
29 in the following form: nameA..1, nameA..2 (note two dots).
32 def __init__(self, template, query, weights=None):
35 self.template = template
38 if len(self.query.keys()) != len(self.template.keys()):
39 raise ValueError(
'''the number of proteins
40 in template and query does not match!''')
44 self.proteins = sorted(self.query.keys())
45 prots_uniq = [i.split(
'..')[0]
for i
in self.proteins]
48 np = prots_uniq.count(p)
49 copies = [i
for i
in self.proteins
if i.split(
'..')[0] == p]
50 prmts = list(itertools.permutations(copies, len(copies)))
53 self.Product = list(itertools.product(*P.values()))
61 torder = sum([list(i)
for i
in self.Product[0]], [])
64 if self.weights
is not None:
65 weights += [i
for i
in self.weights[t]]
68 self.rmsd = 10000000000.
69 for comb
in self.Product:
73 order = sum([list(i)
for i
in comb], [])
83 if self.weights
is not None:
94 from scipy.spatial.distance
import cdist
99 torder = sum([list(i)
for i
in self.Product[0]], [])
104 self.rmsd, Transformation = 10000000000.,
''
105 for comb
in self.Product:
106 order = sum([list(i)
for i
in comb], [])
112 if len(template_xyz) != len(query_xyz):
113 raise ValueError(
'''the number of coordinates
114 in template and query does not match!''')
119 query_xyz_tr = [transformation.get_transformed(n)
123 sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2)) / len(template_xyz))
126 Transformation = transformation
128 return (self.rmsd, Transformation)
133 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
134 'a..2':np.array([np.array([1.,1.,])]),
135 'a..3':np.array([np.array([-2.,1.])]),
136 'b':np.array([np.array([0.,-1.])]),
137 'c..1':np.array([np.array([-1.,-1.])]),
138 'c..2':np.array([np.array([1.,-1.])]),
139 'd':np.array([np.array([0.,0.])]),
140 'e':np.array([np.array([0.,1.])])}
142 Ali = Alignment(Proteins, Proteins)
144 if Ali.get_rmsd() == 0.0: print 'successful test!'
145 else: print 'ERROR!'; exit()
150 class Violations(object):
152 def __init__(self, filename):
154 self.violation_thresholds = {}
155 self.violation_counts = {}
157 data = open(filename)
162 d = d.strip().split()
163 self.violation_thresholds[d[0]] = float(d[1])
165 def get_number_violated_restraints(self, rsts_dict):
167 for rst
in self.violation_thresholds:
168 if rst
not in rsts_dict:
170 if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
172 if rst
not in self.violation_counts:
173 self.violation_counts[rst] = 1
175 self.violation_counts[rst] += 1
181 """A class to cluster structures.
182 Uses scipy's cdist function to compute distance matrices
183 And sklearn's kmeans clustering module.
184 @param rmsd_weights Flat list of weights for each particle (if they're coarse)
186 def __init__(self,rmsd_weights=None):
188 from mpi4py
import MPI
189 self.comm = MPI.COMM_WORLD
190 self.rank = self.comm.Get_rank()
191 self.number_of_processes = self.comm.size
193 self.number_of_processes = 1
196 self.structure_cluster_ids =
None
197 self.tmpl_coords =
None
198 self.rmsd_weights=rmsd_weights
200 def set_template(self, part_coords):
202 self.tmpl_coords = part_coords
206 fill stores coordinates of a model into a dictionary all_coords,
207 containing coordinates for all models.
210 self.all_coords[frame] = Coords
212 def dist_matrix(self):
214 self.model_list_names = self.all_coords.keys()
215 self.model_indexes = range(len(self.model_list_names))
216 self.model_indexes_dict = dict(
217 zip(self.model_list_names, self.model_indexes))
218 model_indexes_unique_pairs = list(itertools.combinations(self.model_indexes, 2))
220 my_model_indexes_unique_pairs = IMP.pmi.tools.chunk_list_into_segments(
221 model_indexes_unique_pairs,
222 self.number_of_processes)[self.rank]
224 print "process %s assigned with %s pairs" % (str(self.rank), str(len(my_model_indexes_unique_pairs)))
226 (raw_distance_dict, self.transformation_distance_dict) = self.matrix_calculation(self.all_coords,
228 my_model_indexes_unique_pairs)
230 if self.number_of_processes > 1:
233 pickable_transformations = self.get_pickable_transformation_distance_dict(
236 pickable_transformations)
237 self.set_transformation_distance_dict_from_pickable(
238 pickable_transformations)
240 self.raw_distance_matrix = np.zeros(
241 (len(self.model_list_names), len(self.model_list_names)))
242 for item
in raw_distance_dict:
244 self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
245 self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
247 def get_dist_matrix(self):
248 return self.raw_distance_matrix
251 """Run K-means clustering
252 @param number_of_clusters Num means
253 @param seed the random seed
255 from sklearn.cluster
import KMeans
260 kmeans = KMeans(n_clusters=number_of_clusters)
263 kmeans = KMeans(k=number_of_clusters)
264 kmeans.fit_predict(self.raw_distance_matrix)
266 self.structure_cluster_ids = kmeans.labels_
268 def get_pickable_transformation_distance_dict(self):
269 pickable_transformations = {}
270 for label
in self.transformation_distance_dict:
271 tr = self.transformation_distance_dict[label]
272 trans = tuple(tr.get_translation())
273 rot = tuple(tr.get_rotation().get_quaternion())
274 pickable_transformations[label] = (rot, trans)
275 return pickable_transformations
277 def set_transformation_distance_dict_from_pickable(
279 pickable_transformations):
280 self.transformation_distance_dict = {}
281 for label
in pickable_transformations:
282 tr = pickable_transformations[label]
285 self.transformation_distance_dict[
288 def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
290 outf = open(file_name +
".data",
'w')
296 pickable_transformations = self.get_pickable_transformation_distance_dict(
299 (self.structure_cluster_ids,
300 self.model_list_names,
301 pickable_transformations),
304 np.save(file_name +
".npy", self.raw_distance_matrix)
306 def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
309 inputf = open(file_name +
".data",
'r')
310 (self.structure_cluster_ids, self.model_list_names,
311 pickable_transformations) = pickle.load(inputf)
314 self.raw_distance_matrix = np.load(file_name + ".npy")
316 self.set_transformation_distance_dict_from_pickable(
317 pickable_transformations)
318 self.model_indexes = range(len(self.model_list_names))
319 self.model_indexes_dict = dict(
320 zip(self.model_list_names, self.model_indexes))
322 def plot_matrix(self, figurename="clustermatrix.pdf"):
324 from scipy.cluster
import hierarchy
as hrc
326 fig = pl.figure(figsize=(10,8))
327 ax = fig.add_subplot(212)
328 dendrogram = hrc.dendrogram(
329 hrc.linkage(self.raw_distance_matrix),
332 leaves_order = dendrogram[
'leaves']
333 ax.set_xlabel(
'Model')
334 ax.set_ylabel(
'RMSD [Angstroms]')
336 ax2 = fig.add_subplot(221)
338 self.raw_distance_matrix[leaves_order,
341 interpolation=
'nearest')
342 cb = fig.colorbar(cax)
343 cb.set_label(
'RMSD [Angstroms]')
344 ax2.set_xlabel(
'Model')
345 ax2.set_ylabel(
'Model')
347 pl.savefig(figurename, dpi=300)
350 def get_model_index_from_name(self, name):
351 return self.model_indexes_dict[name]
353 def get_cluster_labels(self):
355 return list(set(self.structure_cluster_ids))
357 def get_number_of_clusters(self):
358 return len(self.get_cluster_labels())
360 def get_cluster_label_indexes(self, label):
362 [i
for i, l
in enumerate(self.structure_cluster_ids)
if l == label]
365 def get_cluster_label_names(self, label):
367 [self.model_list_names[i]
368 for i
in self.get_cluster_label_indexes(label)]
371 def get_cluster_label_average_rmsd(self, label):
373 indexes = self.get_cluster_label_indexes(label)
376 sub_distance_matrix = self.raw_distance_matrix[
377 indexes, :][:, indexes]
378 average_rmsd = np.sum(sub_distance_matrix) / \
379 (len(sub_distance_matrix)
380 ** 2 - len(sub_distance_matrix))
385 def get_cluster_label_size(self, label):
386 return len(self.get_cluster_label_indexes(label))
388 def get_transformation_to_first_member(
392 reference = self.get_cluster_label_indexes(cluster_label)[0]
393 return self.transformation_distance_dict[(reference, structure_index)]
395 def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
397 model_list_names = all_coords.keys()
398 rmsd_protein_names = all_coords[model_list_names[0]].keys()
399 raw_distance_dict = {}
400 transformation_distance_dict = {}
401 if template_coords
is None:
405 alignment_template_protein_names = template_coords.keys()
407 for (f1, f2)
in list_of_pairs:
415 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
416 for pr
in rmsd_protein_names])
418 for pr
in rmsd_protein_names:
419 coords_f2[pr] = all_coords[model_list_names[f2]][pr]
421 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
422 rmsd = Ali.get_rmsd()
428 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
429 for pr
in alignment_template_protein_names])
430 coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
431 for pr
in alignment_template_protein_names])
434 template_rmsd, transformation = Ali.align()
439 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
440 for pr
in rmsd_protein_names])
442 for pr
in rmsd_protein_names:
443 coords_f2[pr] = [transformation.get_transformed(
444 i)
for i
in all_coords[model_list_names[f2]][pr]]
446 Ali =
Alignment(coords_f1, coords_f2, self.rmsd_weights)
447 rmsd = Ali.get_rmsd()
449 raw_distance_dict[(f1, f2)] = rmsd
450 raw_distance_dict[(f2, f1)] = rmsd
451 transformation_distance_dict[(f1, f2)] = transformation
452 transformation_distance_dict[(f2, f1)] = transformation
454 return raw_distance_dict, transformation_distance_dict
458 """ A class to evaluate the precision of an ensemble.
459 Also can evaluate the cross-precision of multiple ensembles.
460 Supports MPI for coordinate reading.
461 Recommended procedure:
462 -# initialize object and pass the selection for evaluating precision
463 -# call add_structures() to read in the data (specify group name)"
464 -# call get_precision() to evaluate inter/intra precision
465 -# call get_rmsf() to evaluate within-group fluctuations
466 @param model The IMP Model
467 @param resolution Use 1 or 10 (kluge: requires that "_Res:X" is part of the hier name)
468 @param selection_dictionary Dictionary where keys are names for selections
469 and values are selection tuples for scoring precision.
470 "All" is automatically made as well
472 def __init__(self,model,
474 selection_dictionary={}):
476 from mpi4py
import MPI
477 self.comm = MPI.COMM_WORLD
478 self.rank = self.comm.Get_rank()
479 self.number_of_processes = self.comm.size
481 self.number_of_processes=1
484 self.styles=[
'pairwise_rmsd',
'pairwise_drmsd_k',
'pairwise_drmsd_Q',
485 'pairwise_drms_k',
'pairwise_rmsd',
'drmsd_from_center']
486 self.style=
'pairwise_drmsd_k'
487 self.structures_dictionary={}
488 self.reference_structures_dictionary={}
490 self.protein_names=
None
491 self.len_particles_resolution_one=
None
493 self.rmf_names_frames={}
494 self.reference_rmf_names_frames=
None
495 self.reference_structure=
None
496 self.reference_prot=
None
497 self.selection_dictionary=selection_dictionary
499 self.residue_particle_index_map=
None
500 if resolution
in [1,10]:
501 self.resolution=resolution
503 raise KeyError(
"no such resolution")
505 def _get_structure(self,rmf_frame_index,rmf_name):
506 """Read an RMF file and return the particles"""
507 rh= RMF.open_rmf_file_read_only(rmf_name)
510 print "getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name)
513 if self.resolution==1:
515 elif self.resolution==10:
518 protein_names=particle_dict.keys()
519 particles_resolution_one=[]
520 for k
in particle_dict:
521 particles_resolution_one+=(particle_dict[k])
523 if self.protein_names==
None:
524 self.protein_names=protein_names
526 if self.protein_names!=protein_names:
527 print "Error: the protein names of the new coordinate set is not compatible with the previous one"
529 if self.len_particles_resolution_one==
None:
530 self.len_particles_resolution_one=len(particles_resolution_one)
532 if self.len_particles_resolution_one!=len(particles_resolution_one):
533 raise ValueError(
"the new coordinate set is not compatible with the previous one")
535 return particles_resolution_one,prots
541 setup_index_map=
False):
542 """ Read a structure into the ensemble and store (as coordinates).
543 @param rmf_name The name of the RMF file
544 @param rmf_frame_index The frame to read
545 @param structure_set_name Name for the set that includes this structure
550 if structure_set_name
in self.structures_dictionary:
551 cdict=self.structures_dictionary[structure_set_name]
552 rmflist=self.rmf_names_frames[structure_set_name]
554 self.structures_dictionary[structure_set_name]={}
555 self.rmf_names_frames[structure_set_name]=[]
556 cdict=self.structures_dictionary[structure_set_name]
557 rmflist=self.rmf_names_frames[structure_set_name]
561 (particles_resolution_one, prots)=self._get_structure(rmf_frame_index,rmf_name)
563 print "something wrong with the rmf"
566 self.selection_dictionary.update({
"All":self.protein_names})
568 for selection_name
in self.selection_dictionary:
569 selection_tuple=self.selection_dictionary[selection_name]
570 coords=self._select_coordinates(selection_tuple,particles_resolution_one,prots[0])
571 if selection_name
not in cdict:
572 cdict[selection_name]=[coords]
574 cdict[selection_name].append(coords)
576 rmflist.append((rmf_name,rmf_frame_index))
580 self.residue_particle_index_map={}
581 for prot_name
in self.protein_names:
582 self.residue_particle_index_map[prot_name] = \
583 self._get_residue_particle_index_map(
585 particles_resolution_one,prots[0])
590 rmf_name_frame_tuples,
592 """Read a list of RMFs, supports parallel
593 @param rmf_name_frame_tuples list of (rmf_file_name,frame_number)
594 @param structure_set_name Name this set of structures (e.g. "cluster.1")
598 my_rmf_name_frame_tuples=IMP.pmi.tools.chunk_list_into_segments(
599 rmf_name_frame_tuples,self.number_of_processes)[self.rank]
600 for nfr,tup
in enumerate(my_rmf_name_frame_tuples):
602 rmf_frame_index=tup[1]
604 if self.residue_particle_index_map
is None:
607 setup_index_map=
False
614 if self.number_of_processes > 1:
617 self.comm.send(self.structures_dictionary, dest=0, tag=11)
619 for i
in range(1, self.number_of_processes):
620 data_tmp = self.comm.recv(source=i, tag=11)
621 for key
in self.structures_dictionary:
622 self.structures_dictionary[key].update(data_tmp[key])
623 for i
in range(1, self.number_of_processes):
624 self.comm.send(self.structures_dictionary, dest=i, tag=11)
626 self.structures_dictionary = self.comm.recv(source=0, tag=11)
628 def _get_residue_particle_index_map(self,prot_name,structure,hier):
629 residue_particle_index_map=[]
631 all_selected_particles=s.get_selected_particles()
632 intersection=list(set(all_selected_particles) & set(structure))
633 sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
634 for p
in sorted_intersection:
636 return residue_particle_index_map
639 def _select_coordinates(self,tuple_selections,structure,prot):
640 selected_coordinates=[]
641 for t
in tuple_selections:
642 if type(t)==tuple
and len(t)==3:
644 all_selected_particles=s.get_selected_particles()
645 intersection=list(set(all_selected_particles) & set(structure))
646 sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
647 cc=map(
lambda p: tuple(
IMP.core.XYZ(p).get_coordinates()), sorted_intersection)
648 selected_coordinates+=cc
652 all_selected_particles=s.get_selected_particles()
653 intersection=list(set(all_selected_particles) & set(structure))
654 sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
655 cc=map(
lambda p: tuple(
IMP.core.XYZ(p).get_coordinates()), sorted_intersection)
656 selected_coordinates+=cc
658 raise ValueError(
"Selection error")
659 return selected_coordinates
661 def set_threshold(self,threshold):
662 self.threshold=threshold
664 def _get_distance(self,
670 """ Compute distance between structures with various metrics """
671 c1=self.structures_dictionary[structure_set_name1][selection_name][index1]
672 c2=self.structures_dictionary[structure_set_name2][selection_name][index2]
677 if self.style==
'pairwise_drmsd_k':
679 if self.style==
'pairwise_drms_k':
681 if self.style==
'pairwise_drmsd_Q':
684 if self.style==
'pairwise_rmsd':
688 def _get_particle_distances(self,structure_set_name1,structure_set_name2,
689 selection_name,index1,index2):
691 c1=self.structures_dictionary[structure_set_name1][selection_name][index1]
692 c2=self.structures_dictionary[structure_set_name2][selection_name][index2]
697 distances=[np.linalg.norm(a-b)
for (a,b)
in zip(coordinates1,coordinates2)]
706 selection_keywords=
None):
707 """ Evaluate the precision of two named structure groups. Supports MPI.
708 When the structure_set_name1 is different from the structure_set_name2,
709 this evaluates the cross-precision (average pairwise distances).
710 @param outfile Name of the precision output file
711 @param structure_set_name1 string name of the first structure set
712 @param structure_set_name2 string name of the second structure set
713 @param skip analyze every (skip) structure for the distance matrix calculation
714 @param selection_keywords Specify the selection name you want to calculate on.
715 By default this is computed for everything you provided in the constructor,
716 plus all the subunits together.
718 if selection_keywords
is None:
719 sel_keys=self.selection_dictionary.keys()
721 for k
in selection_keywords:
722 if k
not in self.selection_dictionary:
723 raise KeyError(
"you are trying to find named selection " \
724 + k +
" which was not requested in the constructor")
725 sel_keys=selection_keywords
727 if outfile
is not None:
730 for selection_name
in sel_keys:
731 number_of_structures_1=len(self.structures_dictionary[structure_set_name1][selection_name])
732 number_of_structures_2=len(self.structures_dictionary[structure_set_name2][selection_name])
735 structure_pointers_1=range(0,number_of_structures_1,skip)
736 structure_pointers_2=range(0,number_of_structures_2,skip)
738 pair_combination_list=list(itertools.product(structure_pointers_1,structure_pointers_2))
740 if len(pair_combination_list)==0:
741 raise ValueError(
"no structure selected. Check the skip parameter.")
743 my_pair_combination_list=IMP.pmi.tools.chunk_list_into_segments(
744 pair_combination_list,self.number_of_processes)[self.rank]
745 my_length=len(my_pair_combination_list)
746 for n,pair
in enumerate(my_pair_combination_list):
748 progression=int(float(n)/my_length*100.0)
749 distances[pair]=self._get_distance(structure_set_name1,structure_set_name2,
750 selection_name,pair[0],pair[1])
752 if self.number_of_processes > 1:
755 if structure_set_name1==structure_set_name2:
756 structure_pointers=structure_pointers_1
757 number_of_structures=number_of_structures_1
763 distances_to_structure={}
764 distances_to_structure_normalization={}
766 for n
in structure_pointers:
767 distances_to_structure[n]=0.0
768 distances_to_structure_normalization[n]=0
771 distance+=distances[k]
772 distances_to_structure[k[0]]+=distances[k]
773 distances_to_structure[k[1]]+=distances[k]
774 distances_to_structure_normalization[k[0]]+=1
775 distances_to_structure_normalization[k[1]]+=1
777 for n
in structure_pointers:
778 distances_to_structure[n]=distances_to_structure[n]/distances_to_structure_normalization[n]
780 min_distance=min([distances_to_structure[n]
for n
in distances_to_structure])
781 centroid_index=[k
for k, v
in distances_to_structure.iteritems()
if v == min_distance][0]
782 centroid_rmf_name=self.rmf_names_frames[structure_set_name1][centroid_index]
784 centroid_distance=0.0
785 for n
in range(number_of_structures):
786 centroid_distance+=self._get_distance(structure_set_name1,structure_set_name1,
787 selection_name,centroid_index,n)
790 centroid_distance/=number_of_structures
792 if outfile
is not None:
793 of.write(str(selection_name)+
" "+structure_set_name1+
794 " average centroid distance "+str(centroid_distance)+
"\n")
795 of.write(str(selection_name)+
" "+structure_set_name1+
796 " centroid index "+str(centroid_index)+
"\n")
797 of.write(str(selection_name)+
" "+structure_set_name1+
798 " centroid rmf name "+str(centroid_rmf_name)+
"\n")
800 average_pairwise_distances=sum(distances.values())/len(distances.values())
801 if outfile
is not None:
802 of.write(str(selection_name)+
" "+structure_set_name1+
" "+structure_set_name2+
803 " average pairwise distance "+str(average_pairwise_distances)+
"\n")
804 if outfile
is not None:
806 return centroid_index
812 set_plot_yaxis_range=
None):
813 """ Calculate the residue mean square fluctuations (RMSF).
814 Automatically outputs as data file and pdf
815 @param structure_set_name Which structure set to calculate RMSF for
816 @param outdir Where to write the files
817 @param skip Skip this number of structures
818 @param set_plot_yaxis_range In case you need to change the plot
827 for sel_name
in self.protein_names:
828 self.selection_dictionary.update({sel_name:[sel_name]})
830 number_of_structures=len(self.structures_dictionary[structure_set_name][sel_name])
834 rpim=self.residue_particle_index_map[sel_name]
835 outfile=outdir+
"/rmsf."+sel_name+
".dat"
839 for index
in range(number_of_structures):
840 distances=self._get_particle_distances(structure_set_name,
843 centroid_index,index)
844 for nblock,block
in enumerate(rpim):
845 for residue_number
in block:
846 residue_nblock[residue_number]=nblock
847 if residue_number
not in residue_distances:
848 residue_distances[residue_number]=[distances[nblock]]
850 residue_distances[residue_number].append(distances[nblock])
854 for rn
in residue_distances:
856 rmsf=np.std(residue_distances[rn])
858 of.write(str(rn)+
" "+str(residue_nblock[rn])+
" "+str(rmsf)+
"\n")
860 IMP.pmi.output.plot_xy_data(residues,rmsfs,title=sel_name,
861 out_fn=outdir+
"/rmsf."+sel_name,display=
False,
862 set_plot_yaxis_range=set_plot_yaxis_range,
863 xlabel=
'Residue Number',ylabel=
'Standard error')
868 """Read in a structure used for reference computation.
869 Needed before calling get_average_distance_wrt_reference_structure()
870 @param rmf_name The RMF file to read the reference
871 @param rmf_frame_index The index in that file
873 (particles_resolution_one, prot)=self._get_structure(rmf_frame_index,rmf_name)
874 self.reference_rmf_names_frames=(rmf_name,rmf_frame_index)
877 for selection_name
in self.selection_dictionary:
878 selection_tuple=self.selection_dictionary[selection_name]
879 coords=self._select_coordinates(selection_tuple,
880 particles_resolution_one,prot)
881 self.reference_structures_dictionary[selection_name]=coords
885 """Compare the structure set to the reference structure.
886 @param structure_set_name The structure set to compute this on
887 \note First call set_reference_structure()
889 if self.reference_structures_dictionary=={}:
890 print "Cannot compute until you set a reference structure"
892 for selection_name
in self.selection_dictionary:
893 reference_coordinates=self.reference_structures_dictionary[selection_name]
897 for sc
in self.structures_dictionary[structure_set_name][selection_name]:
899 if self.style==
'pairwise_drmsd_k':
901 if self.style==
'pairwise_drms_k':
903 if self.style==
'pairwise_drmsd_Q':
905 if self.style==
'pairwise_rmsd':
908 distances.append(distance)
910 print selection_name,
"average distance",sum(distances)/len(distances),
"minimum distance",min(distances)
912 def get_coordinates(self):
915 def set_precision_style(self, style):
916 if style
in self.styles:
919 raise ValueError(
"No such style")
923 """A class to compute mean density maps from structures
924 Keeps a dictionary of density maps,
925 keys are in the custom ranges. When you call add_subunits_density, it adds
926 particle coordinates to the existing density maps.
927 @param custom_ranges Required. It's a dictionary, keys are the density component names,
928 values are selection tuples
929 e.g. {'kin28':[['kin28',1,-1]],
930 'density_name_1' :[('ccl1')],
931 'density_name_2' :[(1,142,'tfb3d1'),(143,700,'tfb3d2')],
932 @param representation PMI representation, for doing selections.
933 not needed if you only pass hierarchies
934 @param voxel The voxel size for the output map (lower is slower)
937 def __init__(self, custom_ranges, representation=None, voxel=5.0):
939 self.representation = representation
942 self.count_models = 0.0
943 self.custom_ranges = custom_ranges
946 """Add a frame to the densities.
947 @param hierarchy Optionally read the hierarchy from somewhere.
948 If not passed, will just read the representation.
950 self.count_models += 1.0
954 all_particles_by_resolution = []
955 for name
in part_dict:
956 all_particles_by_resolution += part_dict[name]
959 for density_name
in self.custom_ranges:
962 all_particles_by_segments = []
964 for seg
in self.custom_ranges[density_name]:
966 parts += IMP.tools.select_by_tuple(self.representation,
967 seg, resolution=1, name_is_ambiguous=
False)
971 elif type(seg) == tuple:
973 hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1))
975 raise Exception(
'could not understand selection tuple '+str(seg))
977 all_particles_by_segments += s.get_selected_particles()
981 set(all_particles_by_segments) & set(all_particles_by_resolution))
983 self._create_density_from_particles(parts, density_name)
985 def normalize_density(self):
988 def _create_density_from_particles(self, ps, name,
990 kernel_type=
'GAUSSIAN'):
991 '''Internal function for adding to densities.
992 pass XYZR particles with mass and create a density from them.
993 kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
995 'GAUSSIAN': IMP.em.GAUSSIAN,
996 'BINARIZED_SPHERE': IMP.em.BINARIZED_SPHERE,
997 'SPHERE': IMP.em.SPHERE}
1001 if name
not in self.densities:
1002 self.densities[name] = dmap
1009 dmap3.add(self.densities[name])
1010 self.densities[name] = dmap3
1012 def get_density_keys(self):
1013 return self.densities.keys()
1016 """Get the current density for some component name"""
1017 if name
not in self.densities:
1020 return self.densities[name]
1022 def write_mrc(self, path="./"):
1023 for density_name
in self.densities:
1024 self.densities[density_name].
multiply(1. / self.count_models)
1026 self.densities[density_name],
1027 path +
"/" + density_name +
".mrc",
1031 class GetContactMap(object):
1033 def __init__(self, distance=15.):
1034 self.distance = distance
1035 self.contactmap =
''
1042 def set_prot(self, prot):
1043 from scipy.spatial.distance
import cdist
1052 for name
in particles_dictionary:
1053 residue_indexes = []
1054 for p
in particles_dictionary[name]:
1059 if len(residue_indexes) != 0:
1060 self.protnames.append(name)
1061 for res
in range(min(residue_indexes), max(residue_indexes) + 1):
1063 new_name = name +
":" + str(res)
1064 if name
not in self.resmap:
1065 self.resmap[name] = {}
1066 if res
not in self.resmap:
1067 self.resmap[name][res] = {}
1069 self.resmap[name][res] = new_name
1070 namelist.append(new_name)
1072 crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1074 radii.append(d.get_radius())
1076 coords = np.array(coords)
1077 radii = np.array(radii)
1079 if len(self.namelist) == 0:
1080 self.namelist = namelist
1081 self.contactmap = np.zeros((len(coords), len(coords)))
1083 distances = cdist(coords, coords)
1084 distances = (distances - radii).T - radii
1085 distances = distances <= self.distance
1091 self.contactmap += distances
1093 def get_subunit_coords(self, frame, align=0):
1094 from scipy.spatial.distance
import cdist
1098 test, testr = [], []
1099 for part
in self.prot.get_children():
1102 for chl
in part.get_children():
1108 SortedSegments.append((chl, startres))
1109 SortedSegments = sorted(SortedSegments, key=itemgetter(1))
1111 for sgmnt
in SortedSegments:
1114 crd = np.array([p.get_x(), p.get_y(), p.get_z()])
1117 radii.append(p.get_radius())
1119 new_name = part.get_name() +
'_' + sgmnt[0].get_name() +\
1122 .get_residue_indexes()[0])
1123 namelist.append(new_name)
1124 self.expanded[new_name] = len(
1126 if part.get_name()
not in self.resmap:
1127 self.resmap[part.get_name()] = {}
1129 self.resmap[part.get_name()][res] = new_name
1131 coords = np.array(coords)
1132 radii = np.array(radii)
1133 if len(self.namelist) == 0:
1134 self.namelist = namelist
1135 self.contactmap = np.zeros((len(coords), len(coords)))
1136 distances = cdist(coords, coords)
1137 distances = (distances - radii).T - radii
1138 distances = distances <= self.distance
1139 self.contactmap += distances
1144 identification_string=
'ISDCrossLinkMS_Distance_'):
1147 data = open(filname)
1148 D = data.readlines()
1152 if identification_string
in d:
1159 t1, t2 = (d[0], d[1]), (d[1], d[0])
1160 if t1
not in self.XL:
1161 self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
1162 self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
1164 self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
1165 self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
1167 def dist_matrix(self, skip_cmap=0, skip_xl=1):
1171 L = sum(self.expanded.values())
1172 proteins = self.protnames
1177 proteins = [p.get_name()
for p
in self.prot.get_children()]
1179 for p1
in xrange(len(proteins)):
1180 for p2
in xrange(p1, len(proteins)):
1182 self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1183 pn1, pn2 = proteins[p1], proteins[p2]
1184 mtr = np.zeros((pl1 + 1, pl2 + 1))
1185 print 'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2
1186 for i1
in xrange(1, pl1 + 1):
1187 for i2
in xrange(1, pl2 + 1):
1189 r1 = K.index(self.resmap[pn1][i1])
1190 r2 = K.index(self.resmap[pn2][i2])
1192 mtr[i1 - 1, i2 - 1] = r
1194 missing.append((pn1, pn2, i1, i2))
1196 Matrices[(pn1, pn2)] = mtr
1201 raise ValueError(
"cross-links were not provided, use add_xlinks function!")
1204 for p1
in xrange(len(proteins)):
1205 for p2
in xrange(p1, len(proteins)):
1207 self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1208 pn1, pn2 = proteins[p1], proteins[p2]
1209 mtr = np.zeros((pl1 + 1, pl2 + 1))
1212 xls = self.XL[(pn1, pn2)]
1215 xls = self.XL[(pn2, pn1)]
1220 print 'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2
1221 for xl1, xl2
in xls:
1223 print 'X' * 10, xl1, xl2
1226 print 'X' * 10, xl1, xl2
1228 mtr[xl1 - 1, xl2 - 1] = 100
1230 print 'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2
1231 for xl1, xl2
in xls:
1233 print 'X' * 10, xl1, xl2
1236 print 'X' * 10, xl1, xl2
1238 mtr[xl2 - 1, xl1 - 1] = 100
1240 raise RuntimeError(
'WTF!')
1241 Matrices_xl[(pn1, pn2)] = mtr
1257 for i, c
in enumerate(C):
1258 cl = max(self.resmap[c].keys())
1263 R.append(R[-1] + cl)
1266 import matplotlib.pyplot
as plt
1267 import matplotlib.gridspec
as gridspec
1268 import scipy.sparse
as sparse
1271 gs = gridspec.GridSpec(len(W), len(W),
1276 for x1, r1
in enumerate(R):
1281 for x2, r2
in enumerate(R):
1287 ax = plt.subplot(gs[cnt])
1290 mtr = Matrices[(C[x1], C[x2])]
1292 mtr = Matrices[(C[x2], C[x1])].T
1296 interpolation=
'nearest',
1298 vmax=log(NewM.max()))
1303 mtr = Matrices_xl[(C[x1], C[x2])]
1305 mtr = Matrices_xl[(C[x2], C[x1])].T
1307 sparse.csr_matrix(mtr),
1317 ax.set_ylabel(C[x1], rotation=90)
1324 def get_hier_from_rmf(model, frame_number, rmf_file):
1326 print "getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file)
1329 rh = RMF.open_rmf_file_read_only(rmf_file)
1334 print "Unable to open rmf file %s" % (rmf_file)
1342 print "Unable to open frame %i of file %s" % (frame_number, rmf_file)
1348 def get_hier_and_restraints_from_rmf(model, frame_number, rmf_file):
1350 print "getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file)
1353 rh = RMF.open_rmf_file_read_only(rmf_file)
1359 print "Unable to open rmf file %s" % (rmf_file)
1368 print "Unable to open frame %i of file %s" % (frame_number, rmf_file)
1374 def get_hiers_from_rmf(model, frame_number, rmf_file):
1375 print "getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file)
1378 rh = RMF.open_rmf_file_read_only(rmf_file)
1383 print "Unable to open rmf file %s" % (rmf_file)
1390 print "Unable to open frame %i of file %s" % (frame_number, rmf_file)
1399 Get particles at res 1, or any beads, based on the name (no Representation needed)
1400 It is mainly used when the hierarchy is read from an rmf file
1401 Returns a dictionary of component names and their particles
1405 for c
in prot.get_children():
1408 for s
in c.get_children():
1409 if "_Res:1" in s.get_name()
and "_Res:10" not in s.get_name():
1411 if "Beads" in s.get_name():
1415 for name
in particle_dict:
1416 particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1417 list(set(particle_dict[name]) & set(allparticles)))
1418 return particle_dict
1422 Get particles at res 10, or any beads, based on the name (no Representation needed)
1423 It is mainly used when the hierarchy is read from an rmf file
1424 Returns a dictionary of component names and their particles
1428 for c
in prot.get_children():
1431 for s
in c.get_children():
1432 if "_Res:10" in s.get_name():
1434 if "Beads" in s.get_name():
1437 for name
in particle_dict:
1438 particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1439 list(set(particle_dict[name]) & set(allparticles)))
1440 return particle_dict
1444 def select_by_tuple(first_res_last_res_name_tuple):
1445 first_res = first_res_last_res_hier_tuple[0]
1446 last_res = first_res_last_res_hier_tuple[1]
1447 name = first_res_last_res_hier_tuple[2]
1450 """Visualization of crosslinks"""
1452 self.crosslinks = []
1453 self.external_csv_data =
None
1454 self.crosslinkedprots = set()
1455 self.mindist = +10000000.0
1456 self.maxdist = -10000000.0
1457 self.contactmap =
None
1459 def set_hierarchy(self, prot):
1460 self.prot_length_dict = {}
1461 self.model=prot.get_model()
1463 for i
in prot.get_children():
1465 residue_indexes = []
1469 if len(residue_indexes) != 0:
1470 self.prot_length_dict[name] = max(residue_indexes)
1472 def set_coordinates_for_contact_map(self, rmf_name,rmf_frame_index):
1473 from scipy.spatial.distance
import cdist
1475 rh= RMF.open_rmf_file_read_only(rmf_name)
1478 print "getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name)
1489 self.index_dictionary = {}
1491 for name
in particles_dictionary:
1492 residue_indexes = []
1493 for p
in particles_dictionary[name]:
1498 if len(residue_indexes) != 0:
1500 for res
in range(min(residue_indexes), max(residue_indexes) + 1):
1503 crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1505 radii.append(d.get_radius())
1506 if name
not in self.index_dictionary:
1507 self.index_dictionary[name] = [resindex]
1509 self.index_dictionary[name].append(resindex)
1512 coords = np.array(coords)
1513 radii = np.array(radii)
1515 distances = cdist(coords, coords)
1516 distances = (distances - radii).T - radii
1518 distances = np.where(distances <= 20.0, 1.0, 0)
1519 if self.contactmap
is None:
1520 self.contactmap = np.zeros((len(coords), len(coords)))
1521 self.contactmap += distances
1526 self, data_file, search_label=
'ISDCrossLinkMS_Distance_',
1529 filter_rmf_file_names=
None,
1530 external_csv_data_file=
None,
1531 external_csv_data_file_unique_id_key=
"Unique ID"):
1540 keys = po.get_keys()
1542 xl_keys = [k
for k
in keys
if search_label
in k]
1544 if filter_rmf_file_names
is not None:
1545 rmf_file_key=
"local_rmf_file_name"
1546 fs = po.get_fields(xl_keys+[rmf_file_key])
1548 fs = po.get_fields(xl_keys)
1551 self.cross_link_frequency = {}
1555 self.cross_link_distances = {}
1559 self.cross_link_distances_unique = {}
1561 if not external_csv_data_file
is None:
1564 self.external_csv_data = {}
1565 xldb = IMP.pmi.tools.get_db_from_csv(external_csv_data_file)
1568 self.external_csv_data[
1569 xl[external_csv_data_file_unique_id_key]] = xl
1574 cross_link_frequency_list = []
1576 self.unique_cross_link_list = []
1580 keysplit = key.replace(
1589 if filter_label!=
None:
1590 if filter_label
not in keysplit:
continue
1593 r1 = int(keysplit[5])
1595 r2 = int(keysplit[7])
1598 confidence = keysplit[12]
1602 unique_identifier = keysplit[3]
1604 unique_identifier =
'0'
1606 r1 = int(keysplit[mapping[
"Residue1"]])
1607 c1 = keysplit[mapping[
"Protein1"]]
1608 r2 = int(keysplit[mapping[
"Residue2"]])
1609 c2 = keysplit[mapping[
"Protein2"]]
1611 confidence = keysplit[mapping[
"Confidence"]]
1615 unique_identifier = keysplit[mapping[
"Unique Identifier"]]
1617 unique_identifier =
'0'
1619 self.crosslinkedprots.add(c1)
1620 self.crosslinkedprots.add(c2)
1626 if filter_rmf_file_names
is not None:
1627 for n,d
in enumerate(fs[key]):
1628 if fs[rmf_file_key][n]
in filter_rmf_file_names:
1629 dists.append(float(d))
1631 dists=[float(f)
for f
in fs[key]]
1636 mdist = self.median(dists)
1638 stdv = np.std(np.array(dists))
1639 if self.mindist > mdist:
1640 self.mindist = mdist
1641 if self.maxdist < mdist:
1642 self.maxdist = mdist
1646 if not self.external_csv_data
is None:
1647 sample = self.external_csv_data[unique_identifier][
"Sample"]
1651 if (r1, c1, r2, c2,mdist)
not in cross_link_frequency_list:
1652 if (r1, c1, r2, c2)
not in self.cross_link_frequency:
1653 self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1654 self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1656 self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1657 self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1658 cross_link_frequency_list.append((r1, c1, r2, c2))
1659 cross_link_frequency_list.append((r2, c2, r1, c1))
1660 self.unique_cross_link_list.append(
1661 (r1, c1, r2, c2,mdist))
1663 if (r1, c1, r2, c2)
not in self.cross_link_distances:
1664 self.cross_link_distances[(
1670 confidence)] = dists
1671 self.cross_link_distances[(
1677 confidence)] = dists
1678 self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1680 self.cross_link_distances[(
1686 confidence)] += dists
1687 self.cross_link_distances[(
1693 confidence)] += dists
1695 self.crosslinks.append(
1705 self.crosslinks.append(
1716 self.cross_link_frequency_inverted = {}
1717 for xl
in self.unique_cross_link_list:
1718 (r1, c1, r2, c2, mdist) = xl
1719 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1720 if frequency
not in self.cross_link_frequency_inverted:
1721 self.cross_link_frequency_inverted[
1722 frequency] = [(r1, c1, r2, c2)]
1724 self.cross_link_frequency_inverted[
1725 frequency].append((r1, c1, r2, c2))
1729 def median(self, mylist):
1730 sorts = sorted(mylist)
1736 return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1737 return sorts[length / 2]
1739 def set_threshold(self,threshold):
1740 self.threshold=threshold
1742 def set_tolerance(self,tolerance):
1743 self.tolerance=tolerance
1745 def colormap(self, dist):
1746 if dist < self.threshold - self.tolerance:
1748 elif dist >= self.threshold + self.tolerance:
1753 def write_cross_link_database(self, filename, format='csv'):
1757 "Unique ID",
"Protein1",
"Residue1",
"Protein2",
"Residue2",
1758 "Median Distance",
"Standard Deviation",
"Confidence",
"Frequency",
"Arrangement"]
1760 if not self.external_csv_data
is None:
1761 keys = self.external_csv_data.keys()
1762 innerkeys = self.external_csv_data[keys[0]].keys()
1764 fieldnames += innerkeys
1766 dw = csv.DictWriter(
1770 fieldnames=fieldnames)
1772 for xl
in self.crosslinks:
1773 (r1, c1, r2, c2, mdist, stdv, confidence,
1774 unique_identifier, descriptor) = xl
1775 if descriptor ==
'original':
1777 outdict[
"Unique ID"] = unique_identifier
1778 outdict[
"Protein1"] = c1
1779 outdict[
"Protein2"] = c2
1780 outdict[
"Residue1"] = r1
1781 outdict[
"Residue2"] = r2
1782 outdict[
"Median Distance"] = mdist
1783 outdict[
"Standard Deviation"] = stdv
1784 outdict[
"Confidence"] = confidence
1785 outdict[
"Frequency"] = self.cross_link_frequency[
1788 arrangement =
"Intra"
1790 arrangement =
"Inter"
1791 outdict[
"Arrangement"] = arrangement
1792 if not self.external_csv_data
is None:
1793 outdict.update(self.external_csv_data[unique_identifier])
1795 dw.writerow(outdict)
1797 def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
1798 no_confidence_info=
False, filter=
None, layout=
"whole", crosslinkedonly=
False,
1799 filename=
None, confidence_classes=
None, alphablend=0.1, scale_symbol_size=1.0,
1800 gap_between_components=0,
1801 rename_protein_map=
None):
1815 import matplotlib.pyplot
as plt
1816 import matplotlib.cm
as cm
1818 fig = plt.figure(figsize=(10, 10))
1819 ax = fig.add_subplot(111)
1825 if prot_listx
is None:
1827 prot_listx = list(self.crosslinkedprots)
1829 prot_listx = self.prot_length_dict.keys()
1832 nresx = gap_between_components + \
1833 sum([self.prot_length_dict[name]
1834 + gap_between_components
for name
in prot_listx])
1838 if prot_listy
is None:
1840 prot_listy = list(self.crosslinkedprots)
1842 prot_listy = self.prot_length_dict.keys()
1845 nresy = gap_between_components + \
1846 sum([self.prot_length_dict[name]
1847 + gap_between_components
for name
in prot_listy])
1852 res = gap_between_components
1853 for prot
in prot_listx:
1854 resoffsetx[prot] = res
1855 res += self.prot_length_dict[prot]
1857 res += gap_between_components
1861 res = gap_between_components
1862 for prot
in prot_listy:
1863 resoffsety[prot] = res
1864 res += self.prot_length_dict[prot]
1866 res += gap_between_components
1868 resoffsetdiagonal = {}
1869 res = gap_between_components
1870 for prot
in IMP.pmi.tools.OrderedSet(prot_listx + prot_listy):
1871 resoffsetdiagonal[prot] = res
1872 res += self.prot_length_dict[prot]
1873 res += gap_between_components
1879 for n, prot
in enumerate(prot_listx):
1880 res = resoffsetx[prot]
1882 for proty
in prot_listy:
1883 resy = resoffsety[proty]
1884 endy = resendy[proty]
1885 ax.plot([res, res], [resy, endy],
'k-', lw=0.4)
1886 ax.plot([end, end], [resy, endy],
'k-', lw=0.4)
1887 xticks.append((float(res) + float(end)) / 2)
1888 if rename_protein_map
is not None:
1889 if prot
in rename_protein_map:
1890 prot=rename_protein_map[prot]
1891 xlabels.append(prot)
1895 for n, prot
in enumerate(prot_listy):
1896 res = resoffsety[prot]
1898 for protx
in prot_listx:
1899 resx = resoffsetx[protx]
1900 endx = resendx[protx]
1901 ax.plot([resx, endx], [res, res],
'k-', lw=0.4)
1902 ax.plot([resx, endx], [end, end],
'k-', lw=0.4)
1903 yticks.append((float(res) + float(end)) / 2)
1904 if rename_protein_map
is not None:
1905 if prot
in rename_protein_map:
1906 prot=rename_protein_map[prot]
1907 ylabels.append(prot)
1910 print prot_listx, prot_listy
1912 if not self.contactmap
is None:
1913 import matplotlib.cm
as cm
1914 tmp_array = np.zeros((nresx, nresy))
1916 for px
in prot_listx:
1918 for py
in prot_listy:
1920 resx = resoffsety[px]
1921 lengx = resendx[px] - 1
1922 resy = resoffsety[py]
1923 lengy = resendy[py] - 1
1924 indexes_x = self.index_dictionary[px]
1925 minx = min(indexes_x)
1926 maxx = max(indexes_x)
1927 indexes_y = self.index_dictionary[py]
1928 miny = min(indexes_y)
1929 maxy = max(indexes_y)
1931 print px, py, minx, maxx, miny, maxy
1936 resy:lengy] = self.contactmap[
1943 ax.imshow(tmp_array,
1946 interpolation=
'nearest')
1948 ax.set_xticks(xticks)
1949 ax.set_xticklabels(xlabels, rotation=90)
1950 ax.set_yticks(yticks)
1951 ax.set_yticklabels(ylabels)
1952 ax.set_xlim(0,nresx)
1953 ax.set_ylim(0,nresy)
1958 already_added_xls = []
1960 for xl
in self.crosslinks:
1962 (r1, c1, r2, c2, mdist, stdv, confidence,
1963 unique_identifier, descriptor) = xl
1965 if confidence_classes
is not None:
1966 if confidence
not in confidence_classes:
1970 pos1 = r1 + resoffsetx[c1]
1974 pos2 = r2 + resoffsety[c2]
1978 if not filter
is None:
1979 xldb = self.external_csv_data[unique_identifier]
1980 xldb.update({
"Distance": mdist})
1981 xldb.update({
"Distance_stdv": stdv})
1983 if filter[1] ==
">":
1984 if float(xldb[filter[0]]) <= float(filter[2]):
1987 if filter[1] ==
"<":
1988 if float(xldb[filter[0]]) >= float(filter[2]):
1991 if filter[1] ==
"==":
1992 if float(xldb[filter[0]]) != float(filter[2]):
1998 pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
1999 pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
2001 if layout ==
'lowerdiagonal':
2002 if pos_for_diagonal1 <= pos_for_diagonal2:
2004 if layout ==
'upperdiagonal':
2005 if pos_for_diagonal1 >= pos_for_diagonal2:
2008 already_added_xls.append((r1, c1, r2, c2))
2010 if not no_confidence_info:
2011 if confidence ==
'0.01':
2012 markersize = 14 * scale_symbol_size
2013 elif confidence ==
'0.05':
2014 markersize = 9 * scale_symbol_size
2015 elif confidence ==
'0.1':
2016 markersize = 6 * scale_symbol_size
2018 markersize = 15 * scale_symbol_size
2020 markersize = 5 * scale_symbol_size
2022 if not no_dist_info:
2023 color = self.colormap(mdist)
2033 markersize=markersize)
2037 fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
2039 [i.set_linewidth(2.0)
for i
in ax.spines.itervalues()]
2044 plt.savefig(filename +
".pdf", dpi=300, transparent=
"False")
2049 def get_frequency_statistics(self, prot_list,
2052 violated_histogram = {}
2053 satisfied_histogram = {}
2054 unique_cross_links = []
2056 for xl
in self.unique_cross_link_list:
2057 (r1, c1, r2, c2, mdist) = xl
2060 if prot_list2
is None:
2061 if not c1
in prot_list:
2063 if not c2
in prot_list:
2066 if c1
in prot_list
and c2
in prot_list2:
2068 elif c1
in prot_list2
and c2
in prot_list:
2073 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2075 if (r1, c1, r2, c2)
not in unique_cross_links:
2077 if frequency
not in violated_histogram:
2078 violated_histogram[frequency] = 1
2080 violated_histogram[frequency] += 1
2082 if frequency
not in satisfied_histogram:
2083 satisfied_histogram[frequency] = 1
2085 satisfied_histogram[frequency] += 1
2086 unique_cross_links.append((r1, c1, r2, c2))
2087 unique_cross_links.append((r2, c2, r1, c1))
2091 total_number_of_crosslinks=0
2093 for i
in satisfied_histogram:
2097 if i
in violated_histogram:
2098 print i, violated_histogram[i]+satisfied_histogram[i]
2100 print i, satisfied_histogram[i]
2101 total_number_of_crosslinks+=i*satisfied_histogram[i]
2105 for i
in violated_histogram:
2106 print i, violated_histogram[i]
2107 total_number_of_crosslinks+=i*violated_histogram[i]
2109 print total_number_of_crosslinks
2113 def print_cross_link_binary_symbols(self, prot_list,
2116 confidence_list = []
2117 for xl
in self.crosslinks:
2118 (r1, c1, r2, c2, mdist, stdv, confidence,
2119 unique_identifier, descriptor) = xl
2121 if prot_list2
is None:
2122 if not c1
in prot_list:
2124 if not c2
in prot_list:
2127 if c1
in prot_list
and c2
in prot_list2:
2129 elif c1
in prot_list2
and c2
in prot_list:
2134 if descriptor !=
"original":
2137 confidence_list.append(confidence)
2139 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2140 tmp_dist_binary = []
2143 tmp_dist_binary.append(1)
2145 tmp_dist_binary.append(0)
2146 tmp_matrix.append(tmp_dist_binary)
2148 matrix = zip(*tmp_matrix)
2150 satisfied_high_sum = 0
2151 satisfied_mid_sum = 0
2152 satisfied_low_sum = 0
2153 total_satisfied_sum = 0
2154 for k, m
in enumerate(matrix):
2163 for n, b
in enumerate(m):
2164 if confidence_list[n] ==
"0.01":
2168 satisfied_high_sum += 1
2169 elif confidence_list[n] ==
"0.05":
2173 satisfied_mid_sum += 1
2174 elif confidence_list[n] ==
"0.1":
2178 satisfied_low_sum += 1
2180 total_satisfied += 1
2181 total_satisfied_sum += 1
2183 print k, satisfied_high, total_high
2184 print k, satisfied_mid, total_mid
2185 print k, satisfied_low, total_low
2186 print k, total_satisfied, total
2187 print float(satisfied_high_sum) / len(matrix)
2188 print float(satisfied_mid_sum) / len(matrix)
2189 print float(satisfied_low_sum) / len(matrix)
2192 def get_unique_crosslinks_statistics(self, prot_list,
2205 satisfied_string = []
2206 for xl
in self.crosslinks:
2207 (r1, c1, r2, c2, mdist, stdv, confidence,
2208 unique_identifier, descriptor) = xl
2210 if prot_list2
is None:
2211 if not c1
in prot_list:
2213 if not c2
in prot_list:
2216 if c1
in prot_list
and c2
in prot_list2:
2218 elif c1
in prot_list2
and c2
in prot_list:
2223 if descriptor !=
"original":
2227 if confidence ==
"0.01":
2231 if confidence ==
"0.05":
2235 if confidence ==
"0.1":
2240 satisfied_string.append(1)
2242 satisfied_string.append(0)
2244 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2245 tmp_dist_binary = []
2248 tmp_dist_binary.append(1)
2250 tmp_dist_binary.append(0)
2251 tmp_matrix.append(tmp_dist_binary)
2253 print "unique satisfied_high/total_high", satisfied_high,
"/", total_high
2254 print "unique satisfied_mid/total_mid", satisfied_mid,
"/", total_mid
2255 print "unique satisfied_low/total_low", satisfied_low,
"/", total_low
2256 print "total", total
2258 matrix = zip(*tmp_matrix)
2259 satisfied_models = 0
2261 for b
in satisfied_string:
2268 all_satisfied =
True
2270 for n, b
in enumerate(m):
2275 if b == 1
and satisfied_string[n] == 1:
2277 elif b == 1
and satisfied_string[n] == 0:
2279 elif b == 0
and satisfied_string[n] == 0:
2281 elif b == 0
and satisfied_string[n] == 1:
2282 all_satisfied =
False
2284 satisfied_models += 1
2286 print satstr, all_satisfied
2287 print "models that satisfies the median satisfied crosslinks/total models", satisfied_models, len(matrix)
2289 def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,
2295 for kw
in self.cross_link_distances_unique:
2296 (r1, c1, r2, c2) = kw
2297 dists = self.cross_link_distances_unique[kw]
2299 if prot_list2
is None:
2300 if not c1
in prot_list:
2302 if not c2
in prot_list:
2305 if c1
in prot_list
and c2
in prot_list2:
2307 elif c1
in prot_list2
and c2
in prot_list:
2312 dists.append(sum(dists))
2313 tmp_matrix.append(dists)
2315 tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
2318 matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
2320 for i
in range(len(tmp_matrix)):
2321 for k
in range(len(tmp_matrix[i]) - 1):
2322 matrix[i][k] = tmp_matrix[i][k]
2327 ax = fig.add_subplot(211)
2329 cax = ax.imshow(matrix, interpolation=
'nearest')
2333 pl.savefig(figurename, dpi=300)
2342 arrangement=
"inter",
2343 confidence_input=
"None"):
2346 for xl
in self.cross_link_distances:
2347 (r1, c1, r2, c2, mdist, confidence) = xl
2348 if c1
in prots1
and c2
in prots2:
2349 if arrangement ==
"inter" and c1 == c2:
2351 if arrangement ==
"intra" and c1 != c2:
2353 if confidence_input == confidence:
2354 label = str(c1) +
":" + str(r1) + \
2355 "-" + str(c2) +
":" + str(r2)
2356 values = self.cross_link_distances[xl]
2357 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2358 data.append((label, values, mdist, frequency))
2360 sort_by_dist = sorted(data, key=
lambda tup: tup[2])
2361 sort_by_dist = zip(*sort_by_dist)
2362 values = sort_by_dist[1]
2363 positions = range(len(values))
2364 labels = sort_by_dist[0]
2365 frequencies = map(float, sort_by_dist[3])
2366 frequencies = [f * 10.0
for f
in frequencies]
2368 nchunks = int(float(len(values)) / nxl_per_row)
2369 values_chunks = IMP.pmi.tools.chunk_list_into_segments(values, nchunks)
2370 positions_chunks = IMP.pmi.tools.chunk_list_into_segments(
2373 frequencies_chunks = IMP.pmi.tools.chunk_list_into_segments(
2376 labels_chunks = IMP.pmi.tools.chunk_list_into_segments(labels, nchunks)
2378 for n, v
in enumerate(values_chunks):
2379 p = positions_chunks[n]
2380 f = frequencies_chunks[n]
2381 l = labels_chunks[n]
2383 filename +
"." + str(n), v, p, f,
2384 valuename=
"Distance (Ang)", positionname=
"Unique " + arrangement +
" Crosslinks", xlabels=l)
2386 def crosslink_distance_histogram(self, filename,
2389 confidence_classes=
None,
2395 if prot_list
is None:
2396 prot_list = self.prot_length_dict.keys()
2399 for xl
in self.crosslinks:
2400 (r1, c1, r2, c2, mdist, stdv, confidence,
2401 unique_identifier, descriptor) = xl
2403 if not confidence_classes
is None:
2404 if confidence
not in confidence_classes:
2407 if prot_list2
is None:
2408 if not c1
in prot_list:
2410 if not c2
in prot_list:
2413 if c1
in prot_list
and c2
in prot_list2:
2415 elif c1
in prot_list2
and c2
in prot_list:
2420 distances.append(mdist)
2423 filename, distances, valuename=
"C-alpha C-alpha distance [Ang]",
2424 bins=bins, color=color,
2426 reference_xline=35.0,
2427 yplotrange=yplotrange, normalized=normalized)
2429 def scatter_plot_xl_features(self, filename,
2435 reference_ylines=
None,
2436 distance_color=
True,
2438 import matplotlib.pyplot
as plt
2439 import matplotlib.cm
as cm
2441 fig = plt.figure(figsize=(10, 10))
2442 ax = fig.add_subplot(111)
2444 for xl
in self.crosslinks:
2445 (r1, c1, r2, c2, mdist, stdv, confidence,
2446 unique_identifier, arrangement) = xl
2448 if prot_list2
is None:
2449 if not c1
in prot_list:
2451 if not c2
in prot_list:
2454 if c1
in prot_list
and c2
in prot_list2:
2456 elif c1
in prot_list2
and c2
in prot_list:
2461 xldb = self.external_csv_data[unique_identifier]
2462 xldb.update({
"Distance": mdist})
2463 xldb.update({
"Distance_stdv": stdv})
2465 xvalue = float(xldb[feature1])
2466 yvalue = float(xldb[feature2])
2469 color = self.colormap(mdist)
2473 ax.plot([xvalue], [yvalue],
'o', c=color, alpha=0.1, markersize=7)
2475 if not yplotrange
is None:
2476 ax.set_ylim(yplotrange)
2477 if not reference_ylines
is None:
2478 for rl
in reference_ylines:
2479 ax.axhline(rl, color=
'red', linestyle=
'dashed', linewidth=1)
2482 plt.savefig(filename +
"." + format, dpi=150, transparent=
"False")
Visualization of crosslinks.
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, kernel::Model *m)
A decorator to associate a particle with a part of a protein/DNA/RNA.
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
DensityMap * create_density_map(const algebra::GridD< 3, algebra::DenseGridStorageD< 3, float >, float > &grid)
A class for reading stat files.
def plot_field_histogram
This function is plotting a list of histograms from a value list.
def plot_fields_box_plots
This function plots time series as boxplots fields is a list of time series, positions are the x-valu...
kernel::Restraints create_restraints(RMF::FileConstHandle fh, kernel::Model *m)
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
A class to evaluate the precision of an ensemble.
void load_frame(RMF::FileConstHandle file, unsigned int frame)
double get_rmsd(const core::XYZs &s0, const core::XYZs &s1, const IMP::algebra::Transformation3D &tr_for_second)
A class to cluster structures.
void write_map(DensityMap *m, std::string filename)
Write a density map to a file.
Class for sampling a density map from particles.
def add_structure
Read a structure into the ensemble and store (as coordinates).
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
Performs alignment and RMSD calculation for two sets of coordinates.
def add_subunits_density
Add a frame to the densities.
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Basic utilities for handling cryo-electron microscopy 3D density maps.
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name (no Representation needed) It is mainly used ...
def fill
fill stores coordinates of a model into a dictionary all_coords, containing coordinates for all model...
Tools for clustering and cluster analysis.
def do_cluster
Run K-means clustering.
void destroy(Hierarchy d)
Delete the Hierarchy.
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Classes for writing output files and processing them.
def set_reference_structure
Read in a structure used for reference computation.
def get_rmsf
Calculate the residue mean square fluctuations (RMSF).
def get_average_distance_wrt_reference_structure
Compare the structure set to the reference structure.
def get_density
Get the current density for some component name.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
def get_precision
Evaluate the precision of two named structure groups.
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name (no Representation needed) It is mainly used...
Python classes to represent, score, sample and analyze models.
def add_structures
Read a list of RMFs, supports parallel.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Select hierarchy particles identified by the biological name.
A class to compute mean density maps from structures Keeps a dictionary of density maps...
Support for the RMF file format for storing hierarchical molecular data and markup.
A decorator for a particle with x,y,z coordinates and a radius.