3 """@namespace IMP.pmi.analysis
16 from operator
import itemgetter
17 from copy
import deepcopy
18 from math
import log,sqrt
21 from scipy.spatial.distance
import cdist
25 This class performs alignment and RMSD calculation for two sets of coordinates
29 - query = {'p1':coords(L,3), 'p2':coords(L,3)}
30 - template = {'p1':coords(L,3), 'p2':coords(L,3)}
32 The class also takes into accout non-equal stoichiometry of the proteins. If this
33 is the case, the protein names of protein in multiple copies should be delivered
34 in the following form: nameA..1, nameA..2 (note two dots).
37 def __init__(self, template, query, weights=None):
40 self.template = template
43 if len(self.query.keys()) != len(self.template.keys()):
44 print '''ERROR: the number of proteins
45 in template and query does not match!'''
50 self.proteins = sorted(self.query.keys())
51 prots_uniq = [i.split(
'..')[0]
for i
in self.proteins]
54 np = prots_uniq.count(p)
55 copies = [i
for i
in self.proteins
if i.split(
'..')[0] == p]
56 prmts = list(itertools.permutations(copies, len(copies)))
59 self.Product = list(itertools.product(*P.values()))
67 torder = sum([list(i)
for i
in self.Product[0]], [])
70 if self.weights
is not None:
71 weights += [i
for i
in self.weights[t]]
74 self.rmsd = 10000000000.
75 for comb
in self.Product:
79 order = sum([list(i)
for i
in comb], [])
89 if self.weights
is not None:
104 torder = sum([list(i)
for i
in self.Product[0]], [])
109 self.rmsd, Transformation = 10000000000.,
''
110 for comb
in self.Product:
111 order = sum([list(i)
for i
in comb], [])
117 if len(template_xyz) != len(query_xyz):
118 print '''ERROR: the number of coordinates
119 in template and query does not match!'''
125 query_xyz_tr = [transformation.get_transformed(n)
129 sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2)) / len(template_xyz))
132 Transformation = transformation
134 return (self.rmsd, Transformation)
139 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
140 'a..2':np.array([np.array([1.,1.,])]),
141 'a..3':np.array([np.array([-2.,1.])]),
142 'b':np.array([np.array([0.,-1.])]),
143 'c..1':np.array([np.array([-1.,-1.])]),
144 'c..2':np.array([np.array([1.,-1.])]),
145 'd':np.array([np.array([0.,0.])]),
146 'e':np.array([np.array([0.,1.])])}
148 Ali = Alignment(Proteins, Proteins)
150 if Ali.get_rmsd() == 0.0: print 'successful test!'
151 else: print 'ERROR!'; exit()
156 class Violations(object):
158 def __init__(self, filename):
160 self.violation_thresholds = {}
161 self.violation_counts = {}
163 data = open(filename)
168 d = d.strip().split()
169 self.violation_thresholds[d[0]] = float(d[1])
171 def get_number_violated_restraints(self, rsts_dict):
173 for rst
in self.violation_thresholds:
174 if rst
not in rsts_dict:
176 if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
178 if rst
not in self.violation_counts:
179 self.violation_counts[rst] = 1
181 self.violation_counts[rst] += 1
187 """A class to cluster structures.
188 Uses scipy's cdist function to compute distance matrices
189 And sklearn's kmeans clustering module.
191 def __init__(self,rmsd_weights=None):
194 self.structure_cluster_ids =
None
195 self.tmpl_coords =
None
196 self.rmsd_weights=rmsd_weights
198 def set_template(self, part_coords):
200 self.tmpl_coords = part_coords
204 fill stores coordinates of a model into a dictionary all_coords,
205 containing coordinates for all models.
208 self.all_coords[frame] = Coords
210 def dist_matrix(self, is_mpi=False):
213 from mpi4py
import MPI
214 comm = MPI.COMM_WORLD
215 rank = comm.Get_rank()
216 number_of_processes = comm.size
218 number_of_processes = 1
221 self.model_list_names = self.all_coords.keys()
222 self.model_indexes = range(len(self.model_list_names))
223 self.model_indexes_dict = dict(
224 zip(self.model_list_names, self.model_indexes))
225 model_indexes_unique_pairs = list(itertools.combinations(self.model_indexes, 2))
227 my_model_indexes_unique_pairs = IMP.pmi.tools.chunk_list_into_segments(
228 model_indexes_unique_pairs,
229 number_of_processes)[rank]
231 print "process %s assigned with %s pairs" % (str(rank), str(len(my_model_indexes_unique_pairs)))
233 (raw_distance_dict, self.transformation_distance_dict) = self.matrix_calculation(self.all_coords,
235 my_model_indexes_unique_pairs)
237 if number_of_processes > 1:
240 pickable_transformations = self.get_pickable_transformation_distance_dict(
243 pickable_transformations)
244 self.set_transformation_distance_dict_from_pickable(
245 pickable_transformations)
247 self.raw_distance_matrix = np.zeros(
248 (len(self.model_list_names), len(self.model_list_names)))
249 for item
in raw_distance_dict:
251 self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
252 self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
254 def get_dist_matrix(self):
255 return self.raw_distance_matrix
258 """Run K-means clustering
259 @param number_of_clusters Num means
260 @param seed the random seed
262 from sklearn.cluster
import KMeans
265 kmeans = KMeans(n_clusters=number_of_clusters)
266 kmeans.fit_predict(self.raw_distance_matrix)
268 self.structure_cluster_ids = kmeans.labels_
270 def get_pickable_transformation_distance_dict(self):
271 pickable_transformations = {}
272 for label
in self.transformation_distance_dict:
273 tr = self.transformation_distance_dict[label]
274 trans = tuple(tr.get_translation())
275 rot = tuple(tr.get_rotation().get_quaternion())
276 pickable_transformations[label] = (rot, trans)
277 return pickable_transformations
279 def set_transformation_distance_dict_from_pickable(
281 pickable_transformations):
282 self.transformation_distance_dict = {}
283 for label
in pickable_transformations:
284 tr = pickable_transformations[label]
287 self.transformation_distance_dict[
290 def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
292 outf = open(file_name +
".data",
'w')
298 pickable_transformations = self.get_pickable_transformation_distance_dict(
301 (self.structure_cluster_ids,
302 self.model_list_names,
303 pickable_transformations),
306 np.save(file_name +
".npy", self.raw_distance_matrix)
308 def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
311 inputf = open(file_name +
".data",
'r')
312 (self.structure_cluster_ids, self.model_list_names,
313 pickable_transformations) = pickle.load(inputf)
316 self.raw_distance_matrix = np.load(file_name + ".npy")
318 self.set_transformation_distance_dict_from_pickable(
319 pickable_transformations)
320 self.model_indexes = range(len(self.model_list_names))
321 self.model_indexes_dict = dict(
322 zip(self.model_list_names, self.model_indexes))
324 def plot_matrix(self, figurename="clustermatrix.pdf"):
326 from scipy.cluster
import hierarchy
as hrc
329 ax = fig.add_subplot(211)
330 dendrogram = hrc.dendrogram(
331 hrc.linkage(self.raw_distance_matrix),
334 leaves_order = dendrogram[
'leaves']
336 ax = fig.add_subplot(212)
338 self.raw_distance_matrix[leaves_order,
341 interpolation=
'nearest')
345 pl.savefig(figurename, dpi=300)
348 def get_model_index_from_name(self, name):
349 return self.model_indexes_dict[name]
351 def get_cluster_labels(self):
353 return list(set(self.structure_cluster_ids))
355 def get_number_of_clusters(self):
356 return len(self.get_cluster_labels())
358 def get_cluster_label_indexes(self, label):
360 [i
for i, l
in enumerate(self.structure_cluster_ids)
if l == label]
363 def get_cluster_label_names(self, label):
365 [self.model_list_names[i]
366 for i
in self.get_cluster_label_indexes(label)]
369 def get_cluster_label_average_rmsd(self, label):
371 indexes = self.get_cluster_label_indexes(label)
374 sub_distance_matrix = self.raw_distance_matrix[
375 indexes, :][:, indexes]
376 average_rmsd = np.sum(sub_distance_matrix) / \
377 (len(sub_distance_matrix)
378 ** 2 - len(sub_distance_matrix))
383 def get_cluster_label_size(self, label):
384 return len(self.get_cluster_label_indexes(label))
386 def get_transformation_to_first_member(
390 reference = self.get_cluster_label_indexes(cluster_label)[0]
391 return self.transformation_distance_dict[(reference, structure_index)]
393 def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
395 model_list_names = all_coords.keys()
396 rmsd_protein_names = all_coords[model_list_names[0]].keys()
397 raw_distance_dict = {}
398 transformation_distance_dict = {}
399 if template_coords
is None:
403 alignment_template_protein_names = template_coords.keys()
405 for (f1, f2)
in list_of_pairs:
413 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
414 for pr
in rmsd_protein_names])
416 for pr
in rmsd_protein_names:
417 coords_f2[pr] = all_coords[model_list_names[f2]][pr]
420 rmsd = Ali.get_rmsd()
426 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
427 for pr
in alignment_template_protein_names])
428 coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
429 for pr
in alignment_template_protein_names])
432 template_rmsd, transformation = Ali.align()
437 coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
438 for pr
in rmsd_protein_names])
440 for pr
in rmsd_protein_names:
441 coords_f2[pr] = [transformation.get_transformed(
442 i)
for i
in all_coords[model_list_names[f2]][pr]]
445 rmsd = Ali.get_rmsd()
447 raw_distance_dict[(f1, f2)] = rmsd
448 raw_distance_dict[(f2, f1)] = rmsd
449 transformation_distance_dict[(f1, f2)] = transformation
450 transformation_distance_dict[(f2, f1)] = transformation
452 return raw_distance_dict, transformation_distance_dict
458 """A class to compute mean density maps from structures"""
459 def __init__(self, custom_ranges, representation=None, voxel=5.0):
460 """Setup GetModelDensity object. Keeps a dictionary of density maps,
461 keys are in the custom ranges. When you call add_subunits_density, it adds
462 particle coordinates to the existing density maps.
463 @param custom_ranges Required. It's a dictionary, keys are the density component names,
464 values are selection tuples
465 e.g. {'kin28':[['kin28',1,-1]],
466 'density_name_1' :[('ccl1')],
467 'density_name_2' :[(1,142,'tfb3d1'),(143,700,'tfb3d2')],
468 @param representation PMI representation, for doing selections.
469 not needed if you only pass hierarchies
470 @param voxel The voxel size for the output map (lower is slower)
474 self.representation = representation
477 self.count_models = 0.0
478 self.custom_ranges = custom_ranges
481 """Add a frame to the densities.
482 @param hierarchy Optionally read the hierarchy from somewhere.
483 If not passed, will just read the representation.
485 self.count_models += 1.0
486 for density_name
in self.custom_ranges:
489 all_particles_by_segments = []
491 for seg
in self.custom_ranges[density_name]:
493 parts += IMP.tools.select_by_tuple(self.representation,
494 seg, resolution=1, name_is_ambiguous=
False)
497 children = [child
for child
in hierarchy.get_children()
498 if child.get_name() == seg]
500 elif type(seg) == tuple:
501 children = [child
for child
in hierarchy.get_children()
502 if child.get_name() == seg[2]]
504 children, residue_indexes=range(seg[0], seg[1] + 1))
506 raise Exception(
'could not understand selection tuple '+str(seg))
508 all_particles_by_segments += s.get_selected_particles()
512 all_particles_by_resolution = []
513 for name
in part_dict:
514 all_particles_by_resolution += part_dict[name]
516 set(all_particles_by_segments) & set(all_particles_by_resolution))
518 self._create_density_from_particles(parts, density_name)
520 def normalize_density(self):
523 def _create_density_from_particles(self, ps, name,
525 kernel_type=
'GAUSSIAN'):
526 '''Internal function for adding to densities.
527 pass XYZR particles with mass and create a density from them.
528 kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
530 'GAUSSIAN': IMP.em.GAUSSIAN,
531 'BINARIZED_SPHERE': IMP.em.BINARIZED_SPHERE,
532 'SPHERE': IMP.em.SPHERE}
536 if name
not in self.densities:
537 self.densities[name] = dmap
544 dmap3.add(self.densities[name])
545 self.densities[name] = dmap3
547 def get_density_keys(self):
548 return self.densities.keys()
551 """Get the current density for some component name"""
552 if name
not in self.densities:
555 return self.densities[name]
557 def write_mrc(self, path="./"):
558 for density_name
in self.densities:
559 self.densities[density_name].
multiply(1. / self.count_models)
561 self.densities[density_name],
562 path +
"/" + density_name +
".mrc",
568 class GetContactMap(object):
570 def __init__(self, distance=15.):
571 self.distance = distance
579 def set_prot(self, prot):
588 for name
in particles_dictionary:
590 for p
in particles_dictionary[name]:
595 if len(residue_indexes) != 0:
596 self.protnames.append(name)
597 for res
in range(min(residue_indexes), max(residue_indexes) + 1):
599 new_name = name +
":" + str(res)
600 if name
not in self.resmap:
601 self.resmap[name] = {}
602 if res
not in self.resmap:
603 self.resmap[name][res] = {}
605 self.resmap[name][res] = new_name
606 namelist.append(new_name)
608 crd = np.array([d.get_x(), d.get_y(), d.get_z()])
610 radii.append(d.get_radius())
612 coords = np.array(coords)
613 radii = np.array(radii)
615 if len(self.namelist) == 0:
616 self.namelist = namelist
617 self.contactmap = np.zeros((len(coords), len(coords)))
619 distances = cdist(coords, coords)
620 distances = (distances - radii).T - radii
621 distances = distances <= self.distance
627 self.contactmap += distances
629 def get_subunit_coords(self, frame, align=0):
634 for part
in self.prot.get_children():
637 for chl
in part.get_children():
643 SortedSegments.append((chl, startres))
644 SortedSegments = sorted(SortedSegments, key=itemgetter(1))
646 for sgmnt
in SortedSegments:
649 crd = np.array([p.get_x(), p.get_y(), p.get_z()])
652 radii.append(p.get_radius())
654 new_name = part.get_name() +
'_' + sgmnt[0].get_name() +\
657 .get_residue_indexes()[0])
658 namelist.append(new_name)
659 self.expanded[new_name] = len(
661 if part.get_name()
not in self.resmap:
662 self.resmap[part.get_name()] = {}
664 self.resmap[part.get_name()][res] = new_name
666 coords = np.array(coords)
667 radii = np.array(radii)
668 if len(self.namelist) == 0:
669 self.namelist = namelist
670 self.contactmap = np.zeros((len(coords), len(coords)))
671 distances = cdist(coords, coords)
672 distances = (distances - radii).T - radii
673 distances = distances <= self.distance
674 self.contactmap += distances
679 identification_string=
'ISDCrossLinkMS_Distance_'):
687 if identification_string
in d:
694 t1, t2 = (d[0], d[1]), (d[1], d[0])
695 if t1
not in self.XL:
696 self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
697 self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
699 self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
700 self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
702 def dist_matrix(self, skip_cmap=0, skip_xl=1):
706 L = sum(self.expanded.values())
707 proteins = self.protnames
712 proteins = [p.get_name()
for p
in self.prot.get_children()]
714 for p1
in xrange(len(proteins)):
715 for p2
in xrange(p1, len(proteins)):
717 self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
718 pn1, pn2 = proteins[p1], proteins[p2]
719 mtr = np.zeros((pl1 + 1, pl2 + 1))
720 print 'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2
721 for i1
in xrange(1, pl1 + 1):
722 for i2
in xrange(1, pl2 + 1):
724 r1 = K.index(self.resmap[pn1][i1])
725 r2 = K.index(self.resmap[pn2][i2])
727 mtr[i1 - 1, i2 - 1] = r
729 missing.append((pn1, pn2, i1, i2))
731 Matrices[(pn1, pn2)] = mtr
736 print "ERROR: cross-links were not provided, use add_xlinks function!"
740 for p1
in xrange(len(proteins)):
741 for p2
in xrange(p1, len(proteins)):
743 self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
744 pn1, pn2 = proteins[p1], proteins[p2]
745 mtr = np.zeros((pl1 + 1, pl2 + 1))
748 xls = self.XL[(pn1, pn2)]
751 xls = self.XL[(pn2, pn1)]
756 print 'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2
759 print 'X' * 10, xl1, xl2
762 print 'X' * 10, xl1, xl2
764 mtr[xl1 - 1, xl2 - 1] = 100
766 print 'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2
769 print 'X' * 10, xl1, xl2
772 print 'X' * 10, xl1, xl2
774 mtr[xl2 - 1, xl1 - 1] = 100
778 Matrices_xl[(pn1, pn2)] = mtr
794 for i, c
in enumerate(C):
795 cl = max(self.resmap[c].keys())
803 import matplotlib.pyplot
as plt
804 import matplotlib.gridspec
as gridspec
805 import scipy.sparse
as sparse
808 gs = gridspec.GridSpec(len(W), len(W),
813 for x1, r1
in enumerate(R):
818 for x2, r2
in enumerate(R):
824 ax = plt.subplot(gs[cnt])
827 mtr = Matrices[(C[x1], C[x2])]
829 mtr = Matrices[(C[x2], C[x1])].T
833 interpolation=
'nearest',
835 vmax=log(NewM.max()))
840 mtr = Matrices_xl[(C[x1], C[x2])]
842 mtr = Matrices_xl[(C[x2], C[x1])].T
844 sparse.csr_matrix(mtr),
854 ax.set_ylabel(C[x1], rotation=90)
860 class CrossLinkTable(object):
864 self.external_csv_data =
None
865 self.crosslinkedprots = set()
866 self.mindist = +10000000.0
867 self.maxdist = -10000000.0
868 self.contactmap =
None
870 def set_hierarchy(self, prot):
871 self.prot_length_dict = {}
872 self.model=prot.get_model()
874 for i
in prot.get_children():
880 if len(residue_indexes) != 0:
881 self.prot_length_dict[name] = max(residue_indexes)
883 def set_coordinates_for_contact_map(self, rmf_name,rmf_frame_index):
885 rh= RMF.open_rmf_file_read_only(rmf_name)
888 print "getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name)
899 self.index_dictionary = {}
901 for name
in particles_dictionary:
903 for p
in particles_dictionary[name]:
908 if len(residue_indexes) != 0:
910 for res
in range(min(residue_indexes), max(residue_indexes) + 1):
913 crd = np.array([d.get_x(), d.get_y(), d.get_z()])
915 radii.append(d.get_radius())
916 if name
not in self.index_dictionary:
917 self.index_dictionary[name] = [resindex]
919 self.index_dictionary[name].append(resindex)
922 coords = np.array(coords)
923 radii = np.array(radii)
925 distances = cdist(coords, coords)
926 distances = (distances - radii).T - radii
928 distances = np.where(distances <= 20.0, 1.0, 0)
929 if self.contactmap
is None:
930 self.contactmap = np.zeros((len(coords), len(coords)))
931 self.contactmap += distances
936 self, data_file, search_label=
'ISDCrossLinkMS_Distance_',
939 filter_rmf_file_names=
None,
940 external_csv_data_file=
None,
941 external_csv_data_file_unique_id_key=
"Unique ID"):
952 xl_keys = [k
for k
in keys
if search_label
in k]
954 if filter_rmf_file_names
is not None:
955 rmf_file_key=
"local_rmf_file_name"
956 fs = po.get_fields(xl_keys+[rmf_file_key])
958 fs = po.get_fields(xl_keys)
961 self.cross_link_frequency = {}
965 self.cross_link_distances = {}
969 self.cross_link_distances_unique = {}
971 if not external_csv_data_file
is None:
974 self.external_csv_data = {}
975 xldb = IMP.pmi.tools.get_db_from_csv(external_csv_data_file)
978 self.external_csv_data[
979 xl[external_csv_data_file_unique_id_key]] = xl
984 cross_link_frequency_list = []
986 self.unique_cross_link_list = []
990 keysplit = key.replace(
999 if filter_label!=
None:
1000 if filter_label
not in keysplit:
continue
1003 r1 = int(keysplit[5])
1005 r2 = int(keysplit[7])
1008 confidence = keysplit[12]
1012 unique_identifier = keysplit[3]
1014 unique_identifier =
'0'
1016 r1 = int(keysplit[mapping[
"Residue1"]])
1017 c1 = keysplit[mapping[
"Protein1"]]
1018 r2 = int(keysplit[mapping[
"Residue2"]])
1019 c2 = keysplit[mapping[
"Protein2"]]
1021 confidence = keysplit[mapping[
"Confidence"]]
1025 unique_identifier = keysplit[mapping[
"Unique Identifier"]]
1027 unique_identifier =
'0'
1029 self.crosslinkedprots.add(c1)
1030 self.crosslinkedprots.add(c2)
1036 if filter_rmf_file_names
is not None:
1037 for n,d
in enumerate(fs[key]):
1038 if fs[rmf_file_key][n]
in filter_rmf_file_names:
1039 dists.append(float(d))
1041 dists=[float(f)
for f
in fs[key]]
1046 mdist = self.median(dists)
1048 stdv = np.std(np.array(dists))
1049 if self.mindist > mdist:
1050 self.mindist = mdist
1051 if self.maxdist < mdist:
1052 self.maxdist = mdist
1056 if not self.external_csv_data
is None:
1057 sample = self.external_csv_data[unique_identifier][
"Sample"]
1061 if (r1, c1, r2, c2,mdist)
not in cross_link_frequency_list:
1062 if (r1, c1, r2, c2)
not in self.cross_link_frequency:
1063 self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1064 self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1066 self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1067 self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1068 cross_link_frequency_list.append((r1, c1, r2, c2))
1069 cross_link_frequency_list.append((r2, c2, r1, c1))
1070 self.unique_cross_link_list.append(
1071 (r1, c1, r2, c2,mdist))
1073 if (r1, c1, r2, c2)
not in self.cross_link_distances:
1074 self.cross_link_distances[(
1080 confidence)] = dists
1081 self.cross_link_distances[(
1087 confidence)] = dists
1088 self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1090 self.cross_link_distances[(
1096 confidence)] += dists
1097 self.cross_link_distances[(
1103 confidence)] += dists
1105 self.crosslinks.append(
1115 self.crosslinks.append(
1126 self.cross_link_frequency_inverted = {}
1127 for xl
in self.unique_cross_link_list:
1128 (r1, c1, r2, c2, mdist) = xl
1129 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1130 if frequency
not in self.cross_link_frequency_inverted:
1131 self.cross_link_frequency_inverted[
1132 frequency] = [(r1, c1, r2, c2)]
1134 self.cross_link_frequency_inverted[
1135 frequency].append((r1, c1, r2, c2))
1139 def median(self, mylist):
1140 sorts = sorted(mylist)
1146 return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1147 return sorts[length / 2]
1149 def set_threshold(self,threshold):
1150 self.threshold=threshold
1152 def set_tolerance(self,tolerance):
1153 self.tolerance=tolerance
1155 def colormap(self, dist):
1156 if dist < self.threshold - self.tolerance:
1158 elif dist >= self.threshold + self.tolerance:
1163 def write_cross_link_database(self, filename, format='csv'):
1167 "Unique ID",
"Protein1",
"Residue1",
"Protein2",
"Residue2",
1168 "Median Distance",
"Standard Deviation",
"Confidence",
"Frequency",
"Arrangement"]
1170 if not self.external_csv_data
is None:
1171 keys = self.external_csv_data.keys()
1172 innerkeys = self.external_csv_data[keys[0]].keys()
1174 fieldnames += innerkeys
1176 dw = csv.DictWriter(
1180 fieldnames=fieldnames)
1182 for xl
in self.crosslinks:
1183 (r1, c1, r2, c2, mdist, stdv, confidence,
1184 unique_identifier, descriptor) = xl
1185 if descriptor ==
'original':
1187 outdict[
"Unique ID"] = unique_identifier
1188 outdict[
"Protein1"] = c1
1189 outdict[
"Protein2"] = c2
1190 outdict[
"Residue1"] = r1
1191 outdict[
"Residue2"] = r2
1192 outdict[
"Median Distance"] = mdist
1193 outdict[
"Standard Deviation"] = stdv
1194 outdict[
"Confidence"] = confidence
1195 outdict[
"Frequency"] = self.cross_link_frequency[
1198 arrangement =
"Intra"
1200 arrangement =
"Inter"
1201 outdict[
"Arrangement"] = arrangement
1202 if not self.external_csv_data
is None:
1203 outdict.update(self.external_csv_data[unique_identifier])
1205 dw.writerow(outdict)
1207 def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
1208 no_confidence_info=
False, filter=
None, layout=
"whole", crosslinkedonly=
False,
1209 filename=
None, confidence_classes=
None, alphablend=0.1, scale_symbol_size=1.0,
1210 gap_between_components=0,
1211 rename_protein_map=
None):
1225 import matplotlib.pyplot
as plt
1226 import matplotlib.cm
as cm
1228 fig = plt.figure(figsize=(10, 10))
1229 ax = fig.add_subplot(111)
1235 if prot_listx
is None:
1237 prot_listx = list(self.crosslinkedprots)
1239 prot_listx = self.prot_length_dict.keys()
1242 nresx = gap_between_components + \
1243 sum([self.prot_length_dict[name]
1244 + gap_between_components
for name
in prot_listx])
1248 if prot_listy
is None:
1250 prot_listy = list(self.crosslinkedprots)
1252 prot_listy = self.prot_length_dict.keys()
1255 nresy = gap_between_components + \
1256 sum([self.prot_length_dict[name]
1257 + gap_between_components
for name
in prot_listy])
1262 res = gap_between_components
1263 for prot
in prot_listx:
1264 resoffsetx[prot] = res
1265 res += self.prot_length_dict[prot]
1267 res += gap_between_components
1271 res = gap_between_components
1272 for prot
in prot_listy:
1273 resoffsety[prot] = res
1274 res += self.prot_length_dict[prot]
1276 res += gap_between_components
1278 resoffsetdiagonal = {}
1279 res = gap_between_components
1280 for prot
in IMP.pmi.tools.OrderedSet(prot_listx + prot_listy):
1281 resoffsetdiagonal[prot] = res
1282 res += self.prot_length_dict[prot]
1283 res += gap_between_components
1289 for n, prot
in enumerate(prot_listx):
1290 res = resoffsetx[prot]
1292 for proty
in prot_listy:
1293 resy = resoffsety[proty]
1294 endy = resendy[proty]
1295 ax.plot([res, res], [resy, endy],
'k-', lw=0.4)
1296 ax.plot([end, end], [resy, endy],
'k-', lw=0.4)
1297 xticks.append((float(res) + float(end)) / 2)
1298 if rename_protein_map
is not None:
1299 if prot
in rename_protein_map:
1300 prot=rename_protein_map[prot]
1301 xlabels.append(prot)
1305 for n, prot
in enumerate(prot_listy):
1306 res = resoffsety[prot]
1308 for protx
in prot_listx:
1309 resx = resoffsetx[protx]
1310 endx = resendx[protx]
1311 ax.plot([resx, endx], [res, res],
'k-', lw=0.4)
1312 ax.plot([resx, endx], [end, end],
'k-', lw=0.4)
1313 yticks.append((float(res) + float(end)) / 2)
1314 if rename_protein_map
is not None:
1315 if prot
in rename_protein_map:
1316 prot=rename_protein_map[prot]
1317 ylabels.append(prot)
1320 print prot_listx, prot_listy
1322 if not self.contactmap
is None:
1323 import matplotlib.cm
as cm
1324 tmp_array = np.zeros((nresx, nresy))
1326 for px
in prot_listx:
1328 for py
in prot_listy:
1330 resx = resoffsety[px]
1331 lengx = resendx[px] - 1
1332 resy = resoffsety[py]
1333 lengy = resendy[py] - 1
1334 indexes_x = self.index_dictionary[px]
1335 minx = min(indexes_x)
1336 maxx = max(indexes_x)
1337 indexes_y = self.index_dictionary[py]
1338 miny = min(indexes_y)
1339 maxy = max(indexes_y)
1341 print px, py, minx, maxx, miny, maxy
1346 resy:lengy] = self.contactmap[
1353 ax.imshow(tmp_array,
1356 interpolation=
'nearest')
1358 ax.set_xticks(xticks)
1359 ax.set_xticklabels(xlabels, rotation=90)
1360 ax.set_yticks(yticks)
1361 ax.set_yticklabels(ylabels)
1362 ax.set_xlim(0,nresx)
1363 ax.set_ylim(0,nresy)
1368 already_added_xls = []
1370 for xl
in self.crosslinks:
1372 (r1, c1, r2, c2, mdist, stdv, confidence,
1373 unique_identifier, descriptor) = xl
1375 if confidence_classes
is not None:
1376 if confidence
not in confidence_classes:
1380 pos1 = r1 + resoffsetx[c1]
1384 pos2 = r2 + resoffsety[c2]
1388 if not filter
is None:
1389 xldb = self.external_csv_data[unique_identifier]
1390 xldb.update({
"Distance": mdist})
1391 xldb.update({
"Distance_stdv": stdv})
1393 if filter[1] ==
">":
1394 if float(xldb[filter[0]]) <= float(filter[2]):
1397 if filter[1] ==
"<":
1398 if float(xldb[filter[0]]) >= float(filter[2]):
1401 if filter[1] ==
"==":
1402 if float(xldb[filter[0]]) != float(filter[2]):
1408 pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
1409 pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
1411 if layout ==
'lowerdiagonal':
1412 if pos_for_diagonal1 <= pos_for_diagonal2:
1414 if layout ==
'upperdiagonal':
1415 if pos_for_diagonal1 >= pos_for_diagonal2:
1418 already_added_xls.append((r1, c1, r2, c2))
1420 if not no_confidence_info:
1421 if confidence ==
'0.01':
1422 markersize = 14 * scale_symbol_size
1423 elif confidence ==
'0.05':
1424 markersize = 9 * scale_symbol_size
1425 elif confidence ==
'0.1':
1426 markersize = 6 * scale_symbol_size
1428 markersize = 15 * scale_symbol_size
1430 markersize = 5 * scale_symbol_size
1432 if not no_dist_info:
1433 color = self.colormap(mdist)
1443 markersize=markersize)
1447 fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
1449 [i.set_linewidth(2.0)
for i
in ax.spines.itervalues()]
1454 plt.savefig(filename +
".pdf", dpi=300, transparent=
"False")
1459 def get_frequency_statistics(self, prot_list,
1462 violated_histogram = {}
1463 satisfied_histogram = {}
1464 unique_cross_links = []
1466 for xl
in self.unique_cross_link_list:
1467 (r1, c1, r2, c2, mdist) = xl
1470 if prot_list2
is None:
1471 if not c1
in prot_list:
1473 if not c2
in prot_list:
1476 if c1
in prot_list
and c2
in prot_list2:
1478 elif c1
in prot_list2
and c2
in prot_list:
1483 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1485 if (r1, c1, r2, c2)
not in unique_cross_links:
1487 if frequency
not in violated_histogram:
1488 violated_histogram[frequency] = 1
1490 violated_histogram[frequency] += 1
1492 if frequency
not in satisfied_histogram:
1493 satisfied_histogram[frequency] = 1
1495 satisfied_histogram[frequency] += 1
1496 unique_cross_links.append((r1, c1, r2, c2))
1497 unique_cross_links.append((r2, c2, r1, c1))
1501 total_number_of_crosslinks=0
1503 for i
in satisfied_histogram:
1507 if i
in violated_histogram:
1508 print i, violated_histogram[i]+satisfied_histogram[i]
1510 print i, satisfied_histogram[i]
1511 total_number_of_crosslinks+=i*satisfied_histogram[i]
1515 for i
in violated_histogram:
1516 print i, violated_histogram[i]
1517 total_number_of_crosslinks+=i*violated_histogram[i]
1519 print total_number_of_crosslinks
1523 def print_cross_link_binary_symbols(self, prot_list,
1526 confidence_list = []
1527 for xl
in self.crosslinks:
1528 (r1, c1, r2, c2, mdist, stdv, confidence,
1529 unique_identifier, descriptor) = xl
1531 if prot_list2
is None:
1532 if not c1
in prot_list:
1534 if not c2
in prot_list:
1537 if c1
in prot_list
and c2
in prot_list2:
1539 elif c1
in prot_list2
and c2
in prot_list:
1544 if descriptor !=
"original":
1547 confidence_list.append(confidence)
1549 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
1550 tmp_dist_binary = []
1553 tmp_dist_binary.append(1)
1555 tmp_dist_binary.append(0)
1556 tmp_matrix.append(tmp_dist_binary)
1558 matrix = zip(*tmp_matrix)
1560 satisfied_high_sum = 0
1561 satisfied_mid_sum = 0
1562 satisfied_low_sum = 0
1563 total_satisfied_sum = 0
1564 for k, m
in enumerate(matrix):
1573 for n, b
in enumerate(m):
1574 if confidence_list[n] ==
"0.01":
1578 satisfied_high_sum += 1
1579 elif confidence_list[n] ==
"0.05":
1583 satisfied_mid_sum += 1
1584 elif confidence_list[n] ==
"0.1":
1588 satisfied_low_sum += 1
1590 total_satisfied += 1
1591 total_satisfied_sum += 1
1593 print k, satisfied_high, total_high
1594 print k, satisfied_mid, total_mid
1595 print k, satisfied_low, total_low
1596 print k, total_satisfied, total
1597 print float(satisfied_high_sum) / len(matrix)
1598 print float(satisfied_mid_sum) / len(matrix)
1599 print float(satisfied_low_sum) / len(matrix)
1602 def get_unique_crosslinks_statistics(self, prot_list,
1615 satisfied_string = []
1616 for xl
in self.crosslinks:
1617 (r1, c1, r2, c2, mdist, stdv, confidence,
1618 unique_identifier, descriptor) = xl
1620 if prot_list2
is None:
1621 if not c1
in prot_list:
1623 if not c2
in prot_list:
1626 if c1
in prot_list
and c2
in prot_list2:
1628 elif c1
in prot_list2
and c2
in prot_list:
1633 if descriptor !=
"original":
1637 if confidence ==
"0.01":
1641 if confidence ==
"0.05":
1645 if confidence ==
"0.1":
1650 satisfied_string.append(1)
1652 satisfied_string.append(0)
1654 dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
1655 tmp_dist_binary = []
1658 tmp_dist_binary.append(1)
1660 tmp_dist_binary.append(0)
1661 tmp_matrix.append(tmp_dist_binary)
1663 print "unique satisfied_high/total_high", satisfied_high,
"/", total_high
1664 print "unique satisfied_mid/total_mid", satisfied_mid,
"/", total_mid
1665 print "unique satisfied_low/total_low", satisfied_low,
"/", total_low
1666 print "total", total
1668 matrix = zip(*tmp_matrix)
1669 satisfied_models = 0
1671 for b
in satisfied_string:
1678 all_satisfied =
True
1680 for n, b
in enumerate(m):
1685 if b == 1
and satisfied_string[n] == 1:
1687 elif b == 1
and satisfied_string[n] == 0:
1689 elif b == 0
and satisfied_string[n] == 0:
1691 elif b == 0
and satisfied_string[n] == 1:
1692 all_satisfied =
False
1694 satisfied_models += 1
1696 print satstr, all_satisfied
1697 print "models that satisfies the median satisfied crosslinks/total models", satisfied_models, len(matrix)
1699 def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,
1705 for kw
in self.cross_link_distances_unique:
1706 (r1, c1, r2, c2) = kw
1707 dists = self.cross_link_distances_unique[kw]
1709 if prot_list2
is None:
1710 if not c1
in prot_list:
1712 if not c2
in prot_list:
1715 if c1
in prot_list
and c2
in prot_list2:
1717 elif c1
in prot_list2
and c2
in prot_list:
1722 dists.append(sum(dists))
1723 tmp_matrix.append(dists)
1725 tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
1728 matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
1730 for i
in range(len(tmp_matrix)):
1731 for k
in range(len(tmp_matrix[i]) - 1):
1732 matrix[i][k] = tmp_matrix[i][k]
1737 ax = fig.add_subplot(211)
1739 cax = ax.imshow(matrix, interpolation=
'nearest')
1743 pl.savefig(figurename, dpi=300)
1752 arrangement=
"inter",
1753 confidence_input=
"None"):
1756 for xl
in self.cross_link_distances:
1757 (r1, c1, r2, c2, mdist, confidence) = xl
1758 if c1
in prots1
and c2
in prots2:
1759 if arrangement ==
"inter" and c1 == c2:
1761 if arrangement ==
"intra" and c1 != c2:
1763 if confidence_input == confidence:
1764 label = str(c1) +
":" + str(r1) + \
1765 "-" + str(c2) +
":" + str(r2)
1766 values = self.cross_link_distances[xl]
1767 frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1768 data.append((label, values, mdist, frequency))
1770 sort_by_dist = sorted(data, key=
lambda tup: tup[2])
1771 sort_by_dist = zip(*sort_by_dist)
1772 values = sort_by_dist[1]
1773 positions = range(len(values))
1774 labels = sort_by_dist[0]
1775 frequencies = map(float, sort_by_dist[3])
1776 frequencies = [f * 10.0
for f
in frequencies]
1778 nchunks = int(float(len(values)) / nxl_per_row)
1779 values_chunks = IMP.pmi.tools.chunk_list_into_segments(values, nchunks)
1780 positions_chunks = IMP.pmi.tools.chunk_list_into_segments(
1783 frequencies_chunks = IMP.pmi.tools.chunk_list_into_segments(
1786 labels_chunks = IMP.pmi.tools.chunk_list_into_segments(labels, nchunks)
1788 for n, v
in enumerate(values_chunks):
1789 p = positions_chunks[n]
1790 f = frequencies_chunks[n]
1791 l = labels_chunks[n]
1793 filename +
"." + str(n), v, p, f,
1794 valuename=
"Distance (Ang)", positionname=
"Unique " + arrangement +
" Crosslinks", xlabels=l)
1796 def crosslink_distance_histogram(self, filename,
1799 confidence_classes=
None,
1805 if prot_list
is None:
1806 prot_list = self.prot_length_dict.keys()
1809 for xl
in self.crosslinks:
1810 (r1, c1, r2, c2, mdist, stdv, confidence,
1811 unique_identifier, descriptor) = xl
1813 if not confidence_classes
is None:
1814 if confidence
not in confidence_classes:
1817 if prot_list2
is None:
1818 if not c1
in prot_list:
1820 if not c2
in prot_list:
1823 if c1
in prot_list
and c2
in prot_list2:
1825 elif c1
in prot_list2
and c2
in prot_list:
1830 distances.append(mdist)
1833 filename, distances, valuename=
"C-alpha C-alpha distance [Ang]",
1834 bins=bins, color=color,
1836 reference_xline=35.0,
1837 yplotrange=yplotrange, normalized=normalized)
1839 def scatter_plot_xl_features(self, filename,
1845 reference_ylines=
None,
1846 distance_color=
True,
1848 import matplotlib.pyplot
as plt
1849 import matplotlib.cm
as cm
1851 fig = plt.figure(figsize=(10, 10))
1852 ax = fig.add_subplot(111)
1854 for xl
in self.crosslinks:
1855 (r1, c1, r2, c2, mdist, stdv, confidence,
1856 unique_identifier, arrangement) = xl
1858 if prot_list2
is None:
1859 if not c1
in prot_list:
1861 if not c2
in prot_list:
1864 if c1
in prot_list
and c2
in prot_list2:
1866 elif c1
in prot_list2
and c2
in prot_list:
1871 xldb = self.external_csv_data[unique_identifier]
1872 xldb.update({
"Distance": mdist})
1873 xldb.update({
"Distance_stdv": stdv})
1875 xvalue = float(xldb[feature1])
1876 yvalue = float(xldb[feature2])
1879 color = self.colormap(mdist)
1883 ax.plot([xvalue], [yvalue],
'o', c=color, alpha=0.1, markersize=7)
1885 if not yplotrange
is None:
1886 ax.set_ylim(yplotrange)
1887 if not reference_ylines
is None:
1888 for rl
in reference_ylines:
1889 ax.axhline(rl, color=
'red', linestyle=
'dashed', linewidth=1)
1892 plt.savefig(filename +
"." + format, dpi=150, transparent=
"False")
1900 class Precision(object):
1901 def __init__(self,model,resolution='one',selection_dictionary=None):
1903 ''' selection_dictionary is a dictionary where we store coordinates
1904 selection_dictionary = {"Selection_name_1":selection_tuple1,
1905 "Selection_name_2":selection_tuple2}'''
1908 self.styles=[
'pairwise_rmsd',
'pairwise_drmsd_k',
'pairwise_drmsd_Q',
'pairwise_drms_k',
'pairwise_rmsd',
'drmsd_from_center']
1909 self.style=
'pairwise_drmsd_k'
1910 self.structures_dictionary={}
1911 self.reference_structures_dictionary={}
1913 self.protein_names=
None
1914 self.len_particles_resolution_one=
None
1916 self.rmf_names_frames={}
1917 self.reference_rmf_names_frames=
None
1918 self.reference_structure=
None
1919 self.reference_prot=
None
1920 self.selection_dictionary=selection_dictionary
1923 self.residue_particle_index_map=
None
1925 if resolution
in [
"one",
"ten"]:
1926 self.resolution=resolution
1928 print "no such resolution"; exit()
1932 def get_structure(self,rmf_frame_index,rmf_name):
1934 rh= RMF.open_rmf_file_read_only(rmf_name)
1937 print "getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name)
1940 if self.resolution==
'one':
1942 elif self.resolution==
'ten':
1945 protein_names=particle_dict.keys()
1946 particles_resolution_one=[]
1947 for k
in particle_dict:
1948 particles_resolution_one+=(particle_dict[k])
1950 if self.protein_names==
None:
1951 self.protein_names=protein_names
1953 if self.protein_names!=protein_names:
1954 print "Error: the protein names of the new coordinate set is not compatible with the previous one"
1957 if self.len_particles_resolution_one==
None:
1958 self.len_particles_resolution_one=len(particles_resolution_one)
1960 if self.len_particles_resolution_one!=len(particles_resolution_one):
1961 print "Error: the new coordinate set is not compatible with the previous one"
1964 return particles_resolution_one, prots
1967 def add_structures(self,rmf_name_frame_tuples,structure_set_name,is_mpi=False):
1968 '''this function helps reading a list of rmfs, the input is a list of
1969 tuples, containing the file name and the frame number'''
1972 from mpi4py
import MPI
1973 comm = MPI.COMM_WORLD
1974 rank = comm.Get_rank()
1975 number_of_processes = comm.size
1977 number_of_processes=1
1980 my_rmf_name_frame_tuples=IMP.pmi.tools.chunk_list_into_segments(rmf_name_frame_tuples,number_of_processes)[rank]
1983 for nfr,tup
in enumerate(my_rmf_name_frame_tuples):
1985 rmf_frame_index=tup[1]
1987 self.add_structure(rmf_name,rmf_frame_index,structure_set_name)
1989 (a,b)=self.get_structure(rmf_frame_index,rmf_name)
1991 print "something wrong with the rmf"
1993 if self.residue_particle_index_map
is None:
1997 (particles_resolution_one, prots)=self.get_structure(rmf_frame_index,rmf_name)
1999 print "something wrong with the rmf"
2002 self.residue_particle_index_map={}
2004 for prot_name
in self.protein_names:
2006 self.residue_particle_index_map[prot_name]=self.get_residue_particle_index_map(prot_name,particles_resolution_one,prots[0])
2009 if number_of_processes > 1:
2014 comm.send(self.structures_dictionary, dest=0, tag=11)
2017 for i
in range(1, number_of_processes):
2018 data_tmp = comm.recv(source=i, tag=11)
2020 for key
in self.structures_dictionary:
2022 self.structures_dictionary[key]+=data_tmp[key]
2024 for i
in range(1, number_of_processes):
2025 comm.send(self.structures_dictionary, dest=i, tag=11)
2028 self.structures_dictionary = comm.recv(source=0, tag=11)
2033 def add_structure(self,rmf_name,rmf_frame_index,structure_set_name):
2035 @structure_set_name string specify a name for the structure set. This is used for the precision and
2038 if structure_set_name
in self.structures_dictionary:
2039 cdict=self.structures_dictionary[structure_set_name]
2040 rmflist=self.rmf_names_frames[structure_set_name]
2042 self.structures_dictionary[structure_set_name]={}
2043 self.rmf_names_frames[structure_set_name]=[]
2044 cdict=self.structures_dictionary[structure_set_name]
2045 rmflist=self.rmf_names_frames[structure_set_name]
2048 (particles_resolution_one, prots)=self.get_structure(rmf_frame_index,rmf_name)
2050 print "something wrong with the rmf"
2053 self.selection_dictionary.update({
"All":self.protein_names})
2055 for selection_name
in self.selection_dictionary:
2056 selection_tuple=self.selection_dictionary[selection_name]
2057 coords=self.select_coordinates(selection_tuple,particles_resolution_one,prots[0])
2059 if selection_name
not in cdict:
2060 cdict[selection_name]=[coords]
2062 cdict[selection_name].append(coords)
2064 rmflist.append((rmf_name,rmf_frame_index))
2070 def get_residue_particle_index_map(self,prot_name,structure,hier):
2071 residue_particle_index_map=[]
2073 all_selected_particles=s.get_selected_particles()
2074 intersection=list(set(all_selected_particles) & set(structure))
2075 sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
2076 for p
in sorted_intersection:
2078 return residue_particle_index_map
2081 def select_coordinates(self,tuple_selections,structure,prot):
2082 selected_coordinates=[]
2083 for t
in tuple_selections:
2084 if type(t)==tuple
and len(t)==3:
2086 all_selected_particles=s.get_selected_particles()
2087 intersection=list(set(all_selected_particles) & set(structure))
2088 sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
2089 cc=map(
lambda p: tuple(
IMP.core.XYZ(p).get_coordinates()), sorted_intersection)
2090 selected_coordinates+=cc
2094 all_selected_particles=s.get_selected_particles()
2095 intersection=list(set(all_selected_particles) & set(structure))
2096 sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
2097 cc=map(
lambda p: tuple(
IMP.core.XYZ(p).get_coordinates()), sorted_intersection)
2098 selected_coordinates+=cc
2100 print "Selection error"
2102 return selected_coordinates
2104 def set_threshold(self,threshold):
2105 self.threshold=threshold
2108 def get_distance(self,structure_set_name1,structure_set_name2,selection_name,index1,index2):
2109 c1=self.structures_dictionary[structure_set_name1][selection_name][index1]
2110 c2=self.structures_dictionary[structure_set_name2][selection_name][index2]
2115 if self.style==
'pairwise_drmsd_k':
2117 if self.style==
'pairwise_drms_k':
2119 if self.style==
'pairwise_drmsd_Q':
2122 if self.style==
'pairwise_rmsd':
2127 def get_particle_distances(self,structure_set_name1,structure_set_name2,
2128 selection_name,index1,index2):
2130 c1=self.structures_dictionary[structure_set_name1][selection_name][index1]
2131 c2=self.structures_dictionary[structure_set_name2][selection_name][index2]
2136 distances=[np.linalg.norm(a-b)
for (a,b)
in zip(coordinates1,coordinates2)]
2140 def get_precision(self,outfile,structure_set_name1,structure_set_name2,
2141 is_mpi=
False,skip=
None,selection_keywords=
None):
2144 when the structure_set_name1 is different from the structure_set_name2, you get the
2146 structure_set_name1 string name of the first structure set
2147 structure_set_name2 string name of the second structure set
2148 skip Int analyse every skip structure for the distance matrix calculation
2149 outfile str the output file name
2150 is_mpi bool speed up calculation using openmpi
2151 skip int skip every n frames
2156 from mpi4py
import MPI
2157 comm = MPI.COMM_WORLD
2158 rank = comm.Get_rank()
2159 number_of_processes = comm.size
2161 number_of_processes=1
2164 of=open(outfile,
"w")
2166 if selection_keywords
is None:
2167 sel_keys=self.selection_dictionary.keys()
2169 sel_keys=selection_keywords
2171 for selection_name
in sel_keys:
2173 number_of_structures_1=len(self.structures_dictionary[structure_set_name1][selection_name])
2174 number_of_structures_2=len(self.structures_dictionary[structure_set_name2][selection_name])
2178 structure_pointers_1=range(number_of_structures_1)
2179 structure_pointers_2=range(number_of_structures_2)
2181 if skip
is not None:
2182 structure_pointers_1=structure_pointers_1[0::skip]
2183 structure_pointers_2=structure_pointers_2[0::skip]
2185 pair_combination_list=list(itertools.product(structure_pointers_1,structure_pointers_2))
2187 if len(pair_combination_list)==0:
2188 print "get_precision> no structure selected. Check the skip parameter."
2191 my_pair_combination_list=IMP.pmi.tools.chunk_list_into_segments(pair_combination_list,number_of_processes)[rank]
2192 my_length=len(my_pair_combination_list)
2194 for n,pair
in enumerate(my_pair_combination_list):
2195 progression=int(float(n)/my_length*100.0)
2197 distances[pair]=self.get_distance(structure_set_name1,structure_set_name2,
2198 selection_name,pair[0],pair[1])
2200 if number_of_processes > 1:
2205 if structure_set_name1==structure_set_name2:
2206 structure_pointers=structure_pointers_1
2207 number_of_structures=number_of_structures_1
2213 distances_to_structure={}
2214 distances_to_structure_normalization={}
2216 for n
in structure_pointers:
2217 distances_to_structure[n]=0.0
2218 distances_to_structure_normalization[n]=0
2221 distance+=distances[k]
2222 distances_to_structure[k[0]]+=distances[k]
2223 distances_to_structure[k[1]]+=distances[k]
2224 distances_to_structure_normalization[k[0]]+=1
2225 distances_to_structure_normalization[k[1]]+=1
2227 for n
in structure_pointers:
2228 distances_to_structure[n]=distances_to_structure[n]/distances_to_structure_normalization[n]
2232 min_distance=min([distances_to_structure[n]
for n
in distances_to_structure])
2233 centroid_index=[k
for k, v
in distances_to_structure.iteritems()
if v == min_distance][0]
2234 centroid_rmf_name=self.rmf_names_frames[structure_set_name1][centroid_index]
2236 centroid_distance=0.0
2237 for n
in range(number_of_structures):
2238 centroid_distance+=self.get_distance(structure_set_name1,structure_set_name1,
2239 selection_name,centroid_index,n)
2243 centroid_distance/=number_of_structures
2245 of.write(str(selection_name)+
" "+structure_set_name1+
2246 " average centroid distance "+str(centroid_distance)+
"\n")
2247 of.write(str(selection_name)+
" "+structure_set_name1+
2248 " centroid index "+str(centroid_index)+
"\n")
2249 of.write(str(selection_name)+
" "+structure_set_name1+
2250 " centroid rmf name "+str(centroid_rmf_name)+
"\n")
2252 average_pairwise_distances=sum(distances.values())/len(distances.values())
2253 of.write(str(selection_name)+
" "+structure_set_name1+
" "+structure_set_name2+
2254 " average pairwise distance "+str(average_pairwise_distances)+
"\n")
2258 return centroid_index
2260 def get_rmsf(self,structure_set_name,outdir="./",
2261 is_mpi=
False,skip=
None,
2262 make_plot=
False,set_plot_yaxis_range=
None):
2264 outfile=outdir+
"/rmsf.dat"
2266 centroid_index=self.get_precision(outfile,structure_set_name,structure_set_name,
2267 is_mpi=is_mpi,skip=skip,selection_keywords=[
"All"])
2269 for p
in self.protein_names:
2270 outfile=outdir+
"/rmsf."+p+
".dat"
2271 of=open(outfile,
"w")
2273 self.selection_dictionary.update({selection_name:[p]})
2274 number_of_structures=len(self.structures_dictionary[structure_set_name][selection_name])
2276 rpim=self.residue_particle_index_map[p]
2278 residue_distances={}
2280 for index
in range(number_of_structures):
2283 distances=self.get_particle_distances(structure_set_name,
2286 centroid_index,index)
2287 for nblock,block
in enumerate(rpim):
2289 for residue_number
in block:
2290 residue_nblock[residue_number]=nblock
2291 if residue_number
not in residue_distances:
2292 residue_distances[residue_number]=[distances[nblock]]
2294 residue_distances[residue_number].append(distances[nblock])
2298 for rn
in residue_distances:
2300 rmsf=np.std(residue_distances[rn])
2302 of.write(str(rn)+
" "+str(residue_nblock[rn])+
" "+str(rmsf)+
"\n")
2304 IMP.pmi.output.plot_xy_data(residues,rmsfs,title=outdir+
"/rmsf."+p,display=
False,
2305 set_plot_yaxis_range=
None)
2311 def set_reference_structure(self,rmf_name,rmf_frame_index):
2312 (particles_resolution_one, prot)=self.get_structure(rmf_frame_index,rmf_name)
2313 self.reference_rmf_names_frames=(rmf_name,rmf_frame_index)
2316 for selection_name
in self.selection_dictionary:
2317 selection_tuple=self.selection_dictionary[selection_name]
2318 coords=self.select_coordinates(selection_tuple,
2319 particles_resolution_one,prot)
2320 self.reference_structures_dictionary[selection_name]=coords
2323 def get_average_distance_wrt_reference_structure(self,structure_set_name,tuple_selections=None):
2325 for selection_name
in self.selection_dictionary:
2326 reference_coordinates=self.reference_structures_dictionary[selection_name]
2330 for sc
in self.structures_dictionary[structure_set_name][selection_name]:
2332 for rc
in self.reference_structures_dictionary[selection_name]:
2337 if self.style==
'pairwise_drmsd_k':
2339 if self.style==
'pairwise_drms_k':
2341 if self.style==
'pairwise_drmsd_Q':
2344 distances.append(distance)
2347 print selection_name,
"average distance",sum(distances)/len(distances),
"minimum distance",min(distances)
2350 def get_coordinates(self):
2353 def set_precision_style(self, style):
2354 if style
in self.styles:
2357 print "No such style"; exit()
2361 def get_hier_from_rmf(model, frame_number, rmf_file):
2363 print "getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file)
2366 rh = RMF.open_rmf_file_read_only(rmf_file)
2371 print "Unable to open rmf file %s" % (rmf_file)
2379 print "Unable to open frame %i of file %s" % (frame_number, rmf_file)
2385 def get_hier_and_restraints_from_rmf(model, frame_number, rmf_file):
2387 print "getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file)
2390 rh = RMF.open_rmf_file_read_only(rmf_file)
2396 print "Unable to open rmf file %s" % (rmf_file)
2405 print "Unable to open frame %i of file %s" % (frame_number, rmf_file)
2411 def get_hiers_from_rmf(model, frame_number, rmf_file):
2412 print "getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file)
2415 rh = RMF.open_rmf_file_read_only(rmf_file)
2420 print "Unable to open rmf file %s" % (rmf_file)
2427 print "Unable to open frame %i of file %s" % (frame_number, rmf_file)
2436 Get particles at res 1, or any beads, based on the name (no Representation needed)
2437 It is mainly used when the hierarchy is read from an rmf file
2438 Returns a dictionary of component names and their particles
2442 for c
in prot.get_children():
2445 for s
in c.get_children():
2446 if "_Res:1" in s.get_name()
and "_Res:10" not in s.get_name():
2448 if "Beads" in s.get_name():
2452 for name
in particle_dict:
2453 particle_dict[name] = IMP.pmi.tools.sort_by_residues(
2454 list(set(particle_dict[name]) & set(allparticles)))
2455 return particle_dict
2459 Get particles at res 10, or any beads, based on the name (no Representation needed)
2460 It is mainly used when the hierarchy is read from an rmf file
2461 Returns a dictionary of component names and their particles
2465 for c
in prot.get_children():
2468 for s
in c.get_children():
2469 if "_Res:10" in s.get_name():
2471 if "Beads" in s.get_name():
2474 for name
in particle_dict:
2475 particle_dict[name] = IMP.pmi.tools.sort_by_residues(
2476 list(set(particle_dict[name]) & set(allparticles)))
2477 return particle_dict
2481 def select_by_tuple(first_res_last_res_name_tuple):
2482 first_res = first_res_last_res_hier_tuple[0]
2483 last_res = first_res_last_res_hier_tuple[1]
2484 name = first_res_last_res_hier_tuple[2]
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, kernel::Model *m)
A decorator to associate a particle with a part of a protein/DNA/RNA.
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
DensityMap * create_density_map(const algebra::GridD< 3, algebra::DenseGridStorageD< 3, float >, float > &grid)
A class for reading stat files.
def plot_field_histogram
This function is plotting a list of histograms from a value list.
def plot_fields_box_plots
This function plots time series as boxplots fields is a list of time series, positions are the x-valu...
kernel::Restraints create_restraints(RMF::FileConstHandle fh, kernel::Model *m)
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
void load_frame(RMF::FileConstHandle file, unsigned int frame)
double get_rmsd(const core::XYZs &s0, const core::XYZs &s1, const IMP::algebra::Transformation3D &tr_for_second)
A class to cluster structures.
void write_map(DensityMap *m, std::string filename)
Write a density map to a file.
Class for sampling a density map from particles.
double get_distance(const Plane3D &pln, const Vector3D &p)
Return the distance between a plane and a point in 3D.
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
This class performs alignment and RMSD calculation for two sets of coordinates.
def add_subunits_density
Add a frame to the densities.
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Basic utilities for handling cryo-electron microscopy 3D density maps.
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name (no Representation needed) It is mainly used ...
def fill
fill stores coordinates of a model into a dictionary all_coords, containing coordinates for all model...
def do_cluster
Run K-means clustering.
void destroy(Hierarchy d)
Delete the Hierarchy.
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Classes for writing output files and processing them.
def get_density
Get the current density for some component name.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name (no Representation needed) It is mainly used...
Python classes to represent, score, sample and analyze models.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Select hierarchy particles identified by the biological name.
A class to compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
A decorator for a particle with x,y,z coordinates and a radius.