IMP  2.3.0
The Integrative Modeling Platform
pmi/Analysis.py
1 #!/usr/bin/env python
2 
3 """@namespace IMP.pmi.analysis
4  Analysis tools.
5 """
6 
7 import IMP
8 import IMP.algebra
9 import IMP.em
10 import IMP.pmi
11 import IMP.pmi.tools
12 import IMP.pmi.analysis
13 import IMP.pmi.output
14 import IMP.rmf
15 import RMF
16 from operator import itemgetter
17 from copy import deepcopy
18 from math import log,sqrt
19 import itertools
20 import numpy as np
21 from scipy.spatial.distance import cdist
22 
23 class Alignment(object):
24  """
25  This class performs alignment and RMSD calculation for two sets of coordinates
26 
27  Inputs:
28 
29  - query = {'p1':coords(L,3), 'p2':coords(L,3)}
30  - template = {'p1':coords(L,3), 'p2':coords(L,3)}
31 
32  The class also takes into accout non-equal stoichiometry of the proteins. If this
33  is the case, the protein names of protein in multiple copies should be delivered
34  in the following form: nameA..1, nameA..2 (note two dots).
35  """
36 
37  def __init__(self, template, query, weights=None):
38 
39  self.query = query
40  self.template = template
41  self.weights=weights
42 
43  if len(self.query.keys()) != len(self.template.keys()):
44  print '''ERROR: the number of proteins
45  in template and query does not match!'''
46  exit()
47 
48  def permute(self):
49 
50  self.proteins = sorted(self.query.keys())
51  prots_uniq = [i.split('..')[0] for i in self.proteins]
52  P = {}
53  for p in prots_uniq:
54  np = prots_uniq.count(p)
55  copies = [i for i in self.proteins if i.split('..')[0] == p]
56  prmts = list(itertools.permutations(copies, len(copies)))
57  P[p] = prmts
58  self.P = P
59  self.Product = list(itertools.product(*P.values()))
60 
61  def get_rmsd(self):
62 
63  self.permute()
64 
65  template_xyz = []
66  weights = []
67  torder = sum([list(i) for i in self.Product[0]], [])
68  for t in torder:
69  template_xyz += [IMP.algebra.Vector3D(i) for i in self.template[t]]
70  if self.weights is not None:
71  weights += [i for i in self.weights[t]]
72  #template_xyz = np.array(template_xyz)
73 
74  self.rmsd = 10000000000.
75  for comb in self.Product:
76 
77 
78 
79  order = sum([list(i) for i in comb], [])
80  query_xyz = []
81  for p in order:
82  query_xyz += [IMP.algebra.Vector3D(i) for i in self.query[p]]
83  #query_xyz = np.array(query_xyz)
84  #if len(template_xyz) != len(query_xyz):
85  # print '''Alignment.get_rmsd: ERROR: the number of coordinates
86  # in template and query does not match!'''
87  # exit()
88 
89  if self.weights is not None:
90  dist=IMP.algebra.get_weighted_rmsd(template_xyz, query_xyz, weights)
91  else:
92  dist=IMP.algebra.get_rmsd(template_xyz, query_xyz)
93  #dist = sqrt(
94  # sum(np.diagonal(cdist(template_xyz, query_xyz) ** 2)) / len(template_xyz))
95  if dist < self.rmsd:
96  self.rmsd = dist
97  return self.rmsd
98 
99  def align(self):
100 
101  self.permute()
102 
103  template_xyz = []
104  torder = sum([list(i) for i in self.Product[0]], [])
105  for t in torder:
106  template_xyz += [IMP.algebra.Vector3D(i) for i in self.template[t]]
107  #template_xyz = np.array(template_xyz)
108 
109  self.rmsd, Transformation = 10000000000., ''
110  for comb in self.Product:
111  order = sum([list(i) for i in comb], [])
112  query_xyz = []
113  for p in order:
114  query_xyz += [IMP.algebra.Vector3D(i) for i in self.query[p]]
115  #query_xyz = np.array(query_xyz)
116 
117  if len(template_xyz) != len(query_xyz):
118  print '''ERROR: the number of coordinates
119  in template and query does not match!'''
120  exit()
121 
123  query_xyz,
124  template_xyz)
125  query_xyz_tr = [transformation.get_transformed(n)
126  for n in query_xyz]
127 
128  dist = sqrt(
129  sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2)) / len(template_xyz))
130  if dist < self.rmsd:
131  self.rmsd = dist
132  Transformation = transformation
133 
134  return (self.rmsd, Transformation)
135 
136 
137 # TEST for the alignment ###
138 """
139 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
140  'a..2':np.array([np.array([1.,1.,])]),
141  'a..3':np.array([np.array([-2.,1.])]),
142  'b':np.array([np.array([0.,-1.])]),
143  'c..1':np.array([np.array([-1.,-1.])]),
144  'c..2':np.array([np.array([1.,-1.])]),
145  'd':np.array([np.array([0.,0.])]),
146  'e':np.array([np.array([0.,1.])])}
147 
148 Ali = Alignment(Proteins, Proteins)
149 Ali.permute()
150 if Ali.get_rmsd() == 0.0: print 'successful test!'
151 else: print 'ERROR!'; exit()
152 """
153 
154 
155 # ----------------------------------
156 class Violations(object):
157 
158  def __init__(self, filename):
159 
160  self.violation_thresholds = {}
161  self.violation_counts = {}
162 
163  data = open(filename)
164  D = data.readlines()
165  data.close()
166 
167  for d in D:
168  d = d.strip().split()
169  self.violation_thresholds[d[0]] = float(d[1])
170 
171  def get_number_violated_restraints(self, rsts_dict):
172  num_violated = 0
173  for rst in self.violation_thresholds:
174  if rst not in rsts_dict:
175  continue # print rst;
176  if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
177  num_violated += 1
178  if rst not in self.violation_counts:
179  self.violation_counts[rst] = 1
180  else:
181  self.violation_counts[rst] += 1
182  return num_violated
183 
184 
185 # ----------------------------------
186 class Clustering(object):
187  """A class to cluster structures.
188  Uses scipy's cdist function to compute distance matrices
189  And sklearn's kmeans clustering module.
190  """
191  def __init__(self,rmsd_weights=None):
192 
193  self.all_coords = {}
194  self.structure_cluster_ids = None
195  self.tmpl_coords = None
196  self.rmsd_weights=rmsd_weights
197 
198  def set_template(self, part_coords):
199 
200  self.tmpl_coords = part_coords
201 
202  def fill(self, frame, Coords):
203  """
204  fill stores coordinates of a model into a dictionary all_coords,
205  containing coordinates for all models.
206  """
207 
208  self.all_coords[frame] = Coords
209 
210  def dist_matrix(self, is_mpi=False):
211 
212  if is_mpi:
213  from mpi4py import MPI
214  comm = MPI.COMM_WORLD
215  rank = comm.Get_rank()
216  number_of_processes = comm.size
217  else:
218  number_of_processes = 1
219  rank = 0
220 
221  self.model_list_names = self.all_coords.keys()
222  self.model_indexes = range(len(self.model_list_names))
223  self.model_indexes_dict = dict(
224  zip(self.model_list_names, self.model_indexes))
225  model_indexes_unique_pairs = list(itertools.combinations(self.model_indexes, 2))
226 
227  my_model_indexes_unique_pairs = IMP.pmi.tools.chunk_list_into_segments(
228  model_indexes_unique_pairs,
229  number_of_processes)[rank]
230 
231  print "process %s assigned with %s pairs" % (str(rank), str(len(my_model_indexes_unique_pairs)))
232 
233  (raw_distance_dict, self.transformation_distance_dict) = self.matrix_calculation(self.all_coords,
234  self.tmpl_coords,
235  my_model_indexes_unique_pairs)
236 
237  if number_of_processes > 1:
238  raw_distance_dict = IMP.pmi.tools.scatter_and_gather(
239  raw_distance_dict)
240  pickable_transformations = self.get_pickable_transformation_distance_dict(
241  )
242  pickable_transformations = IMP.pmi.tools.scatter_and_gather(
243  pickable_transformations)
244  self.set_transformation_distance_dict_from_pickable(
245  pickable_transformations)
246 
247  self.raw_distance_matrix = np.zeros(
248  (len(self.model_list_names), len(self.model_list_names)))
249  for item in raw_distance_dict:
250  (f1, f2) = item
251  self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
252  self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
253 
254  def get_dist_matrix(self):
255  return self.raw_distance_matrix
256 
257  def do_cluster(self, number_of_clusters,seed=None):
258  """Run K-means clustering
259  @param number_of_clusters Num means
260  @param seed the random seed
261  """
262  from sklearn.cluster import KMeans
263  if seed is not None:
264  np.random.seed(seed)
265  kmeans = KMeans(n_clusters=number_of_clusters)
266  kmeans.fit_predict(self.raw_distance_matrix)
267 
268  self.structure_cluster_ids = kmeans.labels_
269 
270  def get_pickable_transformation_distance_dict(self):
271  pickable_transformations = {}
272  for label in self.transformation_distance_dict:
273  tr = self.transformation_distance_dict[label]
274  trans = tuple(tr.get_translation())
275  rot = tuple(tr.get_rotation().get_quaternion())
276  pickable_transformations[label] = (rot, trans)
277  return pickable_transformations
278 
279  def set_transformation_distance_dict_from_pickable(
280  self,
281  pickable_transformations):
282  self.transformation_distance_dict = {}
283  for label in pickable_transformations:
284  tr = pickable_transformations[label]
285  trans = IMP.algebra.Vector3D(tr[1])
286  rot = IMP.algebra.Rotation3D(tr[0])
287  self.transformation_distance_dict[
288  label] = IMP.algebra.Transformation3D(rot, trans)
289 
290  def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
291  import pickle
292  outf = open(file_name + ".data", 'w')
293 
294  # to pickle the transformation dictionary
295  # you have to save the arrays correposnding to
296  # the transformations
297 
298  pickable_transformations = self.get_pickable_transformation_distance_dict(
299  )
300  pickle.dump(
301  (self.structure_cluster_ids,
302  self.model_list_names,
303  pickable_transformations),
304  outf)
305 
306  np.save(file_name + ".npy", self.raw_distance_matrix)
307 
308  def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
309  import pickle
310 
311  inputf = open(file_name + ".data", 'r')
312  (self.structure_cluster_ids, self.model_list_names,
313  pickable_transformations) = pickle.load(inputf)
314  inputf.close()
315 
316  self.raw_distance_matrix = np.load(file_name + ".npy")
317 
318  self.set_transformation_distance_dict_from_pickable(
319  pickable_transformations)
320  self.model_indexes = range(len(self.model_list_names))
321  self.model_indexes_dict = dict(
322  zip(self.model_list_names, self.model_indexes))
323 
324  def plot_matrix(self, figurename="clustermatrix.pdf"):
325  import pylab as pl
326  from scipy.cluster import hierarchy as hrc
327 
328  fig = pl.figure()
329  ax = fig.add_subplot(211)
330  dendrogram = hrc.dendrogram(
331  hrc.linkage(self.raw_distance_matrix),
332  color_threshold=7,
333  no_labels=True)
334  leaves_order = dendrogram['leaves']
335 
336  ax = fig.add_subplot(212)
337  cax = ax.imshow(
338  self.raw_distance_matrix[leaves_order,
339  :][:,
340  leaves_order],
341  interpolation='nearest')
342  # ax.set_yticks(range(len(self.model_list_names)))
343  #ax.set_yticklabels( [self.model_list_names[i] for i in leaves_order] )
344  fig.colorbar(cax)
345  pl.savefig(figurename, dpi=300)
346  #pl.show()
347 
348  def get_model_index_from_name(self, name):
349  return self.model_indexes_dict[name]
350 
351  def get_cluster_labels(self):
352  # this list
353  return list(set(self.structure_cluster_ids))
354 
355  def get_number_of_clusters(self):
356  return len(self.get_cluster_labels())
357 
358  def get_cluster_label_indexes(self, label):
359  return (
360  [i for i, l in enumerate(self.structure_cluster_ids) if l == label]
361  )
362 
363  def get_cluster_label_names(self, label):
364  return (
365  [self.model_list_names[i]
366  for i in self.get_cluster_label_indexes(label)]
367  )
368 
369  def get_cluster_label_average_rmsd(self, label):
370 
371  indexes = self.get_cluster_label_indexes(label)
372 
373  if len(indexes) > 1:
374  sub_distance_matrix = self.raw_distance_matrix[
375  indexes, :][:, indexes]
376  average_rmsd = np.sum(sub_distance_matrix) / \
377  (len(sub_distance_matrix)
378  ** 2 - len(sub_distance_matrix))
379  else:
380  average_rmsd = 0.0
381  return average_rmsd
382 
383  def get_cluster_label_size(self, label):
384  return len(self.get_cluster_label_indexes(label))
385 
386  def get_transformation_to_first_member(
387  self,
388  cluster_label,
389  structure_index):
390  reference = self.get_cluster_label_indexes(cluster_label)[0]
391  return self.transformation_distance_dict[(reference, structure_index)]
392 
393  def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
394 
395  model_list_names = all_coords.keys()
396  rmsd_protein_names = all_coords[model_list_names[0]].keys()
397  raw_distance_dict = {}
398  transformation_distance_dict = {}
399  if template_coords is None:
400  do_alignment = False
401  else:
402  do_alignment = True
403  alignment_template_protein_names = template_coords.keys()
404 
405  for (f1, f2) in list_of_pairs:
406 
407  if not do_alignment:
408  # here we only get the rmsd,
409  # we need that for instance when you want to cluster conformations
410  # globally, eg the EM map is a reference
412 
413  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
414  for pr in rmsd_protein_names])
415  coords_f2 = {}
416  for pr in rmsd_protein_names:
417  coords_f2[pr] = all_coords[model_list_names[f2]][pr]
418 
419  Ali = IMP.pmi.analysis.Alignment(coords_f1, coords_f2, self.rmsd_weights)
420  rmsd = Ali.get_rmsd()
421 
422  elif do_alignment:
423  # here we actually align the conformations first
424  # and than calculate the rmsd. We need that when the
425  # protein(s) is the reference
426  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
427  for pr in alignment_template_protein_names])
428  coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
429  for pr in alignment_template_protein_names])
430 
431  Ali = IMP.pmi.analysis.Alignment(coords_f1, coords_f2)
432  template_rmsd, transformation = Ali.align()
433 
434  # here we calculate the rmsd
435  # we will align two models based n the nuber of subunits provided
436  # and transform coordinates of model 2 to model 1
437  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
438  for pr in rmsd_protein_names])
439  coords_f2 = {}
440  for pr in rmsd_protein_names:
441  coords_f2[pr] = [transformation.get_transformed(
442  i) for i in all_coords[model_list_names[f2]][pr]]
443 
444  Ali = IMP.pmi.analysis.Alignment(coords_f1, coords_f2)
445  rmsd = Ali.get_rmsd()
446 
447  raw_distance_dict[(f1, f2)] = rmsd
448  raw_distance_dict[(f2, f1)] = rmsd
449  transformation_distance_dict[(f1, f2)] = transformation
450  transformation_distance_dict[(f2, f1)] = transformation
451 
452  return raw_distance_dict, transformation_distance_dict
453 
454 
455 # ----------------------------------
456 
457 class GetModelDensity(object):
458  """A class to compute mean density maps from structures"""
459  def __init__(self, custom_ranges, representation=None, voxel=5.0):
460  """Setup GetModelDensity object. Keeps a dictionary of density maps,
461  keys are in the custom ranges. When you call add_subunits_density, it adds
462  particle coordinates to the existing density maps.
463  @param custom_ranges Required. It's a dictionary, keys are the density component names,
464  values are selection tuples
465  e.g. {'kin28':[['kin28',1,-1]],
466  'density_name_1' :[('ccl1')],
467  'density_name_2' :[(1,142,'tfb3d1'),(143,700,'tfb3d2')],
468  @param representation PMI representation, for doing selections.
469  not needed if you only pass hierarchies
470  @param voxel The voxel size for the output map (lower is slower)
471 
472  """
473 
474  self.representation = representation
475  self.voxel = voxel
476  self.densities = {}
477  self.count_models = 0.0
478  self.custom_ranges = custom_ranges
479 
480  def add_subunits_density(self, hierarchy=None):
481  """Add a frame to the densities.
482  @param hierarchy Optionally read the hierarchy from somewhere.
483  If not passed, will just read the representation.
484  """
485  self.count_models += 1.0
486  for density_name in self.custom_ranges:
487  parts = []
488  if hierarchy:
489  all_particles_by_segments = []
490 
491  for seg in self.custom_ranges[density_name]:
492  if not hierarchy:
493  parts += IMP.tools.select_by_tuple(self.representation,
494  seg, resolution=1, name_is_ambiguous=False)
495  else:
496  if type(seg) == str:
497  children = [child for child in hierarchy.get_children()
498  if child.get_name() == seg]
499  s = IMP.atom.Selection(children)
500  elif type(seg) == tuple:
501  children = [child for child in hierarchy.get_children()
502  if child.get_name() == seg[2]]
503  s = IMP.atom.Selection(
504  children, residue_indexes=range(seg[0], seg[1] + 1))
505  else:
506  raise Exception('could not understand selection tuple '+str(seg))
507 
508  all_particles_by_segments += s.get_selected_particles()
509 
510  if hierarchy:
511  part_dict = get_particles_at_resolution_one(hierarchy)
512  all_particles_by_resolution = []
513  for name in part_dict:
514  all_particles_by_resolution += part_dict[name]
515  parts = list(
516  set(all_particles_by_segments) & set(all_particles_by_resolution))
517 
518  self._create_density_from_particles(parts, density_name)
519 
520  def normalize_density(self):
521  pass
522 
523  def _create_density_from_particles(self, ps, name,
524  resolution=1,
525  kernel_type='GAUSSIAN'):
526  '''Internal function for adding to densities.
527  pass XYZR particles with mass and create a density from them.
528  kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
529  kd = {
530  'GAUSSIAN': IMP.em.GAUSSIAN,
531  'BINARIZED_SPHERE': IMP.em.BINARIZED_SPHERE,
532  'SPHERE': IMP.em.SPHERE}
533 
534  dmap = IMP.em.SampledDensityMap(ps, resolution, self.voxel)
535  dmap.calcRMS()
536  if name not in self.densities:
537  self.densities[name] = dmap
538  else:
539  bbox1 = IMP.em.get_bounding_box(self.densities[name])
540  bbox2 = IMP.em.get_bounding_box(dmap)
541  bbox1 += bbox2
542  dmap3 = IMP.em.create_density_map(bbox1,self.voxel)
543  dmap3.add(dmap)
544  dmap3.add(self.densities[name])
545  self.densities[name] = dmap3
546 
547  def get_density_keys(self):
548  return self.densities.keys()
549 
550  def get_density(self,name):
551  """Get the current density for some component name"""
552  if name not in self.densities:
553  return None
554  else:
555  return self.densities[name]
556 
557  def write_mrc(self, path="./"):
558  for density_name in self.densities:
559  self.densities[density_name].multiply(1. / self.count_models)
561  self.densities[density_name],
562  path + "/" + density_name + ".mrc",
564 
565 # ----------------------------------
566 
567 
568 class GetContactMap(object):
569 
570  def __init__(self, distance=15.):
571  self.distance = distance
572  self.contactmap = ''
573  self.namelist = []
574  self.xlinks = 0
575  self.XL = {}
576  self.expanded = {}
577  self.resmap = {}
578 
579  def set_prot(self, prot):
580  self.prot = prot
581  self.protnames = []
582  coords = []
583  radii = []
584  namelist = []
585 
586  particles_dictionary = get_particles_at_resolution_one(self.prot)
587 
588  for name in particles_dictionary:
589  residue_indexes = []
590  for p in particles_dictionary[name]:
591  print p.get_name()
592  residue_indexes += IMP.pmi.tools.get_residue_indexes(p)
593  #residue_indexes.add( )
594 
595  if len(residue_indexes) != 0:
596  self.protnames.append(name)
597  for res in range(min(residue_indexes), max(residue_indexes) + 1):
598  d = IMP.core.XYZR(p)
599  new_name = name + ":" + str(res)
600  if name not in self.resmap:
601  self.resmap[name] = {}
602  if res not in self.resmap:
603  self.resmap[name][res] = {}
604 
605  self.resmap[name][res] = new_name
606  namelist.append(new_name)
607 
608  crd = np.array([d.get_x(), d.get_y(), d.get_z()])
609  coords.append(crd)
610  radii.append(d.get_radius())
611 
612  coords = np.array(coords)
613  radii = np.array(radii)
614 
615  if len(self.namelist) == 0:
616  self.namelist = namelist
617  self.contactmap = np.zeros((len(coords), len(coords)))
618 
619  distances = cdist(coords, coords)
620  distances = (distances - radii).T - radii
621  distances = distances <= self.distance
622 
623  print coords
624  print radii
625  print distances
626 
627  self.contactmap += distances
628 
629  def get_subunit_coords(self, frame, align=0):
630  coords = []
631  radii = []
632  namelist = []
633  test, testr = [], []
634  for part in self.prot.get_children():
635  SortedSegments = []
636  print part
637  for chl in part.get_children():
638  start = IMP.atom.get_leaves(chl)[0]
639  end = IMP.atom.get_leaves(chl)[-1]
640 
641  startres = IMP.atom.Fragment(start).get_residue_indexes()[0]
642  endres = IMP.atom.Fragment(end).get_residue_indexes()[-1]
643  SortedSegments.append((chl, startres))
644  SortedSegments = sorted(SortedSegments, key=itemgetter(1))
645 
646  for sgmnt in SortedSegments:
647  for leaf in IMP.atom.get_leaves(sgmnt[0]):
648  p = IMP.core.XYZR(leaf)
649  crd = np.array([p.get_x(), p.get_y(), p.get_z()])
650 
651  coords.append(crd)
652  radii.append(p.get_radius())
653 
654  new_name = part.get_name() + '_' + sgmnt[0].get_name() +\
655  '_' + \
656  str(IMP.atom.Fragment(leaf)
657  .get_residue_indexes()[0])
658  namelist.append(new_name)
659  self.expanded[new_name] = len(
660  IMP.atom.Fragment(leaf).get_residue_indexes())
661  if part.get_name() not in self.resmap:
662  self.resmap[part.get_name()] = {}
663  for res in IMP.atom.Fragment(leaf).get_residue_indexes():
664  self.resmap[part.get_name()][res] = new_name
665 
666  coords = np.array(coords)
667  radii = np.array(radii)
668  if len(self.namelist) == 0:
669  self.namelist = namelist
670  self.contactmap = np.zeros((len(coords), len(coords)))
671  distances = cdist(coords, coords)
672  distances = (distances - radii).T - radii
673  distances = distances <= self.distance
674  self.contactmap += distances
675 
676  def add_xlinks(
677  self,
678  filname,
679  identification_string='ISDCrossLinkMS_Distance_'):
680  # 'ISDCrossLinkMS_Distance_interrb_6629-State:0-20:RPS30_218:eIF3j-1-1-0.1_None'
681  self.xlinks = 1
682  data = open(filname)
683  D = data.readlines()
684  data.close()
685 
686  for d in D:
687  if identification_string in d:
688  d = d.replace(
689  "_",
690  " ").replace("-",
691  " ").replace(":",
692  " ").split()
693 
694  t1, t2 = (d[0], d[1]), (d[1], d[0])
695  if t1 not in self.XL:
696  self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
697  self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
698  else:
699  self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
700  self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
701 
702  def dist_matrix(self, skip_cmap=0, skip_xl=1):
703  K = self.namelist
704  M = self.contactmap
705  C, R = [], []
706  L = sum(self.expanded.values())
707  proteins = self.protnames
708 
709  # exp new
710  if skip_cmap == 0:
711  Matrices = {}
712  proteins = [p.get_name() for p in self.prot.get_children()]
713  missing = []
714  for p1 in xrange(len(proteins)):
715  for p2 in xrange(p1, len(proteins)):
716  pl1, pl2 = max(
717  self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
718  pn1, pn2 = proteins[p1], proteins[p2]
719  mtr = np.zeros((pl1 + 1, pl2 + 1))
720  print 'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2
721  for i1 in xrange(1, pl1 + 1):
722  for i2 in xrange(1, pl2 + 1):
723  try:
724  r1 = K.index(self.resmap[pn1][i1])
725  r2 = K.index(self.resmap[pn2][i2])
726  r = M[r1, r2]
727  mtr[i1 - 1, i2 - 1] = r
728  except KeyError:
729  missing.append((pn1, pn2, i1, i2))
730  pass
731  Matrices[(pn1, pn2)] = mtr
732 
733  # add cross-links
734  if skip_xl == 0:
735  if self.XL == {}:
736  print "ERROR: cross-links were not provided, use add_xlinks function!"
737  exit()
738  Matrices_xl = {}
739  missing_xl = []
740  for p1 in xrange(len(proteins)):
741  for p2 in xrange(p1, len(proteins)):
742  pl1, pl2 = max(
743  self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
744  pn1, pn2 = proteins[p1], proteins[p2]
745  mtr = np.zeros((pl1 + 1, pl2 + 1))
746  flg = 0
747  try:
748  xls = self.XL[(pn1, pn2)]
749  except KeyError:
750  try:
751  xls = self.XL[(pn2, pn1)]
752  flg = 1
753  except KeyError:
754  flg = 2
755  if flg == 0:
756  print 'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2
757  for xl1, xl2 in xls:
758  if xl1 > pl1:
759  print 'X' * 10, xl1, xl2
760  xl1 = pl1
761  if xl2 > pl2:
762  print 'X' * 10, xl1, xl2
763  xl2 = pl2
764  mtr[xl1 - 1, xl2 - 1] = 100
765  elif flg == 1:
766  print 'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2
767  for xl1, xl2 in xls:
768  if xl1 > pl1:
769  print 'X' * 10, xl1, xl2
770  xl1 = pl1
771  if xl2 > pl2:
772  print 'X' * 10, xl1, xl2
773  xl2 = pl2
774  mtr[xl2 - 1, xl1 - 1] = 100
775  else:
776  print 'WTF!'
777  exit()
778  Matrices_xl[(pn1, pn2)] = mtr
779 
780  # expand the matrix to individual residues
781  #NewM = []
782  # for x1 in xrange(len(K)):
783  # lst = []
784  # for x2 in xrange(len(K)):
785  # lst += [M[x1,x2]]*self.expanded[K[x2]]
786  # for i in xrange(self.expanded[K[x1]]): NewM.append(np.array(lst))
787  #NewM = np.array(NewM)
788 
789  # make list of protein names and create coordinate lists
790  C = proteins
791  # W is the component length list,
792  # R is the contiguous coordinates list
793  W, R = [], []
794  for i, c in enumerate(C):
795  cl = max(self.resmap[c].keys())
796  W.append(cl)
797  if i == 0:
798  R.append(cl)
799  else:
800  R.append(R[-1] + cl)
801 
802  # start plotting
803  import matplotlib.pyplot as plt
804  import matplotlib.gridspec as gridspec
805  import scipy.sparse as sparse
806 
807  f = plt.figure()
808  gs = gridspec.GridSpec(len(W), len(W),
809  width_ratios=W,
810  height_ratios=W)
811 
812  cnt = 0
813  for x1, r1 in enumerate(R):
814  if x1 == 0:
815  s1 = 0
816  else:
817  s1 = R[x1 - 1]
818  for x2, r2 in enumerate(R):
819  if x2 == 0:
820  s2 = 0
821  else:
822  s2 = R[x2 - 1]
823 
824  ax = plt.subplot(gs[cnt])
825  if skip_cmap == 0:
826  try:
827  mtr = Matrices[(C[x1], C[x2])]
828  except KeyError:
829  mtr = Matrices[(C[x2], C[x1])].T
830  #cax = ax.imshow(log(NewM[s1:r1,s2:r2] / 1.), interpolation='nearest', vmin=0., vmax=log(NewM.max()))
831  cax = ax.imshow(
832  log(mtr),
833  interpolation='nearest',
834  vmin=0.,
835  vmax=log(NewM.max()))
836  ax.set_xticks([])
837  ax.set_yticks([])
838  if skip_xl == 0:
839  try:
840  mtr = Matrices_xl[(C[x1], C[x2])]
841  except KeyError:
842  mtr = Matrices_xl[(C[x2], C[x1])].T
843  cax = ax.spy(
844  sparse.csr_matrix(mtr),
845  markersize=10,
846  color='white',
847  linewidth=100,
848  alpha=0.5)
849  ax.set_xticks([])
850  ax.set_yticks([])
851 
852  cnt += 1
853  if x2 == 0:
854  ax.set_ylabel(C[x1], rotation=90)
855  plt.show()
856 
857 
858 # ------------------------------------------------------------------
859 
860 class CrossLinkTable(object):
861 
862  def __init__(self):
863  self.crosslinks = []
864  self.external_csv_data = None
865  self.crosslinkedprots = set()
866  self.mindist = +10000000.0
867  self.maxdist = -10000000.0
868  self.contactmap = None
869 
870  def set_hierarchy(self, prot):
871  self.prot_length_dict = {}
872  self.model=prot.get_model()
873 
874  for i in prot.get_children():
875  name = i.get_name()
876  residue_indexes = []
877  for p in IMP.atom.get_leaves(i):
878  residue_indexes += IMP.pmi.tools.get_residue_indexes(p)
879 
880  if len(residue_indexes) != 0:
881  self.prot_length_dict[name] = max(residue_indexes)
882 
883  def set_coordinates_for_contact_map(self, rmf_name,rmf_frame_index):
884 
885  rh= RMF.open_rmf_file_read_only(rmf_name)
886  prots=IMP.rmf.create_hierarchies(rh, self.model)
887  IMP.rmf.load_frame(rh, rmf_frame_index)
888  print "getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name)
889  del rh
890 
891 
892  coords = []
893  radii = []
894  namelist = []
895 
896  particles_dictionary = get_particles_at_resolution_one(prots[0])
897 
898  resindex = 0
899  self.index_dictionary = {}
900 
901  for name in particles_dictionary:
902  residue_indexes = []
903  for p in particles_dictionary[name]:
904  print p.get_name()
905  residue_indexes = IMP.pmi.tools.get_residue_indexes(p)
906  #residue_indexes.add( )
907 
908  if len(residue_indexes) != 0:
909 
910  for res in range(min(residue_indexes), max(residue_indexes) + 1):
911  d = IMP.core.XYZR(p)
912 
913  crd = np.array([d.get_x(), d.get_y(), d.get_z()])
914  coords.append(crd)
915  radii.append(d.get_radius())
916  if name not in self.index_dictionary:
917  self.index_dictionary[name] = [resindex]
918  else:
919  self.index_dictionary[name].append(resindex)
920  resindex += 1
921 
922  coords = np.array(coords)
923  radii = np.array(radii)
924 
925  distances = cdist(coords, coords)
926  distances = (distances - radii).T - radii
927 
928  distances = np.where(distances <= 20.0, 1.0, 0)
929  if self.contactmap is None:
930  self.contactmap = np.zeros((len(coords), len(coords)))
931  self.contactmap += distances
932 
933  for prot in prots: IMP.atom.destroy(prot)
934 
935  def set_crosslinks(
936  self, data_file, search_label='ISDCrossLinkMS_Distance_',
937  mapping=None,
938  filter_label=None,
939  filter_rmf_file_names=None, #provide a list of rmf base names to filter the stat file
940  external_csv_data_file=None,
941  external_csv_data_file_unique_id_key="Unique ID"):
942 
943  # example key ISDCrossLinkMS_Distance_intrarb_937-State:0-108:RPS3_55:RPS30-1-1-0.1_None
944  # mapping is a dictionary that maps standard keywords to entry positions in the key string
945  # confidence class is a filter that
946  # external datafile is a datafile that contains further information on the crosslinks
947  # it will use the unique id to create the dictionary keys
948 
949  po = IMP.pmi.output.ProcessOutput(data_file)
950  keys = po.get_keys()
951 
952  xl_keys = [k for k in keys if search_label in k]
953 
954  if filter_rmf_file_names is not None:
955  rmf_file_key="local_rmf_file_name"
956  fs = po.get_fields(xl_keys+[rmf_file_key])
957  else:
958  fs = po.get_fields(xl_keys)
959 
960  # this dictionary stores the occurency of given crosslinks
961  self.cross_link_frequency = {}
962 
963  # this dictionary stores the series of distances for given crosslinked
964  # residues
965  self.cross_link_distances = {}
966 
967  # this dictionary stores the series of distances for given crosslinked
968  # residues
969  self.cross_link_distances_unique = {}
970 
971  if not external_csv_data_file is None:
972  # this dictionary stores the further information on crosslinks
973  # labeled by unique ID
974  self.external_csv_data = {}
975  xldb = IMP.pmi.tools.get_db_from_csv(external_csv_data_file)
976 
977  for xl in xldb:
978  self.external_csv_data[
979  xl[external_csv_data_file_unique_id_key]] = xl
980 
981  # this list keeps track the tuple of cross-links and sample
982  # so that we don't count twice the same crosslinked residues in the
983  # same sample
984  cross_link_frequency_list = []
985 
986  self.unique_cross_link_list = []
987 
988  for key in xl_keys:
989  print key
990  keysplit = key.replace(
991  "_",
992  " ").replace(
993  "-",
994  " ").replace(
995  ":",
996  " ").split(
997  )
998 
999  if filter_label!=None:
1000  if filter_label not in keysplit: continue
1001 
1002  if mapping is None:
1003  r1 = int(keysplit[5])
1004  c1 = keysplit[6]
1005  r2 = int(keysplit[7])
1006  c2 = keysplit[8]
1007  try:
1008  confidence = keysplit[12]
1009  except:
1010  confidence = '0.0'
1011  try:
1012  unique_identifier = keysplit[3]
1013  except:
1014  unique_identifier = '0'
1015  else:
1016  r1 = int(keysplit[mapping["Residue1"]])
1017  c1 = keysplit[mapping["Protein1"]]
1018  r2 = int(keysplit[mapping["Residue2"]])
1019  c2 = keysplit[mapping["Protein2"]]
1020  try:
1021  confidence = keysplit[mapping["Confidence"]]
1022  except:
1023  confidence = '0.0'
1024  try:
1025  unique_identifier = keysplit[mapping["Unique Identifier"]]
1026  except:
1027  unique_identifier = '0'
1028 
1029  self.crosslinkedprots.add(c1)
1030  self.crosslinkedprots.add(c2)
1031 
1032  # construct the list of distances corresponding to the input rmf
1033  # files
1034 
1035  dists=[]
1036  if filter_rmf_file_names is not None:
1037  for n,d in enumerate(fs[key]):
1038  if fs[rmf_file_key][n] in filter_rmf_file_names:
1039  dists.append(float(d))
1040  else:
1041  dists=[float(f) for f in fs[key]]
1042 
1043  # check if the input confidence class corresponds to the
1044  # one of the cross-link
1045 
1046  mdist = self.median(dists)
1047 
1048  stdv = np.std(np.array(dists))
1049  if self.mindist > mdist:
1050  self.mindist = mdist
1051  if self.maxdist < mdist:
1052  self.maxdist = mdist
1053 
1054  # calculate the frequency of unique crosslinks within the same
1055  # sample
1056  if not self.external_csv_data is None:
1057  sample = self.external_csv_data[unique_identifier]["Sample"]
1058  else:
1059  sample = "None"
1060 
1061  if (r1, c1, r2, c2,mdist) not in cross_link_frequency_list:
1062  if (r1, c1, r2, c2) not in self.cross_link_frequency:
1063  self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1064  self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1065  else:
1066  self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1067  self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1068  cross_link_frequency_list.append((r1, c1, r2, c2))
1069  cross_link_frequency_list.append((r2, c2, r1, c1))
1070  self.unique_cross_link_list.append(
1071  (r1, c1, r2, c2,mdist))
1072 
1073  if (r1, c1, r2, c2) not in self.cross_link_distances:
1074  self.cross_link_distances[(
1075  r1,
1076  c1,
1077  r2,
1078  c2,
1079  mdist,
1080  confidence)] = dists
1081  self.cross_link_distances[(
1082  r2,
1083  c2,
1084  r1,
1085  c1,
1086  mdist,
1087  confidence)] = dists
1088  self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1089  else:
1090  self.cross_link_distances[(
1091  r2,
1092  c2,
1093  r1,
1094  c1,
1095  mdist,
1096  confidence)] += dists
1097  self.cross_link_distances[(
1098  r1,
1099  c1,
1100  r2,
1101  c2,
1102  mdist,
1103  confidence)] += dists
1104 
1105  self.crosslinks.append(
1106  (r1,
1107  c1,
1108  r2,
1109  c2,
1110  mdist,
1111  stdv,
1112  confidence,
1113  unique_identifier,
1114  'original'))
1115  self.crosslinks.append(
1116  (r2,
1117  c2,
1118  r1,
1119  c1,
1120  mdist,
1121  stdv,
1122  confidence,
1123  unique_identifier,
1124  'reversed'))
1125 
1126  self.cross_link_frequency_inverted = {}
1127  for xl in self.unique_cross_link_list:
1128  (r1, c1, r2, c2, mdist) = xl
1129  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1130  if frequency not in self.cross_link_frequency_inverted:
1131  self.cross_link_frequency_inverted[
1132  frequency] = [(r1, c1, r2, c2)]
1133  else:
1134  self.cross_link_frequency_inverted[
1135  frequency].append((r1, c1, r2, c2))
1136 
1137  # -------------
1138 
1139  def median(self, mylist):
1140  sorts = sorted(mylist)
1141  length = len(sorts)
1142  print length
1143  if length == 1:
1144  return mylist[0]
1145  if not length % 2:
1146  return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1147  return sorts[length / 2]
1148 
1149  def set_threshold(self,threshold):
1150  self.threshold=threshold
1151 
1152  def set_tolerance(self,tolerance):
1153  self.tolerance=tolerance
1154 
1155  def colormap(self, dist):
1156  if dist < self.threshold - self.tolerance:
1157  return "Green"
1158  elif dist >= self.threshold + self.tolerance:
1159  return "Orange"
1160  else:
1161  return "Red"
1162 
1163  def write_cross_link_database(self, filename, format='csv'):
1164  import csv
1165 
1166  fieldnames = [
1167  "Unique ID", "Protein1", "Residue1", "Protein2", "Residue2",
1168  "Median Distance", "Standard Deviation", "Confidence", "Frequency", "Arrangement"]
1169 
1170  if not self.external_csv_data is None:
1171  keys = self.external_csv_data.keys()
1172  innerkeys = self.external_csv_data[keys[0]].keys()
1173  innerkeys.sort()
1174  fieldnames += innerkeys
1175 
1176  dw = csv.DictWriter(
1177  open(filename,
1178  "w"),
1179  delimiter=',',
1180  fieldnames=fieldnames)
1181  dw.writeheader()
1182  for xl in self.crosslinks:
1183  (r1, c1, r2, c2, mdist, stdv, confidence,
1184  unique_identifier, descriptor) = xl
1185  if descriptor == 'original':
1186  outdict = {}
1187  outdict["Unique ID"] = unique_identifier
1188  outdict["Protein1"] = c1
1189  outdict["Protein2"] = c2
1190  outdict["Residue1"] = r1
1191  outdict["Residue2"] = r2
1192  outdict["Median Distance"] = mdist
1193  outdict["Standard Deviation"] = stdv
1194  outdict["Confidence"] = confidence
1195  outdict["Frequency"] = self.cross_link_frequency[
1196  (r1, c1, r2, c2)]
1197  if c1 == c2:
1198  arrangement = "Intra"
1199  else:
1200  arrangement = "Inter"
1201  outdict["Arrangement"] = arrangement
1202  if not self.external_csv_data is None:
1203  outdict.update(self.external_csv_data[unique_identifier])
1204 
1205  dw.writerow(outdict)
1206 
1207  def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
1208  no_confidence_info=False, filter=None, layout="whole", crosslinkedonly=False,
1209  filename=None, confidence_classes=None, alphablend=0.1, scale_symbol_size=1.0,
1210  gap_between_components=0,
1211  rename_protein_map=None):
1212  # layout can be:
1213  # "lowerdiagonal" print only the lower diagonal plot
1214  # "upperdiagonal" print only the upper diagonal plot
1215  # "whole" print all
1216  # crosslinkedonly: plot only components that have crosslinks
1217  # no_dist_info: if True will plot only the cross-links as grey spots
1218  # filter = tuple the tuple contains a keyword to be search in the database
1219  # a relationship ">","==","<"
1220  # and a value
1221  # example ("ID_Score",">",40)
1222  # scale_symbol_size rescale the symbol for the crosslink
1223  # rename_protein_map is a dictionary to rename proteins
1224 
1225  import matplotlib.pyplot as plt
1226  import matplotlib.cm as cm
1227 
1228  fig = plt.figure(figsize=(10, 10))
1229  ax = fig.add_subplot(111)
1230 
1231  ax.set_xticks([])
1232  ax.set_yticks([])
1233 
1234  # set the list of proteins on the x axis
1235  if prot_listx is None:
1236  if crosslinkedonly:
1237  prot_listx = list(self.crosslinkedprots)
1238  else:
1239  prot_listx = self.prot_length_dict.keys()
1240  prot_listx.sort()
1241 
1242  nresx = gap_between_components + \
1243  sum([self.prot_length_dict[name]
1244  + gap_between_components for name in prot_listx])
1245 
1246  # set the list of proteins on the y axis
1247 
1248  if prot_listy is None:
1249  if crosslinkedonly:
1250  prot_listy = list(self.crosslinkedprots)
1251  else:
1252  prot_listy = self.prot_length_dict.keys()
1253  prot_listy.sort()
1254 
1255  nresy = gap_between_components + \
1256  sum([self.prot_length_dict[name]
1257  + gap_between_components for name in prot_listy])
1258 
1259  # this is the residue offset for each protein
1260  resoffsetx = {}
1261  resendx = {}
1262  res = gap_between_components
1263  for prot in prot_listx:
1264  resoffsetx[prot] = res
1265  res += self.prot_length_dict[prot]
1266  resendx[prot] = res
1267  res += gap_between_components
1268 
1269  resoffsety = {}
1270  resendy = {}
1271  res = gap_between_components
1272  for prot in prot_listy:
1273  resoffsety[prot] = res
1274  res += self.prot_length_dict[prot]
1275  resendy[prot] = res
1276  res += gap_between_components
1277 
1278  resoffsetdiagonal = {}
1279  res = gap_between_components
1280  for prot in IMP.pmi.tools.OrderedSet(prot_listx + prot_listy):
1281  resoffsetdiagonal[prot] = res
1282  res += self.prot_length_dict[prot]
1283  res += gap_between_components
1284 
1285  # plot protein boundaries
1286 
1287  xticks = []
1288  xlabels = []
1289  for n, prot in enumerate(prot_listx):
1290  res = resoffsetx[prot]
1291  end = resendx[prot]
1292  for proty in prot_listy:
1293  resy = resoffsety[proty]
1294  endy = resendy[proty]
1295  ax.plot([res, res], [resy, endy], 'k-', lw=0.4)
1296  ax.plot([end, end], [resy, endy], 'k-', lw=0.4)
1297  xticks.append((float(res) + float(end)) / 2)
1298  if rename_protein_map is not None:
1299  if prot in rename_protein_map:
1300  prot=rename_protein_map[prot]
1301  xlabels.append(prot)
1302 
1303  yticks = []
1304  ylabels = []
1305  for n, prot in enumerate(prot_listy):
1306  res = resoffsety[prot]
1307  end = resendy[prot]
1308  for protx in prot_listx:
1309  resx = resoffsetx[protx]
1310  endx = resendx[protx]
1311  ax.plot([resx, endx], [res, res], 'k-', lw=0.4)
1312  ax.plot([resx, endx], [end, end], 'k-', lw=0.4)
1313  yticks.append((float(res) + float(end)) / 2)
1314  if rename_protein_map is not None:
1315  if prot in rename_protein_map:
1316  prot=rename_protein_map[prot]
1317  ylabels.append(prot)
1318 
1319  # plot the contact map
1320  print prot_listx, prot_listy
1321 
1322  if not self.contactmap is None:
1323  import matplotlib.cm as cm
1324  tmp_array = np.zeros((nresx, nresy))
1325 
1326  for px in prot_listx:
1327  print px
1328  for py in prot_listy:
1329  print py
1330  resx = resoffsety[px]
1331  lengx = resendx[px] - 1
1332  resy = resoffsety[py]
1333  lengy = resendy[py] - 1
1334  indexes_x = self.index_dictionary[px]
1335  minx = min(indexes_x)
1336  maxx = max(indexes_x)
1337  indexes_y = self.index_dictionary[py]
1338  miny = min(indexes_y)
1339  maxy = max(indexes_y)
1340 
1341  print px, py, minx, maxx, miny, maxy
1342 
1343  try:
1344  tmp_array[
1345  resx:lengx,
1346  resy:lengy] = self.contactmap[
1347  minx:maxx,
1348  miny:maxy]
1349  except:
1350  continue
1351 
1352 
1353  ax.imshow(tmp_array,
1354  cmap=cm.binary,
1355  origin='lower',
1356  interpolation='nearest')
1357 
1358  ax.set_xticks(xticks)
1359  ax.set_xticklabels(xlabels, rotation=90)
1360  ax.set_yticks(yticks)
1361  ax.set_yticklabels(ylabels)
1362  ax.set_xlim(0,nresx)
1363  ax.set_ylim(0,nresy)
1364 
1365 
1366  # set the crosslinks
1367 
1368  already_added_xls = []
1369 
1370  for xl in self.crosslinks:
1371 
1372  (r1, c1, r2, c2, mdist, stdv, confidence,
1373  unique_identifier, descriptor) = xl
1374 
1375  if confidence_classes is not None:
1376  if confidence not in confidence_classes:
1377  continue
1378 
1379  try:
1380  pos1 = r1 + resoffsetx[c1]
1381  except:
1382  continue
1383  try:
1384  pos2 = r2 + resoffsety[c2]
1385  except:
1386  continue
1387 
1388  if not filter is None:
1389  xldb = self.external_csv_data[unique_identifier]
1390  xldb.update({"Distance": mdist})
1391  xldb.update({"Distance_stdv": stdv})
1392 
1393  if filter[1] == ">":
1394  if float(xldb[filter[0]]) <= float(filter[2]):
1395  continue
1396 
1397  if filter[1] == "<":
1398  if float(xldb[filter[0]]) >= float(filter[2]):
1399  continue
1400 
1401  if filter[1] == "==":
1402  if float(xldb[filter[0]]) != float(filter[2]):
1403  continue
1404 
1405  # all that below is used for plotting the diagonal
1406  # when you have a rectangolar plots
1407 
1408  pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
1409  pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
1410 
1411  if layout == 'lowerdiagonal':
1412  if pos_for_diagonal1 <= pos_for_diagonal2:
1413  continue
1414  if layout == 'upperdiagonal':
1415  if pos_for_diagonal1 >= pos_for_diagonal2:
1416  continue
1417 
1418  already_added_xls.append((r1, c1, r2, c2))
1419 
1420  if not no_confidence_info:
1421  if confidence == '0.01':
1422  markersize = 14 * scale_symbol_size
1423  elif confidence == '0.05':
1424  markersize = 9 * scale_symbol_size
1425  elif confidence == '0.1':
1426  markersize = 6 * scale_symbol_size
1427  else:
1428  markersize = 15 * scale_symbol_size
1429  else:
1430  markersize = 5 * scale_symbol_size
1431 
1432  if not no_dist_info:
1433  color = self.colormap(mdist)
1434  else:
1435  color = "gray"
1436 
1437  ax.plot(
1438  [pos1],
1439  [pos2],
1440  'o',
1441  c=color,
1442  alpha=alphablend,
1443  markersize=markersize)
1444 
1445 
1446 
1447  fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
1448 
1449  [i.set_linewidth(2.0) for i in ax.spines.itervalues()]
1450 
1451  #plt.tight_layout()
1452 
1453  if filename:
1454  plt.savefig(filename + ".pdf", dpi=300, transparent="False")
1455 
1456 
1457  plt.show()
1458 
1459  def get_frequency_statistics(self, prot_list,
1460  prot_list2=None):
1461 
1462  violated_histogram = {}
1463  satisfied_histogram = {}
1464  unique_cross_links = []
1465 
1466  for xl in self.unique_cross_link_list:
1467  (r1, c1, r2, c2, mdist) = xl
1468 
1469  # here we filter by the protein
1470  if prot_list2 is None:
1471  if not c1 in prot_list:
1472  continue
1473  if not c2 in prot_list:
1474  continue
1475  else:
1476  if c1 in prot_list and c2 in prot_list2:
1477  pass
1478  elif c1 in prot_list2 and c2 in prot_list:
1479  pass
1480  else:
1481  continue
1482 
1483  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1484 
1485  if (r1, c1, r2, c2) not in unique_cross_links:
1486  if mdist > 35.0:
1487  if frequency not in violated_histogram:
1488  violated_histogram[frequency] = 1
1489  else:
1490  violated_histogram[frequency] += 1
1491  else:
1492  if frequency not in satisfied_histogram:
1493  satisfied_histogram[frequency] = 1
1494  else:
1495  satisfied_histogram[frequency] += 1
1496  unique_cross_links.append((r1, c1, r2, c2))
1497  unique_cross_links.append((r2, c2, r1, c1))
1498 
1499  print "# satisfied"
1500 
1501  total_number_of_crosslinks=0
1502 
1503  for i in satisfied_histogram:
1504  # if i in violated_histogram:
1505  # print i, satisfied_histogram[i]+violated_histogram[i]
1506  # else:
1507  if i in violated_histogram:
1508  print i, violated_histogram[i]+satisfied_histogram[i]
1509  else:
1510  print i, satisfied_histogram[i]
1511  total_number_of_crosslinks+=i*satisfied_histogram[i]
1512 
1513  print "# violated"
1514 
1515  for i in violated_histogram:
1516  print i, violated_histogram[i]
1517  total_number_of_crosslinks+=i*violated_histogram[i]
1518 
1519  print total_number_of_crosslinks
1520 
1521 
1522 # ------------
1523  def print_cross_link_binary_symbols(self, prot_list,
1524  prot_list2=None):
1525  tmp_matrix = []
1526  confidence_list = []
1527  for xl in self.crosslinks:
1528  (r1, c1, r2, c2, mdist, stdv, confidence,
1529  unique_identifier, descriptor) = xl
1530 
1531  if prot_list2 is None:
1532  if not c1 in prot_list:
1533  continue
1534  if not c2 in prot_list:
1535  continue
1536  else:
1537  if c1 in prot_list and c2 in prot_list2:
1538  pass
1539  elif c1 in prot_list2 and c2 in prot_list:
1540  pass
1541  else:
1542  continue
1543 
1544  if descriptor != "original":
1545  continue
1546 
1547  confidence_list.append(confidence)
1548 
1549  dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
1550  tmp_dist_binary = []
1551  for d in dists:
1552  if d < 35:
1553  tmp_dist_binary.append(1)
1554  else:
1555  tmp_dist_binary.append(0)
1556  tmp_matrix.append(tmp_dist_binary)
1557 
1558  matrix = zip(*tmp_matrix)
1559 
1560  satisfied_high_sum = 0
1561  satisfied_mid_sum = 0
1562  satisfied_low_sum = 0
1563  total_satisfied_sum = 0
1564  for k, m in enumerate(matrix):
1565  satisfied_high = 0
1566  total_high = 0
1567  satisfied_mid = 0
1568  total_mid = 0
1569  satisfied_low = 0
1570  total_low = 0
1571  total_satisfied = 0
1572  total = 0
1573  for n, b in enumerate(m):
1574  if confidence_list[n] == "0.01":
1575  total_high += 1
1576  if b == 1:
1577  satisfied_high += 1
1578  satisfied_high_sum += 1
1579  elif confidence_list[n] == "0.05":
1580  total_mid += 1
1581  if b == 1:
1582  satisfied_mid += 1
1583  satisfied_mid_sum += 1
1584  elif confidence_list[n] == "0.1":
1585  total_low += 1
1586  if b == 1:
1587  satisfied_low += 1
1588  satisfied_low_sum += 1
1589  if b == 1:
1590  total_satisfied += 1
1591  total_satisfied_sum += 1
1592  total += 1
1593  print k, satisfied_high, total_high
1594  print k, satisfied_mid, total_mid
1595  print k, satisfied_low, total_low
1596  print k, total_satisfied, total
1597  print float(satisfied_high_sum) / len(matrix)
1598  print float(satisfied_mid_sum) / len(matrix)
1599  print float(satisfied_low_sum) / len(matrix)
1600 # ------------
1601 
1602  def get_unique_crosslinks_statistics(self, prot_list,
1603  prot_list2=None):
1604 
1605  print prot_list
1606  print prot_list2
1607  satisfied_high = 0
1608  total_high = 0
1609  satisfied_mid = 0
1610  total_mid = 0
1611  satisfied_low = 0
1612  total_low = 0
1613  total = 0
1614  tmp_matrix = []
1615  satisfied_string = []
1616  for xl in self.crosslinks:
1617  (r1, c1, r2, c2, mdist, stdv, confidence,
1618  unique_identifier, descriptor) = xl
1619 
1620  if prot_list2 is None:
1621  if not c1 in prot_list:
1622  continue
1623  if not c2 in prot_list:
1624  continue
1625  else:
1626  if c1 in prot_list and c2 in prot_list2:
1627  pass
1628  elif c1 in prot_list2 and c2 in prot_list:
1629  pass
1630  else:
1631  continue
1632 
1633  if descriptor != "original":
1634  continue
1635 
1636  total += 1
1637  if confidence == "0.01":
1638  total_high += 1
1639  if mdist <= 35:
1640  satisfied_high += 1
1641  if confidence == "0.05":
1642  total_mid += 1
1643  if mdist <= 35:
1644  satisfied_mid += 1
1645  if confidence == "0.1":
1646  total_low += 1
1647  if mdist <= 35:
1648  satisfied_low += 1
1649  if mdist <= 35:
1650  satisfied_string.append(1)
1651  else:
1652  satisfied_string.append(0)
1653 
1654  dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
1655  tmp_dist_binary = []
1656  for d in dists:
1657  if d < 35:
1658  tmp_dist_binary.append(1)
1659  else:
1660  tmp_dist_binary.append(0)
1661  tmp_matrix.append(tmp_dist_binary)
1662 
1663  print "unique satisfied_high/total_high", satisfied_high, "/", total_high
1664  print "unique satisfied_mid/total_mid", satisfied_mid, "/", total_mid
1665  print "unique satisfied_low/total_low", satisfied_low, "/", total_low
1666  print "total", total
1667 
1668  matrix = zip(*tmp_matrix)
1669  satisfied_models = 0
1670  satstr = ""
1671  for b in satisfied_string:
1672  if b == 0:
1673  satstr += "-"
1674  if b == 1:
1675  satstr += "*"
1676 
1677  for m in matrix:
1678  all_satisfied = True
1679  string = ""
1680  for n, b in enumerate(m):
1681  if b == 0:
1682  string += "0"
1683  if b == 1:
1684  string += "1"
1685  if b == 1 and satisfied_string[n] == 1:
1686  pass
1687  elif b == 1 and satisfied_string[n] == 0:
1688  pass
1689  elif b == 0 and satisfied_string[n] == 0:
1690  pass
1691  elif b == 0 and satisfied_string[n] == 1:
1692  all_satisfied = False
1693  if all_satisfied:
1694  satisfied_models += 1
1695  print string
1696  print satstr, all_satisfied
1697  print "models that satisfies the median satisfied crosslinks/total models", satisfied_models, len(matrix)
1698 
1699  def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,
1700  prot_list2=None):
1701 
1702  import pylab as pl
1703 
1704  tmp_matrix = []
1705  for kw in self.cross_link_distances_unique:
1706  (r1, c1, r2, c2) = kw
1707  dists = self.cross_link_distances_unique[kw]
1708 
1709  if prot_list2 is None:
1710  if not c1 in prot_list:
1711  continue
1712  if not c2 in prot_list:
1713  continue
1714  else:
1715  if c1 in prot_list and c2 in prot_list2:
1716  pass
1717  elif c1 in prot_list2 and c2 in prot_list:
1718  pass
1719  else:
1720  continue
1721  # append the sum of dists to order by that in the matrix plot
1722  dists.append(sum(dists))
1723  tmp_matrix.append(dists)
1724 
1725  tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
1726 
1727  # print len(tmp_matrix), len(tmp_matrix[0])-1
1728  matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
1729 
1730  for i in range(len(tmp_matrix)):
1731  for k in range(len(tmp_matrix[i]) - 1):
1732  matrix[i][k] = tmp_matrix[i][k]
1733 
1734  print matrix
1735 
1736  fig = pl.figure()
1737  ax = fig.add_subplot(211)
1738 
1739  cax = ax.imshow(matrix, interpolation='nearest')
1740  # ax.set_yticks(range(len(self.model_list_names)))
1741  #ax.set_yticklabels( [self.model_list_names[i] for i in leaves_order] )
1742  fig.colorbar(cax)
1743  pl.savefig(figurename, dpi=300)
1744  pl.show()
1745 
1746  def plot_bars(
1747  self,
1748  filename,
1749  prots1,
1750  prots2,
1751  nxl_per_row=20,
1752  arrangement="inter",
1753  confidence_input="None"):
1754 
1755  data = []
1756  for xl in self.cross_link_distances:
1757  (r1, c1, r2, c2, mdist, confidence) = xl
1758  if c1 in prots1 and c2 in prots2:
1759  if arrangement == "inter" and c1 == c2:
1760  continue
1761  if arrangement == "intra" and c1 != c2:
1762  continue
1763  if confidence_input == confidence:
1764  label = str(c1) + ":" + str(r1) + \
1765  "-" + str(c2) + ":" + str(r2)
1766  values = self.cross_link_distances[xl]
1767  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1768  data.append((label, values, mdist, frequency))
1769 
1770  sort_by_dist = sorted(data, key=lambda tup: tup[2])
1771  sort_by_dist = zip(*sort_by_dist)
1772  values = sort_by_dist[1]
1773  positions = range(len(values))
1774  labels = sort_by_dist[0]
1775  frequencies = map(float, sort_by_dist[3])
1776  frequencies = [f * 10.0 for f in frequencies]
1777 
1778  nchunks = int(float(len(values)) / nxl_per_row)
1779  values_chunks = IMP.pmi.tools.chunk_list_into_segments(values, nchunks)
1780  positions_chunks = IMP.pmi.tools.chunk_list_into_segments(
1781  positions,
1782  nchunks)
1783  frequencies_chunks = IMP.pmi.tools.chunk_list_into_segments(
1784  frequencies,
1785  nchunks)
1786  labels_chunks = IMP.pmi.tools.chunk_list_into_segments(labels, nchunks)
1787 
1788  for n, v in enumerate(values_chunks):
1789  p = positions_chunks[n]
1790  f = frequencies_chunks[n]
1791  l = labels_chunks[n]
1793  filename + "." + str(n), v, p, f,
1794  valuename="Distance (Ang)", positionname="Unique " + arrangement + " Crosslinks", xlabels=l)
1795 
1796  def crosslink_distance_histogram(self, filename,
1797  prot_list=None,
1798  prot_list2=None,
1799  confidence_classes=None,
1800  bins=40,
1801  color='#66CCCC',
1802  yplotrange=[0, 1],
1803  format="png",
1804  normalized=False):
1805  if prot_list is None:
1806  prot_list = self.prot_length_dict.keys()
1807 
1808  distances = []
1809  for xl in self.crosslinks:
1810  (r1, c1, r2, c2, mdist, stdv, confidence,
1811  unique_identifier, descriptor) = xl
1812 
1813  if not confidence_classes is None:
1814  if confidence not in confidence_classes:
1815  continue
1816 
1817  if prot_list2 is None:
1818  if not c1 in prot_list:
1819  continue
1820  if not c2 in prot_list:
1821  continue
1822  else:
1823  if c1 in prot_list and c2 in prot_list2:
1824  pass
1825  elif c1 in prot_list2 and c2 in prot_list:
1826  pass
1827  else:
1828  continue
1829 
1830  distances.append(mdist)
1831 
1833  filename, distances, valuename="C-alpha C-alpha distance [Ang]",
1834  bins=bins, color=color,
1835  format=format,
1836  reference_xline=35.0,
1837  yplotrange=yplotrange, normalized=normalized)
1838 
1839  def scatter_plot_xl_features(self, filename,
1840  feature1=None,
1841  feature2=None,
1842  prot_list=None,
1843  prot_list2=None,
1844  yplotrange=None,
1845  reference_ylines=None,
1846  distance_color=True,
1847  format="png"):
1848  import matplotlib.pyplot as plt
1849  import matplotlib.cm as cm
1850 
1851  fig = plt.figure(figsize=(10, 10))
1852  ax = fig.add_subplot(111)
1853 
1854  for xl in self.crosslinks:
1855  (r1, c1, r2, c2, mdist, stdv, confidence,
1856  unique_identifier, arrangement) = xl
1857 
1858  if prot_list2 is None:
1859  if not c1 in prot_list:
1860  continue
1861  if not c2 in prot_list:
1862  continue
1863  else:
1864  if c1 in prot_list and c2 in prot_list2:
1865  pass
1866  elif c1 in prot_list2 and c2 in prot_list:
1867  pass
1868  else:
1869  continue
1870 
1871  xldb = self.external_csv_data[unique_identifier]
1872  xldb.update({"Distance": mdist})
1873  xldb.update({"Distance_stdv": stdv})
1874 
1875  xvalue = float(xldb[feature1])
1876  yvalue = float(xldb[feature2])
1877 
1878  if distance_color:
1879  color = self.colormap(mdist)
1880  else:
1881  color = "gray"
1882 
1883  ax.plot([xvalue], [yvalue], 'o', c=color, alpha=0.1, markersize=7)
1884 
1885  if not yplotrange is None:
1886  ax.set_ylim(yplotrange)
1887  if not reference_ylines is None:
1888  for rl in reference_ylines:
1889  ax.axhline(rl, color='red', linestyle='dashed', linewidth=1)
1890 
1891  if filename:
1892  plt.savefig(filename + "." + format, dpi=150, transparent="False")
1893 
1894  plt.show()
1895 
1896 #
1897 # these are post production function analysis
1898 #
1899 
1900 class Precision(object):
1901  def __init__(self,model,resolution='one',selection_dictionary=None):
1902 
1903  ''' selection_dictionary is a dictionary where we store coordinates
1904  selection_dictionary = {"Selection_name_1":selection_tuple1,
1905  "Selection_name_2":selection_tuple2}'''
1906 
1907 
1908  self.styles=['pairwise_rmsd','pairwise_drmsd_k','pairwise_drmsd_Q','pairwise_drms_k','pairwise_rmsd','drmsd_from_center']
1909  self.style='pairwise_drmsd_k'
1910  self.structures_dictionary={}
1911  self.reference_structures_dictionary={}
1912  self.prots=[]
1913  self.protein_names=None
1914  self.len_particles_resolution_one=None
1915  self.model=model
1916  self.rmf_names_frames={}
1917  self.reference_rmf_names_frames=None
1918  self.reference_structure=None
1919  self.reference_prot=None
1920  self.selection_dictionary=selection_dictionary
1921  self.threshold=40.0
1922 
1923  self.residue_particle_index_map=None
1924 
1925  if resolution in ["one", "ten"]:
1926  self.resolution=resolution
1927  else:
1928  print "no such resolution"; exit()
1929 
1930 
1931 
1932  def get_structure(self,rmf_frame_index,rmf_name):
1933 
1934  rh= RMF.open_rmf_file_read_only(rmf_name)
1935  prots=IMP.rmf.create_hierarchies(rh, self.model)
1936  IMP.rmf.load_frame(rh, rmf_frame_index)
1937  print "getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name)
1938  del rh
1939 
1940  if self.resolution=='one':
1941  particle_dict=get_particles_at_resolution_one(prots[0])
1942  elif self.resolution=='ten':
1943  particle_dict=get_particles_at_resolution_ten(prots[0])
1944 
1945  protein_names=particle_dict.keys()
1946  particles_resolution_one=[]
1947  for k in particle_dict:
1948  particles_resolution_one+=(particle_dict[k])
1949 
1950  if self.protein_names==None:
1951  self.protein_names=protein_names
1952  else:
1953  if self.protein_names!=protein_names:
1954  print "Error: the protein names of the new coordinate set is not compatible with the previous one"
1955  exit()
1956 
1957  if self.len_particles_resolution_one==None:
1958  self.len_particles_resolution_one=len(particles_resolution_one)
1959  else:
1960  if self.len_particles_resolution_one!=len(particles_resolution_one):
1961  print "Error: the new coordinate set is not compatible with the previous one"
1962  exit()
1963 
1964  return particles_resolution_one, prots
1965 
1966 
1967  def add_structures(self,rmf_name_frame_tuples,structure_set_name,is_mpi=False):
1968  '''this function helps reading a list of rmfs, the input is a list of
1969  tuples, containing the file name and the frame number'''
1970 
1971  if is_mpi:
1972  from mpi4py import MPI
1973  comm = MPI.COMM_WORLD
1974  rank = comm.Get_rank()
1975  number_of_processes = comm.size
1976  else:
1977  number_of_processes=1
1978  rank=0
1979 
1980  my_rmf_name_frame_tuples=IMP.pmi.tools.chunk_list_into_segments(rmf_name_frame_tuples,number_of_processes)[rank]
1981 
1982 
1983  for nfr,tup in enumerate(my_rmf_name_frame_tuples):
1984  rmf_name=tup[0]
1985  rmf_frame_index=tup[1]
1986  # the first frame stores the map between residues and particles
1987  self.add_structure(rmf_name,rmf_frame_index,structure_set_name)
1988  try:
1989  (a,b)=self.get_structure(rmf_frame_index,rmf_name)
1990  except:
1991  print "something wrong with the rmf"
1992  continue
1993  if self.residue_particle_index_map is None:
1994 
1995 
1996  try:
1997  (particles_resolution_one, prots)=self.get_structure(rmf_frame_index,rmf_name)
1998  except:
1999  print "something wrong with the rmf"
2000  continue
2001 
2002  self.residue_particle_index_map={}
2003 
2004  for prot_name in self.protein_names:
2005 
2006  self.residue_particle_index_map[prot_name]=self.get_residue_particle_index_map(prot_name,particles_resolution_one,prots[0])
2007 
2008 
2009  if number_of_processes > 1:
2010  # synchronize the internal self.selection_dictionary data structure
2011  self.rmf_names_frames=IMP.pmi.tools.scatter_and_gather(self.rmf_names_frames)
2012 
2013  if rank != 0:
2014  comm.send(self.structures_dictionary, dest=0, tag=11)
2015 
2016  elif rank == 0:
2017  for i in range(1, number_of_processes):
2018  data_tmp = comm.recv(source=i, tag=11)
2019 
2020  for key in self.structures_dictionary:
2021 
2022  self.structures_dictionary[key]+=data_tmp[key]
2023 
2024  for i in range(1, number_of_processes):
2025  comm.send(self.structures_dictionary, dest=i, tag=11)
2026 
2027  if rank != 0:
2028  self.structures_dictionary = comm.recv(source=0, tag=11)
2029 
2030 
2031 
2032 
2033  def add_structure(self,rmf_name,rmf_frame_index,structure_set_name):
2034  '''
2035  @structure_set_name string specify a name for the structure set. This is used for the precision and
2036  cross-precision
2037  '''
2038  if structure_set_name in self.structures_dictionary:
2039  cdict=self.structures_dictionary[structure_set_name]
2040  rmflist=self.rmf_names_frames[structure_set_name]
2041  else:
2042  self.structures_dictionary[structure_set_name]={}
2043  self.rmf_names_frames[structure_set_name]=[]
2044  cdict=self.structures_dictionary[structure_set_name]
2045  rmflist=self.rmf_names_frames[structure_set_name]
2046 
2047  try:
2048  (particles_resolution_one, prots)=self.get_structure(rmf_frame_index,rmf_name)
2049  except:
2050  print "something wrong with the rmf"
2051  return 0
2052 
2053  self.selection_dictionary.update({"All":self.protein_names})
2054 
2055  for selection_name in self.selection_dictionary:
2056  selection_tuple=self.selection_dictionary[selection_name]
2057  coords=self.select_coordinates(selection_tuple,particles_resolution_one,prots[0])
2058 
2059  if selection_name not in cdict:
2060  cdict[selection_name]=[coords]
2061  else:
2062  cdict[selection_name].append(coords)
2063 
2064  rmflist.append((rmf_name,rmf_frame_index))
2065 
2066  for prot in prots: IMP.atom.destroy(prot)
2067 
2068 
2069 
2070  def get_residue_particle_index_map(self,prot_name,structure,hier):
2071  residue_particle_index_map=[]
2072  s=IMP.atom.Selection(hier,molecules=[prot_name])
2073  all_selected_particles=s.get_selected_particles()
2074  intersection=list(set(all_selected_particles) & set(structure))
2075  sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
2076  for p in sorted_intersection:
2077  residue_particle_index_map.append(IMP.pmi.tools.get_residue_indexes(p))
2078  return residue_particle_index_map
2079 
2080 
2081  def select_coordinates(self,tuple_selections,structure,prot):
2082  selected_coordinates=[]
2083  for t in tuple_selections:
2084  if type(t)==tuple and len(t)==3:
2085  s=IMP.atom.Selection(prot,molecules=[t[0]],residue_indexes=range(t[1],t[2]+1))
2086  all_selected_particles=s.get_selected_particles()
2087  intersection=list(set(all_selected_particles) & set(structure))
2088  sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
2089  cc=map(lambda p: tuple(IMP.core.XYZ(p).get_coordinates()), sorted_intersection)
2090  selected_coordinates+=cc
2091 
2092  elif type(t)==str:
2093  s=IMP.atom.Selection(prot,molecules=[t])
2094  all_selected_particles=s.get_selected_particles()
2095  intersection=list(set(all_selected_particles) & set(structure))
2096  sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
2097  cc=map(lambda p: tuple(IMP.core.XYZ(p).get_coordinates()), sorted_intersection)
2098  selected_coordinates+=cc
2099  else:
2100  print "Selection error"
2101  exit()
2102  return selected_coordinates
2103 
2104  def set_threshold(self,threshold):
2105  self.threshold=threshold
2106 
2107 
2108  def get_distance(self,structure_set_name1,structure_set_name2,selection_name,index1,index2):
2109  c1=self.structures_dictionary[structure_set_name1][selection_name][index1]
2110  c2=self.structures_dictionary[structure_set_name2][selection_name][index2]
2111 
2112  coordinates1=map(lambda c: IMP.algebra.Vector3D(c), c1)
2113  coordinates2=map(lambda c: IMP.algebra.Vector3D(c), c2)
2114 
2115  if self.style=='pairwise_drmsd_k':
2116  distance=IMP.atom.get_drmsd(coordinates1,coordinates2)
2117  if self.style=='pairwise_drms_k':
2118  distance=IMP.atom.get_drms(coordinates1,coordinates2)
2119  if self.style=='pairwise_drmsd_Q':
2120  distance=IMP.atom.get_drmsd_Q(coordinates1,coordinates2,self.threshold)
2121 
2122  if self.style=='pairwise_rmsd':
2123  distance=IMP.atom.get_rmsd(coordinates1,coordinates2,IMP.algebra.get_identity_transformation_3d())
2124 
2125  return distance
2126 
2127  def get_particle_distances(self,structure_set_name1,structure_set_name2,
2128  selection_name,index1,index2):
2129  import numpy as np
2130  c1=self.structures_dictionary[structure_set_name1][selection_name][index1]
2131  c2=self.structures_dictionary[structure_set_name2][selection_name][index2]
2132 
2133  coordinates1=map(lambda c: IMP.algebra.Vector3D(c), c1)
2134  coordinates2=map(lambda c: IMP.algebra.Vector3D(c), c2)
2135 
2136  distances=[np.linalg.norm(a-b) for (a,b) in zip(coordinates1,coordinates2)]
2137 
2138  return distances
2139 
2140  def get_precision(self,outfile,structure_set_name1,structure_set_name2,
2141  is_mpi=False,skip=None,selection_keywords=None):
2142 
2143  '''
2144  when the structure_set_name1 is different from the structure_set_name2, you get the
2145  cross-precision
2146  structure_set_name1 string name of the first structure set
2147  structure_set_name2 string name of the second structure set
2148  skip Int analyse every skip structure for the distance matrix calculation
2149  outfile str the output file name
2150  is_mpi bool speed up calculation using openmpi
2151  skip int skip every n frames
2152 
2153  '''
2154 
2155  if is_mpi:
2156  from mpi4py import MPI
2157  comm = MPI.COMM_WORLD
2158  rank = comm.Get_rank()
2159  number_of_processes = comm.size
2160  else:
2161  number_of_processes=1
2162  rank=0
2163 
2164  of=open(outfile,"w")
2165 
2166  if selection_keywords is None:
2167  sel_keys=self.selection_dictionary.keys()
2168  else:
2169  sel_keys=selection_keywords
2170  centroid_index=0
2171  for selection_name in sel_keys:
2172 
2173  number_of_structures_1=len(self.structures_dictionary[structure_set_name1][selection_name])
2174  number_of_structures_2=len(self.structures_dictionary[structure_set_name2][selection_name])
2175 
2176  distances={}
2177 
2178  structure_pointers_1=range(number_of_structures_1)
2179  structure_pointers_2=range(number_of_structures_2)
2180 
2181  if skip is not None:
2182  structure_pointers_1=structure_pointers_1[0::skip]
2183  structure_pointers_2=structure_pointers_2[0::skip]
2184 
2185  pair_combination_list=list(itertools.product(structure_pointers_1,structure_pointers_2))
2186 
2187  if len(pair_combination_list)==0:
2188  print "get_precision> no structure selected. Check the skip parameter."
2189  exit()
2190 
2191  my_pair_combination_list=IMP.pmi.tools.chunk_list_into_segments(pair_combination_list,number_of_processes)[rank]
2192  my_length=len(my_pair_combination_list)
2193 
2194  for n,pair in enumerate(my_pair_combination_list):
2195  progression=int(float(n)/my_length*100.0)
2196  #print rank,progression,len(pair_combination_list),my_length,n
2197  distances[pair]=self.get_distance(structure_set_name1,structure_set_name2,
2198  selection_name,pair[0],pair[1])
2199 
2200  if number_of_processes > 1:
2201  distances = IMP.pmi.tools.scatter_and_gather(distances)
2202 
2203  if rank == 0:
2204 
2205  if structure_set_name1==structure_set_name2:
2206  structure_pointers=structure_pointers_1
2207  number_of_structures=number_of_structures_1
2208 
2209  # calculate the distance from the first centroid
2210  # and determine the centroid
2211 
2212  distance=0.0
2213  distances_to_structure={}
2214  distances_to_structure_normalization={}
2215 
2216  for n in structure_pointers:
2217  distances_to_structure[n]=0.0
2218  distances_to_structure_normalization[n]=0
2219 
2220  for k in distances:
2221  distance+=distances[k]
2222  distances_to_structure[k[0]]+=distances[k]
2223  distances_to_structure[k[1]]+=distances[k]
2224  distances_to_structure_normalization[k[0]]+=1
2225  distances_to_structure_normalization[k[1]]+=1
2226 
2227  for n in structure_pointers:
2228  distances_to_structure[n]=distances_to_structure[n]/distances_to_structure_normalization[n]
2229 
2230  #for n,d in enumerate(distances_to_structure):
2231  # print self.rmf_names_frames[n],d
2232  min_distance=min([distances_to_structure[n] for n in distances_to_structure])
2233  centroid_index=[k for k, v in distances_to_structure.iteritems() if v == min_distance][0]
2234  centroid_rmf_name=self.rmf_names_frames[structure_set_name1][centroid_index]
2235 
2236  centroid_distance=0.0
2237  for n in range(number_of_structures):
2238  centroid_distance+=self.get_distance(structure_set_name1,structure_set_name1,
2239  selection_name,centroid_index,n)
2240 
2241 
2242  #pairwise_distance=distance/len(distances.keys())
2243  centroid_distance/=number_of_structures
2244  #average_centroid_distance=sum(distances_to_structure)/len(distances_to_structure)
2245  of.write(str(selection_name)+" "+structure_set_name1+
2246  " average centroid distance "+str(centroid_distance)+"\n")
2247  of.write(str(selection_name)+" "+structure_set_name1+
2248  " centroid index "+str(centroid_index)+"\n")
2249  of.write(str(selection_name)+" "+structure_set_name1+
2250  " centroid rmf name "+str(centroid_rmf_name)+"\n")
2251 
2252  average_pairwise_distances=sum(distances.values())/len(distances.values())
2253  of.write(str(selection_name)+" "+structure_set_name1+" "+structure_set_name2+
2254  " average pairwise distance "+str(average_pairwise_distances)+"\n")
2255 
2256 
2257  of.close()
2258  return centroid_index
2259 
2260  def get_rmsf(self,structure_set_name,outdir="./",
2261  is_mpi=False,skip=None,
2262  make_plot=False,set_plot_yaxis_range=None):
2263 
2264  outfile=outdir+"/rmsf.dat"
2265  # get the centroid structure for the whole complex
2266  centroid_index=self.get_precision(outfile,structure_set_name,structure_set_name,
2267  is_mpi=is_mpi,skip=skip,selection_keywords=["All"])
2268 
2269  for p in self.protein_names:
2270  outfile=outdir+"/rmsf."+p+".dat"
2271  of=open(outfile,"w")
2272  selection_name=p
2273  self.selection_dictionary.update({selection_name:[p]})
2274  number_of_structures=len(self.structures_dictionary[structure_set_name][selection_name])
2275 
2276  rpim=self.residue_particle_index_map[p]
2277 
2278  residue_distances={}
2279  residue_nblock={}
2280  for index in range(number_of_structures):
2281 
2282 
2283  distances=self.get_particle_distances(structure_set_name,
2284  structure_set_name,
2285  selection_name,
2286  centroid_index,index)
2287  for nblock,block in enumerate(rpim):
2288 
2289  for residue_number in block:
2290  residue_nblock[residue_number]=nblock
2291  if residue_number not in residue_distances:
2292  residue_distances[residue_number]=[distances[nblock]]
2293  else:
2294  residue_distances[residue_number].append(distances[nblock])
2295 
2296  residues=[]
2297  rmsfs=[]
2298  for rn in residue_distances:
2299  residues.append(rn)
2300  rmsf=np.std(residue_distances[rn])
2301  rmsfs.append(rmsf)
2302  of.write(str(rn)+" "+str(residue_nblock[rn])+" "+str(rmsf)+"\n")
2303 
2304  IMP.pmi.output.plot_xy_data(residues,rmsfs,title=outdir+"/rmsf."+p,display=False,
2305  set_plot_yaxis_range=None)
2306  of.close()
2307 
2308 
2309 
2310 
2311  def set_reference_structure(self,rmf_name,rmf_frame_index):
2312  (particles_resolution_one, prot)=self.get_structure(rmf_frame_index,rmf_name)
2313  self.reference_rmf_names_frames=(rmf_name,rmf_frame_index)
2314 
2315 
2316  for selection_name in self.selection_dictionary:
2317  selection_tuple=self.selection_dictionary[selection_name]
2318  coords=self.select_coordinates(selection_tuple,
2319  particles_resolution_one,prot)
2320  self.reference_structures_dictionary[selection_name]=coords
2321 
2322 
2323  def get_average_distance_wrt_reference_structure(self,structure_set_name,tuple_selections=None):
2324 
2325  for selection_name in self.selection_dictionary:
2326  reference_coordinates=self.reference_structures_dictionary[selection_name]
2327 
2328  distances=[]
2329 
2330  for sc in self.structures_dictionary[structure_set_name][selection_name]:
2331 
2332  for rc in self.reference_structures_dictionary[selection_name]:
2333 
2334  coordinates1=map(lambda c: IMP.algebra.Vector3D(c), sc)
2335  coordinates2=map(lambda c: IMP.algebra.Vector3D(c), rc)
2336 
2337  if self.style=='pairwise_drmsd_k':
2338  distance=IMP.atom.get_drmsd(coordinates1,coordinates2)
2339  if self.style=='pairwise_drms_k':
2340  distance=IMP.atom.get_drms(coordinates1,coordinates2)
2341  if self.style=='pairwise_drmsd_Q':
2342  distance=IMP.atom.get_drmsd_Q(coordinates1,coordinates2,self.threshold)
2343 
2344  distances.append(distance)
2345 
2346 
2347  print selection_name,"average distance",sum(distances)/len(distances),"minimum distance",min(distances)
2348 
2349 
2350  def get_coordinates(self):
2351  pass
2352 
2353  def set_precision_style(self, style):
2354  if style in self.styles:
2355  self.style=style
2356  else:
2357  print "No such style"; exit()
2358 
2359 
2360 
2361 def get_hier_from_rmf(model, frame_number, rmf_file):
2362  # I have to deprecate this function
2363  print "getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file)
2364 
2365  # load the frame
2366  rh = RMF.open_rmf_file_read_only(rmf_file)
2367 
2368  try:
2369  prots = IMP.rmf.create_hierarchies(rh, model)
2370  except:
2371  print "Unable to open rmf file %s" % (rmf_file)
2372  prot = None
2373  return prot
2374  #IMP.rmf.link_hierarchies(rh, prots)
2375  prot = prots[0]
2376  try:
2377  IMP.rmf.load_frame(rh, frame_number)
2378  except:
2379  print "Unable to open frame %i of file %s" % (frame_number, rmf_file)
2380  prot = None
2381  model.update()
2382  del rh
2383  return prot
2384 
2385 def get_hier_and_restraints_from_rmf(model, frame_number, rmf_file):
2386  # I have to deprecate this function
2387  print "getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file)
2388 
2389  # load the frame
2390  rh = RMF.open_rmf_file_read_only(rmf_file)
2391 
2392  try:
2393  prots = IMP.rmf.create_hierarchies(rh, model)
2394  rs = IMP.rmf.create_restraints(rh, model)
2395  except:
2396  print "Unable to open rmf file %s" % (rmf_file)
2397  prot = None
2398  rs = None
2399  return prot,rs
2400  #IMP.rmf.link_hierarchies(rh, prots)
2401  prot = prots[0]
2402  try:
2403  IMP.rmf.load_frame(rh, frame_number)
2404  except:
2405  print "Unable to open frame %i of file %s" % (frame_number, rmf_file)
2406  prot = None
2407  model.update()
2408  del rh
2409  return prot,rs
2410 
2411 def get_hiers_from_rmf(model, frame_number, rmf_file):
2412  print "getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file)
2413 
2414  # load the frame
2415  rh = RMF.open_rmf_file_read_only(rmf_file)
2416 
2417  try:
2418  prots = IMP.rmf.create_hierarchies(rh, model)
2419  except:
2420  print "Unable to open rmf file %s" % (rmf_file)
2421  prot = None
2422  return prot
2423  #IMP.rmf.link_hierarchies(rh, prots)
2424  try:
2425  IMP.rmf.load_frame(rh, frame_number)
2426  except:
2427  print "Unable to open frame %i of file %s" % (frame_number, rmf_file)
2428  prots = None
2429  model.update()
2430  del rh
2431  return prots
2432 
2433 
2435  """
2436  Get particles at res 1, or any beads, based on the name (no Representation needed)
2437  It is mainly used when the hierarchy is read from an rmf file
2438  Returns a dictionary of component names and their particles
2439  """
2440  particle_dict = {}
2441  allparticles = []
2442  for c in prot.get_children():
2443  name = c.get_name()
2444  particle_dict[name] = IMP.atom.get_leaves(c)
2445  for s in c.get_children():
2446  if "_Res:1" in s.get_name() and "_Res:10" not in s.get_name():
2447  allparticles += IMP.atom.get_leaves(s)
2448  if "Beads" in s.get_name():
2449  allparticles += IMP.atom.get_leaves(s)
2450 
2451  particle_align = []
2452  for name in particle_dict:
2453  particle_dict[name] = IMP.pmi.tools.sort_by_residues(
2454  list(set(particle_dict[name]) & set(allparticles)))
2455  return particle_dict
2456 
2458  """
2459  Get particles at res 10, or any beads, based on the name (no Representation needed)
2460  It is mainly used when the hierarchy is read from an rmf file
2461  Returns a dictionary of component names and their particles
2462  """
2463  particle_dict = {}
2464  allparticles = []
2465  for c in prot.get_children():
2466  name = c.get_name()
2467  particle_dict[name] = IMP.atom.get_leaves(c)
2468  for s in c.get_children():
2469  if "_Res:10" in s.get_name():
2470  allparticles += IMP.atom.get_leaves(s)
2471  if "Beads" in s.get_name():
2472  allparticles += IMP.atom.get_leaves(s)
2473  particle_align = []
2474  for name in particle_dict:
2475  particle_dict[name] = IMP.pmi.tools.sort_by_residues(
2476  list(set(particle_dict[name]) & set(allparticles)))
2477  return particle_dict
2478 
2479 
2480 
2481 def select_by_tuple(first_res_last_res_name_tuple):
2482  first_res = first_res_last_res_hier_tuple[0]
2483  last_res = first_res_last_res_hier_tuple[1]
2484  name = first_res_last_res_hier_tuple[2]
Simple 3D transformation class.
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, kernel::Model *m)
A decorator to associate a particle with a part of a protein/DNA/RNA.
Definition: Fragment.h:20
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
DensityMap * create_density_map(const algebra::GridD< 3, algebra::DenseGridStorageD< 3, float >, float > &grid)
A class for reading stat files.
Definition: output.py:591
def plot_field_histogram
This function is plotting a list of histograms from a value list.
Definition: output.py:750
def plot_fields_box_plots
This function plots time series as boxplots fields is a list of time series, positions are the x-valu...
Definition: output.py:813
kernel::Restraints create_restraints(RMF::FileConstHandle fh, kernel::Model *m)
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
Definition: atom/distance.h:88
Miscellaneous utilities.
Definition: tools.py:1
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
void load_frame(RMF::FileConstHandle file, unsigned int frame)
Definition: frames.h:27
log
Definition: log.py:1
double get_rmsd(const core::XYZs &s0, const core::XYZs &s1, const IMP::algebra::Transformation3D &tr_for_second)
A class to cluster structures.
void write_map(DensityMap *m, std::string filename)
Write a density map to a file.
Class for sampling a density map from particles.
double get_distance(const Plane3D &pln, const Vector3D &p)
Return the distance between a plane and a point in 3D.
Definition: Plane3D.h:71
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
This class performs alignment and RMSD calculation for two sets of coordinates.
Definition: pmi/Analysis.py:23
def add_subunits_density
Add a frame to the densities.
def scatter_and_gather
Synchronize data over a parallel run.
Definition: tools.py:970
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Basic utilities for handling cryo-electron microscopy 3D density maps.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name (no Representation needed) It is mainly used ...
def fill
fill stores coordinates of a model into a dictionary all_coords, containing coordinates for all model...
Analysis tools.
Definition: pmi/Analysis.py:1
def do_cluster
Run K-means clustering.
void destroy(Hierarchy d)
Delete the Hierarchy.
3D rotation class.
Definition: Rotation3D.h:46
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Definition: DensityMap.h:461
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Classes for writing output files and processing them.
Definition: output.py:1
def get_density
Get the current density for some component name.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
VectorD< 3 > Vector3D
Definition: VectorD.h:395
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name (no Representation needed) It is mainly used...
Python classes to represent, score, sample and analyze models.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Select hierarchy particles identified by the biological name.
Definition: Selection.h:62
A class to compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
def get_residue_indexes
This "overloaded" function retrieves the residue indexes for each particle which is an instance of Fr...
Definition: tools.py:929
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27