IMP  2.3.1
The Integrative Modeling Platform
pmi/Analysis.py
1 #!/usr/bin/env python
2 
3 """@namespace IMP.pmi.analysis
4  Tools for clustering and cluster analysis
5 """
6 import IMP
7 import IMP.algebra
8 import IMP.em
9 import IMP.pmi
10 import IMP.pmi.tools
11 import IMP.pmi.output
12 import IMP.rmf
13 import RMF
14 import IMP.pmi.analysis
15 from operator import itemgetter
16 from copy import deepcopy
17 from math import log,sqrt
18 import itertools
19 import numpy as np
20 
21 
22 class Alignment(object):
23  """Performs alignment and RMSD calculation for two sets of coordinates
24  @param query {'p1':coords(L,3), 'p2':coords(L,3)}
25  @param template {'p1':coords(L,3), 'p2':coords(L,3)}
26 
27  The class also takes into accout non-equal stoichiometry of the proteins. If this
28  is the case, the protein names of protein in multiple copies should be delivered
29  in the following form: nameA..1, nameA..2 (note two dots).
30  """
31 
32  def __init__(self, template, query, weights=None):
33 
34  self.query = query
35  self.template = template
36  self.weights=weights
37 
38  if len(self.query.keys()) != len(self.template.keys()):
39  raise ValueError('''the number of proteins
40  in template and query does not match!''')
41 
42  def permute(self):
43 
44  self.proteins = sorted(self.query.keys())
45  prots_uniq = [i.split('..')[0] for i in self.proteins]
46  P = {}
47  for p in prots_uniq:
48  np = prots_uniq.count(p)
49  copies = [i for i in self.proteins if i.split('..')[0] == p]
50  prmts = list(itertools.permutations(copies, len(copies)))
51  P[p] = prmts
52  self.P = P
53  self.Product = list(itertools.product(*P.values()))
54 
55  def get_rmsd(self):
56 
57  self.permute()
58 
59  template_xyz = []
60  weights = []
61  torder = sum([list(i) for i in self.Product[0]], [])
62  for t in torder:
63  template_xyz += [IMP.algebra.Vector3D(i) for i in self.template[t]]
64  if self.weights is not None:
65  weights += [i for i in self.weights[t]]
66  #template_xyz = np.array(template_xyz)
67 
68  self.rmsd = 10000000000.
69  for comb in self.Product:
70 
71 
72 
73  order = sum([list(i) for i in comb], [])
74  query_xyz = []
75  for p in order:
76  query_xyz += [IMP.algebra.Vector3D(i) for i in self.query[p]]
77  #query_xyz = np.array(query_xyz)
78  #if len(template_xyz) != len(query_xyz):
79  # print '''Alignment.get_rmsd: ERROR: the number of coordinates
80  # in template and query does not match!'''
81  # exit()
82 
83  if self.weights is not None:
84  dist=IMP.algebra.get_weighted_rmsd(template_xyz, query_xyz, weights)
85  else:
86  dist=IMP.algebra.get_rmsd(template_xyz, query_xyz)
87  #dist = sqrt(
88  # sum(np.diagonal(cdist(template_xyz, query_xyz) ** 2)) / len(template_xyz))
89  if dist < self.rmsd:
90  self.rmsd = dist
91  return self.rmsd
92 
93  def align(self):
94  from scipy.spatial.distance import cdist
95 
96  self.permute()
97 
98  template_xyz = []
99  torder = sum([list(i) for i in self.Product[0]], [])
100  for t in torder:
101  template_xyz += [IMP.algebra.Vector3D(i) for i in self.template[t]]
102  #template_xyz = np.array(template_xyz)
103 
104  self.rmsd, Transformation = 10000000000., ''
105  for comb in self.Product:
106  order = sum([list(i) for i in comb], [])
107  query_xyz = []
108  for p in order:
109  query_xyz += [IMP.algebra.Vector3D(i) for i in self.query[p]]
110  #query_xyz = np.array(query_xyz)
111 
112  if len(template_xyz) != len(query_xyz):
113  raise ValueError('''the number of coordinates
114  in template and query does not match!''')
115 
117  query_xyz,
118  template_xyz)
119  query_xyz_tr = [transformation.get_transformed(n)
120  for n in query_xyz]
121 
122  dist = sqrt(
123  sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2)) / len(template_xyz))
124  if dist < self.rmsd:
125  self.rmsd = dist
126  Transformation = transformation
127 
128  return (self.rmsd, Transformation)
129 
130 
131 # TEST for the alignment ###
132 """
133 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
134  'a..2':np.array([np.array([1.,1.,])]),
135  'a..3':np.array([np.array([-2.,1.])]),
136  'b':np.array([np.array([0.,-1.])]),
137  'c..1':np.array([np.array([-1.,-1.])]),
138  'c..2':np.array([np.array([1.,-1.])]),
139  'd':np.array([np.array([0.,0.])]),
140  'e':np.array([np.array([0.,1.])])}
141 
142 Ali = Alignment(Proteins, Proteins)
143 Ali.permute()
144 if Ali.get_rmsd() == 0.0: print 'successful test!'
145 else: print 'ERROR!'; exit()
146 """
147 
148 
149 # ----------------------------------
150 class Violations(object):
151 
152  def __init__(self, filename):
153 
154  self.violation_thresholds = {}
155  self.violation_counts = {}
156 
157  data = open(filename)
158  D = data.readlines()
159  data.close()
160 
161  for d in D:
162  d = d.strip().split()
163  self.violation_thresholds[d[0]] = float(d[1])
164 
165  def get_number_violated_restraints(self, rsts_dict):
166  num_violated = 0
167  for rst in self.violation_thresholds:
168  if rst not in rsts_dict:
169  continue # print rst;
170  if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
171  num_violated += 1
172  if rst not in self.violation_counts:
173  self.violation_counts[rst] = 1
174  else:
175  self.violation_counts[rst] += 1
176  return num_violated
177 
178 
179 # ----------------------------------
180 class Clustering(object):
181  """A class to cluster structures.
182  Uses scipy's cdist function to compute distance matrices
183  And sklearn's kmeans clustering module.
184  @param rmsd_weights Flat list of weights for each particle (if they're coarse)
185  """
186  def __init__(self,rmsd_weights=None):
187  try:
188  from mpi4py import MPI
189  self.comm = MPI.COMM_WORLD
190  self.rank = self.comm.Get_rank()
191  self.number_of_processes = self.comm.size
192  except ImportError:
193  self.number_of_processes = 1
194  self.rank = 0
195  self.all_coords = {}
196  self.structure_cluster_ids = None
197  self.tmpl_coords = None
198  self.rmsd_weights=rmsd_weights
199 
200  def set_template(self, part_coords):
201 
202  self.tmpl_coords = part_coords
203 
204  def fill(self, frame, Coords):
205  """
206  fill stores coordinates of a model into a dictionary all_coords,
207  containing coordinates for all models.
208  """
209 
210  self.all_coords[frame] = Coords
211 
212  def dist_matrix(self):
213 
214  self.model_list_names = self.all_coords.keys()
215  self.model_indexes = range(len(self.model_list_names))
216  self.model_indexes_dict = dict(
217  zip(self.model_list_names, self.model_indexes))
218  model_indexes_unique_pairs = list(itertools.combinations(self.model_indexes, 2))
219 
220  my_model_indexes_unique_pairs = IMP.pmi.tools.chunk_list_into_segments(
221  model_indexes_unique_pairs,
222  self.number_of_processes)[self.rank]
223 
224  print "process %s assigned with %s pairs" % (str(self.rank), str(len(my_model_indexes_unique_pairs)))
225 
226  (raw_distance_dict, self.transformation_distance_dict) = self.matrix_calculation(self.all_coords,
227  self.tmpl_coords,
228  my_model_indexes_unique_pairs)
229 
230  if self.number_of_processes > 1:
231  raw_distance_dict = IMP.pmi.tools.scatter_and_gather(
232  raw_distance_dict)
233  pickable_transformations = self.get_pickable_transformation_distance_dict(
234  )
235  pickable_transformations = IMP.pmi.tools.scatter_and_gather(
236  pickable_transformations)
237  self.set_transformation_distance_dict_from_pickable(
238  pickable_transformations)
239 
240  self.raw_distance_matrix = np.zeros(
241  (len(self.model_list_names), len(self.model_list_names)))
242  for item in raw_distance_dict:
243  (f1, f2) = item
244  self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
245  self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
246 
247  def get_dist_matrix(self):
248  return self.raw_distance_matrix
249 
250  def do_cluster(self, number_of_clusters,seed=None):
251  """Run K-means clustering
252  @param number_of_clusters Num means
253  @param seed the random seed
254  """
255  from sklearn.cluster import KMeans
256  if seed is not None:
257  np.random.seed(seed)
258  try:
259  # check whether we have the right version of sklearn
260  kmeans = KMeans(n_clusters=number_of_clusters)
261  except TypeError:
262  # sklearn older than 0.12
263  kmeans = KMeans(k=number_of_clusters)
264  kmeans.fit_predict(self.raw_distance_matrix)
265 
266  self.structure_cluster_ids = kmeans.labels_
267 
268  def get_pickable_transformation_distance_dict(self):
269  pickable_transformations = {}
270  for label in self.transformation_distance_dict:
271  tr = self.transformation_distance_dict[label]
272  trans = tuple(tr.get_translation())
273  rot = tuple(tr.get_rotation().get_quaternion())
274  pickable_transformations[label] = (rot, trans)
275  return pickable_transformations
276 
277  def set_transformation_distance_dict_from_pickable(
278  self,
279  pickable_transformations):
280  self.transformation_distance_dict = {}
281  for label in pickable_transformations:
282  tr = pickable_transformations[label]
283  trans = IMP.algebra.Vector3D(tr[1])
284  rot = IMP.algebra.Rotation3D(tr[0])
285  self.transformation_distance_dict[
286  label] = IMP.algebra.Transformation3D(rot, trans)
287 
288  def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
289  import pickle
290  outf = open(file_name + ".data", 'w')
291 
292  # to pickle the transformation dictionary
293  # you have to save the arrays correposnding to
294  # the transformations
295 
296  pickable_transformations = self.get_pickable_transformation_distance_dict(
297  )
298  pickle.dump(
299  (self.structure_cluster_ids,
300  self.model_list_names,
301  pickable_transformations),
302  outf)
303 
304  np.save(file_name + ".npy", self.raw_distance_matrix)
305 
306  def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
307  import pickle
308 
309  inputf = open(file_name + ".data", 'r')
310  (self.structure_cluster_ids, self.model_list_names,
311  pickable_transformations) = pickle.load(inputf)
312  inputf.close()
313 
314  self.raw_distance_matrix = np.load(file_name + ".npy")
315 
316  self.set_transformation_distance_dict_from_pickable(
317  pickable_transformations)
318  self.model_indexes = range(len(self.model_list_names))
319  self.model_indexes_dict = dict(
320  zip(self.model_list_names, self.model_indexes))
321 
322  def plot_matrix(self, figurename="clustermatrix.pdf"):
323  import pylab as pl
324  from scipy.cluster import hierarchy as hrc
325 
326  fig = pl.figure(figsize=(10,8))
327  ax = fig.add_subplot(212)
328  dendrogram = hrc.dendrogram(
329  hrc.linkage(self.raw_distance_matrix),
330  color_threshold=7,
331  no_labels=True)
332  leaves_order = dendrogram['leaves']
333  ax.set_xlabel('Model')
334  ax.set_ylabel('RMSD [Angstroms]')
335 
336  ax2 = fig.add_subplot(221)
337  cax = ax2.imshow(
338  self.raw_distance_matrix[leaves_order,
339  :][:,
340  leaves_order],
341  interpolation='nearest')
342  cb = fig.colorbar(cax)
343  cb.set_label('RMSD [Angstroms]')
344  ax2.set_xlabel('Model')
345  ax2.set_ylabel('Model')
346 
347  pl.savefig(figurename, dpi=300)
348  pl.close(fig)
349 
350  def get_model_index_from_name(self, name):
351  return self.model_indexes_dict[name]
352 
353  def get_cluster_labels(self):
354  # this list
355  return list(set(self.structure_cluster_ids))
356 
357  def get_number_of_clusters(self):
358  return len(self.get_cluster_labels())
359 
360  def get_cluster_label_indexes(self, label):
361  return (
362  [i for i, l in enumerate(self.structure_cluster_ids) if l == label]
363  )
364 
365  def get_cluster_label_names(self, label):
366  return (
367  [self.model_list_names[i]
368  for i in self.get_cluster_label_indexes(label)]
369  )
370 
371  def get_cluster_label_average_rmsd(self, label):
372 
373  indexes = self.get_cluster_label_indexes(label)
374 
375  if len(indexes) > 1:
376  sub_distance_matrix = self.raw_distance_matrix[
377  indexes, :][:, indexes]
378  average_rmsd = np.sum(sub_distance_matrix) / \
379  (len(sub_distance_matrix)
380  ** 2 - len(sub_distance_matrix))
381  else:
382  average_rmsd = 0.0
383  return average_rmsd
384 
385  def get_cluster_label_size(self, label):
386  return len(self.get_cluster_label_indexes(label))
387 
388  def get_transformation_to_first_member(
389  self,
390  cluster_label,
391  structure_index):
392  reference = self.get_cluster_label_indexes(cluster_label)[0]
393  return self.transformation_distance_dict[(reference, structure_index)]
394 
395  def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
396 
397  model_list_names = all_coords.keys()
398  rmsd_protein_names = all_coords[model_list_names[0]].keys()
399  raw_distance_dict = {}
400  transformation_distance_dict = {}
401  if template_coords is None:
402  do_alignment = False
403  else:
404  do_alignment = True
405  alignment_template_protein_names = template_coords.keys()
406 
407  for (f1, f2) in list_of_pairs:
408 
409  if not do_alignment:
410  # here we only get the rmsd,
411  # we need that for instance when you want to cluster conformations
412  # globally, eg the EM map is a reference
414 
415  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
416  for pr in rmsd_protein_names])
417  coords_f2 = {}
418  for pr in rmsd_protein_names:
419  coords_f2[pr] = all_coords[model_list_names[f2]][pr]
420 
421  Ali = Alignment(coords_f1, coords_f2, self.rmsd_weights)
422  rmsd = Ali.get_rmsd()
423 
424  elif do_alignment:
425  # here we actually align the conformations first
426  # and than calculate the rmsd. We need that when the
427  # protein(s) is the reference
428  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
429  for pr in alignment_template_protein_names])
430  coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
431  for pr in alignment_template_protein_names])
432 
433  Ali = Alignment(coords_f1, coords_f2)
434  template_rmsd, transformation = Ali.align()
435 
436  # here we calculate the rmsd
437  # we will align two models based n the nuber of subunits provided
438  # and transform coordinates of model 2 to model 1
439  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
440  for pr in rmsd_protein_names])
441  coords_f2 = {}
442  for pr in rmsd_protein_names:
443  coords_f2[pr] = [transformation.get_transformed(
444  i) for i in all_coords[model_list_names[f2]][pr]]
445 
446  Ali = Alignment(coords_f1, coords_f2, self.rmsd_weights)
447  rmsd = Ali.get_rmsd()
448 
449  raw_distance_dict[(f1, f2)] = rmsd
450  raw_distance_dict[(f2, f1)] = rmsd
451  transformation_distance_dict[(f1, f2)] = transformation
452  transformation_distance_dict[(f2, f1)] = transformation
453 
454  return raw_distance_dict, transformation_distance_dict
455 
456 
457 class Precision(object):
458  """ A class to evaluate the precision of an ensemble.
459  Also can evaluate the cross-precision of multiple ensembles.
460  Supports MPI for coordinate reading.
461  Recommended procedure:
462  -# initialize object and pass the selection for evaluating precision
463  -# call add_structures() to read in the data (specify group name)"
464  -# call get_precision() to evaluate inter/intra precision
465  -# call get_rmsf() to evaluate within-group fluctuations
466  @param model The IMP Model
467  @param resolution Use 1 or 10 (kluge: requires that "_Res:X" is part of the hier name)
468  @param selection_dictionary Dictionary where keys are names for selections
469  and values are selection tuples for scoring precision.
470  "All" is automatically made as well
471  """
472  def __init__(self,model,
473  resolution=1,
474  selection_dictionary={}):
475  try:
476  from mpi4py import MPI
477  self.comm = MPI.COMM_WORLD
478  self.rank = self.comm.Get_rank()
479  self.number_of_processes = self.comm.size
480  except ImportError:
481  self.number_of_processes=1
482  self.rank=0
483 
484  self.styles=['pairwise_rmsd','pairwise_drmsd_k','pairwise_drmsd_Q',
485  'pairwise_drms_k','pairwise_rmsd','drmsd_from_center']
486  self.style='pairwise_drmsd_k'
487  self.structures_dictionary={}
488  self.reference_structures_dictionary={}
489  self.prots=[]
490  self.protein_names=None
491  self.len_particles_resolution_one=None
492  self.model=model
493  self.rmf_names_frames={}
494  self.reference_rmf_names_frames=None
495  self.reference_structure=None
496  self.reference_prot=None
497  self.selection_dictionary=selection_dictionary
498  self.threshold=40.0
499  self.residue_particle_index_map=None
500  if resolution in [1,10]:
501  self.resolution=resolution
502  else:
503  raise KeyError("no such resolution")
504 
505  def _get_structure(self,rmf_frame_index,rmf_name):
506  """Read an RMF file and return the particles"""
507  rh= RMF.open_rmf_file_read_only(rmf_name)
508  prots=IMP.rmf.create_hierarchies(rh, self.model)
509  IMP.rmf.load_frame(rh, rmf_frame_index)
510  print "getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name)
511  del rh
512 
513  if self.resolution==1:
514  particle_dict = get_particles_at_resolution_one(prots[0])
515  elif self.resolution==10:
516  particle_dict = get_particles_at_resolution_ten(prots[0])
517 
518  protein_names=particle_dict.keys()
519  particles_resolution_one=[]
520  for k in particle_dict:
521  particles_resolution_one+=(particle_dict[k])
522 
523  if self.protein_names==None:
524  self.protein_names=protein_names
525  else:
526  if self.protein_names!=protein_names:
527  print "Error: the protein names of the new coordinate set is not compatible with the previous one"
528 
529  if self.len_particles_resolution_one==None:
530  self.len_particles_resolution_one=len(particles_resolution_one)
531  else:
532  if self.len_particles_resolution_one!=len(particles_resolution_one):
533  raise ValueError("the new coordinate set is not compatible with the previous one")
534 
535  return particles_resolution_one,prots
536 
537  def add_structure(self,
538  rmf_name,
539  rmf_frame_index,
540  structure_set_name,
541  setup_index_map=False):
542  """ Read a structure into the ensemble and store (as coordinates).
543  @param rmf_name The name of the RMF file
544  @param rmf_frame_index The frame to read
545  @param structure_set_name Name for the set that includes this structure
546  (e.g. "cluster 1")
547  """
548 
549  # decide where to put this structure
550  if structure_set_name in self.structures_dictionary:
551  cdict=self.structures_dictionary[structure_set_name]
552  rmflist=self.rmf_names_frames[structure_set_name]
553  else:
554  self.structures_dictionary[structure_set_name]={}
555  self.rmf_names_frames[structure_set_name]=[]
556  cdict=self.structures_dictionary[structure_set_name]
557  rmflist=self.rmf_names_frames[structure_set_name]
558 
559  # read the particles
560  try:
561  (particles_resolution_one, prots)=self._get_structure(rmf_frame_index,rmf_name)
562  except:
563  print "something wrong with the rmf"
564  return 0
565 
566  self.selection_dictionary.update({"All":self.protein_names})
567 
568  for selection_name in self.selection_dictionary:
569  selection_tuple=self.selection_dictionary[selection_name]
570  coords=self._select_coordinates(selection_tuple,particles_resolution_one,prots[0])
571  if selection_name not in cdict:
572  cdict[selection_name]=[coords]
573  else:
574  cdict[selection_name].append(coords)
575 
576  rmflist.append((rmf_name,rmf_frame_index))
577 
578  # if requested, set up a dictionary to help find residue indexes
579  if setup_index_map:
580  self.residue_particle_index_map={}
581  for prot_name in self.protein_names:
582  self.residue_particle_index_map[prot_name] = \
583  self._get_residue_particle_index_map(
584  prot_name,
585  particles_resolution_one,prots[0])
586  for prot in prots:
587  IMP.atom.destroy(prot)
588 
589  def add_structures(self,
590  rmf_name_frame_tuples,
591  structure_set_name):
592  """Read a list of RMFs, supports parallel
593  @param rmf_name_frame_tuples list of (rmf_file_name,frame_number)
594  @param structure_set_name Name this set of structures (e.g. "cluster.1")
595  """
596 
597  # split up the requested list to read in parallel
598  my_rmf_name_frame_tuples=IMP.pmi.tools.chunk_list_into_segments(
599  rmf_name_frame_tuples,self.number_of_processes)[self.rank]
600  for nfr,tup in enumerate(my_rmf_name_frame_tuples):
601  rmf_name=tup[0]
602  rmf_frame_index=tup[1]
603  # the first frame stores the map between residues and particles
604  if self.residue_particle_index_map is None:
605  setup_index_map=True
606  else:
607  setup_index_map=False
608  self.add_structure(rmf_name,
609  rmf_frame_index,
610  structure_set_name,
611  setup_index_map)
612 
613  # synchronize the structures
614  if self.number_of_processes > 1:
615  self.rmf_names_frames=IMP.pmi.tools.scatter_and_gather(self.rmf_names_frames)
616  if self.rank != 0:
617  self.comm.send(self.structures_dictionary, dest=0, tag=11)
618  elif self.rank == 0:
619  for i in range(1, self.number_of_processes):
620  data_tmp = self.comm.recv(source=i, tag=11)
621  for key in self.structures_dictionary:
622  self.structures_dictionary[key].update(data_tmp[key])
623  for i in range(1, self.number_of_processes):
624  self.comm.send(self.structures_dictionary, dest=i, tag=11)
625  if self.rank != 0:
626  self.structures_dictionary = self.comm.recv(source=0, tag=11)
627 
628  def _get_residue_particle_index_map(self,prot_name,structure,hier):
629  residue_particle_index_map=[]
630  s=IMP.atom.Selection(hier,molecules=[prot_name])
631  all_selected_particles=s.get_selected_particles()
632  intersection=list(set(all_selected_particles) & set(structure))
633  sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
634  for p in sorted_intersection:
635  residue_particle_index_map.append(IMP.pmi.tools.get_residue_indexes(p))
636  return residue_particle_index_map
637 
638 
639  def _select_coordinates(self,tuple_selections,structure,prot):
640  selected_coordinates=[]
641  for t in tuple_selections:
642  if type(t)==tuple and len(t)==3:
643  s=IMP.atom.Selection(prot,molecules=[t[2]],residue_indexes=range(t[0],t[1]+1))
644  all_selected_particles=s.get_selected_particles()
645  intersection=list(set(all_selected_particles) & set(structure))
646  sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
647  cc=map(lambda p: tuple(IMP.core.XYZ(p).get_coordinates()), sorted_intersection)
648  selected_coordinates+=cc
649 
650  elif type(t)==str:
651  s=IMP.atom.Selection(prot,molecules=[t])
652  all_selected_particles=s.get_selected_particles()
653  intersection=list(set(all_selected_particles) & set(structure))
654  sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
655  cc=map(lambda p: tuple(IMP.core.XYZ(p).get_coordinates()), sorted_intersection)
656  selected_coordinates+=cc
657  else:
658  raise ValueError("Selection error")
659  return selected_coordinates
660 
661  def set_threshold(self,threshold):
662  self.threshold=threshold
663 
664  def _get_distance(self,
665  structure_set_name1,
666  structure_set_name2,
667  selection_name,
668  index1,
669  index2):
670  """ Compute distance between structures with various metrics """
671  c1=self.structures_dictionary[structure_set_name1][selection_name][index1]
672  c2=self.structures_dictionary[structure_set_name2][selection_name][index2]
673 
674  coordinates1=map(lambda c: IMP.algebra.Vector3D(c), c1)
675  coordinates2=map(lambda c: IMP.algebra.Vector3D(c), c2)
676 
677  if self.style=='pairwise_drmsd_k':
678  distance=IMP.atom.get_drmsd(coordinates1,coordinates2)
679  if self.style=='pairwise_drms_k':
680  distance=IMP.atom.get_drms(coordinates1,coordinates2)
681  if self.style=='pairwise_drmsd_Q':
682  distance=IMP.atom.get_drmsd_Q(coordinates1,coordinates2,self.threshold)
683 
684  if self.style=='pairwise_rmsd':
685  distance=IMP.atom.get_rmsd(coordinates1,coordinates2,IMP.algebra.get_identity_transformation_3d())
686  return distance
687 
688  def _get_particle_distances(self,structure_set_name1,structure_set_name2,
689  selection_name,index1,index2):
690  import numpy as np
691  c1=self.structures_dictionary[structure_set_name1][selection_name][index1]
692  c2=self.structures_dictionary[structure_set_name2][selection_name][index2]
693 
694  coordinates1=map(lambda c: IMP.algebra.Vector3D(c), c1)
695  coordinates2=map(lambda c: IMP.algebra.Vector3D(c), c2)
696 
697  distances=[np.linalg.norm(a-b) for (a,b) in zip(coordinates1,coordinates2)]
698 
699  return distances
700 
701  def get_precision(self,
702  structure_set_name1,
703  structure_set_name2,
704  outfile=None,
705  skip=1,
706  selection_keywords=None):
707  """ Evaluate the precision of two named structure groups. Supports MPI.
708  When the structure_set_name1 is different from the structure_set_name2,
709  this evaluates the cross-precision (average pairwise distances).
710  @param outfile Name of the precision output file
711  @param structure_set_name1 string name of the first structure set
712  @param structure_set_name2 string name of the second structure set
713  @param skip analyze every (skip) structure for the distance matrix calculation
714  @param selection_keywords Specify the selection name you want to calculate on.
715  By default this is computed for everything you provided in the constructor,
716  plus all the subunits together.
717  """
718  if selection_keywords is None:
719  sel_keys=self.selection_dictionary.keys()
720  else:
721  for k in selection_keywords:
722  if k not in self.selection_dictionary:
723  raise KeyError("you are trying to find named selection " \
724  + k + " which was not requested in the constructor")
725  sel_keys=selection_keywords
726 
727  if outfile is not None:
728  of=open(outfile,"w")
729  centroid_index=0
730  for selection_name in sel_keys:
731  number_of_structures_1=len(self.structures_dictionary[structure_set_name1][selection_name])
732  number_of_structures_2=len(self.structures_dictionary[structure_set_name2][selection_name])
733 
734  distances={}
735  structure_pointers_1=range(0,number_of_structures_1,skip)
736  structure_pointers_2=range(0,number_of_structures_2,skip)
737 
738  pair_combination_list=list(itertools.product(structure_pointers_1,structure_pointers_2))
739 
740  if len(pair_combination_list)==0:
741  raise ValueError("no structure selected. Check the skip parameter.")
742 
743  my_pair_combination_list=IMP.pmi.tools.chunk_list_into_segments(
744  pair_combination_list,self.number_of_processes)[self.rank]
745  my_length=len(my_pair_combination_list)
746  for n,pair in enumerate(my_pair_combination_list):
747 
748  progression=int(float(n)/my_length*100.0)
749  distances[pair]=self._get_distance(structure_set_name1,structure_set_name2,
750  selection_name,pair[0],pair[1])
751 
752  if self.number_of_processes > 1:
753  distances = IMP.pmi.tools.scatter_and_gather(distances)
754  if self.rank == 0:
755  if structure_set_name1==structure_set_name2:
756  structure_pointers=structure_pointers_1
757  number_of_structures=number_of_structures_1
758 
759  # calculate the distance from the first centroid
760  # and determine the centroid
761 
762  distance=0.0
763  distances_to_structure={}
764  distances_to_structure_normalization={}
765 
766  for n in structure_pointers:
767  distances_to_structure[n]=0.0
768  distances_to_structure_normalization[n]=0
769 
770  for k in distances:
771  distance+=distances[k]
772  distances_to_structure[k[0]]+=distances[k]
773  distances_to_structure[k[1]]+=distances[k]
774  distances_to_structure_normalization[k[0]]+=1
775  distances_to_structure_normalization[k[1]]+=1
776 
777  for n in structure_pointers:
778  distances_to_structure[n]=distances_to_structure[n]/distances_to_structure_normalization[n]
779 
780  min_distance=min([distances_to_structure[n] for n in distances_to_structure])
781  centroid_index=[k for k, v in distances_to_structure.iteritems() if v == min_distance][0]
782  centroid_rmf_name=self.rmf_names_frames[structure_set_name1][centroid_index]
783 
784  centroid_distance=0.0
785  for n in range(number_of_structures):
786  centroid_distance+=self._get_distance(structure_set_name1,structure_set_name1,
787  selection_name,centroid_index,n)
788 
789  #pairwise_distance=distance/len(distances.keys())
790  centroid_distance/=number_of_structures
791  #average_centroid_distance=sum(distances_to_structure)/len(distances_to_structure)
792  if outfile is not None:
793  of.write(str(selection_name)+" "+structure_set_name1+
794  " average centroid distance "+str(centroid_distance)+"\n")
795  of.write(str(selection_name)+" "+structure_set_name1+
796  " centroid index "+str(centroid_index)+"\n")
797  of.write(str(selection_name)+" "+structure_set_name1+
798  " centroid rmf name "+str(centroid_rmf_name)+"\n")
799 
800  average_pairwise_distances=sum(distances.values())/len(distances.values())
801  if outfile is not None:
802  of.write(str(selection_name)+" "+structure_set_name1+" "+structure_set_name2+
803  " average pairwise distance "+str(average_pairwise_distances)+"\n")
804  if outfile is not None:
805  of.close()
806  return centroid_index
807 
808  def get_rmsf(self,
809  structure_set_name,
810  outdir="./",
811  skip=1,
812  set_plot_yaxis_range=None):
813  """ Calculate the residue mean square fluctuations (RMSF).
814  Automatically outputs as data file and pdf
815  @param structure_set_name Which structure set to calculate RMSF for
816  @param outdir Where to write the files
817  @param skip Skip this number of structures
818  @param set_plot_yaxis_range In case you need to change the plot
819  """
820  # get the centroid structure for the whole complex
821  centroid_index=self.get_precision(
822  structure_set_name,
823  structure_set_name,
824  outfile=None,
825  skip=skip)
826  if self.rank==0:
827  for sel_name in self.protein_names:
828  self.selection_dictionary.update({sel_name:[sel_name]})
829  try:
830  number_of_structures=len(self.structures_dictionary[structure_set_name][sel_name])
831  except KeyError:
832  # that protein was not included in the selection
833  continue
834  rpim=self.residue_particle_index_map[sel_name]
835  outfile=outdir+"/rmsf."+sel_name+".dat"
836  of=open(outfile,"w")
837  residue_distances={}
838  residue_nblock={}
839  for index in range(number_of_structures):
840  distances=self._get_particle_distances(structure_set_name,
841  structure_set_name,
842  sel_name,
843  centroid_index,index)
844  for nblock,block in enumerate(rpim):
845  for residue_number in block:
846  residue_nblock[residue_number]=nblock
847  if residue_number not in residue_distances:
848  residue_distances[residue_number]=[distances[nblock]]
849  else:
850  residue_distances[residue_number].append(distances[nblock])
851 
852  residues=[]
853  rmsfs=[]
854  for rn in residue_distances:
855  residues.append(rn)
856  rmsf=np.std(residue_distances[rn])
857  rmsfs.append(rmsf)
858  of.write(str(rn)+" "+str(residue_nblock[rn])+" "+str(rmsf)+"\n")
859 
860  IMP.pmi.output.plot_xy_data(residues,rmsfs,title=sel_name,
861  out_fn=outdir+"/rmsf."+sel_name,display=False,
862  set_plot_yaxis_range=set_plot_yaxis_range,
863  xlabel='Residue Number',ylabel='Standard error')
864  of.close()
865 
866 
867  def set_reference_structure(self,rmf_name,rmf_frame_index):
868  """Read in a structure used for reference computation.
869  Needed before calling get_average_distance_wrt_reference_structure()
870  @param rmf_name The RMF file to read the reference
871  @param rmf_frame_index The index in that file
872  """
873  (particles_resolution_one, prot)=self._get_structure(rmf_frame_index,rmf_name)
874  self.reference_rmf_names_frames=(rmf_name,rmf_frame_index)
875 
876 
877  for selection_name in self.selection_dictionary:
878  selection_tuple=self.selection_dictionary[selection_name]
879  coords=self._select_coordinates(selection_tuple,
880  particles_resolution_one,prot)
881  self.reference_structures_dictionary[selection_name]=coords
882 
883 
884  def get_average_distance_wrt_reference_structure(self,structure_set_name):
885  """Compare the structure set to the reference structure.
886  @param structure_set_name The structure set to compute this on
887  \note First call set_reference_structure()
888  """
889  if self.reference_structures_dictionary=={}:
890  print "Cannot compute until you set a reference structure"
891  return
892  for selection_name in self.selection_dictionary:
893  reference_coordinates=self.reference_structures_dictionary[selection_name]
894  coordinates2=map(lambda c: IMP.algebra.Vector3D(c), reference_coordinates)
895  distances=[]
896 
897  for sc in self.structures_dictionary[structure_set_name][selection_name]:
898  coordinates1=map(lambda c: IMP.algebra.Vector3D(c), sc)
899  if self.style=='pairwise_drmsd_k':
900  distance=IMP.atom.get_drmsd(coordinates1,coordinates2)
901  if self.style=='pairwise_drms_k':
902  distance=IMP.atom.get_drms(coordinates1,coordinates2)
903  if self.style=='pairwise_drmsd_Q':
904  distance=IMP.atom.get_drmsd_Q(coordinates1,coordinates2,self.threshold)
905  if self.style=='pairwise_rmsd':
906  distance=IMP.atom.get_rmsd(coordinates1,coordinates2,
908  distances.append(distance)
909 
910  print selection_name,"average distance",sum(distances)/len(distances),"minimum distance",min(distances)
911 
912  def get_coordinates(self):
913  pass
914 
915  def set_precision_style(self, style):
916  if style in self.styles:
917  self.style=style
918  else:
919  raise ValueError("No such style")
920 
921 
922 class GetModelDensity(object):
923  """A class to compute mean density maps from structures
924  Keeps a dictionary of density maps,
925  keys are in the custom ranges. When you call add_subunits_density, it adds
926  particle coordinates to the existing density maps.
927  @param custom_ranges Required. It's a dictionary, keys are the density component names,
928  values are selection tuples
929  e.g. {'kin28':[['kin28',1,-1]],
930  'density_name_1' :[('ccl1')],
931  'density_name_2' :[(1,142,'tfb3d1'),(143,700,'tfb3d2')],
932  @param representation PMI representation, for doing selections.
933  not needed if you only pass hierarchies
934  @param voxel The voxel size for the output map (lower is slower)
935  """
936 
937  def __init__(self, custom_ranges, representation=None, voxel=5.0):
938 
939  self.representation = representation
940  self.voxel = voxel
941  self.densities = {}
942  self.count_models = 0.0
943  self.custom_ranges = custom_ranges
944 
945  def add_subunits_density(self, hierarchy=None):
946  """Add a frame to the densities.
947  @param hierarchy Optionally read the hierarchy from somewhere.
948  If not passed, will just read the representation.
949  """
950  self.count_models += 1.0
951 
952  if hierarchy:
953  part_dict = get_particles_at_resolution_one(hierarchy)
954  all_particles_by_resolution = []
955  for name in part_dict:
956  all_particles_by_resolution += part_dict[name]
957 
958 
959  for density_name in self.custom_ranges:
960  parts = []
961  if hierarchy:
962  all_particles_by_segments = []
963 
964  for seg in self.custom_ranges[density_name]:
965  if not hierarchy:
966  parts += IMP.tools.select_by_tuple(self.representation,
967  seg, resolution=1, name_is_ambiguous=False)
968  else:
969  if type(seg) == str:
970  s = IMP.atom.Selection(hierarchy,molecule=seg)
971  elif type(seg) == tuple:
972  s = IMP.atom.Selection(
973  hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1))
974  else:
975  raise Exception('could not understand selection tuple '+str(seg))
976 
977  all_particles_by_segments += s.get_selected_particles()
978 
979  if hierarchy:
980  parts = list(
981  set(all_particles_by_segments) & set(all_particles_by_resolution))
982 
983  self._create_density_from_particles(parts, density_name)
984 
985  def normalize_density(self):
986  pass
987 
988  def _create_density_from_particles(self, ps, name,
989  resolution=1,
990  kernel_type='GAUSSIAN'):
991  '''Internal function for adding to densities.
992  pass XYZR particles with mass and create a density from them.
993  kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
994  kd = {
995  'GAUSSIAN': IMP.em.GAUSSIAN,
996  'BINARIZED_SPHERE': IMP.em.BINARIZED_SPHERE,
997  'SPHERE': IMP.em.SPHERE}
998 
999  dmap = IMP.em.SampledDensityMap(ps, resolution, self.voxel)
1000  dmap.calcRMS()
1001  if name not in self.densities:
1002  self.densities[name] = dmap
1003  else:
1004  bbox1 = IMP.em.get_bounding_box(self.densities[name])
1005  bbox2 = IMP.em.get_bounding_box(dmap)
1006  bbox1 += bbox2
1007  dmap3 = IMP.em.create_density_map(bbox1,self.voxel)
1008  dmap3.add(dmap)
1009  dmap3.add(self.densities[name])
1010  self.densities[name] = dmap3
1011 
1012  def get_density_keys(self):
1013  return self.densities.keys()
1014 
1015  def get_density(self,name):
1016  """Get the current density for some component name"""
1017  if name not in self.densities:
1018  return None
1019  else:
1020  return self.densities[name]
1021 
1022  def write_mrc(self, path="./"):
1023  for density_name in self.densities:
1024  self.densities[density_name].multiply(1. / self.count_models)
1026  self.densities[density_name],
1027  path + "/" + density_name + ".mrc",
1029 
1030 
1031 class GetContactMap(object):
1032 
1033  def __init__(self, distance=15.):
1034  self.distance = distance
1035  self.contactmap = ''
1036  self.namelist = []
1037  self.xlinks = 0
1038  self.XL = {}
1039  self.expanded = {}
1040  self.resmap = {}
1041 
1042  def set_prot(self, prot):
1043  from scipy.spatial.distance import cdist
1044  self.prot = prot
1045  self.protnames = []
1046  coords = []
1047  radii = []
1048  namelist = []
1049 
1050  particles_dictionary = get_particles_at_resolution_one(self.prot)
1051 
1052  for name in particles_dictionary:
1053  residue_indexes = []
1054  for p in particles_dictionary[name]:
1055  print p.get_name()
1056  residue_indexes += IMP.pmi.tools.get_residue_indexes(p)
1057  #residue_indexes.add( )
1058 
1059  if len(residue_indexes) != 0:
1060  self.protnames.append(name)
1061  for res in range(min(residue_indexes), max(residue_indexes) + 1):
1062  d = IMP.core.XYZR(p)
1063  new_name = name + ":" + str(res)
1064  if name not in self.resmap:
1065  self.resmap[name] = {}
1066  if res not in self.resmap:
1067  self.resmap[name][res] = {}
1068 
1069  self.resmap[name][res] = new_name
1070  namelist.append(new_name)
1071 
1072  crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1073  coords.append(crd)
1074  radii.append(d.get_radius())
1075 
1076  coords = np.array(coords)
1077  radii = np.array(radii)
1078 
1079  if len(self.namelist) == 0:
1080  self.namelist = namelist
1081  self.contactmap = np.zeros((len(coords), len(coords)))
1082 
1083  distances = cdist(coords, coords)
1084  distances = (distances - radii).T - radii
1085  distances = distances <= self.distance
1086 
1087  print coords
1088  print radii
1089  print distances
1090 
1091  self.contactmap += distances
1092 
1093  def get_subunit_coords(self, frame, align=0):
1094  from scipy.spatial.distance import cdist
1095  coords = []
1096  radii = []
1097  namelist = []
1098  test, testr = [], []
1099  for part in self.prot.get_children():
1100  SortedSegments = []
1101  print part
1102  for chl in part.get_children():
1103  start = IMP.atom.get_leaves(chl)[0]
1104  end = IMP.atom.get_leaves(chl)[-1]
1105 
1106  startres = IMP.atom.Fragment(start).get_residue_indexes()[0]
1107  endres = IMP.atom.Fragment(end).get_residue_indexes()[-1]
1108  SortedSegments.append((chl, startres))
1109  SortedSegments = sorted(SortedSegments, key=itemgetter(1))
1110 
1111  for sgmnt in SortedSegments:
1112  for leaf in IMP.atom.get_leaves(sgmnt[0]):
1113  p = IMP.core.XYZR(leaf)
1114  crd = np.array([p.get_x(), p.get_y(), p.get_z()])
1115 
1116  coords.append(crd)
1117  radii.append(p.get_radius())
1118 
1119  new_name = part.get_name() + '_' + sgmnt[0].get_name() +\
1120  '_' + \
1121  str(IMP.atom.Fragment(leaf)
1122  .get_residue_indexes()[0])
1123  namelist.append(new_name)
1124  self.expanded[new_name] = len(
1125  IMP.atom.Fragment(leaf).get_residue_indexes())
1126  if part.get_name() not in self.resmap:
1127  self.resmap[part.get_name()] = {}
1128  for res in IMP.atom.Fragment(leaf).get_residue_indexes():
1129  self.resmap[part.get_name()][res] = new_name
1130 
1131  coords = np.array(coords)
1132  radii = np.array(radii)
1133  if len(self.namelist) == 0:
1134  self.namelist = namelist
1135  self.contactmap = np.zeros((len(coords), len(coords)))
1136  distances = cdist(coords, coords)
1137  distances = (distances - radii).T - radii
1138  distances = distances <= self.distance
1139  self.contactmap += distances
1140 
1141  def add_xlinks(
1142  self,
1143  filname,
1144  identification_string='ISDCrossLinkMS_Distance_'):
1145  # 'ISDCrossLinkMS_Distance_interrb_6629-State:0-20:RPS30_218:eIF3j-1-1-0.1_None'
1146  self.xlinks = 1
1147  data = open(filname)
1148  D = data.readlines()
1149  data.close()
1150 
1151  for d in D:
1152  if identification_string in d:
1153  d = d.replace(
1154  "_",
1155  " ").replace("-",
1156  " ").replace(":",
1157  " ").split()
1158 
1159  t1, t2 = (d[0], d[1]), (d[1], d[0])
1160  if t1 not in self.XL:
1161  self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
1162  self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
1163  else:
1164  self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
1165  self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
1166 
1167  def dist_matrix(self, skip_cmap=0, skip_xl=1):
1168  K = self.namelist
1169  M = self.contactmap
1170  C, R = [], []
1171  L = sum(self.expanded.values())
1172  proteins = self.protnames
1173 
1174  # exp new
1175  if skip_cmap == 0:
1176  Matrices = {}
1177  proteins = [p.get_name() for p in self.prot.get_children()]
1178  missing = []
1179  for p1 in xrange(len(proteins)):
1180  for p2 in xrange(p1, len(proteins)):
1181  pl1, pl2 = max(
1182  self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1183  pn1, pn2 = proteins[p1], proteins[p2]
1184  mtr = np.zeros((pl1 + 1, pl2 + 1))
1185  print 'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2
1186  for i1 in xrange(1, pl1 + 1):
1187  for i2 in xrange(1, pl2 + 1):
1188  try:
1189  r1 = K.index(self.resmap[pn1][i1])
1190  r2 = K.index(self.resmap[pn2][i2])
1191  r = M[r1, r2]
1192  mtr[i1 - 1, i2 - 1] = r
1193  except KeyError:
1194  missing.append((pn1, pn2, i1, i2))
1195  pass
1196  Matrices[(pn1, pn2)] = mtr
1197 
1198  # add cross-links
1199  if skip_xl == 0:
1200  if self.XL == {}:
1201  raise ValueError("cross-links were not provided, use add_xlinks function!")
1202  Matrices_xl = {}
1203  missing_xl = []
1204  for p1 in xrange(len(proteins)):
1205  for p2 in xrange(p1, len(proteins)):
1206  pl1, pl2 = max(
1207  self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1208  pn1, pn2 = proteins[p1], proteins[p2]
1209  mtr = np.zeros((pl1 + 1, pl2 + 1))
1210  flg = 0
1211  try:
1212  xls = self.XL[(pn1, pn2)]
1213  except KeyError:
1214  try:
1215  xls = self.XL[(pn2, pn1)]
1216  flg = 1
1217  except KeyError:
1218  flg = 2
1219  if flg == 0:
1220  print 'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2
1221  for xl1, xl2 in xls:
1222  if xl1 > pl1:
1223  print 'X' * 10, xl1, xl2
1224  xl1 = pl1
1225  if xl2 > pl2:
1226  print 'X' * 10, xl1, xl2
1227  xl2 = pl2
1228  mtr[xl1 - 1, xl2 - 1] = 100
1229  elif flg == 1:
1230  print 'Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2
1231  for xl1, xl2 in xls:
1232  if xl1 > pl1:
1233  print 'X' * 10, xl1, xl2
1234  xl1 = pl1
1235  if xl2 > pl2:
1236  print 'X' * 10, xl1, xl2
1237  xl2 = pl2
1238  mtr[xl2 - 1, xl1 - 1] = 100
1239  else:
1240  raise RuntimeError('WTF!')
1241  Matrices_xl[(pn1, pn2)] = mtr
1242 
1243  # expand the matrix to individual residues
1244  #NewM = []
1245  # for x1 in xrange(len(K)):
1246  # lst = []
1247  # for x2 in xrange(len(K)):
1248  # lst += [M[x1,x2]]*self.expanded[K[x2]]
1249  # for i in xrange(self.expanded[K[x1]]): NewM.append(np.array(lst))
1250  #NewM = np.array(NewM)
1251 
1252  # make list of protein names and create coordinate lists
1253  C = proteins
1254  # W is the component length list,
1255  # R is the contiguous coordinates list
1256  W, R = [], []
1257  for i, c in enumerate(C):
1258  cl = max(self.resmap[c].keys())
1259  W.append(cl)
1260  if i == 0:
1261  R.append(cl)
1262  else:
1263  R.append(R[-1] + cl)
1264 
1265  # start plotting
1266  import matplotlib.pyplot as plt
1267  import matplotlib.gridspec as gridspec
1268  import scipy.sparse as sparse
1269 
1270  f = plt.figure()
1271  gs = gridspec.GridSpec(len(W), len(W),
1272  width_ratios=W,
1273  height_ratios=W)
1274 
1275  cnt = 0
1276  for x1, r1 in enumerate(R):
1277  if x1 == 0:
1278  s1 = 0
1279  else:
1280  s1 = R[x1 - 1]
1281  for x2, r2 in enumerate(R):
1282  if x2 == 0:
1283  s2 = 0
1284  else:
1285  s2 = R[x2 - 1]
1286 
1287  ax = plt.subplot(gs[cnt])
1288  if skip_cmap == 0:
1289  try:
1290  mtr = Matrices[(C[x1], C[x2])]
1291  except KeyError:
1292  mtr = Matrices[(C[x2], C[x1])].T
1293  #cax = ax.imshow(log(NewM[s1:r1,s2:r2] / 1.), interpolation='nearest', vmin=0., vmax=log(NewM.max()))
1294  cax = ax.imshow(
1295  log(mtr),
1296  interpolation='nearest',
1297  vmin=0.,
1298  vmax=log(NewM.max()))
1299  ax.set_xticks([])
1300  ax.set_yticks([])
1301  if skip_xl == 0:
1302  try:
1303  mtr = Matrices_xl[(C[x1], C[x2])]
1304  except KeyError:
1305  mtr = Matrices_xl[(C[x2], C[x1])].T
1306  cax = ax.spy(
1307  sparse.csr_matrix(mtr),
1308  markersize=10,
1309  color='white',
1310  linewidth=100,
1311  alpha=0.5)
1312  ax.set_xticks([])
1313  ax.set_yticks([])
1314 
1315  cnt += 1
1316  if x2 == 0:
1317  ax.set_ylabel(C[x1], rotation=90)
1318  plt.show()
1319 
1320 
1321 # ------------------------------------------------------------------
1322 # a few random tools
1323 
1324 def get_hier_from_rmf(model, frame_number, rmf_file):
1325  # I have to deprecate this function
1326  print "getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file)
1327 
1328  # load the frame
1329  rh = RMF.open_rmf_file_read_only(rmf_file)
1330 
1331  try:
1332  prots = IMP.rmf.create_hierarchies(rh, model)
1333  except:
1334  print "Unable to open rmf file %s" % (rmf_file)
1335  prot = None
1336  return prot
1337  #IMP.rmf.link_hierarchies(rh, prots)
1338  prot = prots[0]
1339  try:
1340  IMP.rmf.load_frame(rh, frame_number)
1341  except:
1342  print "Unable to open frame %i of file %s" % (frame_number, rmf_file)
1343  prot = None
1344  model.update()
1345  del rh
1346  return prot
1347 
1348 def get_hier_and_restraints_from_rmf(model, frame_number, rmf_file):
1349  # I have to deprecate this function
1350  print "getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file)
1351 
1352  # load the frame
1353  rh = RMF.open_rmf_file_read_only(rmf_file)
1354 
1355  try:
1356  prots = IMP.rmf.create_hierarchies(rh, model)
1357  rs = IMP.rmf.create_restraints(rh, model)
1358  except:
1359  print "Unable to open rmf file %s" % (rmf_file)
1360  prot = None
1361  rs = None
1362  return prot,rs
1363  #IMP.rmf.link_hierarchies(rh, prots)
1364  prot = prots[0]
1365  try:
1366  IMP.rmf.load_frame(rh, frame_number)
1367  except:
1368  print "Unable to open frame %i of file %s" % (frame_number, rmf_file)
1369  prot = None
1370  model.update()
1371  del rh
1372  return prot,rs
1373 
1374 def get_hiers_from_rmf(model, frame_number, rmf_file):
1375  print "getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file)
1376 
1377  # load the frame
1378  rh = RMF.open_rmf_file_read_only(rmf_file)
1379 
1380  try:
1381  prots = IMP.rmf.create_hierarchies(rh, model)
1382  except:
1383  print "Unable to open rmf file %s" % (rmf_file)
1384  prot = None
1385  return prot
1386  #IMP.rmf.link_hierarchies(rh, prots)
1387  try:
1388  IMP.rmf.load_frame(rh, frame_number)
1389  except:
1390  print "Unable to open frame %i of file %s" % (frame_number, rmf_file)
1391  prots = None
1392  model.update()
1393  del rh
1394  return prots
1395 
1396 
1398  """
1399  Get particles at res 1, or any beads, based on the name (no Representation needed)
1400  It is mainly used when the hierarchy is read from an rmf file
1401  Returns a dictionary of component names and their particles
1402  """
1403  particle_dict = {}
1404  allparticles = []
1405  for c in prot.get_children():
1406  name = c.get_name()
1407  particle_dict[name] = IMP.atom.get_leaves(c)
1408  for s in c.get_children():
1409  if "_Res:1" in s.get_name() and "_Res:10" not in s.get_name():
1410  allparticles += IMP.atom.get_leaves(s)
1411  if "Beads" in s.get_name():
1412  allparticles += IMP.atom.get_leaves(s)
1413 
1414  particle_align = []
1415  for name in particle_dict:
1416  particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1417  list(set(particle_dict[name]) & set(allparticles)))
1418  return particle_dict
1419 
1421  """
1422  Get particles at res 10, or any beads, based on the name (no Representation needed)
1423  It is mainly used when the hierarchy is read from an rmf file
1424  Returns a dictionary of component names and their particles
1425  """
1426  particle_dict = {}
1427  allparticles = []
1428  for c in prot.get_children():
1429  name = c.get_name()
1430  particle_dict[name] = IMP.atom.get_leaves(c)
1431  for s in c.get_children():
1432  if "_Res:10" in s.get_name():
1433  allparticles += IMP.atom.get_leaves(s)
1434  if "Beads" in s.get_name():
1435  allparticles += IMP.atom.get_leaves(s)
1436  particle_align = []
1437  for name in particle_dict:
1438  particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1439  list(set(particle_dict[name]) & set(allparticles)))
1440  return particle_dict
1441 
1442 
1443 
1444 def select_by_tuple(first_res_last_res_name_tuple):
1445  first_res = first_res_last_res_hier_tuple[0]
1446  last_res = first_res_last_res_hier_tuple[1]
1447  name = first_res_last_res_hier_tuple[2]
1448 
1449 class CrossLinkTable(object):
1450  """Visualization of crosslinks"""
1451  def __init__(self):
1452  self.crosslinks = []
1453  self.external_csv_data = None
1454  self.crosslinkedprots = set()
1455  self.mindist = +10000000.0
1456  self.maxdist = -10000000.0
1457  self.contactmap = None
1458 
1459  def set_hierarchy(self, prot):
1460  self.prot_length_dict = {}
1461  self.model=prot.get_model()
1462 
1463  for i in prot.get_children():
1464  name = i.get_name()
1465  residue_indexes = []
1466  for p in IMP.atom.get_leaves(i):
1467  residue_indexes += IMP.pmi.tools.get_residue_indexes(p)
1468 
1469  if len(residue_indexes) != 0:
1470  self.prot_length_dict[name] = max(residue_indexes)
1471 
1472  def set_coordinates_for_contact_map(self, rmf_name,rmf_frame_index):
1473  from scipy.spatial.distance import cdist
1474 
1475  rh= RMF.open_rmf_file_read_only(rmf_name)
1476  prots=IMP.rmf.create_hierarchies(rh, self.model)
1477  IMP.rmf.load_frame(rh, rmf_frame_index)
1478  print "getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name)
1479  del rh
1480 
1481 
1482  coords = []
1483  radii = []
1484  namelist = []
1485 
1486  particles_dictionary = get_particles_at_resolution_one(prots[0])
1487 
1488  resindex = 0
1489  self.index_dictionary = {}
1490 
1491  for name in particles_dictionary:
1492  residue_indexes = []
1493  for p in particles_dictionary[name]:
1494  print p.get_name()
1495  residue_indexes = IMP.pmi.tools.get_residue_indexes(p)
1496  #residue_indexes.add( )
1497 
1498  if len(residue_indexes) != 0:
1499 
1500  for res in range(min(residue_indexes), max(residue_indexes) + 1):
1501  d = IMP.core.XYZR(p)
1502 
1503  crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1504  coords.append(crd)
1505  radii.append(d.get_radius())
1506  if name not in self.index_dictionary:
1507  self.index_dictionary[name] = [resindex]
1508  else:
1509  self.index_dictionary[name].append(resindex)
1510  resindex += 1
1511 
1512  coords = np.array(coords)
1513  radii = np.array(radii)
1514 
1515  distances = cdist(coords, coords)
1516  distances = (distances - radii).T - radii
1517 
1518  distances = np.where(distances <= 20.0, 1.0, 0)
1519  if self.contactmap is None:
1520  self.contactmap = np.zeros((len(coords), len(coords)))
1521  self.contactmap += distances
1522 
1523  for prot in prots: IMP.atom.destroy(prot)
1524 
1525  def set_crosslinks(
1526  self, data_file, search_label='ISDCrossLinkMS_Distance_',
1527  mapping=None,
1528  filter_label=None,
1529  filter_rmf_file_names=None, #provide a list of rmf base names to filter the stat file
1530  external_csv_data_file=None,
1531  external_csv_data_file_unique_id_key="Unique ID"):
1532 
1533  # example key ISDCrossLinkMS_Distance_intrarb_937-State:0-108:RPS3_55:RPS30-1-1-0.1_None
1534  # mapping is a dictionary that maps standard keywords to entry positions in the key string
1535  # confidence class is a filter that
1536  # external datafile is a datafile that contains further information on the crosslinks
1537  # it will use the unique id to create the dictionary keys
1538 
1539  po = IMP.pmi.output.ProcessOutput(data_file)
1540  keys = po.get_keys()
1541 
1542  xl_keys = [k for k in keys if search_label in k]
1543 
1544  if filter_rmf_file_names is not None:
1545  rmf_file_key="local_rmf_file_name"
1546  fs = po.get_fields(xl_keys+[rmf_file_key])
1547  else:
1548  fs = po.get_fields(xl_keys)
1549 
1550  # this dictionary stores the occurency of given crosslinks
1551  self.cross_link_frequency = {}
1552 
1553  # this dictionary stores the series of distances for given crosslinked
1554  # residues
1555  self.cross_link_distances = {}
1556 
1557  # this dictionary stores the series of distances for given crosslinked
1558  # residues
1559  self.cross_link_distances_unique = {}
1560 
1561  if not external_csv_data_file is None:
1562  # this dictionary stores the further information on crosslinks
1563  # labeled by unique ID
1564  self.external_csv_data = {}
1565  xldb = IMP.pmi.tools.get_db_from_csv(external_csv_data_file)
1566 
1567  for xl in xldb:
1568  self.external_csv_data[
1569  xl[external_csv_data_file_unique_id_key]] = xl
1570 
1571  # this list keeps track the tuple of cross-links and sample
1572  # so that we don't count twice the same crosslinked residues in the
1573  # same sample
1574  cross_link_frequency_list = []
1575 
1576  self.unique_cross_link_list = []
1577 
1578  for key in xl_keys:
1579  print key
1580  keysplit = key.replace(
1581  "_",
1582  " ").replace(
1583  "-",
1584  " ").replace(
1585  ":",
1586  " ").split(
1587  )
1588 
1589  if filter_label!=None:
1590  if filter_label not in keysplit: continue
1591 
1592  if mapping is None:
1593  r1 = int(keysplit[5])
1594  c1 = keysplit[6]
1595  r2 = int(keysplit[7])
1596  c2 = keysplit[8]
1597  try:
1598  confidence = keysplit[12]
1599  except:
1600  confidence = '0.0'
1601  try:
1602  unique_identifier = keysplit[3]
1603  except:
1604  unique_identifier = '0'
1605  else:
1606  r1 = int(keysplit[mapping["Residue1"]])
1607  c1 = keysplit[mapping["Protein1"]]
1608  r2 = int(keysplit[mapping["Residue2"]])
1609  c2 = keysplit[mapping["Protein2"]]
1610  try:
1611  confidence = keysplit[mapping["Confidence"]]
1612  except:
1613  confidence = '0.0'
1614  try:
1615  unique_identifier = keysplit[mapping["Unique Identifier"]]
1616  except:
1617  unique_identifier = '0'
1618 
1619  self.crosslinkedprots.add(c1)
1620  self.crosslinkedprots.add(c2)
1621 
1622  # construct the list of distances corresponding to the input rmf
1623  # files
1624 
1625  dists=[]
1626  if filter_rmf_file_names is not None:
1627  for n,d in enumerate(fs[key]):
1628  if fs[rmf_file_key][n] in filter_rmf_file_names:
1629  dists.append(float(d))
1630  else:
1631  dists=[float(f) for f in fs[key]]
1632 
1633  # check if the input confidence class corresponds to the
1634  # one of the cross-link
1635 
1636  mdist = self.median(dists)
1637 
1638  stdv = np.std(np.array(dists))
1639  if self.mindist > mdist:
1640  self.mindist = mdist
1641  if self.maxdist < mdist:
1642  self.maxdist = mdist
1643 
1644  # calculate the frequency of unique crosslinks within the same
1645  # sample
1646  if not self.external_csv_data is None:
1647  sample = self.external_csv_data[unique_identifier]["Sample"]
1648  else:
1649  sample = "None"
1650 
1651  if (r1, c1, r2, c2,mdist) not in cross_link_frequency_list:
1652  if (r1, c1, r2, c2) not in self.cross_link_frequency:
1653  self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1654  self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1655  else:
1656  self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1657  self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1658  cross_link_frequency_list.append((r1, c1, r2, c2))
1659  cross_link_frequency_list.append((r2, c2, r1, c1))
1660  self.unique_cross_link_list.append(
1661  (r1, c1, r2, c2,mdist))
1662 
1663  if (r1, c1, r2, c2) not in self.cross_link_distances:
1664  self.cross_link_distances[(
1665  r1,
1666  c1,
1667  r2,
1668  c2,
1669  mdist,
1670  confidence)] = dists
1671  self.cross_link_distances[(
1672  r2,
1673  c2,
1674  r1,
1675  c1,
1676  mdist,
1677  confidence)] = dists
1678  self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1679  else:
1680  self.cross_link_distances[(
1681  r2,
1682  c2,
1683  r1,
1684  c1,
1685  mdist,
1686  confidence)] += dists
1687  self.cross_link_distances[(
1688  r1,
1689  c1,
1690  r2,
1691  c2,
1692  mdist,
1693  confidence)] += dists
1694 
1695  self.crosslinks.append(
1696  (r1,
1697  c1,
1698  r2,
1699  c2,
1700  mdist,
1701  stdv,
1702  confidence,
1703  unique_identifier,
1704  'original'))
1705  self.crosslinks.append(
1706  (r2,
1707  c2,
1708  r1,
1709  c1,
1710  mdist,
1711  stdv,
1712  confidence,
1713  unique_identifier,
1714  'reversed'))
1715 
1716  self.cross_link_frequency_inverted = {}
1717  for xl in self.unique_cross_link_list:
1718  (r1, c1, r2, c2, mdist) = xl
1719  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1720  if frequency not in self.cross_link_frequency_inverted:
1721  self.cross_link_frequency_inverted[
1722  frequency] = [(r1, c1, r2, c2)]
1723  else:
1724  self.cross_link_frequency_inverted[
1725  frequency].append((r1, c1, r2, c2))
1726 
1727  # -------------
1728 
1729  def median(self, mylist):
1730  sorts = sorted(mylist)
1731  length = len(sorts)
1732  print length
1733  if length == 1:
1734  return mylist[0]
1735  if not length % 2:
1736  return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1737  return sorts[length / 2]
1738 
1739  def set_threshold(self,threshold):
1740  self.threshold=threshold
1741 
1742  def set_tolerance(self,tolerance):
1743  self.tolerance=tolerance
1744 
1745  def colormap(self, dist):
1746  if dist < self.threshold - self.tolerance:
1747  return "Green"
1748  elif dist >= self.threshold + self.tolerance:
1749  return "Orange"
1750  else:
1751  return "Red"
1752 
1753  def write_cross_link_database(self, filename, format='csv'):
1754  import csv
1755 
1756  fieldnames = [
1757  "Unique ID", "Protein1", "Residue1", "Protein2", "Residue2",
1758  "Median Distance", "Standard Deviation", "Confidence", "Frequency", "Arrangement"]
1759 
1760  if not self.external_csv_data is None:
1761  keys = self.external_csv_data.keys()
1762  innerkeys = self.external_csv_data[keys[0]].keys()
1763  innerkeys.sort()
1764  fieldnames += innerkeys
1765 
1766  dw = csv.DictWriter(
1767  open(filename,
1768  "w"),
1769  delimiter=',',
1770  fieldnames=fieldnames)
1771  dw.writeheader()
1772  for xl in self.crosslinks:
1773  (r1, c1, r2, c2, mdist, stdv, confidence,
1774  unique_identifier, descriptor) = xl
1775  if descriptor == 'original':
1776  outdict = {}
1777  outdict["Unique ID"] = unique_identifier
1778  outdict["Protein1"] = c1
1779  outdict["Protein2"] = c2
1780  outdict["Residue1"] = r1
1781  outdict["Residue2"] = r2
1782  outdict["Median Distance"] = mdist
1783  outdict["Standard Deviation"] = stdv
1784  outdict["Confidence"] = confidence
1785  outdict["Frequency"] = self.cross_link_frequency[
1786  (r1, c1, r2, c2)]
1787  if c1 == c2:
1788  arrangement = "Intra"
1789  else:
1790  arrangement = "Inter"
1791  outdict["Arrangement"] = arrangement
1792  if not self.external_csv_data is None:
1793  outdict.update(self.external_csv_data[unique_identifier])
1794 
1795  dw.writerow(outdict)
1796 
1797  def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
1798  no_confidence_info=False, filter=None, layout="whole", crosslinkedonly=False,
1799  filename=None, confidence_classes=None, alphablend=0.1, scale_symbol_size=1.0,
1800  gap_between_components=0,
1801  rename_protein_map=None):
1802  # layout can be:
1803  # "lowerdiagonal" print only the lower diagonal plot
1804  # "upperdiagonal" print only the upper diagonal plot
1805  # "whole" print all
1806  # crosslinkedonly: plot only components that have crosslinks
1807  # no_dist_info: if True will plot only the cross-links as grey spots
1808  # filter = tuple the tuple contains a keyword to be search in the database
1809  # a relationship ">","==","<"
1810  # and a value
1811  # example ("ID_Score",">",40)
1812  # scale_symbol_size rescale the symbol for the crosslink
1813  # rename_protein_map is a dictionary to rename proteins
1814 
1815  import matplotlib.pyplot as plt
1816  import matplotlib.cm as cm
1817 
1818  fig = plt.figure(figsize=(10, 10))
1819  ax = fig.add_subplot(111)
1820 
1821  ax.set_xticks([])
1822  ax.set_yticks([])
1823 
1824  # set the list of proteins on the x axis
1825  if prot_listx is None:
1826  if crosslinkedonly:
1827  prot_listx = list(self.crosslinkedprots)
1828  else:
1829  prot_listx = self.prot_length_dict.keys()
1830  prot_listx.sort()
1831 
1832  nresx = gap_between_components + \
1833  sum([self.prot_length_dict[name]
1834  + gap_between_components for name in prot_listx])
1835 
1836  # set the list of proteins on the y axis
1837 
1838  if prot_listy is None:
1839  if crosslinkedonly:
1840  prot_listy = list(self.crosslinkedprots)
1841  else:
1842  prot_listy = self.prot_length_dict.keys()
1843  prot_listy.sort()
1844 
1845  nresy = gap_between_components + \
1846  sum([self.prot_length_dict[name]
1847  + gap_between_components for name in prot_listy])
1848 
1849  # this is the residue offset for each protein
1850  resoffsetx = {}
1851  resendx = {}
1852  res = gap_between_components
1853  for prot in prot_listx:
1854  resoffsetx[prot] = res
1855  res += self.prot_length_dict[prot]
1856  resendx[prot] = res
1857  res += gap_between_components
1858 
1859  resoffsety = {}
1860  resendy = {}
1861  res = gap_between_components
1862  for prot in prot_listy:
1863  resoffsety[prot] = res
1864  res += self.prot_length_dict[prot]
1865  resendy[prot] = res
1866  res += gap_between_components
1867 
1868  resoffsetdiagonal = {}
1869  res = gap_between_components
1870  for prot in IMP.pmi.tools.OrderedSet(prot_listx + prot_listy):
1871  resoffsetdiagonal[prot] = res
1872  res += self.prot_length_dict[prot]
1873  res += gap_between_components
1874 
1875  # plot protein boundaries
1876 
1877  xticks = []
1878  xlabels = []
1879  for n, prot in enumerate(prot_listx):
1880  res = resoffsetx[prot]
1881  end = resendx[prot]
1882  for proty in prot_listy:
1883  resy = resoffsety[proty]
1884  endy = resendy[proty]
1885  ax.plot([res, res], [resy, endy], 'k-', lw=0.4)
1886  ax.plot([end, end], [resy, endy], 'k-', lw=0.4)
1887  xticks.append((float(res) + float(end)) / 2)
1888  if rename_protein_map is not None:
1889  if prot in rename_protein_map:
1890  prot=rename_protein_map[prot]
1891  xlabels.append(prot)
1892 
1893  yticks = []
1894  ylabels = []
1895  for n, prot in enumerate(prot_listy):
1896  res = resoffsety[prot]
1897  end = resendy[prot]
1898  for protx in prot_listx:
1899  resx = resoffsetx[protx]
1900  endx = resendx[protx]
1901  ax.plot([resx, endx], [res, res], 'k-', lw=0.4)
1902  ax.plot([resx, endx], [end, end], 'k-', lw=0.4)
1903  yticks.append((float(res) + float(end)) / 2)
1904  if rename_protein_map is not None:
1905  if prot in rename_protein_map:
1906  prot=rename_protein_map[prot]
1907  ylabels.append(prot)
1908 
1909  # plot the contact map
1910  print prot_listx, prot_listy
1911 
1912  if not self.contactmap is None:
1913  import matplotlib.cm as cm
1914  tmp_array = np.zeros((nresx, nresy))
1915 
1916  for px in prot_listx:
1917  print px
1918  for py in prot_listy:
1919  print py
1920  resx = resoffsety[px]
1921  lengx = resendx[px] - 1
1922  resy = resoffsety[py]
1923  lengy = resendy[py] - 1
1924  indexes_x = self.index_dictionary[px]
1925  minx = min(indexes_x)
1926  maxx = max(indexes_x)
1927  indexes_y = self.index_dictionary[py]
1928  miny = min(indexes_y)
1929  maxy = max(indexes_y)
1930 
1931  print px, py, minx, maxx, miny, maxy
1932 
1933  try:
1934  tmp_array[
1935  resx:lengx,
1936  resy:lengy] = self.contactmap[
1937  minx:maxx,
1938  miny:maxy]
1939  except:
1940  continue
1941 
1942 
1943  ax.imshow(tmp_array,
1944  cmap=cm.binary,
1945  origin='lower',
1946  interpolation='nearest')
1947 
1948  ax.set_xticks(xticks)
1949  ax.set_xticklabels(xlabels, rotation=90)
1950  ax.set_yticks(yticks)
1951  ax.set_yticklabels(ylabels)
1952  ax.set_xlim(0,nresx)
1953  ax.set_ylim(0,nresy)
1954 
1955 
1956  # set the crosslinks
1957 
1958  already_added_xls = []
1959 
1960  for xl in self.crosslinks:
1961 
1962  (r1, c1, r2, c2, mdist, stdv, confidence,
1963  unique_identifier, descriptor) = xl
1964 
1965  if confidence_classes is not None:
1966  if confidence not in confidence_classes:
1967  continue
1968 
1969  try:
1970  pos1 = r1 + resoffsetx[c1]
1971  except:
1972  continue
1973  try:
1974  pos2 = r2 + resoffsety[c2]
1975  except:
1976  continue
1977 
1978  if not filter is None:
1979  xldb = self.external_csv_data[unique_identifier]
1980  xldb.update({"Distance": mdist})
1981  xldb.update({"Distance_stdv": stdv})
1982 
1983  if filter[1] == ">":
1984  if float(xldb[filter[0]]) <= float(filter[2]):
1985  continue
1986 
1987  if filter[1] == "<":
1988  if float(xldb[filter[0]]) >= float(filter[2]):
1989  continue
1990 
1991  if filter[1] == "==":
1992  if float(xldb[filter[0]]) != float(filter[2]):
1993  continue
1994 
1995  # all that below is used for plotting the diagonal
1996  # when you have a rectangolar plots
1997 
1998  pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
1999  pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
2000 
2001  if layout == 'lowerdiagonal':
2002  if pos_for_diagonal1 <= pos_for_diagonal2:
2003  continue
2004  if layout == 'upperdiagonal':
2005  if pos_for_diagonal1 >= pos_for_diagonal2:
2006  continue
2007 
2008  already_added_xls.append((r1, c1, r2, c2))
2009 
2010  if not no_confidence_info:
2011  if confidence == '0.01':
2012  markersize = 14 * scale_symbol_size
2013  elif confidence == '0.05':
2014  markersize = 9 * scale_symbol_size
2015  elif confidence == '0.1':
2016  markersize = 6 * scale_symbol_size
2017  else:
2018  markersize = 15 * scale_symbol_size
2019  else:
2020  markersize = 5 * scale_symbol_size
2021 
2022  if not no_dist_info:
2023  color = self.colormap(mdist)
2024  else:
2025  color = "gray"
2026 
2027  ax.plot(
2028  [pos1],
2029  [pos2],
2030  'o',
2031  c=color,
2032  alpha=alphablend,
2033  markersize=markersize)
2034 
2035 
2036 
2037  fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
2038 
2039  [i.set_linewidth(2.0) for i in ax.spines.itervalues()]
2040 
2041  #plt.tight_layout()
2042 
2043  if filename:
2044  plt.savefig(filename + ".pdf", dpi=300, transparent="False")
2045 
2046 
2047  plt.show()
2048 
2049  def get_frequency_statistics(self, prot_list,
2050  prot_list2=None):
2051 
2052  violated_histogram = {}
2053  satisfied_histogram = {}
2054  unique_cross_links = []
2055 
2056  for xl in self.unique_cross_link_list:
2057  (r1, c1, r2, c2, mdist) = xl
2058 
2059  # here we filter by the protein
2060  if prot_list2 is None:
2061  if not c1 in prot_list:
2062  continue
2063  if not c2 in prot_list:
2064  continue
2065  else:
2066  if c1 in prot_list and c2 in prot_list2:
2067  pass
2068  elif c1 in prot_list2 and c2 in prot_list:
2069  pass
2070  else:
2071  continue
2072 
2073  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2074 
2075  if (r1, c1, r2, c2) not in unique_cross_links:
2076  if mdist > 35.0:
2077  if frequency not in violated_histogram:
2078  violated_histogram[frequency] = 1
2079  else:
2080  violated_histogram[frequency] += 1
2081  else:
2082  if frequency not in satisfied_histogram:
2083  satisfied_histogram[frequency] = 1
2084  else:
2085  satisfied_histogram[frequency] += 1
2086  unique_cross_links.append((r1, c1, r2, c2))
2087  unique_cross_links.append((r2, c2, r1, c1))
2088 
2089  print "# satisfied"
2090 
2091  total_number_of_crosslinks=0
2092 
2093  for i in satisfied_histogram:
2094  # if i in violated_histogram:
2095  # print i, satisfied_histogram[i]+violated_histogram[i]
2096  # else:
2097  if i in violated_histogram:
2098  print i, violated_histogram[i]+satisfied_histogram[i]
2099  else:
2100  print i, satisfied_histogram[i]
2101  total_number_of_crosslinks+=i*satisfied_histogram[i]
2102 
2103  print "# violated"
2104 
2105  for i in violated_histogram:
2106  print i, violated_histogram[i]
2107  total_number_of_crosslinks+=i*violated_histogram[i]
2108 
2109  print total_number_of_crosslinks
2110 
2111 
2112 # ------------
2113  def print_cross_link_binary_symbols(self, prot_list,
2114  prot_list2=None):
2115  tmp_matrix = []
2116  confidence_list = []
2117  for xl in self.crosslinks:
2118  (r1, c1, r2, c2, mdist, stdv, confidence,
2119  unique_identifier, descriptor) = xl
2120 
2121  if prot_list2 is None:
2122  if not c1 in prot_list:
2123  continue
2124  if not c2 in prot_list:
2125  continue
2126  else:
2127  if c1 in prot_list and c2 in prot_list2:
2128  pass
2129  elif c1 in prot_list2 and c2 in prot_list:
2130  pass
2131  else:
2132  continue
2133 
2134  if descriptor != "original":
2135  continue
2136 
2137  confidence_list.append(confidence)
2138 
2139  dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2140  tmp_dist_binary = []
2141  for d in dists:
2142  if d < 35:
2143  tmp_dist_binary.append(1)
2144  else:
2145  tmp_dist_binary.append(0)
2146  tmp_matrix.append(tmp_dist_binary)
2147 
2148  matrix = zip(*tmp_matrix)
2149 
2150  satisfied_high_sum = 0
2151  satisfied_mid_sum = 0
2152  satisfied_low_sum = 0
2153  total_satisfied_sum = 0
2154  for k, m in enumerate(matrix):
2155  satisfied_high = 0
2156  total_high = 0
2157  satisfied_mid = 0
2158  total_mid = 0
2159  satisfied_low = 0
2160  total_low = 0
2161  total_satisfied = 0
2162  total = 0
2163  for n, b in enumerate(m):
2164  if confidence_list[n] == "0.01":
2165  total_high += 1
2166  if b == 1:
2167  satisfied_high += 1
2168  satisfied_high_sum += 1
2169  elif confidence_list[n] == "0.05":
2170  total_mid += 1
2171  if b == 1:
2172  satisfied_mid += 1
2173  satisfied_mid_sum += 1
2174  elif confidence_list[n] == "0.1":
2175  total_low += 1
2176  if b == 1:
2177  satisfied_low += 1
2178  satisfied_low_sum += 1
2179  if b == 1:
2180  total_satisfied += 1
2181  total_satisfied_sum += 1
2182  total += 1
2183  print k, satisfied_high, total_high
2184  print k, satisfied_mid, total_mid
2185  print k, satisfied_low, total_low
2186  print k, total_satisfied, total
2187  print float(satisfied_high_sum) / len(matrix)
2188  print float(satisfied_mid_sum) / len(matrix)
2189  print float(satisfied_low_sum) / len(matrix)
2190 # ------------
2191 
2192  def get_unique_crosslinks_statistics(self, prot_list,
2193  prot_list2=None):
2194 
2195  print prot_list
2196  print prot_list2
2197  satisfied_high = 0
2198  total_high = 0
2199  satisfied_mid = 0
2200  total_mid = 0
2201  satisfied_low = 0
2202  total_low = 0
2203  total = 0
2204  tmp_matrix = []
2205  satisfied_string = []
2206  for xl in self.crosslinks:
2207  (r1, c1, r2, c2, mdist, stdv, confidence,
2208  unique_identifier, descriptor) = xl
2209 
2210  if prot_list2 is None:
2211  if not c1 in prot_list:
2212  continue
2213  if not c2 in prot_list:
2214  continue
2215  else:
2216  if c1 in prot_list and c2 in prot_list2:
2217  pass
2218  elif c1 in prot_list2 and c2 in prot_list:
2219  pass
2220  else:
2221  continue
2222 
2223  if descriptor != "original":
2224  continue
2225 
2226  total += 1
2227  if confidence == "0.01":
2228  total_high += 1
2229  if mdist <= 35:
2230  satisfied_high += 1
2231  if confidence == "0.05":
2232  total_mid += 1
2233  if mdist <= 35:
2234  satisfied_mid += 1
2235  if confidence == "0.1":
2236  total_low += 1
2237  if mdist <= 35:
2238  satisfied_low += 1
2239  if mdist <= 35:
2240  satisfied_string.append(1)
2241  else:
2242  satisfied_string.append(0)
2243 
2244  dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2245  tmp_dist_binary = []
2246  for d in dists:
2247  if d < 35:
2248  tmp_dist_binary.append(1)
2249  else:
2250  tmp_dist_binary.append(0)
2251  tmp_matrix.append(tmp_dist_binary)
2252 
2253  print "unique satisfied_high/total_high", satisfied_high, "/", total_high
2254  print "unique satisfied_mid/total_mid", satisfied_mid, "/", total_mid
2255  print "unique satisfied_low/total_low", satisfied_low, "/", total_low
2256  print "total", total
2257 
2258  matrix = zip(*tmp_matrix)
2259  satisfied_models = 0
2260  satstr = ""
2261  for b in satisfied_string:
2262  if b == 0:
2263  satstr += "-"
2264  if b == 1:
2265  satstr += "*"
2266 
2267  for m in matrix:
2268  all_satisfied = True
2269  string = ""
2270  for n, b in enumerate(m):
2271  if b == 0:
2272  string += "0"
2273  if b == 1:
2274  string += "1"
2275  if b == 1 and satisfied_string[n] == 1:
2276  pass
2277  elif b == 1 and satisfied_string[n] == 0:
2278  pass
2279  elif b == 0 and satisfied_string[n] == 0:
2280  pass
2281  elif b == 0 and satisfied_string[n] == 1:
2282  all_satisfied = False
2283  if all_satisfied:
2284  satisfied_models += 1
2285  print string
2286  print satstr, all_satisfied
2287  print "models that satisfies the median satisfied crosslinks/total models", satisfied_models, len(matrix)
2288 
2289  def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,
2290  prot_list2=None):
2291 
2292  import pylab as pl
2293 
2294  tmp_matrix = []
2295  for kw in self.cross_link_distances_unique:
2296  (r1, c1, r2, c2) = kw
2297  dists = self.cross_link_distances_unique[kw]
2298 
2299  if prot_list2 is None:
2300  if not c1 in prot_list:
2301  continue
2302  if not c2 in prot_list:
2303  continue
2304  else:
2305  if c1 in prot_list and c2 in prot_list2:
2306  pass
2307  elif c1 in prot_list2 and c2 in prot_list:
2308  pass
2309  else:
2310  continue
2311  # append the sum of dists to order by that in the matrix plot
2312  dists.append(sum(dists))
2313  tmp_matrix.append(dists)
2314 
2315  tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
2316 
2317  # print len(tmp_matrix), len(tmp_matrix[0])-1
2318  matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
2319 
2320  for i in range(len(tmp_matrix)):
2321  for k in range(len(tmp_matrix[i]) - 1):
2322  matrix[i][k] = tmp_matrix[i][k]
2323 
2324  print matrix
2325 
2326  fig = pl.figure()
2327  ax = fig.add_subplot(211)
2328 
2329  cax = ax.imshow(matrix, interpolation='nearest')
2330  # ax.set_yticks(range(len(self.model_list_names)))
2331  #ax.set_yticklabels( [self.model_list_names[i] for i in leaves_order] )
2332  fig.colorbar(cax)
2333  pl.savefig(figurename, dpi=300)
2334  pl.show()
2335 
2336  def plot_bars(
2337  self,
2338  filename,
2339  prots1,
2340  prots2,
2341  nxl_per_row=20,
2342  arrangement="inter",
2343  confidence_input="None"):
2344 
2345  data = []
2346  for xl in self.cross_link_distances:
2347  (r1, c1, r2, c2, mdist, confidence) = xl
2348  if c1 in prots1 and c2 in prots2:
2349  if arrangement == "inter" and c1 == c2:
2350  continue
2351  if arrangement == "intra" and c1 != c2:
2352  continue
2353  if confidence_input == confidence:
2354  label = str(c1) + ":" + str(r1) + \
2355  "-" + str(c2) + ":" + str(r2)
2356  values = self.cross_link_distances[xl]
2357  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2358  data.append((label, values, mdist, frequency))
2359 
2360  sort_by_dist = sorted(data, key=lambda tup: tup[2])
2361  sort_by_dist = zip(*sort_by_dist)
2362  values = sort_by_dist[1]
2363  positions = range(len(values))
2364  labels = sort_by_dist[0]
2365  frequencies = map(float, sort_by_dist[3])
2366  frequencies = [f * 10.0 for f in frequencies]
2367 
2368  nchunks = int(float(len(values)) / nxl_per_row)
2369  values_chunks = IMP.pmi.tools.chunk_list_into_segments(values, nchunks)
2370  positions_chunks = IMP.pmi.tools.chunk_list_into_segments(
2371  positions,
2372  nchunks)
2373  frequencies_chunks = IMP.pmi.tools.chunk_list_into_segments(
2374  frequencies,
2375  nchunks)
2376  labels_chunks = IMP.pmi.tools.chunk_list_into_segments(labels, nchunks)
2377 
2378  for n, v in enumerate(values_chunks):
2379  p = positions_chunks[n]
2380  f = frequencies_chunks[n]
2381  l = labels_chunks[n]
2383  filename + "." + str(n), v, p, f,
2384  valuename="Distance (Ang)", positionname="Unique " + arrangement + " Crosslinks", xlabels=l)
2385 
2386  def crosslink_distance_histogram(self, filename,
2387  prot_list=None,
2388  prot_list2=None,
2389  confidence_classes=None,
2390  bins=40,
2391  color='#66CCCC',
2392  yplotrange=[0, 1],
2393  format="png",
2394  normalized=False):
2395  if prot_list is None:
2396  prot_list = self.prot_length_dict.keys()
2397 
2398  distances = []
2399  for xl in self.crosslinks:
2400  (r1, c1, r2, c2, mdist, stdv, confidence,
2401  unique_identifier, descriptor) = xl
2402 
2403  if not confidence_classes is None:
2404  if confidence not in confidence_classes:
2405  continue
2406 
2407  if prot_list2 is None:
2408  if not c1 in prot_list:
2409  continue
2410  if not c2 in prot_list:
2411  continue
2412  else:
2413  if c1 in prot_list and c2 in prot_list2:
2414  pass
2415  elif c1 in prot_list2 and c2 in prot_list:
2416  pass
2417  else:
2418  continue
2419 
2420  distances.append(mdist)
2421 
2423  filename, distances, valuename="C-alpha C-alpha distance [Ang]",
2424  bins=bins, color=color,
2425  format=format,
2426  reference_xline=35.0,
2427  yplotrange=yplotrange, normalized=normalized)
2428 
2429  def scatter_plot_xl_features(self, filename,
2430  feature1=None,
2431  feature2=None,
2432  prot_list=None,
2433  prot_list2=None,
2434  yplotrange=None,
2435  reference_ylines=None,
2436  distance_color=True,
2437  format="png"):
2438  import matplotlib.pyplot as plt
2439  import matplotlib.cm as cm
2440 
2441  fig = plt.figure(figsize=(10, 10))
2442  ax = fig.add_subplot(111)
2443 
2444  for xl in self.crosslinks:
2445  (r1, c1, r2, c2, mdist, stdv, confidence,
2446  unique_identifier, arrangement) = xl
2447 
2448  if prot_list2 is None:
2449  if not c1 in prot_list:
2450  continue
2451  if not c2 in prot_list:
2452  continue
2453  else:
2454  if c1 in prot_list and c2 in prot_list2:
2455  pass
2456  elif c1 in prot_list2 and c2 in prot_list:
2457  pass
2458  else:
2459  continue
2460 
2461  xldb = self.external_csv_data[unique_identifier]
2462  xldb.update({"Distance": mdist})
2463  xldb.update({"Distance_stdv": stdv})
2464 
2465  xvalue = float(xldb[feature1])
2466  yvalue = float(xldb[feature2])
2467 
2468  if distance_color:
2469  color = self.colormap(mdist)
2470  else:
2471  color = "gray"
2472 
2473  ax.plot([xvalue], [yvalue], 'o', c=color, alpha=0.1, markersize=7)
2474 
2475  if not yplotrange is None:
2476  ax.set_ylim(yplotrange)
2477  if not reference_ylines is None:
2478  for rl in reference_ylines:
2479  ax.axhline(rl, color='red', linestyle='dashed', linewidth=1)
2480 
2481  if filename:
2482  plt.savefig(filename + "." + format, dpi=150, transparent="False")
2483 
2484  plt.show()
Simple 3D transformation class.
Visualization of crosslinks.
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, kernel::Model *m)
A decorator to associate a particle with a part of a protein/DNA/RNA.
Definition: Fragment.h:20
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
DensityMap * create_density_map(const algebra::GridD< 3, algebra::DenseGridStorageD< 3, float >, float > &grid)
A class for reading stat files.
Definition: output.py:588
def plot_field_histogram
This function is plotting a list of histograms from a value list.
Definition: output.py:747
def plot_fields_box_plots
This function plots time series as boxplots fields is a list of time series, positions are the x-valu...
Definition: output.py:812
kernel::Restraints create_restraints(RMF::FileConstHandle fh, kernel::Model *m)
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
Definition: atom/distance.h:88
Miscellaneous utilities.
Definition: tools.py:1
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
A class to evaluate the precision of an ensemble.
void load_frame(RMF::FileConstHandle file, unsigned int frame)
Definition: frames.h:27
log
Definition: log.py:1
double get_rmsd(const core::XYZs &s0, const core::XYZs &s1, const IMP::algebra::Transformation3D &tr_for_second)
A class to cluster structures.
void write_map(DensityMap *m, std::string filename)
Write a density map to a file.
Class for sampling a density map from particles.
def add_structure
Read a structure into the ensemble and store (as coordinates).
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
Performs alignment and RMSD calculation for two sets of coordinates.
Definition: pmi/Analysis.py:22
def add_subunits_density
Add a frame to the densities.
def scatter_and_gather
Synchronize data over a parallel run.
Definition: tools.py:961
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Basic utilities for handling cryo-electron microscopy 3D density maps.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name (no Representation needed) It is mainly used ...
def fill
fill stores coordinates of a model into a dictionary all_coords, containing coordinates for all model...
Tools for clustering and cluster analysis.
Definition: pmi/Analysis.py:1
def do_cluster
Run K-means clustering.
void destroy(Hierarchy d)
Delete the Hierarchy.
3D rotation class.
Definition: Rotation3D.h:46
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Definition: DensityMap.h:461
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Classes for writing output files and processing them.
Definition: output.py:1
def set_reference_structure
Read in a structure used for reference computation.
def get_rmsf
Calculate the residue mean square fluctuations (RMSF).
def get_average_distance_wrt_reference_structure
Compare the structure set to the reference structure.
def get_density
Get the current density for some component name.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
def get_precision
Evaluate the precision of two named structure groups.
VectorD< 3 > Vector3D
Definition: VectorD.h:395
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name (no Representation needed) It is mainly used...
Python classes to represent, score, sample and analyze models.
def add_structures
Read a list of RMFs, supports parallel.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Select hierarchy particles identified by the biological name.
Definition: Selection.h:62
A class to compute mean density maps from structures Keeps a dictionary of density maps...
Support for the RMF file format for storing hierarchical molecular data and markup.
def get_residue_indexes
This "overloaded" function retrieves the residue indexes for each particle which is an instance of Fr...
Definition: tools.py:920
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27