IMP  2.4.0
The Integrative Modeling Platform
pmi/Analysis.py
1 #!/usr/bin/env python
2 
3 """@namespace IMP.pmi.analysis
4  Tools for clustering and cluster analysis
5 """
6 from __future__ import print_function
7 import IMP
8 import IMP.algebra
9 import IMP.em
10 import IMP.pmi
11 import IMP.pmi.tools
12 import IMP.pmi.output
13 import IMP.rmf
14 import RMF
15 import IMP.pmi.analysis
16 from operator import itemgetter
17 from copy import deepcopy
18 from math import log,sqrt
19 import itertools
20 import numpy as np
21 
22 
23 class Alignment(object):
24  """Performs alignment and RMSD calculation for two sets of coordinates
25 
26  The class also takes into account non-equal stoichiometry of the proteins.
27  If this is the case, the protein names of proteins in multiple copies
28  should be specified in the following form:
29  nameA..1, nameA..2 (note two dots).
30  """
31 
32  def __init__(self, template, query, weights=None):
33  """Constructor.
34  @param query {'p1':coords(L,3), 'p2':coords(L,3)}
35  @param template {'p1':coords(L,3), 'p2':coords(L,3)}
36  """
37  self.query = query
38  self.template = template
39  self.weights=weights
40 
41  if len(self.query.keys()) != len(self.template.keys()):
42  raise ValueError('''the number of proteins
43  in template and query does not match!''')
44 
45  def permute(self):
46 
47  self.proteins = sorted(self.query.keys())
48  prots_uniq = [i.split('..')[0] for i in self.proteins]
49  P = {}
50  for p in prots_uniq:
51  np = prots_uniq.count(p)
52  copies = [i for i in self.proteins if i.split('..')[0] == p]
53  prmts = list(itertools.permutations(copies, len(copies)))
54  P[p] = prmts
55  self.P = P
56  self.Product = list(itertools.product(*P.values()))
57 
58  def get_rmsd(self):
59 
60  self.permute()
61 
62  template_xyz = []
63  weights = []
64  torder = sum([list(i) for i in self.Product[0]], [])
65  for t in torder:
66  template_xyz += [IMP.algebra.Vector3D(i) for i in self.template[t]]
67  if self.weights is not None:
68  weights += [i for i in self.weights[t]]
69  #template_xyz = np.array(template_xyz)
70 
71  self.rmsd = 10000000000.
72  for comb in self.Product:
73 
74 
75 
76  order = sum([list(i) for i in comb], [])
77  query_xyz = []
78  for p in order:
79  query_xyz += [IMP.algebra.Vector3D(i) for i in self.query[p]]
80  #query_xyz = np.array(query_xyz)
81  #if len(template_xyz) != len(query_xyz):
82  # print '''Alignment.get_rmsd: ERROR: the number of coordinates
83  # in template and query does not match!'''
84  # exit()
85 
86  if self.weights is not None:
87  dist=IMP.algebra.get_weighted_rmsd(template_xyz, query_xyz, weights)
88  else:
89  dist=IMP.algebra.get_rmsd(template_xyz, query_xyz)
90  #dist = sqrt(
91  # sum(np.diagonal(cdist(template_xyz, query_xyz) ** 2)) / len(template_xyz))
92  if dist < self.rmsd:
93  self.rmsd = dist
94  return self.rmsd
95 
96  def align(self):
97  from scipy.spatial.distance import cdist
98 
99  self.permute()
100 
101  template_xyz = []
102  torder = sum([list(i) for i in self.Product[0]], [])
103  for t in torder:
104  template_xyz += [IMP.algebra.Vector3D(i) for i in self.template[t]]
105  #template_xyz = np.array(template_xyz)
106 
107  self.rmsd, Transformation = 10000000000., ''
108  for comb in self.Product:
109  order = sum([list(i) for i in comb], [])
110  query_xyz = []
111  for p in order:
112  query_xyz += [IMP.algebra.Vector3D(i) for i in self.query[p]]
113  #query_xyz = np.array(query_xyz)
114 
115  if len(template_xyz) != len(query_xyz):
116  raise ValueError('''the number of coordinates
117  in template and query does not match!''')
118 
120  query_xyz,
121  template_xyz)
122  query_xyz_tr = [transformation.get_transformed(n)
123  for n in query_xyz]
124 
125  dist = sqrt(
126  sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2)) / len(template_xyz))
127  if dist < self.rmsd:
128  self.rmsd = dist
129  Transformation = transformation
130 
131  return (self.rmsd, Transformation)
132 
133 
134 # TEST for the alignment ###
135 """
136 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
137  'a..2':np.array([np.array([1.,1.,])]),
138  'a..3':np.array([np.array([-2.,1.])]),
139  'b':np.array([np.array([0.,-1.])]),
140  'c..1':np.array([np.array([-1.,-1.])]),
141  'c..2':np.array([np.array([1.,-1.])]),
142  'd':np.array([np.array([0.,0.])]),
143  'e':np.array([np.array([0.,1.])])}
144 
145 Ali = Alignment(Proteins, Proteins)
146 Ali.permute()
147 if Ali.get_rmsd() == 0.0: print 'successful test!'
148 else: print 'ERROR!'; exit()
149 """
150 
151 
152 # ----------------------------------
153 class Violations(object):
154 
155  def __init__(self, filename):
156 
157  self.violation_thresholds = {}
158  self.violation_counts = {}
159 
160  data = open(filename)
161  D = data.readlines()
162  data.close()
163 
164  for d in D:
165  d = d.strip().split()
166  self.violation_thresholds[d[0]] = float(d[1])
167 
168  def get_number_violated_restraints(self, rsts_dict):
169  num_violated = 0
170  for rst in self.violation_thresholds:
171  if rst not in rsts_dict:
172  continue # print rst;
173  if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
174  num_violated += 1
175  if rst not in self.violation_counts:
176  self.violation_counts[rst] = 1
177  else:
178  self.violation_counts[rst] += 1
179  return num_violated
180 
181 
182 # ----------------------------------
183 class Clustering(object):
184  """A class to cluster structures.
185  Uses scipy's cdist function to compute distance matrices
186  and sklearn's kmeans clustering module.
187  """
188  def __init__(self,rmsd_weights=None):
189  """Constructor.
190  @param rmsd_weights Flat list of weights for each particle
191  (if they're coarse)
192  """
193  try:
194  from mpi4py import MPI
195  self.comm = MPI.COMM_WORLD
196  self.rank = self.comm.Get_rank()
197  self.number_of_processes = self.comm.size
198  except ImportError:
199  self.number_of_processes = 1
200  self.rank = 0
201  self.all_coords = {}
202  self.structure_cluster_ids = None
203  self.tmpl_coords = None
204  self.rmsd_weights=rmsd_weights
205 
206  def set_template(self, part_coords):
207 
208  self.tmpl_coords = part_coords
209 
210  def fill(self, frame, Coords):
211  """Add coordinates for a single model."""
212 
213  self.all_coords[frame] = Coords
214 
215  def dist_matrix(self):
216 
217  self.model_list_names = list(self.all_coords.keys())
218  self.model_indexes = list(range(len(self.model_list_names)))
219  self.model_indexes_dict = dict(
220  list(zip(self.model_list_names, self.model_indexes)))
221  model_indexes_unique_pairs = list(itertools.combinations(self.model_indexes, 2))
222 
223  my_model_indexes_unique_pairs = IMP.pmi.tools.chunk_list_into_segments(
224  model_indexes_unique_pairs,
225  self.number_of_processes)[self.rank]
226 
227  print("process %s assigned with %s pairs" % (str(self.rank), str(len(my_model_indexes_unique_pairs))))
228 
229  (raw_distance_dict, self.transformation_distance_dict) = self.matrix_calculation(self.all_coords,
230  self.tmpl_coords,
231  my_model_indexes_unique_pairs)
232 
233  if self.number_of_processes > 1:
234  raw_distance_dict = IMP.pmi.tools.scatter_and_gather(
235  raw_distance_dict)
236  pickable_transformations = self.get_pickable_transformation_distance_dict(
237  )
238  pickable_transformations = IMP.pmi.tools.scatter_and_gather(
239  pickable_transformations)
240  self.set_transformation_distance_dict_from_pickable(
241  pickable_transformations)
242 
243  self.raw_distance_matrix = np.zeros(
244  (len(self.model_list_names), len(self.model_list_names)))
245  for item in raw_distance_dict:
246  (f1, f2) = item
247  self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
248  self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
249 
250  def get_dist_matrix(self):
251  return self.raw_distance_matrix
252 
253  def do_cluster(self, number_of_clusters,seed=None):
254  """Run K-means clustering
255  @param number_of_clusters Num means
256  @param seed the random seed
257  """
258  from sklearn.cluster import KMeans
259  if seed is not None:
260  np.random.seed(seed)
261  try:
262  # check whether we have the right version of sklearn
263  kmeans = KMeans(n_clusters=number_of_clusters)
264  except TypeError:
265  # sklearn older than 0.12
266  kmeans = KMeans(k=number_of_clusters)
267  kmeans.fit_predict(self.raw_distance_matrix)
268 
269  self.structure_cluster_ids = kmeans.labels_
270 
271  def get_pickable_transformation_distance_dict(self):
272  pickable_transformations = {}
273  for label in self.transformation_distance_dict:
274  tr = self.transformation_distance_dict[label]
275  trans = tuple(tr.get_translation())
276  rot = tuple(tr.get_rotation().get_quaternion())
277  pickable_transformations[label] = (rot, trans)
278  return pickable_transformations
279 
280  def set_transformation_distance_dict_from_pickable(
281  self,
282  pickable_transformations):
283  self.transformation_distance_dict = {}
284  for label in pickable_transformations:
285  tr = pickable_transformations[label]
286  trans = IMP.algebra.Vector3D(tr[1])
287  rot = IMP.algebra.Rotation3D(tr[0])
288  self.transformation_distance_dict[
289  label] = IMP.algebra.Transformation3D(rot, trans)
290 
291  def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
292  import pickle
293  outf = open(file_name + ".data", 'wb')
294 
295  # to pickle the transformation dictionary
296  # you have to save the arrays correposnding to
297  # the transformations
298 
299  pickable_transformations = self.get_pickable_transformation_distance_dict(
300  )
301  pickle.dump(
302  (self.structure_cluster_ids,
303  self.model_list_names,
304  pickable_transformations),
305  outf)
306 
307  np.save(file_name + ".npy", self.raw_distance_matrix)
308 
309  def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
310  import pickle
311 
312  inputf = open(file_name + ".data", 'rb')
313  (self.structure_cluster_ids, self.model_list_names,
314  pickable_transformations) = pickle.load(inputf)
315  inputf.close()
316 
317  self.raw_distance_matrix = np.load(file_name + ".npy")
318 
319  self.set_transformation_distance_dict_from_pickable(
320  pickable_transformations)
321  self.model_indexes = list(range(len(self.model_list_names)))
322  self.model_indexes_dict = dict(
323  list(zip(self.model_list_names, self.model_indexes)))
324 
325  def plot_matrix(self, figurename="clustermatrix.pdf"):
326  import pylab as pl
327  from scipy.cluster import hierarchy as hrc
328 
329  fig = pl.figure(figsize=(10,8))
330  ax = fig.add_subplot(212)
331  dendrogram = hrc.dendrogram(
332  hrc.linkage(self.raw_distance_matrix),
333  color_threshold=7,
334  no_labels=True)
335  leaves_order = dendrogram['leaves']
336  ax.set_xlabel('Model')
337  ax.set_ylabel('RMSD [Angstroms]')
338 
339  ax2 = fig.add_subplot(221)
340  cax = ax2.imshow(
341  self.raw_distance_matrix[leaves_order,
342  :][:,
343  leaves_order],
344  interpolation='nearest')
345  cb = fig.colorbar(cax)
346  cb.set_label('RMSD [Angstroms]')
347  ax2.set_xlabel('Model')
348  ax2.set_ylabel('Model')
349 
350  pl.savefig(figurename, dpi=300)
351  pl.close(fig)
352 
353  def get_model_index_from_name(self, name):
354  return self.model_indexes_dict[name]
355 
356  def get_cluster_labels(self):
357  # this list
358  return list(set(self.structure_cluster_ids))
359 
360  def get_number_of_clusters(self):
361  return len(self.get_cluster_labels())
362 
363  def get_cluster_label_indexes(self, label):
364  return (
365  [i for i, l in enumerate(self.structure_cluster_ids) if l == label]
366  )
367 
368  def get_cluster_label_names(self, label):
369  return (
370  [self.model_list_names[i]
371  for i in self.get_cluster_label_indexes(label)]
372  )
373 
374  def get_cluster_label_average_rmsd(self, label):
375 
376  indexes = self.get_cluster_label_indexes(label)
377 
378  if len(indexes) > 1:
379  sub_distance_matrix = self.raw_distance_matrix[
380  indexes, :][:, indexes]
381  average_rmsd = np.sum(sub_distance_matrix) / \
382  (len(sub_distance_matrix)
383  ** 2 - len(sub_distance_matrix))
384  else:
385  average_rmsd = 0.0
386  return average_rmsd
387 
388  def get_cluster_label_size(self, label):
389  return len(self.get_cluster_label_indexes(label))
390 
391  def get_transformation_to_first_member(
392  self,
393  cluster_label,
394  structure_index):
395  reference = self.get_cluster_label_indexes(cluster_label)[0]
396  return self.transformation_distance_dict[(reference, structure_index)]
397 
398  def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
399 
400  model_list_names = list(all_coords.keys())
401  rmsd_protein_names = list(all_coords[model_list_names[0]].keys())
402  raw_distance_dict = {}
403  transformation_distance_dict = {}
404  if template_coords is None:
405  do_alignment = False
406  else:
407  do_alignment = True
408  alignment_template_protein_names = list(template_coords.keys())
409 
410  for (f1, f2) in list_of_pairs:
411 
412  if not do_alignment:
413  # here we only get the rmsd,
414  # we need that for instance when you want to cluster conformations
415  # globally, eg the EM map is a reference
417 
418  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
419  for pr in rmsd_protein_names])
420  coords_f2 = {}
421  for pr in rmsd_protein_names:
422  coords_f2[pr] = all_coords[model_list_names[f2]][pr]
423 
424  Ali = Alignment(coords_f1, coords_f2, self.rmsd_weights)
425  rmsd = Ali.get_rmsd()
426 
427  elif do_alignment:
428  # here we actually align the conformations first
429  # and than calculate the rmsd. We need that when the
430  # protein(s) is the reference
431  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
432  for pr in alignment_template_protein_names])
433  coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
434  for pr in alignment_template_protein_names])
435 
436  Ali = Alignment(coords_f1, coords_f2)
437  template_rmsd, transformation = Ali.align()
438 
439  # here we calculate the rmsd
440  # we will align two models based n the nuber of subunits provided
441  # and transform coordinates of model 2 to model 1
442  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
443  for pr in rmsd_protein_names])
444  coords_f2 = {}
445  for pr in rmsd_protein_names:
446  coords_f2[pr] = [transformation.get_transformed(
447  i) for i in all_coords[model_list_names[f2]][pr]]
448 
449  Ali = Alignment(coords_f1, coords_f2, self.rmsd_weights)
450  rmsd = Ali.get_rmsd()
451 
452  raw_distance_dict[(f1, f2)] = rmsd
453  raw_distance_dict[(f2, f1)] = rmsd
454  transformation_distance_dict[(f1, f2)] = transformation
455  transformation_distance_dict[(f2, f1)] = transformation
456 
457  return raw_distance_dict, transformation_distance_dict
458 
459 
460 class Precision(object):
461  """A class to evaluate the precision of an ensemble.
462 
463  Also can evaluate the cross-precision of multiple ensembles.
464  Supports MPI for coordinate reading.
465  Recommended procedure:
466  -# initialize object and pass the selection for evaluating precision
467  -# call add_structures() to read in the data (specify group name)
468  -# call get_precision() to evaluate inter/intra precision
469  -# call get_rmsf() to evaluate within-group fluctuations
470  """
471  def __init__(self,model,
472  resolution=1,
473  selection_dictionary={}):
474  """Constructor.
475  @param model The IMP Model
476  @param resolution Use 1 or 10 (kluge: requires that "_Res:X" is
477  part of the hier name)
478  @param selection_dictionary Dictionary where keys are names for
479  selections and values are selection tuples for scoring
480  precision. "All" is automatically made as well
481  """
482  try:
483  from mpi4py import MPI
484  self.comm = MPI.COMM_WORLD
485  self.rank = self.comm.Get_rank()
486  self.number_of_processes = self.comm.size
487  except ImportError:
488  self.number_of_processes=1
489  self.rank=0
490 
491  self.styles=['pairwise_rmsd','pairwise_drmsd_k','pairwise_drmsd_Q',
492  'pairwise_drms_k','pairwise_rmsd','drmsd_from_center']
493  self.style='pairwise_drmsd_k'
494  self.structures_dictionary={}
495  self.reference_structures_dictionary={}
496  self.prots=[]
497  self.protein_names=None
498  self.len_particles_resolution_one=None
499  self.model=model
500  self.rmf_names_frames={}
501  self.reference_rmf_names_frames=None
502  self.reference_structure=None
503  self.reference_prot=None
504  self.selection_dictionary=selection_dictionary
505  self.threshold=40.0
506  self.residue_particle_index_map=None
507  if resolution in [1,10]:
508  self.resolution=resolution
509  else:
510  raise KeyError("no such resolution")
511 
512  def _get_structure(self,rmf_frame_index,rmf_name):
513  """Read an RMF file and return the particles"""
514  rh= RMF.open_rmf_file_read_only(rmf_name)
515  prots=IMP.rmf.create_hierarchies(rh, self.model)
516  IMP.rmf.load_frame(rh, rmf_frame_index)
517  print("getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
518  del rh
519 
520  if self.resolution==1:
521  particle_dict = get_particles_at_resolution_one(prots[0])
522  elif self.resolution==10:
523  particle_dict = get_particles_at_resolution_ten(prots[0])
524 
525  protein_names=list(particle_dict.keys())
526  particles_resolution_one=[]
527  for k in particle_dict:
528  particles_resolution_one+=(particle_dict[k])
529 
530  if self.protein_names==None:
531  self.protein_names=protein_names
532  else:
533  if self.protein_names!=protein_names:
534  print("Error: the protein names of the new coordinate set is not compatible with the previous one")
535 
536  if self.len_particles_resolution_one==None:
537  self.len_particles_resolution_one=len(particles_resolution_one)
538  else:
539  if self.len_particles_resolution_one!=len(particles_resolution_one):
540  raise ValueError("the new coordinate set is not compatible with the previous one")
541 
542  return particles_resolution_one,prots
543 
544  def add_structure(self,
545  rmf_name,
546  rmf_frame_index,
547  structure_set_name,
548  setup_index_map=False):
549  """ Read a structure into the ensemble and store (as coordinates).
550  @param rmf_name The name of the RMF file
551  @param rmf_frame_index The frame to read
552  @param structure_set_name Name for the set that includes this structure
553  (e.g. "cluster 1")
554  """
555 
556  # decide where to put this structure
557  if structure_set_name in self.structures_dictionary:
558  cdict=self.structures_dictionary[structure_set_name]
559  rmflist=self.rmf_names_frames[structure_set_name]
560  else:
561  self.structures_dictionary[structure_set_name]={}
562  self.rmf_names_frames[structure_set_name]=[]
563  cdict=self.structures_dictionary[structure_set_name]
564  rmflist=self.rmf_names_frames[structure_set_name]
565 
566  # read the particles
567  try:
568  (particles_resolution_one, prots)=self._get_structure(rmf_frame_index,rmf_name)
569  except:
570  print("something wrong with the rmf")
571  return 0
572 
573  self.selection_dictionary.update({"All":self.protein_names})
574 
575  for selection_name in self.selection_dictionary:
576  selection_tuple=self.selection_dictionary[selection_name]
577  coords=self._select_coordinates(selection_tuple,particles_resolution_one,prots[0])
578  if selection_name not in cdict:
579  cdict[selection_name]=[coords]
580  else:
581  cdict[selection_name].append(coords)
582 
583  rmflist.append((rmf_name,rmf_frame_index))
584 
585  # if requested, set up a dictionary to help find residue indexes
586  if setup_index_map:
587  self.residue_particle_index_map={}
588  for prot_name in self.protein_names:
589  self.residue_particle_index_map[prot_name] = \
590  self._get_residue_particle_index_map(
591  prot_name,
592  particles_resolution_one,prots[0])
593  for prot in prots:
594  IMP.atom.destroy(prot)
595 
596  def add_structures(self,
597  rmf_name_frame_tuples,
598  structure_set_name):
599  """Read a list of RMFs, supports parallel
600  @param rmf_name_frame_tuples list of (rmf_file_name,frame_number)
601  @param structure_set_name Name this set of structures (e.g. "cluster.1")
602  """
603 
604  # split up the requested list to read in parallel
605  my_rmf_name_frame_tuples=IMP.pmi.tools.chunk_list_into_segments(
606  rmf_name_frame_tuples,self.number_of_processes)[self.rank]
607  for nfr,tup in enumerate(my_rmf_name_frame_tuples):
608  rmf_name=tup[0]
609  rmf_frame_index=tup[1]
610  # the first frame stores the map between residues and particles
611  if self.residue_particle_index_map is None:
612  setup_index_map=True
613  else:
614  setup_index_map=False
615  self.add_structure(rmf_name,
616  rmf_frame_index,
617  structure_set_name,
618  setup_index_map)
619 
620  # synchronize the structures
621  if self.number_of_processes > 1:
622  self.rmf_names_frames=IMP.pmi.tools.scatter_and_gather(self.rmf_names_frames)
623  if self.rank != 0:
624  self.comm.send(self.structures_dictionary, dest=0, tag=11)
625  elif self.rank == 0:
626  for i in range(1, self.number_of_processes):
627  data_tmp = self.comm.recv(source=i, tag=11)
628  for key in self.structures_dictionary:
629  self.structures_dictionary[key].update(data_tmp[key])
630  for i in range(1, self.number_of_processes):
631  self.comm.send(self.structures_dictionary, dest=i, tag=11)
632  if self.rank != 0:
633  self.structures_dictionary = self.comm.recv(source=0, tag=11)
634 
635  def _get_residue_particle_index_map(self,prot_name,structure,hier):
636  residue_particle_index_map=[]
637  s=IMP.atom.Selection(hier,molecules=[prot_name])
638  all_selected_particles=s.get_selected_particles()
639  intersection=list(set(all_selected_particles) & set(structure))
640  sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
641  for p in sorted_intersection:
642  residue_particle_index_map.append(IMP.pmi.tools.get_residue_indexes(p))
643  return residue_particle_index_map
644 
645 
646  def _select_coordinates(self,tuple_selections,structure,prot):
647  selected_coordinates=[]
648  for t in tuple_selections:
649  if type(t)==tuple and len(t)==3:
650  s=IMP.atom.Selection(prot,molecules=[t[2]],residue_indexes=range(t[0],t[1]+1))
651  all_selected_particles=s.get_selected_particles()
652  intersection=list(set(all_selected_particles) & set(structure))
653  sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
654  cc=[tuple(IMP.core.XYZ(p).get_coordinates()) for p in sorted_intersection]
655  selected_coordinates+=cc
656 
657  elif type(t)==str:
658  s=IMP.atom.Selection(prot,molecules=[t])
659  all_selected_particles=s.get_selected_particles()
660  intersection=list(set(all_selected_particles) & set(structure))
661  sorted_intersection=IMP.pmi.tools.sort_by_residues(intersection)
662  cc=[tuple(IMP.core.XYZ(p).get_coordinates()) for p in sorted_intersection]
663  selected_coordinates+=cc
664  else:
665  raise ValueError("Selection error")
666  return selected_coordinates
667 
668  def set_threshold(self,threshold):
669  self.threshold=threshold
670 
671  def _get_distance(self,
672  structure_set_name1,
673  structure_set_name2,
674  selection_name,
675  index1,
676  index2):
677  """ Compute distance between structures with various metrics """
678  c1=self.structures_dictionary[structure_set_name1][selection_name][index1]
679  c2=self.structures_dictionary[structure_set_name2][selection_name][index2]
680 
681  coordinates1=[IMP.algebra.Vector3D(c) for c in c1]
682  coordinates2=[IMP.algebra.Vector3D(c) for c in c2]
683 
684  if self.style=='pairwise_drmsd_k':
685  distance=IMP.atom.get_drmsd(coordinates1,coordinates2)
686  if self.style=='pairwise_drms_k':
687  distance=IMP.atom.get_drms(coordinates1,coordinates2)
688  if self.style=='pairwise_drmsd_Q':
689  distance=IMP.atom.get_drmsd_Q(coordinates1,coordinates2,self.threshold)
690 
691  if self.style=='pairwise_rmsd':
692  distance=IMP.atom.get_rmsd(coordinates1,coordinates2,IMP.algebra.get_identity_transformation_3d())
693  return distance
694 
695  def _get_particle_distances(self,structure_set_name1,structure_set_name2,
696  selection_name,index1,index2):
697  import numpy as np
698  c1=self.structures_dictionary[structure_set_name1][selection_name][index1]
699  c2=self.structures_dictionary[structure_set_name2][selection_name][index2]
700 
701  coordinates1=[IMP.algebra.Vector3D(c) for c in c1]
702  coordinates2=[IMP.algebra.Vector3D(c) for c in c2]
703 
704  distances=[np.linalg.norm(a-b) for (a,b) in zip(coordinates1,coordinates2)]
705 
706  return distances
707 
708  def get_precision(self,
709  structure_set_name1,
710  structure_set_name2,
711  outfile=None,
712  skip=1,
713  selection_keywords=None):
714  """ Evaluate the precision of two named structure groups. Supports MPI.
715  When the structure_set_name1 is different from the structure_set_name2,
716  this evaluates the cross-precision (average pairwise distances).
717  @param outfile Name of the precision output file
718  @param structure_set_name1 string name of the first structure set
719  @param structure_set_name2 string name of the second structure set
720  @param skip analyze every (skip) structure for the distance matrix calculation
721  @param selection_keywords Specify the selection name you want to calculate on.
722  By default this is computed for everything you provided in the constructor,
723  plus all the subunits together.
724  """
725  if selection_keywords is None:
726  sel_keys=list(self.selection_dictionary.keys())
727  else:
728  for k in selection_keywords:
729  if k not in self.selection_dictionary:
730  raise KeyError("you are trying to find named selection " \
731  + k + " which was not requested in the constructor")
732  sel_keys=selection_keywords
733 
734  if outfile is not None:
735  of=open(outfile,"w")
736  centroid_index=0
737  for selection_name in sel_keys:
738  number_of_structures_1=len(self.structures_dictionary[structure_set_name1][selection_name])
739  number_of_structures_2=len(self.structures_dictionary[structure_set_name2][selection_name])
740 
741  distances={}
742  structure_pointers_1=list(range(0,number_of_structures_1,skip))
743  structure_pointers_2=list(range(0,number_of_structures_2,skip))
744 
745  pair_combination_list=list(itertools.product(structure_pointers_1,structure_pointers_2))
746 
747  if len(pair_combination_list)==0:
748  raise ValueError("no structure selected. Check the skip parameter.")
749 
750  my_pair_combination_list=IMP.pmi.tools.chunk_list_into_segments(
751  pair_combination_list,self.number_of_processes)[self.rank]
752  my_length=len(my_pair_combination_list)
753  for n,pair in enumerate(my_pair_combination_list):
754 
755  progression=int(float(n)/my_length*100.0)
756  distances[pair]=self._get_distance(structure_set_name1,structure_set_name2,
757  selection_name,pair[0],pair[1])
758 
759  if self.number_of_processes > 1:
760  distances = IMP.pmi.tools.scatter_and_gather(distances)
761  if self.rank == 0:
762  if structure_set_name1==structure_set_name2:
763  structure_pointers=structure_pointers_1
764  number_of_structures=number_of_structures_1
765 
766  # calculate the distance from the first centroid
767  # and determine the centroid
768 
769  distance=0.0
770  distances_to_structure={}
771  distances_to_structure_normalization={}
772 
773  for n in structure_pointers:
774  distances_to_structure[n]=0.0
775  distances_to_structure_normalization[n]=0
776 
777  for k in distances:
778  distance+=distances[k]
779  distances_to_structure[k[0]]+=distances[k]
780  distances_to_structure[k[1]]+=distances[k]
781  distances_to_structure_normalization[k[0]]+=1
782  distances_to_structure_normalization[k[1]]+=1
783 
784  for n in structure_pointers:
785  distances_to_structure[n]=distances_to_structure[n]/distances_to_structure_normalization[n]
786 
787  min_distance=min([distances_to_structure[n] for n in distances_to_structure])
788  centroid_index=[k for k, v in distances_to_structure.items() if v == min_distance][0]
789  centroid_rmf_name=self.rmf_names_frames[structure_set_name1][centroid_index]
790 
791  centroid_distance=0.0
792  for n in range(number_of_structures):
793  centroid_distance+=self._get_distance(structure_set_name1,structure_set_name1,
794  selection_name,centroid_index,n)
795 
796  #pairwise_distance=distance/len(distances.keys())
797  centroid_distance/=number_of_structures
798  #average_centroid_distance=sum(distances_to_structure)/len(distances_to_structure)
799  if outfile is not None:
800  of.write(str(selection_name)+" "+structure_set_name1+
801  " average centroid distance "+str(centroid_distance)+"\n")
802  of.write(str(selection_name)+" "+structure_set_name1+
803  " centroid index "+str(centroid_index)+"\n")
804  of.write(str(selection_name)+" "+structure_set_name1+
805  " centroid rmf name "+str(centroid_rmf_name)+"\n")
806 
807  average_pairwise_distances=sum(distances.values())/len(list(distances.values()))
808  if outfile is not None:
809  of.write(str(selection_name)+" "+structure_set_name1+" "+structure_set_name2+
810  " average pairwise distance "+str(average_pairwise_distances)+"\n")
811  if outfile is not None:
812  of.close()
813  return centroid_index
814 
815  def get_rmsf(self,
816  structure_set_name,
817  outdir="./",
818  skip=1,
819  set_plot_yaxis_range=None):
820  """ Calculate the residue mean square fluctuations (RMSF).
821  Automatically outputs as data file and pdf
822  @param structure_set_name Which structure set to calculate RMSF for
823  @param outdir Where to write the files
824  @param skip Skip this number of structures
825  @param set_plot_yaxis_range In case you need to change the plot
826  """
827  # get the centroid structure for the whole complex
828  centroid_index=self.get_precision(
829  structure_set_name,
830  structure_set_name,
831  outfile=None,
832  skip=skip)
833  if self.rank==0:
834  for sel_name in self.protein_names:
835  self.selection_dictionary.update({sel_name:[sel_name]})
836  try:
837  number_of_structures=len(self.structures_dictionary[structure_set_name][sel_name])
838  except KeyError:
839  # that protein was not included in the selection
840  continue
841  rpim=self.residue_particle_index_map[sel_name]
842  outfile=outdir+"/rmsf."+sel_name+".dat"
843  of=open(outfile,"w")
844  residue_distances={}
845  residue_nblock={}
846  for index in range(number_of_structures):
847  distances=self._get_particle_distances(structure_set_name,
848  structure_set_name,
849  sel_name,
850  centroid_index,index)
851  for nblock,block in enumerate(rpim):
852  for residue_number in block:
853  residue_nblock[residue_number]=nblock
854  if residue_number not in residue_distances:
855  residue_distances[residue_number]=[distances[nblock]]
856  else:
857  residue_distances[residue_number].append(distances[nblock])
858 
859  residues=[]
860  rmsfs=[]
861  for rn in residue_distances:
862  residues.append(rn)
863  rmsf=np.std(residue_distances[rn])
864  rmsfs.append(rmsf)
865  of.write(str(rn)+" "+str(residue_nblock[rn])+" "+str(rmsf)+"\n")
866 
867  IMP.pmi.output.plot_xy_data(residues,rmsfs,title=sel_name,
868  out_fn=outdir+"/rmsf."+sel_name,display=False,
869  set_plot_yaxis_range=set_plot_yaxis_range,
870  xlabel='Residue Number',ylabel='Standard error')
871  of.close()
872 
873 
874  def set_reference_structure(self,rmf_name,rmf_frame_index):
875  """Read in a structure used for reference computation.
876  Needed before calling get_average_distance_wrt_reference_structure()
877  @param rmf_name The RMF file to read the reference
878  @param rmf_frame_index The index in that file
879  """
880  (particles_resolution_one, prot)=self._get_structure(rmf_frame_index,rmf_name)
881  self.reference_rmf_names_frames=(rmf_name,rmf_frame_index)
882 
883 
884  for selection_name in self.selection_dictionary:
885  selection_tuple=self.selection_dictionary[selection_name]
886  coords=self._select_coordinates(selection_tuple,
887  particles_resolution_one,prot)
888  self.reference_structures_dictionary[selection_name]=coords
889 
890 
891  def get_average_distance_wrt_reference_structure(self,structure_set_name):
892  """Compare the structure set to the reference structure.
893  @param structure_set_name The structure set to compute this on
894  @note First call set_reference_structure()
895  """
896  if self.reference_structures_dictionary=={}:
897  print("Cannot compute until you set a reference structure")
898  return
899  for selection_name in self.selection_dictionary:
900  reference_coordinates=self.reference_structures_dictionary[selection_name]
901  coordinates2=[IMP.algebra.Vector3D(c) for c in reference_coordinates]
902  distances=[]
903 
904  for sc in self.structures_dictionary[structure_set_name][selection_name]:
905  coordinates1=[IMP.algebra.Vector3D(c) for c in sc]
906  if self.style=='pairwise_drmsd_k':
907  distance=IMP.atom.get_drmsd(coordinates1,coordinates2)
908  if self.style=='pairwise_drms_k':
909  distance=IMP.atom.get_drms(coordinates1,coordinates2)
910  if self.style=='pairwise_drmsd_Q':
911  distance=IMP.atom.get_drmsd_Q(coordinates1,coordinates2,self.threshold)
912  if self.style=='pairwise_rmsd':
913  distance=IMP.atom.get_rmsd(coordinates1,coordinates2,
915  distances.append(distance)
916 
917  print(selection_name,"average distance",sum(distances)/len(distances),"minimum distance",min(distances))
918 
919  def get_coordinates(self):
920  pass
921 
922  def set_precision_style(self, style):
923  if style in self.styles:
924  self.style=style
925  else:
926  raise ValueError("No such style")
927 
928 
929 class GetModelDensity(object):
930  """Compute mean density maps from structures.
931 
932  Keeps a dictionary of density maps,
933  keys are in the custom ranges. When you call add_subunits_density, it adds
934  particle coordinates to the existing density maps.
935  """
936 
937  def __init__(self, custom_ranges, representation=None, voxel=5.0):
938  """Constructor.
939  @param custom_ranges Required. It's a dictionary, keys are the
940  density component names, values are selection tuples
941  e.g. {'kin28':[['kin28',1,-1]],
942  'density_name_1' :[('ccl1')],
943  'density_name_2' :[(1,142,'tfb3d1'),
944  (143,700,'tfb3d2')],
945  @param representation PMI representation, for doing selections.
946  Not needed if you only pass hierarchies
947  @param voxel The voxel size for the output map (lower is slower)
948  """
949 
950  self.representation = representation
951  self.voxel = voxel
952  self.densities = {}
953  self.count_models = 0.0
954  self.custom_ranges = custom_ranges
955 
956  def add_subunits_density(self, hierarchy=None):
957  """Add a frame to the densities.
958  @param hierarchy Optionally read the hierarchy from somewhere.
959  If not passed, will just read the representation.
960  """
961  self.count_models += 1.0
962 
963  if hierarchy:
964  part_dict = get_particles_at_resolution_one(hierarchy)
965  all_particles_by_resolution = []
966  for name in part_dict:
967  all_particles_by_resolution += part_dict[name]
968 
969 
970  for density_name in self.custom_ranges:
971  parts = []
972  if hierarchy:
973  all_particles_by_segments = []
974 
975  for seg in self.custom_ranges[density_name]:
976  if not hierarchy:
977  parts += IMP.tools.select_by_tuple(self.representation,
978  seg, resolution=1, name_is_ambiguous=False)
979  else:
980  if type(seg) == str:
981  s = IMP.atom.Selection(hierarchy,molecule=seg)
982  elif type(seg) == tuple:
983  s = IMP.atom.Selection(
984  hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1))
985  else:
986  raise Exception('could not understand selection tuple '+str(seg))
987 
988  all_particles_by_segments += s.get_selected_particles()
989 
990  if hierarchy:
991  parts = list(
992  set(all_particles_by_segments) & set(all_particles_by_resolution))
993 
994  self._create_density_from_particles(parts, density_name)
995 
996  def normalize_density(self):
997  pass
998 
999  def _create_density_from_particles(self, ps, name,
1000  resolution=1,
1001  kernel_type='GAUSSIAN'):
1002  '''Internal function for adding to densities.
1003  pass XYZR particles with mass and create a density from them.
1004  kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
1005  kd = {
1006  'GAUSSIAN': IMP.em.GAUSSIAN,
1007  'BINARIZED_SPHERE': IMP.em.BINARIZED_SPHERE,
1008  'SPHERE': IMP.em.SPHERE}
1009 
1010  dmap = IMP.em.SampledDensityMap(ps, resolution, self.voxel)
1011  dmap.calcRMS()
1012  if name not in self.densities:
1013  self.densities[name] = dmap
1014  else:
1015  bbox1 = IMP.em.get_bounding_box(self.densities[name])
1016  bbox2 = IMP.em.get_bounding_box(dmap)
1017  bbox1 += bbox2
1018  dmap3 = IMP.em.create_density_map(bbox1,self.voxel)
1019  dmap3.add(dmap)
1020  dmap3.add(self.densities[name])
1021  self.densities[name] = dmap3
1022 
1023  def get_density_keys(self):
1024  return list(self.densities.keys())
1025 
1026  def get_density(self,name):
1027  """Get the current density for some component name"""
1028  if name not in self.densities:
1029  return None
1030  else:
1031  return self.densities[name]
1032 
1033  def write_mrc(self, path="./"):
1034  for density_name in self.densities:
1035  self.densities[density_name].multiply(1. / self.count_models)
1037  self.densities[density_name],
1038  path + "/" + density_name + ".mrc",
1040 
1041 
1042 class GetContactMap(object):
1043 
1044  def __init__(self, distance=15.):
1045  self.distance = distance
1046  self.contactmap = ''
1047  self.namelist = []
1048  self.xlinks = 0
1049  self.XL = {}
1050  self.expanded = {}
1051  self.resmap = {}
1052 
1053  def set_prot(self, prot):
1054  from scipy.spatial.distance import cdist
1055  self.prot = prot
1056  self.protnames = []
1057  coords = []
1058  radii = []
1059  namelist = []
1060 
1061  particles_dictionary = get_particles_at_resolution_one(self.prot)
1062 
1063  for name in particles_dictionary:
1064  residue_indexes = []
1065  for p in particles_dictionary[name]:
1066  print(p.get_name())
1067  residue_indexes += IMP.pmi.tools.get_residue_indexes(p)
1068  #residue_indexes.add( )
1069 
1070  if len(residue_indexes) != 0:
1071  self.protnames.append(name)
1072  for res in range(min(residue_indexes), max(residue_indexes) + 1):
1073  d = IMP.core.XYZR(p)
1074  new_name = name + ":" + str(res)
1075  if name not in self.resmap:
1076  self.resmap[name] = {}
1077  if res not in self.resmap:
1078  self.resmap[name][res] = {}
1079 
1080  self.resmap[name][res] = new_name
1081  namelist.append(new_name)
1082 
1083  crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1084  coords.append(crd)
1085  radii.append(d.get_radius())
1086 
1087  coords = np.array(coords)
1088  radii = np.array(radii)
1089 
1090  if len(self.namelist) == 0:
1091  self.namelist = namelist
1092  self.contactmap = np.zeros((len(coords), len(coords)))
1093 
1094  distances = cdist(coords, coords)
1095  distances = (distances - radii).T - radii
1096  distances = distances <= self.distance
1097 
1098  print(coords)
1099  print(radii)
1100  print(distances)
1101 
1102  self.contactmap += distances
1103 
1104  def get_subunit_coords(self, frame, align=0):
1105  from scipy.spatial.distance import cdist
1106  coords = []
1107  radii = []
1108  namelist = []
1109  test, testr = [], []
1110  for part in self.prot.get_children():
1111  SortedSegments = []
1112  print(part)
1113  for chl in part.get_children():
1114  start = IMP.atom.get_leaves(chl)[0]
1115  end = IMP.atom.get_leaves(chl)[-1]
1116 
1117  startres = IMP.atom.Fragment(start).get_residue_indexes()[0]
1118  endres = IMP.atom.Fragment(end).get_residue_indexes()[-1]
1119  SortedSegments.append((chl, startres))
1120  SortedSegments = sorted(SortedSegments, key=itemgetter(1))
1121 
1122  for sgmnt in SortedSegments:
1123  for leaf in IMP.atom.get_leaves(sgmnt[0]):
1124  p = IMP.core.XYZR(leaf)
1125  crd = np.array([p.get_x(), p.get_y(), p.get_z()])
1126 
1127  coords.append(crd)
1128  radii.append(p.get_radius())
1129 
1130  new_name = part.get_name() + '_' + sgmnt[0].get_name() +\
1131  '_' + \
1132  str(IMP.atom.Fragment(leaf)
1133  .get_residue_indexes()[0])
1134  namelist.append(new_name)
1135  self.expanded[new_name] = len(
1136  IMP.atom.Fragment(leaf).get_residue_indexes())
1137  if part.get_name() not in self.resmap:
1138  self.resmap[part.get_name()] = {}
1139  for res in IMP.atom.Fragment(leaf).get_residue_indexes():
1140  self.resmap[part.get_name()][res] = new_name
1141 
1142  coords = np.array(coords)
1143  radii = np.array(radii)
1144  if len(self.namelist) == 0:
1145  self.namelist = namelist
1146  self.contactmap = np.zeros((len(coords), len(coords)))
1147  distances = cdist(coords, coords)
1148  distances = (distances - radii).T - radii
1149  distances = distances <= self.distance
1150  self.contactmap += distances
1151 
1152  def add_xlinks(
1153  self,
1154  filname,
1155  identification_string='ISDCrossLinkMS_Distance_'):
1156  # 'ISDCrossLinkMS_Distance_interrb_6629-State:0-20:RPS30_218:eIF3j-1-1-0.1_None'
1157  self.xlinks = 1
1158  data = open(filname)
1159  D = data.readlines()
1160  data.close()
1161 
1162  for d in D:
1163  if identification_string in d:
1164  d = d.replace(
1165  "_",
1166  " ").replace("-",
1167  " ").replace(":",
1168  " ").split()
1169 
1170  t1, t2 = (d[0], d[1]), (d[1], d[0])
1171  if t1 not in self.XL:
1172  self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
1173  self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
1174  else:
1175  self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
1176  self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
1177 
1178  def dist_matrix(self, skip_cmap=0, skip_xl=1):
1179  K = self.namelist
1180  M = self.contactmap
1181  C, R = [], []
1182  L = sum(self.expanded.values())
1183  proteins = self.protnames
1184 
1185  # exp new
1186  if skip_cmap == 0:
1187  Matrices = {}
1188  proteins = [p.get_name() for p in self.prot.get_children()]
1189  missing = []
1190  for p1 in range(len(proteins)):
1191  for p2 in range(p1, len(proteins)):
1192  pl1, pl2 = max(
1193  self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1194  pn1, pn2 = proteins[p1], proteins[p2]
1195  mtr = np.zeros((pl1 + 1, pl2 + 1))
1196  print('Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1197  for i1 in range(1, pl1 + 1):
1198  for i2 in range(1, pl2 + 1):
1199  try:
1200  r1 = K.index(self.resmap[pn1][i1])
1201  r2 = K.index(self.resmap[pn2][i2])
1202  r = M[r1, r2]
1203  mtr[i1 - 1, i2 - 1] = r
1204  except KeyError:
1205  missing.append((pn1, pn2, i1, i2))
1206  pass
1207  Matrices[(pn1, pn2)] = mtr
1208 
1209  # add cross-links
1210  if skip_xl == 0:
1211  if self.XL == {}:
1212  raise ValueError("cross-links were not provided, use add_xlinks function!")
1213  Matrices_xl = {}
1214  missing_xl = []
1215  for p1 in range(len(proteins)):
1216  for p2 in range(p1, len(proteins)):
1217  pl1, pl2 = max(
1218  self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1219  pn1, pn2 = proteins[p1], proteins[p2]
1220  mtr = np.zeros((pl1 + 1, pl2 + 1))
1221  flg = 0
1222  try:
1223  xls = self.XL[(pn1, pn2)]
1224  except KeyError:
1225  try:
1226  xls = self.XL[(pn2, pn1)]
1227  flg = 1
1228  except KeyError:
1229  flg = 2
1230  if flg == 0:
1231  print('Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1232  for xl1, xl2 in xls:
1233  if xl1 > pl1:
1234  print('X' * 10, xl1, xl2)
1235  xl1 = pl1
1236  if xl2 > pl2:
1237  print('X' * 10, xl1, xl2)
1238  xl2 = pl2
1239  mtr[xl1 - 1, xl2 - 1] = 100
1240  elif flg == 1:
1241  print('Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1242  for xl1, xl2 in xls:
1243  if xl1 > pl1:
1244  print('X' * 10, xl1, xl2)
1245  xl1 = pl1
1246  if xl2 > pl2:
1247  print('X' * 10, xl1, xl2)
1248  xl2 = pl2
1249  mtr[xl2 - 1, xl1 - 1] = 100
1250  else:
1251  raise RuntimeError('WTF!')
1252  Matrices_xl[(pn1, pn2)] = mtr
1253 
1254  # expand the matrix to individual residues
1255  #NewM = []
1256  # for x1 in xrange(len(K)):
1257  # lst = []
1258  # for x2 in xrange(len(K)):
1259  # lst += [M[x1,x2]]*self.expanded[K[x2]]
1260  # for i in xrange(self.expanded[K[x1]]): NewM.append(np.array(lst))
1261  #NewM = np.array(NewM)
1262 
1263  # make list of protein names and create coordinate lists
1264  C = proteins
1265  # W is the component length list,
1266  # R is the contiguous coordinates list
1267  W, R = [], []
1268  for i, c in enumerate(C):
1269  cl = max(self.resmap[c].keys())
1270  W.append(cl)
1271  if i == 0:
1272  R.append(cl)
1273  else:
1274  R.append(R[-1] + cl)
1275 
1276  # start plotting
1277  if filename:
1278  # Don't require a display
1279  import matplotlib as mpl
1280  mpl.use('Agg')
1281  import matplotlib.pyplot as plt
1282  import matplotlib.gridspec as gridspec
1283  import scipy.sparse as sparse
1284 
1285  f = plt.figure()
1286  gs = gridspec.GridSpec(len(W), len(W),
1287  width_ratios=W,
1288  height_ratios=W)
1289 
1290  cnt = 0
1291  for x1, r1 in enumerate(R):
1292  if x1 == 0:
1293  s1 = 0
1294  else:
1295  s1 = R[x1 - 1]
1296  for x2, r2 in enumerate(R):
1297  if x2 == 0:
1298  s2 = 0
1299  else:
1300  s2 = R[x2 - 1]
1301 
1302  ax = plt.subplot(gs[cnt])
1303  if skip_cmap == 0:
1304  try:
1305  mtr = Matrices[(C[x1], C[x2])]
1306  except KeyError:
1307  mtr = Matrices[(C[x2], C[x1])].T
1308  #cax = ax.imshow(log(NewM[s1:r1,s2:r2] / 1.), interpolation='nearest', vmin=0., vmax=log(NewM.max()))
1309  cax = ax.imshow(
1310  log(mtr),
1311  interpolation='nearest',
1312  vmin=0.,
1313  vmax=log(NewM.max()))
1314  ax.set_xticks([])
1315  ax.set_yticks([])
1316  if skip_xl == 0:
1317  try:
1318  mtr = Matrices_xl[(C[x1], C[x2])]
1319  except KeyError:
1320  mtr = Matrices_xl[(C[x2], C[x1])].T
1321  cax = ax.spy(
1322  sparse.csr_matrix(mtr),
1323  markersize=10,
1324  color='white',
1325  linewidth=100,
1326  alpha=0.5)
1327  ax.set_xticks([])
1328  ax.set_yticks([])
1329 
1330  cnt += 1
1331  if x2 == 0:
1332  ax.set_ylabel(C[x1], rotation=90)
1333  plt.show()
1334 
1335 
1336 # ------------------------------------------------------------------
1337 # a few random tools
1338 
1339 def get_hier_from_rmf(model, frame_number, rmf_file,state_number=0):
1340  # I have to deprecate this function
1341  print("getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1342 
1343  # load the frame
1344  rh = RMF.open_rmf_file_read_only(rmf_file)
1345 
1346  try:
1347  prots = IMP.rmf.create_hierarchies(rh, model)
1348  except:
1349  print("Unable to open rmf file %s" % (rmf_file))
1350  prot = None
1351  return prot
1352  #IMP.rmf.link_hierarchies(rh, prots)
1353  prot = prots[state_number]
1354  try:
1355  IMP.rmf.load_frame(rh, frame_number)
1356  except:
1357  print("Unable to open frame %i of file %s" % (frame_number, rmf_file))
1358  prot = None
1359  model.update()
1360  del rh
1361  return prot
1362 
1363 def get_hier_and_restraints_from_rmf(model, frame_number, rmf_file, state_number=0):
1364  # I have to deprecate this function
1365  print("getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1366 
1367  # load the frame
1368  rh = RMF.open_rmf_file_read_only(rmf_file)
1369 
1370  try:
1371  prots = IMP.rmf.create_hierarchies(rh, model)
1372  rs = IMP.rmf.create_restraints(rh, model)
1373  except:
1374  print("Unable to open rmf file %s" % (rmf_file))
1375  prot = None
1376  rs = None
1377  return prot,rs
1378  #IMP.rmf.link_hierarchies(rh, prots)
1379  prot = prots[state_number]
1380  try:
1381  IMP.rmf.load_frame(rh, frame_number)
1382  except:
1383  print("Unable to open frame %i of file %s" % (frame_number, rmf_file))
1384  prot = None
1385  model.update()
1386  del rh
1387  return prot,rs
1388 
1389 def get_hiers_from_rmf(model, frame_number, rmf_file):
1390  print("getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1391 
1392  # load the frame
1393  rh = RMF.open_rmf_file_read_only(rmf_file)
1394 
1395  try:
1396  prots = IMP.rmf.create_hierarchies(rh, model)
1397  except:
1398  print("Unable to open rmf file %s" % (rmf_file))
1399  prot = None
1400  return prot
1401  #IMP.rmf.link_hierarchies(rh, prots)
1402  try:
1403  IMP.rmf.load_frame(rh, frame_number)
1404  except:
1405  print("Unable to open frame %i of file %s" % (frame_number, rmf_file))
1406  prots = None
1407  model.update()
1408  del rh
1409  return prots
1410 
1411 
1413  """
1414  Get particles at res 1, or any beads, based on the name.
1415  No Representation is needed. This is mainly used when the hierarchy
1416  is read from an RMF file.
1417  @return a dictionary of component names and their particles
1418  """
1419  particle_dict = {}
1420  allparticles = []
1421  for c in prot.get_children():
1422  name = c.get_name()
1423  particle_dict[name] = IMP.atom.get_leaves(c)
1424  for s in c.get_children():
1425  if "_Res:1" in s.get_name() and "_Res:10" not in s.get_name():
1426  allparticles += IMP.atom.get_leaves(s)
1427  if "Beads" in s.get_name():
1428  allparticles += IMP.atom.get_leaves(s)
1429 
1430  particle_align = []
1431  for name in particle_dict:
1432  particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1433  list(set(particle_dict[name]) & set(allparticles)))
1434  return particle_dict
1435 
1437  """
1438  Get particles at res 10, or any beads, based on the name.
1439  No Representation is needed.
1440  This is mainly used when the hierarchy is read from an RMF file.
1441  @return a dictionary of component names and their particles
1442  """
1443  particle_dict = {}
1444  allparticles = []
1445  for c in prot.get_children():
1446  name = c.get_name()
1447  particle_dict[name] = IMP.atom.get_leaves(c)
1448  for s in c.get_children():
1449  if "_Res:10" in s.get_name():
1450  allparticles += IMP.atom.get_leaves(s)
1451  if "Beads" in s.get_name():
1452  allparticles += IMP.atom.get_leaves(s)
1453  particle_align = []
1454  for name in particle_dict:
1455  particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1456  list(set(particle_dict[name]) & set(allparticles)))
1457  return particle_dict
1458 
1459 
1460 
1461 def select_by_tuple(first_res_last_res_name_tuple):
1462  first_res = first_res_last_res_hier_tuple[0]
1463  last_res = first_res_last_res_hier_tuple[1]
1464  name = first_res_last_res_hier_tuple[2]
1465 
1466 class CrossLinkTable(object):
1467  """Visualization of crosslinks"""
1468  def __init__(self):
1469  self.crosslinks = []
1470  self.external_csv_data = None
1471  self.crosslinkedprots = set()
1472  self.mindist = +10000000.0
1473  self.maxdist = -10000000.0
1474  self.contactmap = None
1475 
1476  def set_hierarchy(self, prot):
1477  self.prot_length_dict = {}
1478  self.model=prot.get_model()
1479 
1480  for i in prot.get_children():
1481  name = i.get_name()
1482  residue_indexes = []
1483  for p in IMP.atom.get_leaves(i):
1484  residue_indexes += IMP.pmi.tools.get_residue_indexes(p)
1485 
1486  if len(residue_indexes) != 0:
1487  self.prot_length_dict[name] = max(residue_indexes)
1488 
1489  def set_coordinates_for_contact_map(self, rmf_name,rmf_frame_index):
1490  from scipy.spatial.distance import cdist
1491 
1492  rh= RMF.open_rmf_file_read_only(rmf_name)
1493  prots=IMP.rmf.create_hierarchies(rh, self.model)
1494  IMP.rmf.load_frame(rh, rmf_frame_index)
1495  print("getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
1496  del rh
1497 
1498 
1499  coords = []
1500  radii = []
1501  namelist = []
1502 
1503  particles_dictionary = get_particles_at_resolution_one(prots[0])
1504 
1505  resindex = 0
1506  self.index_dictionary = {}
1507 
1508  for name in particles_dictionary:
1509  residue_indexes = []
1510  for p in particles_dictionary[name]:
1511  print(p.get_name())
1512  residue_indexes = IMP.pmi.tools.get_residue_indexes(p)
1513  #residue_indexes.add( )
1514 
1515  if len(residue_indexes) != 0:
1516 
1517  for res in range(min(residue_indexes), max(residue_indexes) + 1):
1518  d = IMP.core.XYZR(p)
1519 
1520  crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1521  coords.append(crd)
1522  radii.append(d.get_radius())
1523  if name not in self.index_dictionary:
1524  self.index_dictionary[name] = [resindex]
1525  else:
1526  self.index_dictionary[name].append(resindex)
1527  resindex += 1
1528 
1529  coords = np.array(coords)
1530  radii = np.array(radii)
1531 
1532  distances = cdist(coords, coords)
1533  distances = (distances - radii).T - radii
1534 
1535  distances = np.where(distances <= 20.0, 1.0, 0)
1536  if self.contactmap is None:
1537  self.contactmap = np.zeros((len(coords), len(coords)))
1538  self.contactmap += distances
1539 
1540  for prot in prots: IMP.atom.destroy(prot)
1541 
1542  def set_crosslinks(
1543  self, data_file, search_label='ISDCrossLinkMS_Distance_',
1544  mapping=None,
1545  filter_label=None,
1546  filter_rmf_file_names=None, #provide a list of rmf base names to filter the stat file
1547  external_csv_data_file=None,
1548  external_csv_data_file_unique_id_key="Unique ID"):
1549 
1550  # example key ISDCrossLinkMS_Distance_intrarb_937-State:0-108:RPS3_55:RPS30-1-1-0.1_None
1551  # mapping is a dictionary that maps standard keywords to entry positions in the key string
1552  # confidence class is a filter that
1553  # external datafile is a datafile that contains further information on the crosslinks
1554  # it will use the unique id to create the dictionary keys
1555 
1556  po = IMP.pmi.output.ProcessOutput(data_file)
1557  keys = po.get_keys()
1558 
1559  xl_keys = [k for k in keys if search_label in k]
1560 
1561  if filter_rmf_file_names is not None:
1562  rmf_file_key="local_rmf_file_name"
1563  fs = po.get_fields(xl_keys+[rmf_file_key])
1564  else:
1565  fs = po.get_fields(xl_keys)
1566 
1567  # this dictionary stores the occurency of given crosslinks
1568  self.cross_link_frequency = {}
1569 
1570  # this dictionary stores the series of distances for given crosslinked
1571  # residues
1572  self.cross_link_distances = {}
1573 
1574  # this dictionary stores the series of distances for given crosslinked
1575  # residues
1576  self.cross_link_distances_unique = {}
1577 
1578  if not external_csv_data_file is None:
1579  # this dictionary stores the further information on crosslinks
1580  # labeled by unique ID
1581  self.external_csv_data = {}
1582  xldb = IMP.pmi.tools.get_db_from_csv(external_csv_data_file)
1583 
1584  for xl in xldb:
1585  self.external_csv_data[
1586  xl[external_csv_data_file_unique_id_key]] = xl
1587 
1588  # this list keeps track the tuple of cross-links and sample
1589  # so that we don't count twice the same crosslinked residues in the
1590  # same sample
1591  cross_link_frequency_list = []
1592 
1593  self.unique_cross_link_list = []
1594 
1595  for key in xl_keys:
1596  print(key)
1597  keysplit = key.replace(
1598  "_",
1599  " ").replace(
1600  "-",
1601  " ").replace(
1602  ":",
1603  " ").split(
1604  )
1605 
1606  if filter_label!=None:
1607  if filter_label not in keysplit: continue
1608 
1609  if mapping is None:
1610  r1 = int(keysplit[5])
1611  c1 = keysplit[6]
1612  r2 = int(keysplit[7])
1613  c2 = keysplit[8]
1614  try:
1615  confidence = keysplit[12]
1616  except:
1617  confidence = '0.0'
1618  try:
1619  unique_identifier = keysplit[3]
1620  except:
1621  unique_identifier = '0'
1622  else:
1623  r1 = int(keysplit[mapping["Residue1"]])
1624  c1 = keysplit[mapping["Protein1"]]
1625  r2 = int(keysplit[mapping["Residue2"]])
1626  c2 = keysplit[mapping["Protein2"]]
1627  try:
1628  confidence = keysplit[mapping["Confidence"]]
1629  except:
1630  confidence = '0.0'
1631  try:
1632  unique_identifier = keysplit[mapping["Unique Identifier"]]
1633  except:
1634  unique_identifier = '0'
1635 
1636  self.crosslinkedprots.add(c1)
1637  self.crosslinkedprots.add(c2)
1638 
1639  # construct the list of distances corresponding to the input rmf
1640  # files
1641 
1642  dists=[]
1643  if filter_rmf_file_names is not None:
1644  for n,d in enumerate(fs[key]):
1645  if fs[rmf_file_key][n] in filter_rmf_file_names:
1646  dists.append(float(d))
1647  else:
1648  dists=[float(f) for f in fs[key]]
1649 
1650  # check if the input confidence class corresponds to the
1651  # one of the cross-link
1652 
1653  mdist = self.median(dists)
1654 
1655  stdv = np.std(np.array(dists))
1656  if self.mindist > mdist:
1657  self.mindist = mdist
1658  if self.maxdist < mdist:
1659  self.maxdist = mdist
1660 
1661  # calculate the frequency of unique crosslinks within the same
1662  # sample
1663  if not self.external_csv_data is None:
1664  sample = self.external_csv_data[unique_identifier]["Sample"]
1665  else:
1666  sample = "None"
1667 
1668  if (r1, c1, r2, c2,mdist) not in cross_link_frequency_list:
1669  if (r1, c1, r2, c2) not in self.cross_link_frequency:
1670  self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1671  self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1672  else:
1673  self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1674  self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1675  cross_link_frequency_list.append((r1, c1, r2, c2))
1676  cross_link_frequency_list.append((r2, c2, r1, c1))
1677  self.unique_cross_link_list.append(
1678  (r1, c1, r2, c2,mdist))
1679 
1680  if (r1, c1, r2, c2) not in self.cross_link_distances:
1681  self.cross_link_distances[(
1682  r1,
1683  c1,
1684  r2,
1685  c2,
1686  mdist,
1687  confidence)] = dists
1688  self.cross_link_distances[(
1689  r2,
1690  c2,
1691  r1,
1692  c1,
1693  mdist,
1694  confidence)] = dists
1695  self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1696  else:
1697  self.cross_link_distances[(
1698  r2,
1699  c2,
1700  r1,
1701  c1,
1702  mdist,
1703  confidence)] += dists
1704  self.cross_link_distances[(
1705  r1,
1706  c1,
1707  r2,
1708  c2,
1709  mdist,
1710  confidence)] += dists
1711 
1712  self.crosslinks.append(
1713  (r1,
1714  c1,
1715  r2,
1716  c2,
1717  mdist,
1718  stdv,
1719  confidence,
1720  unique_identifier,
1721  'original'))
1722  self.crosslinks.append(
1723  (r2,
1724  c2,
1725  r1,
1726  c1,
1727  mdist,
1728  stdv,
1729  confidence,
1730  unique_identifier,
1731  'reversed'))
1732 
1733  self.cross_link_frequency_inverted = {}
1734  for xl in self.unique_cross_link_list:
1735  (r1, c1, r2, c2, mdist) = xl
1736  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1737  if frequency not in self.cross_link_frequency_inverted:
1738  self.cross_link_frequency_inverted[
1739  frequency] = [(r1, c1, r2, c2)]
1740  else:
1741  self.cross_link_frequency_inverted[
1742  frequency].append((r1, c1, r2, c2))
1743 
1744  # -------------
1745 
1746  def median(self, mylist):
1747  sorts = sorted(mylist)
1748  length = len(sorts)
1749  print(length)
1750  if length == 1:
1751  return mylist[0]
1752  if not length % 2:
1753  return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1754  return sorts[length / 2]
1755 
1756  def set_threshold(self,threshold):
1757  self.threshold=threshold
1758 
1759  def set_tolerance(self,tolerance):
1760  self.tolerance=tolerance
1761 
1762  def colormap(self, dist):
1763  if dist < self.threshold - self.tolerance:
1764  return "Green"
1765  elif dist >= self.threshold + self.tolerance:
1766  return "Orange"
1767  else:
1768  return "Red"
1769 
1770  def write_cross_link_database(self, filename, format='csv'):
1771  import csv
1772 
1773  fieldnames = [
1774  "Unique ID", "Protein1", "Residue1", "Protein2", "Residue2",
1775  "Median Distance", "Standard Deviation", "Confidence", "Frequency", "Arrangement"]
1776 
1777  if not self.external_csv_data is None:
1778  keys = list(self.external_csv_data.keys())
1779  innerkeys = list(self.external_csv_data[keys[0]].keys())
1780  innerkeys.sort()
1781  fieldnames += innerkeys
1782 
1783  dw = csv.DictWriter(
1784  open(filename,
1785  "w"),
1786  delimiter=',',
1787  fieldnames=fieldnames)
1788  dw.writeheader()
1789  for xl in self.crosslinks:
1790  (r1, c1, r2, c2, mdist, stdv, confidence,
1791  unique_identifier, descriptor) = xl
1792  if descriptor == 'original':
1793  outdict = {}
1794  outdict["Unique ID"] = unique_identifier
1795  outdict["Protein1"] = c1
1796  outdict["Protein2"] = c2
1797  outdict["Residue1"] = r1
1798  outdict["Residue2"] = r2
1799  outdict["Median Distance"] = mdist
1800  outdict["Standard Deviation"] = stdv
1801  outdict["Confidence"] = confidence
1802  outdict["Frequency"] = self.cross_link_frequency[
1803  (r1, c1, r2, c2)]
1804  if c1 == c2:
1805  arrangement = "Intra"
1806  else:
1807  arrangement = "Inter"
1808  outdict["Arrangement"] = arrangement
1809  if not self.external_csv_data is None:
1810  outdict.update(self.external_csv_data[unique_identifier])
1811 
1812  dw.writerow(outdict)
1813 
1814  def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
1815  no_confidence_info=False, filter=None, layout="whole", crosslinkedonly=False,
1816  filename=None, confidence_classes=None, alphablend=0.1, scale_symbol_size=1.0,
1817  gap_between_components=0,
1818  rename_protein_map=None):
1819  # layout can be:
1820  # "lowerdiagonal" print only the lower diagonal plot
1821  # "upperdiagonal" print only the upper diagonal plot
1822  # "whole" print all
1823  # crosslinkedonly: plot only components that have crosslinks
1824  # no_dist_info: if True will plot only the cross-links as grey spots
1825  # filter = tuple the tuple contains a keyword to be search in the database
1826  # a relationship ">","==","<"
1827  # and a value
1828  # example ("ID_Score",">",40)
1829  # scale_symbol_size rescale the symbol for the crosslink
1830  # rename_protein_map is a dictionary to rename proteins
1831 
1832  import matplotlib.pyplot as plt
1833  import matplotlib.cm as cm
1834 
1835  fig = plt.figure(figsize=(10, 10))
1836  ax = fig.add_subplot(111)
1837 
1838  ax.set_xticks([])
1839  ax.set_yticks([])
1840 
1841  # set the list of proteins on the x axis
1842  if prot_listx is None:
1843  if crosslinkedonly:
1844  prot_listx = list(self.crosslinkedprots)
1845  else:
1846  prot_listx = list(self.prot_length_dict.keys())
1847  prot_listx.sort()
1848 
1849  nresx = gap_between_components + \
1850  sum([self.prot_length_dict[name]
1851  + gap_between_components for name in prot_listx])
1852 
1853  # set the list of proteins on the y axis
1854 
1855  if prot_listy is None:
1856  if crosslinkedonly:
1857  prot_listy = list(self.crosslinkedprots)
1858  else:
1859  prot_listy = list(self.prot_length_dict.keys())
1860  prot_listy.sort()
1861 
1862  nresy = gap_between_components + \
1863  sum([self.prot_length_dict[name]
1864  + gap_between_components for name in prot_listy])
1865 
1866  # this is the residue offset for each protein
1867  resoffsetx = {}
1868  resendx = {}
1869  res = gap_between_components
1870  for prot in prot_listx:
1871  resoffsetx[prot] = res
1872  res += self.prot_length_dict[prot]
1873  resendx[prot] = res
1874  res += gap_between_components
1875 
1876  resoffsety = {}
1877  resendy = {}
1878  res = gap_between_components
1879  for prot in prot_listy:
1880  resoffsety[prot] = res
1881  res += self.prot_length_dict[prot]
1882  resendy[prot] = res
1883  res += gap_between_components
1884 
1885  resoffsetdiagonal = {}
1886  res = gap_between_components
1887  for prot in IMP.pmi.tools.OrderedSet(prot_listx + prot_listy):
1888  resoffsetdiagonal[prot] = res
1889  res += self.prot_length_dict[prot]
1890  res += gap_between_components
1891 
1892  # plot protein boundaries
1893 
1894  xticks = []
1895  xlabels = []
1896  for n, prot in enumerate(prot_listx):
1897  res = resoffsetx[prot]
1898  end = resendx[prot]
1899  for proty in prot_listy:
1900  resy = resoffsety[proty]
1901  endy = resendy[proty]
1902  ax.plot([res, res], [resy, endy], 'k-', lw=0.4)
1903  ax.plot([end, end], [resy, endy], 'k-', lw=0.4)
1904  xticks.append((float(res) + float(end)) / 2)
1905  if rename_protein_map is not None:
1906  if prot in rename_protein_map:
1907  prot=rename_protein_map[prot]
1908  xlabels.append(prot)
1909 
1910  yticks = []
1911  ylabels = []
1912  for n, prot in enumerate(prot_listy):
1913  res = resoffsety[prot]
1914  end = resendy[prot]
1915  for protx in prot_listx:
1916  resx = resoffsetx[protx]
1917  endx = resendx[protx]
1918  ax.plot([resx, endx], [res, res], 'k-', lw=0.4)
1919  ax.plot([resx, endx], [end, end], 'k-', lw=0.4)
1920  yticks.append((float(res) + float(end)) / 2)
1921  if rename_protein_map is not None:
1922  if prot in rename_protein_map:
1923  prot=rename_protein_map[prot]
1924  ylabels.append(prot)
1925 
1926  # plot the contact map
1927  print(prot_listx, prot_listy)
1928 
1929  if not self.contactmap is None:
1930  import matplotlib.cm as cm
1931  tmp_array = np.zeros((nresx, nresy))
1932 
1933  for px in prot_listx:
1934  print(px)
1935  for py in prot_listy:
1936  print(py)
1937  resx = resoffsety[px]
1938  lengx = resendx[px] - 1
1939  resy = resoffsety[py]
1940  lengy = resendy[py] - 1
1941  indexes_x = self.index_dictionary[px]
1942  minx = min(indexes_x)
1943  maxx = max(indexes_x)
1944  indexes_y = self.index_dictionary[py]
1945  miny = min(indexes_y)
1946  maxy = max(indexes_y)
1947 
1948  print(px, py, minx, maxx, miny, maxy)
1949 
1950  try:
1951  tmp_array[
1952  resx:lengx,
1953  resy:lengy] = self.contactmap[
1954  minx:maxx,
1955  miny:maxy]
1956  except:
1957  continue
1958 
1959 
1960  ax.imshow(tmp_array,
1961  cmap=cm.binary,
1962  origin='lower',
1963  interpolation='nearest')
1964 
1965  ax.set_xticks(xticks)
1966  ax.set_xticklabels(xlabels, rotation=90)
1967  ax.set_yticks(yticks)
1968  ax.set_yticklabels(ylabels)
1969  ax.set_xlim(0,nresx)
1970  ax.set_ylim(0,nresy)
1971 
1972 
1973  # set the crosslinks
1974 
1975  already_added_xls = []
1976 
1977  for xl in self.crosslinks:
1978 
1979  (r1, c1, r2, c2, mdist, stdv, confidence,
1980  unique_identifier, descriptor) = xl
1981 
1982  if confidence_classes is not None:
1983  if confidence not in confidence_classes:
1984  continue
1985 
1986  try:
1987  pos1 = r1 + resoffsetx[c1]
1988  except:
1989  continue
1990  try:
1991  pos2 = r2 + resoffsety[c2]
1992  except:
1993  continue
1994 
1995  if not filter is None:
1996  xldb = self.external_csv_data[unique_identifier]
1997  xldb.update({"Distance": mdist})
1998  xldb.update({"Distance_stdv": stdv})
1999 
2000  if filter[1] == ">":
2001  if float(xldb[filter[0]]) <= float(filter[2]):
2002  continue
2003 
2004  if filter[1] == "<":
2005  if float(xldb[filter[0]]) >= float(filter[2]):
2006  continue
2007 
2008  if filter[1] == "==":
2009  if float(xldb[filter[0]]) != float(filter[2]):
2010  continue
2011 
2012  # all that below is used for plotting the diagonal
2013  # when you have a rectangolar plots
2014 
2015  pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
2016  pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
2017 
2018  if layout == 'lowerdiagonal':
2019  if pos_for_diagonal1 <= pos_for_diagonal2:
2020  continue
2021  if layout == 'upperdiagonal':
2022  if pos_for_diagonal1 >= pos_for_diagonal2:
2023  continue
2024 
2025  already_added_xls.append((r1, c1, r2, c2))
2026 
2027  if not no_confidence_info:
2028  if confidence == '0.01':
2029  markersize = 14 * scale_symbol_size
2030  elif confidence == '0.05':
2031  markersize = 9 * scale_symbol_size
2032  elif confidence == '0.1':
2033  markersize = 6 * scale_symbol_size
2034  else:
2035  markersize = 15 * scale_symbol_size
2036  else:
2037  markersize = 5 * scale_symbol_size
2038 
2039  if not no_dist_info:
2040  color = self.colormap(mdist)
2041  else:
2042  color = "gray"
2043 
2044  ax.plot(
2045  [pos1],
2046  [pos2],
2047  'o',
2048  c=color,
2049  alpha=alphablend,
2050  markersize=markersize)
2051 
2052 
2053 
2054  fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
2055 
2056  [i.set_linewidth(2.0) for i in ax.spines.values()]
2057 
2058  #plt.tight_layout()
2059 
2060  if filename:
2061  plt.savefig(filename + ".pdf", dpi=300, transparent="False")
2062  else:
2063  plt.show()
2064 
2065  def get_frequency_statistics(self, prot_list,
2066  prot_list2=None):
2067 
2068  violated_histogram = {}
2069  satisfied_histogram = {}
2070  unique_cross_links = []
2071 
2072  for xl in self.unique_cross_link_list:
2073  (r1, c1, r2, c2, mdist) = xl
2074 
2075  # here we filter by the protein
2076  if prot_list2 is None:
2077  if not c1 in prot_list:
2078  continue
2079  if not c2 in prot_list:
2080  continue
2081  else:
2082  if c1 in prot_list and c2 in prot_list2:
2083  pass
2084  elif c1 in prot_list2 and c2 in prot_list:
2085  pass
2086  else:
2087  continue
2088 
2089  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2090 
2091  if (r1, c1, r2, c2) not in unique_cross_links:
2092  if mdist > 35.0:
2093  if frequency not in violated_histogram:
2094  violated_histogram[frequency] = 1
2095  else:
2096  violated_histogram[frequency] += 1
2097  else:
2098  if frequency not in satisfied_histogram:
2099  satisfied_histogram[frequency] = 1
2100  else:
2101  satisfied_histogram[frequency] += 1
2102  unique_cross_links.append((r1, c1, r2, c2))
2103  unique_cross_links.append((r2, c2, r1, c1))
2104 
2105  print("# satisfied")
2106 
2107  total_number_of_crosslinks=0
2108 
2109  for i in satisfied_histogram:
2110  # if i in violated_histogram:
2111  # print i, satisfied_histogram[i]+violated_histogram[i]
2112  # else:
2113  if i in violated_histogram:
2114  print(i, violated_histogram[i]+satisfied_histogram[i])
2115  else:
2116  print(i, satisfied_histogram[i])
2117  total_number_of_crosslinks+=i*satisfied_histogram[i]
2118 
2119  print("# violated")
2120 
2121  for i in violated_histogram:
2122  print(i, violated_histogram[i])
2123  total_number_of_crosslinks+=i*violated_histogram[i]
2124 
2125  print(total_number_of_crosslinks)
2126 
2127 
2128 # ------------
2129  def print_cross_link_binary_symbols(self, prot_list,
2130  prot_list2=None):
2131  tmp_matrix = []
2132  confidence_list = []
2133  for xl in self.crosslinks:
2134  (r1, c1, r2, c2, mdist, stdv, confidence,
2135  unique_identifier, descriptor) = xl
2136 
2137  if prot_list2 is None:
2138  if not c1 in prot_list:
2139  continue
2140  if not c2 in prot_list:
2141  continue
2142  else:
2143  if c1 in prot_list and c2 in prot_list2:
2144  pass
2145  elif c1 in prot_list2 and c2 in prot_list:
2146  pass
2147  else:
2148  continue
2149 
2150  if descriptor != "original":
2151  continue
2152 
2153  confidence_list.append(confidence)
2154 
2155  dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2156  tmp_dist_binary = []
2157  for d in dists:
2158  if d < 35:
2159  tmp_dist_binary.append(1)
2160  else:
2161  tmp_dist_binary.append(0)
2162  tmp_matrix.append(tmp_dist_binary)
2163 
2164  matrix = list(zip(*tmp_matrix))
2165 
2166  satisfied_high_sum = 0
2167  satisfied_mid_sum = 0
2168  satisfied_low_sum = 0
2169  total_satisfied_sum = 0
2170  for k, m in enumerate(matrix):
2171  satisfied_high = 0
2172  total_high = 0
2173  satisfied_mid = 0
2174  total_mid = 0
2175  satisfied_low = 0
2176  total_low = 0
2177  total_satisfied = 0
2178  total = 0
2179  for n, b in enumerate(m):
2180  if confidence_list[n] == "0.01":
2181  total_high += 1
2182  if b == 1:
2183  satisfied_high += 1
2184  satisfied_high_sum += 1
2185  elif confidence_list[n] == "0.05":
2186  total_mid += 1
2187  if b == 1:
2188  satisfied_mid += 1
2189  satisfied_mid_sum += 1
2190  elif confidence_list[n] == "0.1":
2191  total_low += 1
2192  if b == 1:
2193  satisfied_low += 1
2194  satisfied_low_sum += 1
2195  if b == 1:
2196  total_satisfied += 1
2197  total_satisfied_sum += 1
2198  total += 1
2199  print(k, satisfied_high, total_high)
2200  print(k, satisfied_mid, total_mid)
2201  print(k, satisfied_low, total_low)
2202  print(k, total_satisfied, total)
2203  print(float(satisfied_high_sum) / len(matrix))
2204  print(float(satisfied_mid_sum) / len(matrix))
2205  print(float(satisfied_low_sum) / len(matrix))
2206 # ------------
2207 
2208  def get_unique_crosslinks_statistics(self, prot_list,
2209  prot_list2=None):
2210 
2211  print(prot_list)
2212  print(prot_list2)
2213  satisfied_high = 0
2214  total_high = 0
2215  satisfied_mid = 0
2216  total_mid = 0
2217  satisfied_low = 0
2218  total_low = 0
2219  total = 0
2220  tmp_matrix = []
2221  satisfied_string = []
2222  for xl in self.crosslinks:
2223  (r1, c1, r2, c2, mdist, stdv, confidence,
2224  unique_identifier, descriptor) = xl
2225 
2226  if prot_list2 is None:
2227  if not c1 in prot_list:
2228  continue
2229  if not c2 in prot_list:
2230  continue
2231  else:
2232  if c1 in prot_list and c2 in prot_list2:
2233  pass
2234  elif c1 in prot_list2 and c2 in prot_list:
2235  pass
2236  else:
2237  continue
2238 
2239  if descriptor != "original":
2240  continue
2241 
2242  total += 1
2243  if confidence == "0.01":
2244  total_high += 1
2245  if mdist <= 35:
2246  satisfied_high += 1
2247  if confidence == "0.05":
2248  total_mid += 1
2249  if mdist <= 35:
2250  satisfied_mid += 1
2251  if confidence == "0.1":
2252  total_low += 1
2253  if mdist <= 35:
2254  satisfied_low += 1
2255  if mdist <= 35:
2256  satisfied_string.append(1)
2257  else:
2258  satisfied_string.append(0)
2259 
2260  dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2261  tmp_dist_binary = []
2262  for d in dists:
2263  if d < 35:
2264  tmp_dist_binary.append(1)
2265  else:
2266  tmp_dist_binary.append(0)
2267  tmp_matrix.append(tmp_dist_binary)
2268 
2269  print("unique satisfied_high/total_high", satisfied_high, "/", total_high)
2270  print("unique satisfied_mid/total_mid", satisfied_mid, "/", total_mid)
2271  print("unique satisfied_low/total_low", satisfied_low, "/", total_low)
2272  print("total", total)
2273 
2274  matrix = list(zip(*tmp_matrix))
2275  satisfied_models = 0
2276  satstr = ""
2277  for b in satisfied_string:
2278  if b == 0:
2279  satstr += "-"
2280  if b == 1:
2281  satstr += "*"
2282 
2283  for m in matrix:
2284  all_satisfied = True
2285  string = ""
2286  for n, b in enumerate(m):
2287  if b == 0:
2288  string += "0"
2289  if b == 1:
2290  string += "1"
2291  if b == 1 and satisfied_string[n] == 1:
2292  pass
2293  elif b == 1 and satisfied_string[n] == 0:
2294  pass
2295  elif b == 0 and satisfied_string[n] == 0:
2296  pass
2297  elif b == 0 and satisfied_string[n] == 1:
2298  all_satisfied = False
2299  if all_satisfied:
2300  satisfied_models += 1
2301  print(string)
2302  print(satstr, all_satisfied)
2303  print("models that satisfies the median satisfied crosslinks/total models", satisfied_models, len(matrix))
2304 
2305  def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,
2306  prot_list2=None):
2307 
2308  import pylab as pl
2309 
2310  tmp_matrix = []
2311  for kw in self.cross_link_distances_unique:
2312  (r1, c1, r2, c2) = kw
2313  dists = self.cross_link_distances_unique[kw]
2314 
2315  if prot_list2 is None:
2316  if not c1 in prot_list:
2317  continue
2318  if not c2 in prot_list:
2319  continue
2320  else:
2321  if c1 in prot_list and c2 in prot_list2:
2322  pass
2323  elif c1 in prot_list2 and c2 in prot_list:
2324  pass
2325  else:
2326  continue
2327  # append the sum of dists to order by that in the matrix plot
2328  dists.append(sum(dists))
2329  tmp_matrix.append(dists)
2330 
2331  tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
2332 
2333  # print len(tmp_matrix), len(tmp_matrix[0])-1
2334  matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
2335 
2336  for i in range(len(tmp_matrix)):
2337  for k in range(len(tmp_matrix[i]) - 1):
2338  matrix[i][k] = tmp_matrix[i][k]
2339 
2340  print(matrix)
2341 
2342  fig = pl.figure()
2343  ax = fig.add_subplot(211)
2344 
2345  cax = ax.imshow(matrix, interpolation='nearest')
2346  # ax.set_yticks(range(len(self.model_list_names)))
2347  #ax.set_yticklabels( [self.model_list_names[i] for i in leaves_order] )
2348  fig.colorbar(cax)
2349  pl.savefig(figurename, dpi=300)
2350  pl.show()
2351 
2352  def plot_bars(
2353  self,
2354  filename,
2355  prots1,
2356  prots2,
2357  nxl_per_row=20,
2358  arrangement="inter",
2359  confidence_input="None"):
2360 
2361  data = []
2362  for xl in self.cross_link_distances:
2363  (r1, c1, r2, c2, mdist, confidence) = xl
2364  if c1 in prots1 and c2 in prots2:
2365  if arrangement == "inter" and c1 == c2:
2366  continue
2367  if arrangement == "intra" and c1 != c2:
2368  continue
2369  if confidence_input == confidence:
2370  label = str(c1) + ":" + str(r1) + \
2371  "-" + str(c2) + ":" + str(r2)
2372  values = self.cross_link_distances[xl]
2373  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2374  data.append((label, values, mdist, frequency))
2375 
2376  sort_by_dist = sorted(data, key=lambda tup: tup[2])
2377  sort_by_dist = list(zip(*sort_by_dist))
2378  values = sort_by_dist[1]
2379  positions = list(range(len(values)))
2380  labels = sort_by_dist[0]
2381  frequencies = list(map(float, sort_by_dist[3]))
2382  frequencies = [f * 10.0 for f in frequencies]
2383 
2384  nchunks = int(float(len(values)) / nxl_per_row)
2385  values_chunks = IMP.pmi.tools.chunk_list_into_segments(values, nchunks)
2386  positions_chunks = IMP.pmi.tools.chunk_list_into_segments(
2387  positions,
2388  nchunks)
2389  frequencies_chunks = IMP.pmi.tools.chunk_list_into_segments(
2390  frequencies,
2391  nchunks)
2392  labels_chunks = IMP.pmi.tools.chunk_list_into_segments(labels, nchunks)
2393 
2394  for n, v in enumerate(values_chunks):
2395  p = positions_chunks[n]
2396  f = frequencies_chunks[n]
2397  l = labels_chunks[n]
2399  filename + "." + str(n), v, p, f,
2400  valuename="Distance (Ang)", positionname="Unique " + arrangement + " Crosslinks", xlabels=l)
2401 
2402  def crosslink_distance_histogram(self, filename,
2403  prot_list=None,
2404  prot_list2=None,
2405  confidence_classes=None,
2406  bins=40,
2407  color='#66CCCC',
2408  yplotrange=[0, 1],
2409  format="png",
2410  normalized=False):
2411  if prot_list is None:
2412  prot_list = list(self.prot_length_dict.keys())
2413 
2414  distances = []
2415  for xl in self.crosslinks:
2416  (r1, c1, r2, c2, mdist, stdv, confidence,
2417  unique_identifier, descriptor) = xl
2418 
2419  if not confidence_classes is None:
2420  if confidence not in confidence_classes:
2421  continue
2422 
2423  if prot_list2 is None:
2424  if not c1 in prot_list:
2425  continue
2426  if not c2 in prot_list:
2427  continue
2428  else:
2429  if c1 in prot_list and c2 in prot_list2:
2430  pass
2431  elif c1 in prot_list2 and c2 in prot_list:
2432  pass
2433  else:
2434  continue
2435 
2436  distances.append(mdist)
2437 
2439  filename, distances, valuename="C-alpha C-alpha distance [Ang]",
2440  bins=bins, color=color,
2441  format=format,
2442  reference_xline=35.0,
2443  yplotrange=yplotrange, normalized=normalized)
2444 
2445  def scatter_plot_xl_features(self, filename,
2446  feature1=None,
2447  feature2=None,
2448  prot_list=None,
2449  prot_list2=None,
2450  yplotrange=None,
2451  reference_ylines=None,
2452  distance_color=True,
2453  format="png"):
2454  import matplotlib.pyplot as plt
2455  import matplotlib.cm as cm
2456 
2457  fig = plt.figure(figsize=(10, 10))
2458  ax = fig.add_subplot(111)
2459 
2460  for xl in self.crosslinks:
2461  (r1, c1, r2, c2, mdist, stdv, confidence,
2462  unique_identifier, arrangement) = xl
2463 
2464  if prot_list2 is None:
2465  if not c1 in prot_list:
2466  continue
2467  if not c2 in prot_list:
2468  continue
2469  else:
2470  if c1 in prot_list and c2 in prot_list2:
2471  pass
2472  elif c1 in prot_list2 and c2 in prot_list:
2473  pass
2474  else:
2475  continue
2476 
2477  xldb = self.external_csv_data[unique_identifier]
2478  xldb.update({"Distance": mdist})
2479  xldb.update({"Distance_stdv": stdv})
2480 
2481  xvalue = float(xldb[feature1])
2482  yvalue = float(xldb[feature2])
2483 
2484  if distance_color:
2485  color = self.colormap(mdist)
2486  else:
2487  color = "gray"
2488 
2489  ax.plot([xvalue], [yvalue], 'o', c=color, alpha=0.1, markersize=7)
2490 
2491  if not yplotrange is None:
2492  ax.set_ylim(yplotrange)
2493  if not reference_ylines is None:
2494  for rl in reference_ylines:
2495  ax.axhline(rl, color='red', linestyle='dashed', linewidth=1)
2496 
2497  if filename:
2498  plt.savefig(filename + "." + format, dpi=150, transparent="False")
2499 
2500  plt.show()
Simple 3D transformation class.
Visualization of crosslinks.
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, kernel::Model *m)
A decorator to associate a particle with a part of a protein/DNA/RNA.
Definition: Fragment.h:20
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
DensityMap * create_density_map(const algebra::GridD< 3, algebra::DenseGridStorageD< 3, float >, float > &grid)
A class for reading stat files.
Definition: output.py:613
def plot_field_histogram
Plot a list of histograms from a value list.
Definition: output.py:768
def plot_fields_box_plots
Plot time series as boxplots.
Definition: output.py:833
kernel::Restraints create_restraints(RMF::FileConstHandle fh, kernel::Model *m)
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
Definition: atom/distance.h:88
Miscellaneous utilities.
Definition: tools.py:1
def __init__
Constructor.
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
A class to evaluate the precision of an ensemble.
void load_frame(RMF::FileConstHandle file, unsigned int frame)
Definition: frames.h:27
log
Definition: log.py:1
double get_rmsd(const core::XYZs &s0, const core::XYZs &s1, const IMP::algebra::Transformation3D &tr_for_second)
A class to cluster structures.
void write_map(DensityMap *m, std::string filename)
Write a density map to a file.
def __init__
Constructor.
Definition: pmi/Analysis.py:32
Class for sampling a density map from particles.
def add_structure
Read a structure into the ensemble and store (as coordinates).
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
Performs alignment and RMSD calculation for two sets of coordinates.
Definition: pmi/Analysis.py:23
def add_subunits_density
Add a frame to the densities.
def scatter_and_gather
Synchronize data over a parallel run.
Definition: tools.py:972
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Basic utilities for handling cryo-electron microscopy 3D density maps.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
def fill
Add coordinates for a single model.
Tools for clustering and cluster analysis.
Definition: pmi/Analysis.py:1
def do_cluster
Run K-means clustering.
void destroy(Hierarchy d)
Delete the Hierarchy.
3D rotation class.
Definition: Rotation3D.h:46
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Definition: DensityMap.h:461
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Classes for writing output files and processing them.
Definition: output.py:1
def set_reference_structure
Read in a structure used for reference computation.
def get_rmsf
Calculate the residue mean square fluctuations (RMSF).
def get_average_distance_wrt_reference_structure
Compare the structure set to the reference structure.
def get_density
Get the current density for some component name.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
def get_precision
Evaluate the precision of two named structure groups.
VectorD< 3 > Vector3D
Definition: VectorD.h:395
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name.
Python classes to represent, score, sample and analyze models.
def add_structures
Read a list of RMFs, supports parallel.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Select hierarchy particles identified by the biological name.
Definition: Selection.h:62
Compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
def get_residue_indexes
Retrieve the residue indexes for the given particle.
Definition: tools.py:930
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27