IMP Reference Guide  develop.27926d84dc,2024/04/13 The Integrative Modeling Platform
pmi1/Analysis.py
1 #!/usr/bin/env python
2
3 """@namespace IMP.pmi1.analysis
4  Tools for clustering and cluster analysis
5 """
6 from __future__ import print_function
7 import IMP
8 import IMP.algebra
9 import IMP.em
10 import IMP.pmi1
11 import IMP.pmi1.tools
12 import IMP.pmi1.output
13 import IMP.rmf
14 import RMF
15 import IMP.pmi1.analysis
16 from operator import itemgetter
17 from copy import deepcopy
18 from math import log,sqrt
19 import itertools
20 import numpy as np
21
22
23 class Alignment(object):
24  """Performs alignment and RMSD calculation for two sets of coordinates
25
26  The class also takes into account non-equal stoichiometry of the proteins.
27  If this is the case, the protein names of proteins in multiple copies
28  should be specified in the following form:
29  nameA..1, nameA..2 (note two dots).
30  """
31
32  def __init__(self, template, query, weights=None):
33  """Constructor.
34  @param query {'p1':coords(L,3), 'p2':coords(L,3)}
35  @param template {'p1':coords(L,3), 'p2':coords(L,3)}
36  @param weights optional weights for each set of coordinates
37  """
38  self.query = query
39  self.template = template
40  self.weights=weights
41
42  if len(self.query.keys()) != len(self.template.keys()):
43  raise ValueError('''the number of proteins
44  in template and query does not match!''')
45
46  def permute(self):
47  # get unique protein names for each protein
48  # this is, unfortunately, expecting that names are all 'Molname..X'
49  # where X is different for each copy.
50  self.proteins = sorted(self.query.keys())
51  prots_uniq = [i.split('..')[0] for i in self.proteins]
52
53  # for each unique name, store list of permutations
54  # e.g. for keys A..1,A..2 store P[A] = [[A..1,A..2],[A..2,A..1]]
55  # then store the product: [[[A1,A2],[B1,B2]],[[A1,A2],[B2,B1]],
56  # [[A2,A1],[B1,B2]],[[A2,A1],[B2,B1]]]
57  P = {}
58  for p in prots_uniq:
59  np = prots_uniq.count(p)
60  copies = [i for i in self.proteins if i.split('..')[0] == p]
61  prmts = list(itertools.permutations(copies, len(copies)))
62  P[p] = prmts
63  self.P = P
64  self.Product = list(itertools.product(*P.values()))
65
66  def get_rmsd(self):
67
68  self.permute()
69
70  template_xyz = []
71  weights = []
72  torder = sum([list(i) for i in self.Product[0]], [])
73  for t in torder:
74  template_xyz += [IMP.algebra.Vector3D(i) for i in self.template[t]]
75  if self.weights is not None:
76  weights += [i for i in self.weights[t]]
77  #template_xyz = np.array(template_xyz)
78
79  self.rmsd = 10000000000.
80  for comb in self.Product:
81
82  order = sum([list(i) for i in comb], [])
83  query_xyz = []
84  for p in order:
85  query_xyz += [IMP.algebra.Vector3D(i) for i in self.query[p]]
86  #query_xyz = np.array(query_xyz)
87  #if len(template_xyz) != len(query_xyz):
88  # print '''Alignment.get_rmsd: ERROR: the number of coordinates
89  # in template and query does not match!'''
90  # exit()
91
92  if self.weights is not None:
93  dist=IMP.algebra.get_weighted_rmsd(template_xyz, query_xyz, weights)
94  else:
95  dist=IMP.algebra.get_rmsd(template_xyz, query_xyz)
96  #dist = sqrt(
97  # sum(np.diagonal(cdist(template_xyz, query_xyz) ** 2)) / len(template_xyz))
98  if dist < self.rmsd:
99  self.rmsd = dist
100  return self.rmsd
101
102  def align(self):
103  from scipy.spatial.distance import cdist
104
105  self.permute()
106
107  # create flat coordinate list from template in standard order
108  # then loop through the permutations and try to align and get RMSD
109  # should allow you to try all mappings within each protein
110  template_xyz = []
111  torder = sum([list(i) for i in self.Product[0]], [])
112  for t in torder:
113  template_xyz += [IMP.algebra.Vector3D(i) for i in self.template[t]]
114  # template_xyz = np.array(template_xyz)
115  self.rmsd, Transformation = 10000000000., ''
116
117  # then for each permutation, get flat list of coords and get RMSD
118  for comb in self.Product:
119  order = sum([list(i) for i in comb], [])
120  query_xyz = []
121  for p in order:
122  query_xyz += [IMP.algebra.Vector3D(i) for i in self.query[p]]
123  #query_xyz = np.array(query_xyz)
124
125  if len(template_xyz) != len(query_xyz):
126  raise ValueError('''the number of coordinates
127  in template and query does not match!''')
128
130  query_xyz,
131  template_xyz)
132  query_xyz_tr = [transformation.get_transformed(n)
133  for n in query_xyz]
134
135  dist = sqrt(
136  sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2)) / len(template_xyz))
137  if dist < self.rmsd:
138  self.rmsd = dist
139  Transformation = transformation
140
141  # return the transformation
142  return (self.rmsd, Transformation)
143
144
145 # TEST for the alignment ###
146 """
147 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
148  'a..2':np.array([np.array([1.,1.,])]),
149  'a..3':np.array([np.array([-2.,1.])]),
150  'b':np.array([np.array([0.,-1.])]),
151  'c..1':np.array([np.array([-1.,-1.])]),
152  'c..2':np.array([np.array([1.,-1.])]),
153  'd':np.array([np.array([0.,0.])]),
154  'e':np.array([np.array([0.,1.])])}
155
156 Ali = Alignment(Proteins, Proteins)
157 Ali.permute()
158 if Ali.get_rmsd() == 0.0: print 'successful test!'
159 else: print 'ERROR!'; exit()
160 """
161
162
163 # ----------------------------------
164 class Violations(object):
165
166  def __init__(self, filename):
167
168  self.violation_thresholds = {}
169  self.violation_counts = {}
170
171  data = open(filename)
173  data.close()
174
175  for d in D:
176  d = d.strip().split()
177  self.violation_thresholds[d[0]] = float(d[1])
178
179  def get_number_violated_restraints(self, rsts_dict):
180  num_violated = 0
181  for rst in self.violation_thresholds:
182  if rst not in rsts_dict:
183  continue # print rst;
184  if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
185  num_violated += 1
186  if rst not in self.violation_counts:
187  self.violation_counts[rst] = 1
188  else:
189  self.violation_counts[rst] += 1
190  return num_violated
191
192
193 # ----------------------------------
194 class Clustering(object):
195  """A class to cluster structures.
196  Uses scipy's cdist function to compute distance matrices
197  and sklearn's kmeans clustering module.
198  """
199  def __init__(self,rmsd_weights=None):
200  """Constructor.
201  @param rmsd_weights Flat list of weights for each particle
202  (if they're coarse)
203  """
204  try:
205  from mpi4py import MPI
206  self.comm = MPI.COMM_WORLD
207  self.rank = self.comm.Get_rank()
208  self.number_of_processes = self.comm.size
209  except ImportError:
210  self.number_of_processes = 1
211  self.rank = 0
212  self.all_coords = {}
213  self.structure_cluster_ids = None
214  self.tmpl_coords = None
215  self.rmsd_weights=rmsd_weights
216
217  def set_template(self, part_coords):
218
219  self.tmpl_coords = part_coords
220
221  def fill(self, frame, Coords):
222  """Add coordinates for a single model."""
223
224  self.all_coords[frame] = Coords
225
226  def dist_matrix(self):
227
228  self.model_list_names = list(self.all_coords.keys())
229  self.model_indexes = list(range(len(self.model_list_names)))
230  self.model_indexes_dict = dict(
231  list(zip(self.model_list_names, self.model_indexes)))
232  model_indexes_unique_pairs = list(itertools.combinations(self.model_indexes, 2))
233
234  my_model_indexes_unique_pairs = IMP.pmi1.tools.chunk_list_into_segments(
235  model_indexes_unique_pairs,
236  self.number_of_processes)[self.rank]
237
238  print("process %s assigned with %s pairs" % (str(self.rank), str(len(my_model_indexes_unique_pairs))))
239
240  (raw_distance_dict, self.transformation_distance_dict) = self.matrix_calculation(self.all_coords,
241  self.tmpl_coords,
242  my_model_indexes_unique_pairs)
243
244  if self.number_of_processes > 1:
245  raw_distance_dict = IMP.pmi1.tools.scatter_and_gather(
246  raw_distance_dict)
247  pickable_transformations = self.get_pickable_transformation_distance_dict(
248  )
249  pickable_transformations = IMP.pmi1.tools.scatter_and_gather(
250  pickable_transformations)
251  self.set_transformation_distance_dict_from_pickable(
252  pickable_transformations)
253
254  self.raw_distance_matrix = np.zeros(
255  (len(self.model_list_names), len(self.model_list_names)))
256  for item in raw_distance_dict:
257  (f1, f2) = item
258  self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
259  self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
260
261  def get_dist_matrix(self):
262  return self.raw_distance_matrix
263
264  def do_cluster(self, number_of_clusters,seed=None):
265  """Run K-means clustering
266  @param number_of_clusters Num means
267  @param seed the random seed
268  """
269  from sklearn.cluster import KMeans
270  if seed is not None:
271  np.random.seed(seed)
272  try:
273  # check whether we have the right version of sklearn
274  kmeans = KMeans(n_clusters=number_of_clusters)
275  except TypeError:
276  # sklearn older than 0.12
277  kmeans = KMeans(k=number_of_clusters)
278  kmeans.fit_predict(self.raw_distance_matrix)
279
280  self.structure_cluster_ids = kmeans.labels_
281
282  def get_pickable_transformation_distance_dict(self):
283  pickable_transformations = {}
284  for label in self.transformation_distance_dict:
285  tr = self.transformation_distance_dict[label]
286  trans = tuple(tr.get_translation())
287  rot = tuple(tr.get_rotation().get_quaternion())
288  pickable_transformations[label] = (rot, trans)
289  return pickable_transformations
290
291  def set_transformation_distance_dict_from_pickable(
292  self,
293  pickable_transformations):
294  self.transformation_distance_dict = {}
295  for label in pickable_transformations:
296  tr = pickable_transformations[label]
297  trans = IMP.algebra.Vector3D(tr[1])
298  rot = IMP.algebra.Rotation3D(tr[0])
299  self.transformation_distance_dict[
300  label] = IMP.algebra.Transformation3D(rot, trans)
301
302  def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
303  import pickle
304  outf = open(file_name + ".data", 'wb')
305
306  # to pickle the transformation dictionary
307  # you have to save the arrays correposnding to
308  # the transformations
309
310  pickable_transformations = self.get_pickable_transformation_distance_dict(
311  )
312  pickle.dump(
313  (self.structure_cluster_ids,
314  self.model_list_names,
315  pickable_transformations),
316  outf)
317
318  np.save(file_name + ".npy", self.raw_distance_matrix)
319
321  import pickle
322
323  inputf = open(file_name + ".data", 'rb')
324  (self.structure_cluster_ids, self.model_list_names,
326  inputf.close()
327
328  self.raw_distance_matrix = np.load(file_name + ".npy")
329
330  self.set_transformation_distance_dict_from_pickable(
331  pickable_transformations)
332  self.model_indexes = list(range(len(self.model_list_names)))
333  self.model_indexes_dict = dict(
334  list(zip(self.model_list_names, self.model_indexes)))
335
336  def plot_matrix(self, figurename="clustermatrix.pdf"):
337  import matplotlib as mpl
338  mpl.use('Agg')
339  import matplotlib.pylab as pl
340  from scipy.cluster import hierarchy as hrc
341
342  fig = pl.figure(figsize=(10,8))
344  dendrogram = hrc.dendrogram(
346  color_threshold=7,
347  no_labels=True)
348  leaves_order = dendrogram['leaves']
349  ax.set_xlabel('Model')
350  ax.set_ylabel('RMSD [Angstroms]')
351
353  cax = ax2.imshow(
354  self.raw_distance_matrix[leaves_order,
355  :][:,
356  leaves_order],
357  interpolation='nearest')
358  cb = fig.colorbar(cax)
359  cb.set_label('RMSD [Angstroms]')
360  ax2.set_xlabel('Model')
361  ax2.set_ylabel('Model')
362
363  pl.savefig(figurename, dpi=300)
364  pl.close(fig)
365
366  def get_model_index_from_name(self, name):
367  return self.model_indexes_dict[name]
368
369  def get_cluster_labels(self):
370  # this list
371  return list(set(self.structure_cluster_ids))
372
373  def get_number_of_clusters(self):
374  return len(self.get_cluster_labels())
375
376  def get_cluster_label_indexes(self, label):
377  return (
378  [i for i, l in enumerate(self.structure_cluster_ids) if l == label]
379  )
380
381  def get_cluster_label_names(self, label):
382  return (
383  [self.model_list_names[i]
384  for i in self.get_cluster_label_indexes(label)]
385  )
386
387  def get_cluster_label_average_rmsd(self, label):
388
389  indexes = self.get_cluster_label_indexes(label)
390
391  if len(indexes) > 1:
392  sub_distance_matrix = self.raw_distance_matrix[
393  indexes, :][:, indexes]
394  average_rmsd = np.sum(sub_distance_matrix) / \
395  (len(sub_distance_matrix)
396  ** 2 - len(sub_distance_matrix))
397  else:
398  average_rmsd = 0.0
399  return average_rmsd
400
401  def get_cluster_label_size(self, label):
402  return len(self.get_cluster_label_indexes(label))
403
404  def get_transformation_to_first_member(
405  self,
406  cluster_label,
407  structure_index):
408  reference = self.get_cluster_label_indexes(cluster_label)[0]
409  return self.transformation_distance_dict[(reference, structure_index)]
410
411  def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
412
413  model_list_names = list(all_coords.keys())
414  rmsd_protein_names = list(all_coords[model_list_names[0]].keys())
415  raw_distance_dict = {}
416  transformation_distance_dict = {}
417  if template_coords is None:
418  do_alignment = False
419  else:
420  do_alignment = True
421  alignment_template_protein_names = list(template_coords.keys())
422
423  for (f1, f2) in list_of_pairs:
424
425  if not do_alignment:
426  # here we only get the rmsd,
427  # we need that for instance when you want to cluster conformations
428  # globally, eg the EM map is a reference
430
431  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
432  for pr in rmsd_protein_names])
433  coords_f2 = {}
434  for pr in rmsd_protein_names:
435  coords_f2[pr] = all_coords[model_list_names[f2]][pr]
436
437  Ali = Alignment(coords_f1, coords_f2, self.rmsd_weights)
438  rmsd = Ali.get_rmsd()
439
440  elif do_alignment:
441  # here we actually align the conformations first
442  # and than calculate the rmsd. We need that when the
443  # protein(s) is the reference
444  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
445  for pr in alignment_template_protein_names])
446  coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
447  for pr in alignment_template_protein_names])
448
449  Ali = Alignment(coords_f1, coords_f2)
450  template_rmsd, transformation = Ali.align()
451
452  # here we calculate the rmsd
453  # we will align two models based on the number of subunits provided
454  # and transform coordinates of model 2 to model 1
455  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
456  for pr in rmsd_protein_names])
457  coords_f2 = {}
458  for pr in rmsd_protein_names:
459  coords_f2[pr] = [transformation.get_transformed(
460  i) for i in all_coords[model_list_names[f2]][pr]]
461
462  Ali = Alignment(coords_f1, coords_f2, self.rmsd_weights)
463  rmsd = Ali.get_rmsd()
464
465  raw_distance_dict[(f1, f2)] = rmsd
466  raw_distance_dict[(f2, f1)] = rmsd
467  transformation_distance_dict[(f1, f2)] = transformation
468  transformation_distance_dict[(f2, f1)] = transformation
469
470  return raw_distance_dict, transformation_distance_dict
471
472
473 class RMSD(object):
474  """Compute the RMSD (without alignment) taking into account the copy ambiguity.
475  To be used with pmi2 hierarchies. Can be used for instance as follows:
476
477  rmsd=IMP.pmi1.analysis.RMSD(hier,hier,[mol.get_name() for mol in mols],dynamic0=True,dynamic1=False)
478  output_objects.append(rmsd)
479
480  before shuffling the coordinates
481  """
482
483  def __init__(self,hier0,hier1,molnames,label="None",dynamic0=True,dynamic1=True,metric=IMP.algebra.get_rmsd):
484  """
485  @param hier0 first input hierarchy
486  @param hier1 second input hierarchy
487  @param molname the names of the molecules used for the RMSD
488  @dynamic0 if True stores the decorators XYZ and coordinates of hier0 can be update. If false coordinates are static (stored in Vector3Ds)
489  and will never be updated
490  @dynamic1 same as above
491  metric what metric should be used
492  """
493  self.moldict0,self.molcoords0,self.mol_XYZs0=self.get_moldict_coorddict(hier0,molnames)
494  self.moldict1,self.molcoords1,self.mol_XYZs1=self.get_moldict_coorddict(hier1,molnames)
495  self.dynamic0=dynamic0
496  self.dynamic1=dynamic1
497  self.metric=metric
498  self.label=label
499
500  def get_moldict_coorddict(self,hier,molnames):
501  """return data structure for the RMSD calculation"""
502  moldict={}
503  mol_coords={}
504  mol_XYZs={}
505  for mol in IMP.pmi1.tools.get_molecules(hier):
506  name=mol.get_name()
507  if name not in molnames:
508  continue
509  parts=True
510  mol_coords[mol]=[]
511  mol_XYZs[mol]=[]
512  i=1
513  while parts:
514  sel=IMP.atom.Selection(mol,residue_index=i,representation_type=IMP.atom.BALLS,resolution=1)
515  parts=sel.get_selected_particles()
516  if parts:
517  mol_coords[mol].append(IMP.core.XYZ(parts[0]).get_coordinates())
518  mol_XYZs[mol].append(IMP.core.XYZ(parts[0]))
519  i=i+1
520  if name in moldict:
521  moldict[name].append(mol)
522  else:
523  moldict[name]=[mol]
524  return moldict, mol_coords, mol_XYZs
525
526  def get_rmsd_and_assigments(self):
527  best_orders=[]
528  total_rmsd=0
529  total_N=0
530  best_assignments=[]
531  rmsd_dict={}
532  for molname, ref_mols in self.moldict1.items():
533  ref_coords=[]
534  for ref_mol in ref_mols:
535  if self.dynamic1:
536  coord1=[XYZ.get_coordinates() for XYZ in self.mol_XYZs1[ref_mol]]
537  else:
538  coord1=self.molcoords1[ref_mol]
539  ref_coords+=coord1
540
541  rmsd=[]
542  rmf_mols_list=[]
543  for rmf_mols in itertools.permutations(self.moldict0[molname]):
544  rmf_coords=[]
545  for rmf_mol in rmf_mols:
546  if self.dynamic0:
547  coord0=[XYZ.get_coordinates() for XYZ in self.mol_XYZs0[rmf_mol]]
548  else:
549  coord0=self.molcoords0[rmf_mol]
550  rmf_coords+=coord0
551
552  rmsd.append(IMP.algebra.get_rmsd(ref_coords, rmf_coords))
553  rmf_mols_list.append(rmf_mols)
554  m=min(rmsd)
555  rmf_mols_best_order=rmf_mols_list[rmsd.index(m)]
556
557
558  for n, (ref_mol,rmf_mol) in enumerate(zip(ref_mols,rmf_mols_best_order)):
559  best_assignments.append((rmf_mol,ref_mol))
560  if self.dynamic0:
561  coord0=[XYZ.get_coordinates() for XYZ in self.mol_XYZs0[rmf_mol]]
562  else:
563  coord0=self.molcoords0[rmf_mol]
564
565  if self.dynamic1:
566  coord1=[XYZ.get_coordinates() for XYZ in self.mol_XYZs1[ref_mol]]
567  else:
568  coord1=self.molcoords1[ref_mol]
569  rmsd_pair=self.metric(coord1, coord0)
570  N=len(self.molcoords1[ref_mol])
571  total_N+=N
572  total_rmsd+=rmsd_pair*rmsd_pair*N
573  rmsd_dict[ref_mol]=rmsd_pair
574  total_rmsd = sqrt(total_rmsd/total_N)
576
577  def get_output(self):
578  """Returns output for IMP.pmi1.output.Output object"""
579  total_rmsd,best_assignments=self.get_rmsd_and_assigments()
580
581  assignments_out=[]
582  for rmf_mol,ref_mol in best_assignments:
583  ref_name=ref_mol.get_name()
584  ref_copy=IMP.atom.Copy(ref_mol).get_copy_index()
585  rmf_name=rmf_mol.get_name()
586  rmf_copy=IMP.atom.Copy(rmf_mol).get_copy_index()
587  assignments_out.append(rmf_name+"."+str(rmf_copy)+"->"+ref_name+"."+str(ref_copy))
588  return {"RMSD_"+self.label:str(total_rmsd),"RMSD_assignments_"+self.label:str(assignments_out)}
589
590
591
592
593 class Precision(object):
594  """A class to evaluate the precision of an ensemble.
595
596  Also can evaluate the cross-precision of multiple ensembles.
597  Supports MPI for coordinate reading.
598  Recommended procedure:
599  -# initialize object and pass the selection for evaluating precision
600  -# call add_structures() to read in the data (specify group name)
601  -# call get_precision() to evaluate inter/intra precision
602  -# call get_rmsf() to evaluate within-group fluctuations
603  """
604  def __init__(self,model,
605  resolution=1,
606  selection_dictionary={}):
607  """Constructor.
608  @param model The IMP Model
609  @param resolution Use 1 or 10 (kluge: requires that "_Res:X" is
610  part of the hier name)
611  @param selection_dictionary Dictionary where keys are names for
612  selections and values are selection tuples for scoring
613  precision. "All" is automatically made as well
614  """
615  try:
616  from mpi4py import MPI
617  self.comm = MPI.COMM_WORLD
618  self.rank = self.comm.Get_rank()
619  self.number_of_processes = self.comm.size
620  except ImportError:
621  self.number_of_processes = 1
622  self.rank = 0
623
624  self.styles = ['pairwise_rmsd','pairwise_drmsd_k','pairwise_drmsd_Q',
625  'pairwise_drms_k','pairwise_rmsd','drmsd_from_center']
626  self.style = 'pairwise_drmsd_k'
627  self.structures_dictionary = {}
628  self.reference_structures_dictionary = {}
629  self.prots = []
630  self.protein_names = None
631  self.len_particles_resolution_one = None
632  self.model = model
633  self.rmf_names_frames = {}
634  self.reference_rmf_names_frames = None
635  self.reference_structure = None
636  self.reference_prot = None
637  self.selection_dictionary = selection_dictionary
638  self.threshold = 40.0
639  self.residue_particle_index_map = None
640  self.prots = None
641  if resolution in [1,10]:
642  self.resolution = resolution
643  else:
644  raise KeyError("Currently only allow resolution 1 or 10")
645
646  def _get_structure(self,rmf_frame_index,rmf_name):
647  """Read an RMF file and return the particles"""
649  if not self.prots:
650  print("getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
651  self.prots = IMP.rmf.create_hierarchies(rh, self.model)
653  else:
654  print("linking coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
657  del rh
658
659  if self.resolution==1:
660  particle_dict = get_particles_at_resolution_one(self.prots[0])
661  elif self.resolution==10:
662  particle_dict = get_particles_at_resolution_ten(self.prots[0])
663
664  protein_names = list(particle_dict.keys())
665  particles_resolution_one = []
666  for k in particle_dict:
667  particles_resolution_one += (particle_dict[k])
668
669  if self.protein_names==None:
670  self.protein_names = protein_names
671  else:
672  if self.protein_names!=protein_names:
673  print("Error: the protein names of the new coordinate set is not compatible with the previous one")
674
675  if self.len_particles_resolution_one==None:
676  self.len_particles_resolution_one = len(particles_resolution_one)
677  else:
678  if self.len_particles_resolution_one!=len(particles_resolution_one):
679  raise ValueError("the new coordinate set is not compatible with the previous one")
680
681  return particles_resolution_one
682
684  rmf_name,
685  rmf_frame_index,
686  structure_set_name,
687  setup_index_map=False):
688  """ Read a structure into the ensemble and store (as coordinates).
689  @param rmf_name The name of the RMF file
690  @param rmf_frame_index The frame to read
691  @param structure_set_name Name for the set that includes this structure
692  (e.g. "cluster 1")
693  @param setup_index_map if requested, set up a dictionary to help
694  find residue indexes
695  """
696
697  # decide where to put this structure
698  if structure_set_name in self.structures_dictionary:
699  cdict = self.structures_dictionary[structure_set_name]
700  rmflist = self.rmf_names_frames[structure_set_name]
701  else:
702  self.structures_dictionary[structure_set_name]={}
703  self.rmf_names_frames[structure_set_name]=[]
704  cdict = self.structures_dictionary[structure_set_name]
705  rmflist = self.rmf_names_frames[structure_set_name]
706
708  try:
709  particles_resolution_one = self._get_structure(rmf_frame_index,rmf_name)
710  except ValueError:
711  print("something wrong with the rmf")
712  return 0
713  self.selection_dictionary.update({"All":self.protein_names})
714
715  for selection_name in self.selection_dictionary:
716  selection_tuple = self.selection_dictionary[selection_name]
717  coords = self._select_coordinates(selection_tuple,particles_resolution_one,self.prots[0])
718  if selection_name not in cdict:
719  cdict[selection_name] = [coords]
720  else:
721  cdict[selection_name].append(coords)
722
723  rmflist.append((rmf_name,rmf_frame_index))
724
725  # if requested, set up a dictionary to help find residue indexes
726  if setup_index_map:
727  self.residue_particle_index_map = {}
728  for prot_name in self.protein_names:
729  self.residue_particle_index_map[prot_name] = \
730  self._get_residue_particle_index_map(
731  prot_name,
732  particles_resolution_one,self.prots[0])
733
735  rmf_name_frame_tuples,
736  structure_set_name):
737  """Read a list of RMFs, supports parallel
738  @param rmf_name_frame_tuples list of (rmf_file_name,frame_number)
739  @param structure_set_name Name this set of structures (e.g. "cluster.1")
740  """
741
742  # split up the requested list to read in parallel
743  my_rmf_name_frame_tuples=IMP.pmi1.tools.chunk_list_into_segments(
744  rmf_name_frame_tuples,self.number_of_processes)[self.rank]
745  for nfr,tup in enumerate(my_rmf_name_frame_tuples):
746  rmf_name=tup[0]
747  rmf_frame_index=tup[1]
748  # the first frame stores the map between residues and particles
749  if self.residue_particle_index_map is None:
750  setup_index_map = True
751  else:
752  setup_index_map = False
754  rmf_frame_index,
755  structure_set_name,
756  setup_index_map)
757
758  # synchronize the structures
759  if self.number_of_processes > 1:
760  self.rmf_names_frames=IMP.pmi1.tools.scatter_and_gather(self.rmf_names_frames)
761  if self.rank != 0:
762  self.comm.send(self.structures_dictionary, dest=0, tag=11)
763  elif self.rank == 0:
764  for i in range(1, self.number_of_processes):
765  data_tmp = self.comm.recv(source=i, tag=11)
766  for key in self.structures_dictionary:
767  self.structures_dictionary[key].update(data_tmp[key])
768  for i in range(1, self.number_of_processes):
769  self.comm.send(self.structures_dictionary, dest=i, tag=11)
770  if self.rank != 0:
771  self.structures_dictionary = self.comm.recv(source=0, tag=11)
772
773  def _get_residue_particle_index_map(self,prot_name,structure,hier):
774  # Creates map from all particles to residue numbers
775  residue_particle_index_map = []
776  s = IMP.atom.Selection(hier,molecules=[prot_name])
777  all_selected_particles = s.get_selected_particles()
778  intersection = list(set(all_selected_particles) & set(structure))
779  sorted_intersection = IMP.pmi1.tools.sort_by_residues(intersection)
780  for p in sorted_intersection:
781  residue_particle_index_map.append(IMP.pmi1.tools.get_residue_indexes(p))
782  return residue_particle_index_map
783
784
785  def _select_coordinates(self,tuple_selections,structure,prot):
786  selected_coordinates=[]
787  for t in tuple_selections:
788  if type(t)==tuple and len(t)==3:
789  s = IMP.atom.Selection(prot,molecules=[t[2]],residue_indexes=range(t[0],t[1]+1))
790  all_selected_particles = s.get_selected_particles()
791  intersection = list(set(all_selected_particles) & set(structure))
792  sorted_intersection = IMP.pmi1.tools.sort_by_residues(intersection)
793  cc = [tuple(IMP.core.XYZ(p).get_coordinates()) for p in sorted_intersection]
794  selected_coordinates += cc
795  elif type(t)==str:
796  s = IMP.atom.Selection(prot,molecules=[t])
797  all_selected_particles = s.get_selected_particles()
798  intersection = list(set(all_selected_particles) & set(structure))
799  sorted_intersection = IMP.pmi1.tools.sort_by_residues(intersection)
800  cc = [tuple(IMP.core.XYZ(p).get_coordinates()) for p in sorted_intersection]
801  selected_coordinates += cc
802  else:
803  raise ValueError("Selection error")
804  return selected_coordinates
805
806  def set_threshold(self,threshold):
807  self.threshold = threshold
808
809  def _get_distance(self,
810  structure_set_name1,
811  structure_set_name2,
812  selection_name,
813  index1,
814  index2):
815  """ Compute distance between structures with various metrics """
816  c1 = self.structures_dictionary[structure_set_name1][selection_name][index1]
817  c2 = self.structures_dictionary[structure_set_name2][selection_name][index2]
818  coordinates1 = [IMP.algebra.Vector3D(c) for c in c1]
819  coordinates2 = [IMP.algebra.Vector3D(c) for c in c2]
820
821  if self.style=='pairwise_drmsd_k':
822  distance=IMP.atom.get_drmsd(coordinates1,coordinates2)
823  if self.style=='pairwise_drms_k':
824  distance=IMP.atom.get_drms(coordinates1,coordinates2)
825  if self.style=='pairwise_drmsd_Q':
826  distance=IMP.atom.get_drmsd_Q(coordinates1,coordinates2,self.threshold)
827
828  if self.style=='pairwise_rmsd':
829  distance=IMP.algebra.get_rmsd(coordinates1,coordinates2)
830  return distance
831
832  def _get_particle_distances(self,structure_set_name1,structure_set_name2,
833  selection_name,index1,index2):
834  c1 = self.structures_dictionary[structure_set_name1][selection_name][index1]
835  c2 = self.structures_dictionary[structure_set_name2][selection_name][index2]
836
837  coordinates1 = [IMP.algebra.Vector3D(c) for c in c1]
838  coordinates2 = [IMP.algebra.Vector3D(c) for c in c2]
839
840  distances=[np.linalg.norm(a-b) for (a,b) in zip(coordinates1,coordinates2)]
841
842  return distances
843
844  def get_precision(self,
845  structure_set_name1,
846  structure_set_name2,
847  outfile=None,
848  skip=1,
849  selection_keywords=None):
850  """ Evaluate the precision of two named structure groups. Supports MPI.
851  When the structure_set_name1 is different from the structure_set_name2,
852  this evaluates the cross-precision (average pairwise distances).
853  @param outfile Name of the precision output file
854  @param structure_set_name1 string name of the first structure set
855  @param structure_set_name2 string name of the second structure set
856  @param skip analyze every (skip) structure for the distance matrix calculation
857  @param selection_keywords Specify the selection name you want to calculate on.
858  By default this is computed for everything you provided in the constructor,
859  plus all the subunits together.
860  """
861  if selection_keywords is None:
862  sel_keys = list(self.selection_dictionary.keys())
863  else:
864  for k in selection_keywords:
865  if k not in self.selection_dictionary:
866  raise KeyError("you are trying to find named selection " \
867  + k + " which was not requested in the constructor")
868  sel_keys = selection_keywords
869
870  if outfile is not None:
871  of = open(outfile,"w")
872  centroid_index = 0
873  for selection_name in sel_keys:
874  number_of_structures_1 = len(self.structures_dictionary[structure_set_name1][selection_name])
875  number_of_structures_2 = len(self.structures_dictionary[structure_set_name2][selection_name])
876  distances={}
877  structure_pointers_1 = list(range(0,number_of_structures_1,skip))
878  structure_pointers_2 = list(range(0,number_of_structures_2,skip))
879  pair_combination_list = list(itertools.product(structure_pointers_1,structure_pointers_2))
880  if len(pair_combination_list)==0:
881  raise ValueError("no structure selected. Check the skip parameter.")
882
883  # compute pairwise distances in parallel
884  my_pair_combination_list = IMP.pmi1.tools.chunk_list_into_segments(
885  pair_combination_list,self.number_of_processes)[self.rank]
886  my_length = len(my_pair_combination_list)
887  for n,pair in enumerate(my_pair_combination_list):
888  progression = int(float(n)/my_length*100.0)
889  distances[pair] = self._get_distance(structure_set_name1,structure_set_name2,
890  selection_name,pair[0],pair[1])
891  if self.number_of_processes > 1:
892  distances = IMP.pmi1.tools.scatter_and_gather(distances)
893
894  # Finally compute distance to centroid
895  if self.rank == 0:
896  if structure_set_name1==structure_set_name2:
897  structure_pointers = structure_pointers_1
898  number_of_structures = number_of_structures_1
899
900  # calculate the distance from the first centroid
901  # and determine the centroid
902  distance = 0.0
903  distances_to_structure = {}
904  distances_to_structure_normalization = {}
905
906  for n in structure_pointers:
907  distances_to_structure[n] = 0.0
908  distances_to_structure_normalization[n]=0
909
910  for k in distances:
911  distance += distances[k]
912  distances_to_structure[k[0]] += distances[k]
913  distances_to_structure[k[1]] += distances[k]
914  distances_to_structure_normalization[k[0]] += 1
915  distances_to_structure_normalization[k[1]] += 1
916
917  for n in structure_pointers:
918  distances_to_structure[n] = distances_to_structure[n]/distances_to_structure_normalization[n]
919
920  min_distance = min([distances_to_structure[n] for n in distances_to_structure])
921  centroid_index = [k for k, v in distances_to_structure.items() if v == min_distance][0]
922  centroid_rmf_name = self.rmf_names_frames[structure_set_name1][centroid_index]
923
924  centroid_distance = 0.0
925  distance_list = []
926  for n in range(number_of_structures):
927  dist = self._get_distance(structure_set_name1,structure_set_name1,
928  selection_name,centroid_index,n)
929  centroid_distance += dist
930  distance_list.append(dist)
931
932  #pairwise_distance=distance/len(distances.keys())
933  centroid_distance /= number_of_structures
934  #average_centroid_distance=sum(distances_to_structure)/len(distances_to_structure)
935  if outfile is not None:
936  of.write(str(selection_name)+" "+structure_set_name1+
937  " average centroid distance "+str(centroid_distance)+"\n")
938  of.write(str(selection_name)+" "+structure_set_name1+
939  " centroid index "+str(centroid_index)+"\n")
940  of.write(str(selection_name)+" "+structure_set_name1+
941  " centroid rmf name "+str(centroid_rmf_name)+"\n")
942  of.write(str(selection_name)+" "+structure_set_name1+
943  " median centroid distance "+str(np.median(distance_list))+"\n")
944
945  average_pairwise_distances=sum(distances.values())/len(list(distances.values()))
946  if outfile is not None:
947  of.write(str(selection_name)+" "+structure_set_name1+" "+structure_set_name2+
948  " average pairwise distance "+str(average_pairwise_distances)+"\n")
949  if outfile is not None:
950  of.close()
951  return centroid_index
952
953  def get_rmsf(self,
954  structure_set_name,
955  outdir="./",
956  skip=1,
957  set_plot_yaxis_range=None):
958  """ Calculate the residue mean square fluctuations (RMSF).
959  Automatically outputs as data file and pdf
960  @param structure_set_name Which structure set to calculate RMSF for
961  @param outdir Where to write the files
962  @param skip Skip this number of structures
963  @param set_plot_yaxis_range In case you need to change the plot
964  """
965  # get the centroid structure for the whole complex
966  centroid_index = self.get_precision(structure_set_name,
967  structure_set_name,
968  outfile=None,
969  skip=skip)
970  if self.rank==0:
971  for sel_name in self.protein_names:
972  self.selection_dictionary.update({sel_name:[sel_name]})
973  try:
974  number_of_structures = len(self.structures_dictionary[structure_set_name][sel_name])
975  except KeyError:
976  # that protein was not included in the selection
977  continue
978  rpim = self.residue_particle_index_map[sel_name]
979  outfile = outdir+"/rmsf."+sel_name+".dat"
980  of = open(outfile,"w")
981  residue_distances = {}
982  residue_nblock = {}
983  for index in range(number_of_structures):
984  distances = self._get_particle_distances(structure_set_name,
985  structure_set_name,
986  sel_name,
987  centroid_index,index)
988  for nblock,block in enumerate(rpim):
989  for residue_number in block:
990  residue_nblock[residue_number] = nblock
991  if residue_number not in residue_distances:
992  residue_distances[residue_number] = [distances[nblock]]
993  else:
994  residue_distances[residue_number].append(distances[nblock])
995
996  residues = []
997  rmsfs = []
998  for rn in residue_distances:
999  residues.append(rn)
1000  rmsf = np.std(residue_distances[rn])
1001  rmsfs.append(rmsf)
1002  of.write(str(rn)+" "+str(residue_nblock[rn])+" "+str(rmsf)+"\n")
1003
1004  IMP.pmi1.output.plot_xy_data(residues,rmsfs,title=sel_name,
1005  out_fn=outdir+"/rmsf."+sel_name,display=False,
1006  set_plot_yaxis_range=set_plot_yaxis_range,
1007  xlabel='Residue Number',ylabel='Standard error')
1008  of.close()
1009
1010
1011  def set_reference_structure(self,rmf_name,rmf_frame_index):
1012  """Read in a structure used for reference computation.
1013  Needed before calling get_average_distance_wrt_reference_structure()
1014  @param rmf_name The RMF file to read the reference
1015  @param rmf_frame_index The index in that file
1016  """
1017  particles_resolution_one = self._get_structure(rmf_frame_index,rmf_name)
1018  self.reference_rmf_names_frames = (rmf_name,rmf_frame_index)
1019
1020  for selection_name in self.selection_dictionary:
1021  selection_tuple = self.selection_dictionary[selection_name]
1022  coords = self._select_coordinates(selection_tuple,
1023  particles_resolution_one,self.prots[0])
1024  self.reference_structures_dictionary[selection_name] = coords
1025
1026  def get_rmsd_wrt_reference_structure_with_alignment(self,structure_set_name,alignment_selection_key):
1027  """First align then calculate RMSD
1028  @param structure_set_name: the name of the structure set
1029  @param alignment_selection: the key containing the selection tuples needed to make the alignment stored in self.selection_dictionary
1030  @return: for each structure in the structure set, returns the rmsd
1031  """
1032  if self.reference_structures_dictionary=={}:
1033  print("Cannot compute until you set a reference structure")
1034  return
1035
1036  align_reference_coordinates = self.reference_structures_dictionary[alignment_selection_key]
1037  align_coordinates = self.structures_dictionary[structure_set_name][alignment_selection_key]
1038  transformations = []
1039  for c in align_coordinates:
1040  Ali = IMP.pmi1.analysis.Alignment({"All":align_reference_coordinates}, {"All":c})
1041  transformation = Ali.align()[1]
1042  transformations.append(transformation)
1043  for selection_name in self.selection_dictionary:
1044  reference_coordinates = self.reference_structures_dictionary[selection_name]
1045  coordinates2 = [IMP.algebra.Vector3D(c) for c in reference_coordinates]
1046  distances = []
1047  for n,sc in enumerate(self.structures_dictionary[structure_set_name][selection_name]):
1048  coordinates1 = [transformations[n].get_transformed(IMP.algebra.Vector3D(c)) for c in sc]
1049  distance = IMP.algebra.get_rmsd(coordinates1,coordinates2)
1050  distances.append(distance)
1051  print(selection_name,"average rmsd",sum(distances)/len(distances),"median",self._get_median(distances),"minimum distance",min(distances))
1052
1053  def _get_median(self,list_of_values):
1054  return np.median(np.array(list_of_values))
1055
1056  def get_average_distance_wrt_reference_structure(self,structure_set_name):
1057  """Compare the structure set to the reference structure.
1058  @param structure_set_name The structure set to compute this on
1059  @note First call set_reference_structure()
1060  """
1061  ret = {}
1062  if self.reference_structures_dictionary=={}:
1063  print("Cannot compute until you set a reference structure")
1064  return
1065  for selection_name in self.selection_dictionary:
1066  reference_coordinates = self.reference_structures_dictionary[selection_name]
1067  coordinates2 = [IMP.algebra.Vector3D(c) for c in reference_coordinates]
1068  distances = []
1069  for sc in self.structures_dictionary[structure_set_name][selection_name]:
1070  coordinates1 = [IMP.algebra.Vector3D(c) for c in sc]
1071  if self.style=='pairwise_drmsd_k':
1072  distance = IMP.atom.get_drmsd(coordinates1,coordinates2)
1073  if self.style=='pairwise_drms_k':
1074  distance = IMP.atom.get_drms(coordinates1,coordinates2)
1075  if self.style=='pairwise_drmsd_Q':
1076  distance = IMP.atom.get_drmsd_Q(coordinates1,coordinates2,self.threshold)
1077  if self.style=='pairwise_rmsd':
1078  distance = IMP.algebra.get_rmsd(coordinates1,coordinates2)
1079  distances.append(distance)
1080
1081  print(selection_name,"average distance",sum(distances)/len(distances),"minimum distance",min(distances),'nframes',len(distances))
1082  ret[selection_name] = {'average_distance':sum(distances)/len(distances),'minimum_distance':min(distances)}
1083  return ret
1084
1085  def get_coordinates(self):
1086  pass
1087
1088  def set_precision_style(self, style):
1089  if style in self.styles:
1090  self.style=style
1091  else:
1092  raise ValueError("No such style")
1093
1094
1095 class GetModelDensity(object):
1096  """Compute mean density maps from structures.
1097
1098  Keeps a dictionary of density maps,
1099  keys are in the custom ranges. When you call add_subunits_density, it adds
1100  particle coordinates to the existing density maps.
1101  """
1102
1103  def __init__(self, custom_ranges, representation=None, resolution=20.0, voxel=5.0):
1104  """Constructor.
1105  @param custom_ranges Required. It's a dictionary, keys are the
1106  density component names, values are selection tuples
1107  e.g. {'kin28':[['kin28',1,-1]],
1108  'density_name_1' :[('ccl1')],
1109  'density_name_2' :[(1,142,'tfb3d1'),
1110  (143,700,'tfb3d2')],
1111  @param representation PMI representation, for doing selections.
1112  Not needed if you only pass hierarchies
1113  @param resolution The MRC resolution of the output map (in Angstrom unit)
1114  @param voxel The voxel size for the output map (lower is slower)
1115  """
1116
1117  self.representation = representation
1118  self.MRCresolution = resolution
1119  self.voxel = voxel
1120  self.densities = {}
1121  self.count_models = 0.0
1122  self.custom_ranges = custom_ranges
1123
1125  """Add a frame to the densities.
1126  @param hierarchy Optionally read the hierarchy from somewhere.
1127  If not passed, will just read the representation.
1128  """
1129  self.count_models += 1.0
1130
1131  if hierarchy:
1132  part_dict = get_particles_at_resolution_one(hierarchy)
1133  all_particles_by_resolution = []
1134  for name in part_dict:
1135  all_particles_by_resolution += part_dict[name]
1136
1137  for density_name in self.custom_ranges:
1138  parts = []
1139  if hierarchy:
1140  all_particles_by_segments = []
1141
1142  for seg in self.custom_ranges[density_name]:
1143  if not hierarchy:
1144  # when you have a IMP.pmi1.representation.Representation class
1145  parts += IMP.tools.select_by_tuple(self.representation,
1146  seg, resolution=1, name_is_ambiguous=False)
1147  else:
1148  # else, when you have a hierarchy, but not a representation
1149  for h in hierarchy.get_children():
1151  IMP.atom.Molecule.setup_particle(h.get_particle())
1152
1153  if type(seg) == str:
1154  s = IMP.atom.Selection(hierarchy,molecule=seg)
1155  elif type(seg) == tuple and len(seg) == 2:
1156  s = IMP.atom.Selection(
1157  hierarchy, molecule=seg[0],copy_index=seg[1])
1158  elif type(seg) == tuple and len(seg) == 3:
1159  s = IMP.atom.Selection(
1160  hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1))
1161  elif type(seg) == tuple and len(seg) == 4:
1162  s = IMP.atom.Selection(
1163  hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1),copy_index=seg[3])
1164  else:
1165  raise Exception('could not understand selection tuple '+str(seg))
1166
1167  all_particles_by_segments += s.get_selected_particles()
1168  if hierarchy:
1169  parts = list(
1170  set(all_particles_by_segments) & set(all_particles_by_resolution))
1171  self._create_density_from_particles(parts, density_name)
1172
1173  def normalize_density(self):
1174  pass
1175
1176  def _create_density_from_particles(self, ps, name,
1177  kernel_type='GAUSSIAN'):
1178  '''Internal function for adding to densities.
1179  pass XYZR particles with mass and create a density from them.
1180  kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
1181  kd = {
1182  'GAUSSIAN': IMP.em.GAUSSIAN,
1183  'BINARIZED_SPHERE': IMP.em.BINARIZED_SPHERE,
1184  'SPHERE': IMP.em.SPHERE}
1185
1186  dmap = IMP.em.SampledDensityMap(ps, self.MRCresolution, self.voxel)
1187  dmap.calcRMS()
1188  dmap.set_was_used(True)
1189  if name not in self.densities:
1190  self.densities[name] = dmap
1191  else:
1192  bbox1 = IMP.em.get_bounding_box(self.densities[name])
1193  bbox2 = IMP.em.get_bounding_box(dmap)
1194  bbox1 += bbox2
1195  dmap3 = IMP.em.create_density_map(bbox1,self.voxel)
1196  dmap3.set_was_used(True)
1199  self.densities[name] = dmap3
1200
1201  def get_density_keys(self):
1202  return list(self.densities.keys())
1203
1204  def get_density(self,name):
1205  """Get the current density for some component name"""
1206  if name not in self.densities:
1207  return None
1208  else:
1209  return self.densities[name]
1210
1211  def write_mrc(self, path="./",suffix=None):
1212  import os, errno
1213  for density_name in self.densities:
1214  self.densities[density_name].multiply(1. / self.count_models)
1215  if suffix is None:
1216  name=path + "/" + density_name + ".mrc"
1217  else:
1218  name=path + "/" + density_name + "." + suffix + ".mrc"
1219  path, file = os.path.split(name)
1220  if not os.path.exists(path):
1221  try:
1222  os.makedirs(path)
1223  except OSError as e:
1224  if e.errno != errno.EEXIST:
1225  raise
1226  IMP.em.write_map(
1227  self.densities[density_name],name,
1229
1230
1231 class GetContactMap(object):
1232
1233  def __init__(self, distance=15.):
1234  self.distance = distance
1235  self.contactmap = ''
1236  self.namelist = []
1238  self.XL = {}
1239  self.expanded = {}
1240  self.resmap = {}
1241
1242  def set_prot(self, prot):
1243  from scipy.spatial.distance import cdist
1244  self.prot = prot
1245  self.protnames = []
1246  coords = []
1248  namelist = []
1249
1250  particles_dictionary = get_particles_at_resolution_one(self.prot)
1251
1252  for name in particles_dictionary:
1253  residue_indexes = []
1254  for p in particles_dictionary[name]:
1255  print(p.get_name())
1256  residue_indexes.extend(IMP.pmi1.tools.get_residue_indexes(p))
1257
1258  if len(residue_indexes) != 0:
1259  self.protnames.append(name)
1260
1261  def get_subunit_coords(self, frame, align=0):
1262  from scipy.spatial.distance import cdist
1263  coords = []
1265  namelist = []
1266  test, testr = [], []
1267  for part in self.prot.get_children():
1268  SortedSegments = []
1269  print(part)
1270  for chl in part.get_children():
1271  start = IMP.atom.get_leaves(chl)[0]
1272  end = IMP.atom.get_leaves(chl)[-1]
1273
1274  startres = IMP.atom.Fragment(start).get_residue_indexes()[0]
1275  endres = IMP.atom.Fragment(end).get_residue_indexes()[-1]
1276  SortedSegments.append((chl, startres))
1277  SortedSegments = sorted(SortedSegments, key=itemgetter(1))
1278
1279  for sgmnt in SortedSegments:
1280  for leaf in IMP.atom.get_leaves(sgmnt[0]):
1281  p = IMP.core.XYZR(leaf)
1282  crd = np.array([p.get_x(), p.get_y(), p.get_z()])
1283
1284  coords.append(crd)
1286
1287  new_name = part.get_name() + '_' + sgmnt[0].get_name() +\
1288  '_' + \
1289  str(IMP.atom.Fragment(leaf)
1290  .get_residue_indexes()[0])
1291  namelist.append(new_name)
1292  self.expanded[new_name] = len(
1293  IMP.atom.Fragment(leaf).get_residue_indexes())
1294  if part.get_name() not in self.resmap:
1295  self.resmap[part.get_name()] = {}
1296  for res in IMP.atom.Fragment(leaf).get_residue_indexes():
1297  self.resmap[part.get_name()][res] = new_name
1298
1299  coords = np.array(coords)
1301  if len(self.namelist) == 0:
1302  self.namelist = namelist
1303  self.contactmap = np.zeros((len(coords), len(coords)))
1304  distances = cdist(coords, coords)
1306  distances = distances <= self.distance
1307  self.contactmap += distances
1308
1310  self,
1311  filename,
1315  data = open(filename)
1317  data.close()
1318
1319  for d in D:
1320  if identification_string in d:
1321  d = d.replace(
1322  "_",
1323  " ").replace("-",
1324  " ").replace(":",
1325  " ").split()
1326
1327  t1, t2 = (d[0], d[1]), (d[1], d[0])
1328  if t1 not in self.XL:
1329  self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
1330  self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
1331  else:
1332  self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
1333  self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
1334
1335  def dist_matrix(self, skip_cmap=0, skip_xl=1, outname=None):
1336  K = self.namelist
1337  M = self.contactmap
1338  C, R = [], []
1339  L = sum(self.expanded.values())
1340  proteins = self.protnames
1341
1342  # exp new
1343  if skip_cmap == 0:
1344  Matrices = {}
1345  proteins = [p.get_name() for p in self.prot.get_children()]
1346  missing = []
1347  for p1 in range(len(proteins)):
1348  for p2 in range(p1, len(proteins)):
1349  pl1, pl2 = max(
1350  self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1351  pn1, pn2 = proteins[p1], proteins[p2]
1352  mtr = np.zeros((pl1 + 1, pl2 + 1))
1353  print('Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1354  for i1 in range(1, pl1 + 1):
1355  for i2 in range(1, pl2 + 1):
1356  try:
1357  r1 = K.index(self.resmap[pn1][i1])
1358  r2 = K.index(self.resmap[pn2][i2])
1359  r = M[r1, r2]
1360  mtr[i1 - 1, i2 - 1] = r
1361  except KeyError:
1362  missing.append((pn1, pn2, i1, i2))
1363  pass
1364  Matrices[(pn1, pn2)] = mtr
1365
1367  if skip_xl == 0:
1368  if self.XL == {}:
1370  Matrices_xl = {}
1371  missing_xl = []
1372  for p1 in range(len(proteins)):
1373  for p2 in range(p1, len(proteins)):
1374  pl1, pl2 = max(
1375  self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1376  pn1, pn2 = proteins[p1], proteins[p2]
1377  mtr = np.zeros((pl1 + 1, pl2 + 1))
1378  flg = 0
1379  try:
1380  xls = self.XL[(pn1, pn2)]
1381  except KeyError:
1382  try:
1383  xls = self.XL[(pn2, pn1)]
1384  flg = 1
1385  except KeyError:
1386  flg = 2
1387  if flg == 0:
1388  print('Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1389  for xl1, xl2 in xls:
1390  if xl1 > pl1:
1391  print('X' * 10, xl1, xl2)
1392  xl1 = pl1
1393  if xl2 > pl2:
1394  print('X' * 10, xl1, xl2)
1395  xl2 = pl2
1396  mtr[xl1 - 1, xl2 - 1] = 100
1397  elif flg == 1:
1398  print('Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1399  for xl1, xl2 in xls:
1400  if xl1 > pl1:
1401  print('X' * 10, xl1, xl2)
1402  xl1 = pl1
1403  if xl2 > pl2:
1404  print('X' * 10, xl1, xl2)
1405  xl2 = pl2
1406  mtr[xl2 - 1, xl1 - 1] = 100
1407  else:
1408  print('No cross links between: ', pn1, pn2)
1409  Matrices_xl[(pn1, pn2)] = mtr
1410
1411  # expand the matrix to individual residues
1412  #NewM = []
1413  # for x1 in xrange(len(K)):
1414  # lst = []
1415  # for x2 in xrange(len(K)):
1416  # lst += [M[x1,x2]]*self.expanded[K[x2]]
1417  # for i in xrange(self.expanded[K[x1]]): NewM.append(np.array(lst))
1418  #NewM = np.array(NewM)
1419
1420  # make list of protein names and create coordinate lists
1421  C = proteins
1422  # W is the component length list,
1423  # R is the contiguous coordinates list
1424  W, R = [], []
1425  for i, c in enumerate(C):
1426  cl = max(self.resmap[c].keys())
1427  W.append(cl)
1428  if i == 0:
1429  R.append(cl)
1430  else:
1431  R.append(R[-1] + cl)
1432
1433  # start plotting
1434  if outname:
1435  # Don't require a display
1436  import matplotlib as mpl
1437  mpl.use('Agg')
1438  import matplotlib.pyplot as plt
1439  import matplotlib.gridspec as gridspec
1440  import scipy.sparse as sparse
1441
1442  f = plt.figure()
1443  gs = gridspec.GridSpec(len(W), len(W),
1444  width_ratios=W,
1445  height_ratios=W)
1446
1447  cnt = 0
1448  for x1, r1 in enumerate(R):
1449  if x1 == 0:
1450  s1 = 0
1451  else:
1452  s1 = R[x1 - 1]
1453  for x2, r2 in enumerate(R):
1454  if x2 == 0:
1455  s2 = 0
1456  else:
1457  s2 = R[x2 - 1]
1458
1459  ax = plt.subplot(gs[cnt])
1460  if skip_cmap == 0:
1461  try:
1462  mtr = Matrices[(C[x1], C[x2])]
1463  except KeyError:
1464  mtr = Matrices[(C[x2], C[x1])].T
1465  #cax = ax.imshow(log(NewM[s1:r1,s2:r2] / 1.), interpolation='nearest', vmin=0., vmax=log(NewM.max()))
1466  cax = ax.imshow(
1467  np.log(mtr),
1468  interpolation='nearest',
1469  vmin=0.,
1470  vmax=log(mtr.max()) if mtr.max() > 0. else 1.)
1471  ax.set_xticks([])
1472  ax.set_yticks([])
1473  if skip_xl == 0:
1474  try:
1475  mtr = Matrices_xl[(C[x1], C[x2])]
1476  except KeyError:
1477  mtr = Matrices_xl[(C[x2], C[x1])].T
1478  cax = ax.spy(
1479  sparse.csr_matrix(mtr),
1480  markersize=10,
1481  color='white',
1482  linewidth=100,
1483  alpha=0.5)
1484  ax.set_xticks([])
1485  ax.set_yticks([])
1486
1487  cnt += 1
1488  if x2 == 0:
1489  ax.set_ylabel(C[x1], rotation=90)
1490  if outname:
1491  plt.savefig(outname + ".pdf", dpi=300, transparent="False")
1492  else:
1493  plt.show()
1494
1495
1496 # ------------------------------------------------------------------
1497 # a few random tools
1498
1499 def get_hiers_from_rmf(model, frame_number, rmf_file):
1500  # I have to deprecate this function
1501  print("getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1502
1505
1506  try:
1507  prots = IMP.rmf.create_hierarchies(rh, model)
1508  except IOError:
1509  print("Unable to open rmf file %s" % (rmf_file))
1510  return None
1512
1513  try:
1515  except IOError:
1516  print("Unable to open frame %i of file %s" % (frame_number, rmf_file))
1517  return None
1518  model.update()
1519  del rh
1520
1521  return prots
1522
1524  print("linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1527  try:
1529  except:
1530  print("Unable to open frame %i of file %s" % (frame_number, rmf_file))
1531  return False
1532  model.update()
1533  del rh
1534  return True
1535
1536
1537 def get_hiers_and_restraints_from_rmf(model, frame_number, rmf_file):
1538  # I have to deprecate this function
1539  print("getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1540
1543
1544  try:
1545  prots = IMP.rmf.create_hierarchies(rh, model)
1546  rs = IMP.rmf.create_restraints(rh, model)
1547  except:
1548  print("Unable to open rmf file %s" % (rmf_file))
1549  return None,None
1550  try:
1552  except:
1553  print("Unable to open frame %i of file %s" % (frame_number, rmf_file))
1554  return None,None
1555  model.update()
1556  del rh
1557
1558  return prots,rs
1559
1561  print("linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1565  try:
1567  except:
1568  print("Unable to open frame %i of file %s" % (frame_number, rmf_file))
1569  return False
1570  model.update()
1571  del rh
1572  return True
1573
1574
1576  """Get particles at res 1, or any beads, based on the name.
1577  No Representation is needed. This is mainly used when the hierarchy
1578  is read from an RMF file.
1579  @return a dictionary of component names and their particles
1580  \note If the root node is named "System" or is a "State", do proper selection.
1581  """
1582  particle_dict = {}
1583
1584  allparticles = []
1585  for c in prot.get_children():
1586  name = c.get_name()
1587  particle_dict[name] = IMP.atom.get_leaves(c)
1588  for s in c.get_children():
1589  if "_Res:1" in s.get_name() and "_Res:10" not in s.get_name():
1590  allparticles += IMP.atom.get_leaves(s)
1592  allparticles += IMP.atom.get_leaves(s)
1593
1594  particle_align = []
1595  for name in particle_dict:
1596  particle_dict[name] = IMP.pmi1.tools.sort_by_residues(
1597  list(set(particle_dict[name]) & set(allparticles)))
1598  return particle_dict
1599
1601  """Get particles at res 10, or any beads, based on the name.
1602  No Representation is needed.
1603  This is mainly used when the hierarchy is read from an RMF file.
1604  @return a dictionary of component names and their particles
1605  \note If the root node is named "System" or is a "State", do proper selection.
1606  """
1607  particle_dict = {}
1608  allparticles = []
1609  for c in prot.get_children():
1610  name = c.get_name()
1611  particle_dict[name] = IMP.atom.get_leaves(c)
1612  for s in c.get_children():
1613  if "_Res:10" in s.get_name():
1614  allparticles += IMP.atom.get_leaves(s)
1616  allparticles += IMP.atom.get_leaves(s)
1617  particle_align = []
1618  for name in particle_dict:
1619  particle_dict[name] = IMP.pmi1.tools.sort_by_residues(
1620  list(set(particle_dict[name]) & set(allparticles)))
1621  return particle_dict
1622
1623 def select_by_tuple(first_res_last_res_name_tuple):
1624  first_res = first_res_last_res_hier_tuple[0]
1625  last_res = first_res_last_res_hier_tuple[1]
1626  name = first_res_last_res_hier_tuple[2]
1627
1630  def __init__(self):
1632  self.external_csv_data = None
1634  self.mindist = +10000000.0
1635  self.maxdist = -10000000.0
1636  self.contactmap = None
1637
1638  def set_hierarchy(self, prot):
1639  self.prot_length_dict = {}
1640  self.model=prot.get_model()
1641
1642  for i in prot.get_children():
1643  name = i.get_name()
1644  residue_indexes = []
1645  for p in IMP.atom.get_leaves(i):
1646  residue_indexes.extend(IMP.pmi1.tools.get_residue_indexes(p))
1647
1648  if len(residue_indexes) != 0:
1649  self.prot_length_dict[name] = max(residue_indexes)
1650
1651  def set_coordinates_for_contact_map(self, rmf_name,rmf_frame_index):
1652  from scipy.spatial.distance import cdist
1653
1655  prots=IMP.rmf.create_hierarchies(rh, self.model)
1657  print("getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
1658  del rh
1659
1660
1661  coords = []
1663  namelist = []
1664
1665  particles_dictionary = get_particles_at_resolution_one(prots[0])
1666
1667  resindex = 0
1668  self.index_dictionary = {}
1669
1670  for name in particles_dictionary:
1671  residue_indexes = []
1672  for p in particles_dictionary[name]:
1673  print(p.get_name())
1674  residue_indexes = IMP.pmi1.tools.get_residue_indexes(p)
1676
1677  if len(residue_indexes) != 0:
1678
1679  for res in range(min(residue_indexes), max(residue_indexes) + 1):
1680  d = IMP.core.XYZR(p)
1681
1682  crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1683  coords.append(crd)
1685  if name not in self.index_dictionary:
1686  self.index_dictionary[name] = [resindex]
1687  else:
1688  self.index_dictionary[name].append(resindex)
1689  resindex += 1
1690
1691  coords = np.array(coords)
1693
1694  distances = cdist(coords, coords)
1696
1697  distances = np.where(distances <= 20.0, 1.0, 0)
1698  if self.contactmap is None:
1699  self.contactmap = np.zeros((len(coords), len(coords)))
1700  self.contactmap += distances
1701
1702  for prot in prots: IMP.atom.destroy(prot)
1703
1706  mapping=None,
1707  filter_label=None,
1708  filter_rmf_file_names=None, #provide a list of rmf base names to filter the stat file
1709  external_csv_data_file=None,
1710  external_csv_data_file_unique_id_key="Unique ID"):
1711
1713  # mapping is a dictionary that maps standard keywords to entry positions in the key string
1714  # confidence class is a filter that
1715  # external datafile is a datafile that contains further information on the crosslinks
1716  # it will use the unique id to create the dictionary keys
1717
1718  po = IMP.pmi1.output.ProcessOutput(data_file)
1719  keys = po.get_keys()
1720
1721  xl_keys = [k for k in keys if search_label in k]
1722
1723  if filter_rmf_file_names is not None:
1724  rmf_file_key="local_rmf_file_name"
1725  fs = po.get_fields(xl_keys+[rmf_file_key])
1726  else:
1727  fs = po.get_fields(xl_keys)
1728
1729  # this dictionary stores the occurency of given crosslinks
1731
1732  # this dictionary stores the series of distances for given crosslinked
1733  # residues
1735
1736  # this dictionary stores the series of distances for given crosslinked
1737  # residues
1739
1740  if not external_csv_data_file is None:
1741  # this dictionary stores the further information on crosslinks
1742  # labeled by unique ID
1743  self.external_csv_data = {}
1744  xldb = IMP.pmi1.tools.get_db_from_csv(external_csv_data_file)
1745
1746  for xl in xldb:
1747  self.external_csv_data[
1748  xl[external_csv_data_file_unique_id_key]] = xl
1749
1750  # this list keeps track the tuple of cross-links and sample
1751  # so that we don't count twice the same crosslinked residues in the
1752  # same sample
1754
1756
1757  for key in xl_keys:
1758  print(key)
1759  keysplit = key.replace(
1760  "_",
1761  " ").replace(
1762  "-",
1763  " ").replace(
1764  ":",
1765  " ").split(
1766  )
1767
1768  if filter_label!=None:
1769  if filter_label not in keysplit: continue
1770
1771  if mapping is None:
1772  r1 = int(keysplit[5])
1773  c1 = keysplit[6]
1774  r2 = int(keysplit[7])
1775  c2 = keysplit[8]
1776  try:
1777  confidence = keysplit[12]
1778  except:
1779  confidence = '0.0'
1780  try:
1781  unique_identifier = keysplit[3]
1782  except:
1783  unique_identifier = '0'
1784  else:
1785  r1 = int(keysplit[mapping["Residue1"]])
1786  c1 = keysplit[mapping["Protein1"]]
1787  r2 = int(keysplit[mapping["Residue2"]])
1788  c2 = keysplit[mapping["Protein2"]]
1789  try:
1790  confidence = keysplit[mapping["Confidence"]]
1791  except:
1792  confidence = '0.0'
1793  try:
1794  unique_identifier = keysplit[mapping["Unique Identifier"]]
1795  except:
1796  unique_identifier = '0'
1797
1800
1801  # construct the list of distances corresponding to the input rmf
1802  # files
1803
1804  dists=[]
1805  if filter_rmf_file_names is not None:
1806  for n,d in enumerate(fs[key]):
1807  if fs[rmf_file_key][n] in filter_rmf_file_names:
1808  dists.append(float(d))
1809  else:
1810  dists=[float(f) for f in fs[key]]
1811
1812  # check if the input confidence class corresponds to the
1813  # one of the cross-link
1814
1815  mdist = self.median(dists)
1816
1817  stdv = np.std(np.array(dists))
1818  if self.mindist > mdist:
1819  self.mindist = mdist
1820  if self.maxdist < mdist:
1821  self.maxdist = mdist
1822
1823  # calculate the frequency of unique crosslinks within the same
1824  # sample
1825  if not self.external_csv_data is None:
1826  sample = self.external_csv_data[unique_identifier]["Sample"]
1827  else:
1828  sample = "None"
1829
1830  if (r1, c1, r2, c2,mdist) not in cross_link_frequency_list:
1831  if (r1, c1, r2, c2) not in self.cross_link_frequency:
1832  self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1833  self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1834  else:
1835  self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1836  self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1840  (r1, c1, r2, c2,mdist))
1841
1842  if (r1, c1, r2, c2) not in self.cross_link_distances:
1844  r1,
1845  c1,
1846  r2,
1847  c2,
1848  mdist,
1849  confidence)] = dists
1851  r2,
1852  c2,
1853  r1,
1854  c1,
1855  mdist,
1856  confidence)] = dists
1857  self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1858  else:
1860  r2,
1861  c2,
1862  r1,
1863  c1,
1864  mdist,
1865  confidence)] += dists
1867  r1,
1868  c1,
1869  r2,
1870  c2,
1871  mdist,
1872  confidence)] += dists
1873
1875  (r1,
1876  c1,
1877  r2,
1878  c2,
1879  mdist,
1880  stdv,
1881  confidence,
1882  unique_identifier,
1883  'original'))
1885  (r2,
1886  c2,
1887  r1,
1888  c1,
1889  mdist,
1890  stdv,
1891  confidence,
1892  unique_identifier,
1893  'reversed'))
1894
1897  (r1, c1, r2, c2, mdist) = xl
1898  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1899  if frequency not in self.cross_link_frequency_inverted:
1901  frequency] = [(r1, c1, r2, c2)]
1902  else:
1904  frequency].append((r1, c1, r2, c2))
1905
1906  # -------------
1907
1908  def median(self, mylist):
1909  sorts = sorted(mylist)
1910  length = len(sorts)
1911  print(length)
1912  if length == 1:
1913  return mylist[0]
1914  if not length % 2:
1915  return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1916  return sorts[length / 2]
1917
1918  def set_threshold(self,threshold):
1919  self.threshold=threshold
1920
1921  def set_tolerance(self,tolerance):
1922  self.tolerance=tolerance
1923
1924  def colormap(self, dist):
1925  if dist < self.threshold - self.tolerance:
1926  return "Green"
1927  elif dist >= self.threshold + self.tolerance:
1928  return "Orange"
1929  else:
1930  return "Red"
1931
1933  import csv
1934
1935  fieldnames = [
1936  "Unique ID", "Protein1", "Residue1", "Protein2", "Residue2",
1937  "Median Distance", "Standard Deviation", "Confidence", "Frequency", "Arrangement"]
1938
1939  if not self.external_csv_data is None:
1940  keys = list(self.external_csv_data.keys())
1941  innerkeys = list(self.external_csv_data[keys[0]].keys())
1942  innerkeys.sort()
1943  fieldnames += innerkeys
1944
1945  dw = csv.DictWriter(
1946  open(filename,
1947  "w"),
1948  delimiter=',',
1949  fieldnames=fieldnames)
1952  (r1, c1, r2, c2, mdist, stdv, confidence,
1953  unique_identifier, descriptor) = xl
1954  if descriptor == 'original':
1955  outdict = {}
1956  outdict["Unique ID"] = unique_identifier
1957  outdict["Protein1"] = c1
1958  outdict["Protein2"] = c2
1959  outdict["Residue1"] = r1
1960  outdict["Residue2"] = r2
1961  outdict["Median Distance"] = mdist
1962  outdict["Standard Deviation"] = stdv
1963  outdict["Confidence"] = confidence
1965  (r1, c1, r2, c2)]
1966  if c1 == c2:
1967  arrangement = "Intra"
1968  else:
1969  arrangement = "Inter"
1970  outdict["Arrangement"] = arrangement
1971  if not self.external_csv_data is None:
1972  outdict.update(self.external_csv_data[unique_identifier])
1973
1974  dw.writerow(outdict)
1975
1976  def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
1978  filename=None, confidence_classes=None, alphablend=0.1, scale_symbol_size=1.0,
1979  gap_between_components=0,
1980  rename_protein_map=None):
1981  # layout can be:
1982  # "lowerdiagonal" print only the lower diagonal plot
1983  # "upperdiagonal" print only the upper diagonal plot
1984  # "whole" print all
1986  # no_dist_info: if True will plot only the cross-links as grey spots
1987  # filter = tuple the tuple contains a keyword to be search in the database
1988  # a relationship ">","==","<"
1989  # and a value
1990  # example ("ID_Score",">",40)
1991  # scale_symbol_size rescale the symbol for the crosslink
1992  # rename_protein_map is a dictionary to rename proteins
1993
1994  import matplotlib as mpl
1995  mpl.use('Agg')
1996  import matplotlib.pyplot as plt
1997  import matplotlib.cm as cm
1998
1999  fig = plt.figure(figsize=(10, 10))
2001
2002  ax.set_xticks([])
2003  ax.set_yticks([])
2004
2005  # set the list of proteins on the x axis
2006  if prot_listx is None:
2009  else:
2010  prot_listx = list(self.prot_length_dict.keys())
2011  prot_listx.sort()
2012
2013  nresx = gap_between_components + \
2014  sum([self.prot_length_dict[name]
2015  + gap_between_components for name in prot_listx])
2016
2017  # set the list of proteins on the y axis
2018
2019  if prot_listy is None:
2022  else:
2023  prot_listy = list(self.prot_length_dict.keys())
2024  prot_listy.sort()
2025
2026  nresy = gap_between_components + \
2027  sum([self.prot_length_dict[name]
2028  + gap_between_components for name in prot_listy])
2029
2030  # this is the residue offset for each protein
2031  resoffsetx = {}
2032  resendx = {}
2033  res = gap_between_components
2034  for prot in prot_listx:
2035  resoffsetx[prot] = res
2036  res += self.prot_length_dict[prot]
2037  resendx[prot] = res
2038  res += gap_between_components
2039
2040  resoffsety = {}
2041  resendy = {}
2042  res = gap_between_components
2043  for prot in prot_listy:
2044  resoffsety[prot] = res
2045  res += self.prot_length_dict[prot]
2046  resendy[prot] = res
2047  res += gap_between_components
2048
2049  resoffsetdiagonal = {}
2050  res = gap_between_components
2051  for prot in IMP.pmi1.tools.OrderedSet(prot_listx + prot_listy):
2052  resoffsetdiagonal[prot] = res
2053  res += self.prot_length_dict[prot]
2054  res += gap_between_components
2055
2056  # plot protein boundaries
2057
2058  xticks = []
2059  xlabels = []
2060  for n, prot in enumerate(prot_listx):
2061  res = resoffsetx[prot]
2062  end = resendx[prot]
2063  for proty in prot_listy:
2064  resy = resoffsety[proty]
2065  endy = resendy[proty]
2066  ax.plot([res, res], [resy, endy], 'k-', lw=0.4)
2067  ax.plot([end, end], [resy, endy], 'k-', lw=0.4)
2068  xticks.append((float(res) + float(end)) / 2)
2069  if rename_protein_map is not None:
2070  if prot in rename_protein_map:
2071  prot=rename_protein_map[prot]
2072  xlabels.append(prot)
2073
2074  yticks = []
2075  ylabels = []
2076  for n, prot in enumerate(prot_listy):
2077  res = resoffsety[prot]
2078  end = resendy[prot]
2079  for protx in prot_listx:
2080  resx = resoffsetx[protx]
2081  endx = resendx[protx]
2082  ax.plot([resx, endx], [res, res], 'k-', lw=0.4)
2083  ax.plot([resx, endx], [end, end], 'k-', lw=0.4)
2084  yticks.append((float(res) + float(end)) / 2)
2085  if rename_protein_map is not None:
2086  if prot in rename_protein_map:
2087  prot=rename_protein_map[prot]
2088  ylabels.append(prot)
2089
2090  # plot the contact map
2091  print(prot_listx, prot_listy)
2092
2093  if not self.contactmap is None:
2094  import matplotlib.cm as cm
2095  tmp_array = np.zeros((nresx, nresy))
2096
2097  for px in prot_listx:
2098  print(px)
2099  for py in prot_listy:
2100  print(py)
2101  resx = resoffsety[px]
2102  lengx = resendx[px] - 1
2103  resy = resoffsety[py]
2104  lengy = resendy[py] - 1
2105  indexes_x = self.index_dictionary[px]
2106  minx = min(indexes_x)
2107  maxx = max(indexes_x)
2108  indexes_y = self.index_dictionary[py]
2109  miny = min(indexes_y)
2110  maxy = max(indexes_y)
2111
2112  print(px, py, minx, maxx, miny, maxy)
2113
2114  try:
2115  tmp_array[
2116  resx:lengx,
2117  resy:lengy] = self.contactmap[
2118  minx:maxx,
2119  miny:maxy]
2120  except:
2121  continue
2122
2123
2124  ax.imshow(tmp_array,
2125  cmap=cm.binary,
2126  origin='lower',
2127  interpolation='nearest')
2128
2129  ax.set_xticks(xticks)
2130  ax.set_xticklabels(xlabels, rotation=90)
2131  ax.set_yticks(yticks)
2132  ax.set_yticklabels(ylabels)
2133  ax.set_xlim(0,nresx)
2134  ax.set_ylim(0,nresy)
2135
2136
2138
2140
2142
2143  (r1, c1, r2, c2, mdist, stdv, confidence,
2144  unique_identifier, descriptor) = xl
2145
2146  if confidence_classes is not None:
2147  if confidence not in confidence_classes:
2148  continue
2149
2150  try:
2151  pos1 = r1 + resoffsetx[c1]
2152  except:
2153  continue
2154  try:
2155  pos2 = r2 + resoffsety[c2]
2156  except:
2157  continue
2158
2159  if not filter is None:
2160  xldb = self.external_csv_data[unique_identifier]
2161  xldb.update({"Distance": mdist})
2162  xldb.update({"Distance_stdv": stdv})
2163
2164  if filter[1] == ">":
2165  if float(xldb[filter[0]]) <= float(filter[2]):
2166  continue
2167
2168  if filter[1] == "<":
2169  if float(xldb[filter[0]]) >= float(filter[2]):
2170  continue
2171
2172  if filter[1] == "==":
2173  if float(xldb[filter[0]]) != float(filter[2]):
2174  continue
2175
2176  # all that below is used for plotting the diagonal
2177  # when you have a rectangolar plots
2178
2179  pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
2180  pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
2181
2182  if layout == 'lowerdiagonal':
2183  if pos_for_diagonal1 <= pos_for_diagonal2:
2184  continue
2185  if layout == 'upperdiagonal':
2186  if pos_for_diagonal1 >= pos_for_diagonal2:
2187  continue
2188
2190
2191  if not no_confidence_info:
2192  if confidence == '0.01':
2193  markersize = 14 * scale_symbol_size
2194  elif confidence == '0.05':
2195  markersize = 9 * scale_symbol_size
2196  elif confidence == '0.1':
2197  markersize = 6 * scale_symbol_size
2198  else:
2199  markersize = 15 * scale_symbol_size
2200  else:
2201  markersize = 5 * scale_symbol_size
2202
2203  if not no_dist_info:
2204  color = self.colormap(mdist)
2205  else:
2206  color = "gray"
2207
2208  ax.plot(
2209  [pos1],
2210  [pos2],
2211  'o',
2212  c=color,
2213  alpha=alphablend,
2214  markersize=markersize)
2215
2216
2217
2218  fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
2219
2220  [i.set_linewidth(2.0) for i in ax.spines.values()]
2221
2222  #plt.tight_layout()
2223
2224  if filename:
2225  plt.savefig(filename + ".pdf", dpi=300, transparent="False")
2226  else:
2227  plt.show()
2228
2229  def get_frequency_statistics(self, prot_list,
2230  prot_list2=None):
2231
2232  violated_histogram = {}
2233  satisfied_histogram = {}
2235
2237  (r1, c1, r2, c2, mdist) = xl
2238
2239  # here we filter by the protein
2240  if prot_list2 is None:
2241  if not c1 in prot_list:
2242  continue
2243  if not c2 in prot_list:
2244  continue
2245  else:
2246  if c1 in prot_list and c2 in prot_list2:
2247  pass
2248  elif c1 in prot_list2 and c2 in prot_list:
2249  pass
2250  else:
2251  continue
2252
2253  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2254
2255  if (r1, c1, r2, c2) not in unique_cross_links:
2256  if mdist > 35.0:
2257  if frequency not in violated_histogram:
2258  violated_histogram[frequency] = 1
2259  else:
2260  violated_histogram[frequency] += 1
2261  else:
2262  if frequency not in satisfied_histogram:
2263  satisfied_histogram[frequency] = 1
2264  else:
2265  satisfied_histogram[frequency] += 1
2268
2269  print("# satisfied")
2270
2272
2273  for i in satisfied_histogram:
2274  # if i in violated_histogram:
2275  # print i, satisfied_histogram[i]+violated_histogram[i]
2276  # else:
2277  if i in violated_histogram:
2278  print(i, violated_histogram[i]+satisfied_histogram[i])
2279  else:
2280  print(i, satisfied_histogram[i])
2282
2283  print("# violated")
2284
2285  for i in violated_histogram:
2286  print(i, violated_histogram[i])
2288
2290
2291
2292 # ------------
2294  prot_list2=None):
2295  tmp_matrix = []
2296  confidence_list = []
2298  (r1, c1, r2, c2, mdist, stdv, confidence,
2299  unique_identifier, descriptor) = xl
2300
2301  if prot_list2 is None:
2302  if not c1 in prot_list:
2303  continue
2304  if not c2 in prot_list:
2305  continue
2306  else:
2307  if c1 in prot_list and c2 in prot_list2:
2308  pass
2309  elif c1 in prot_list2 and c2 in prot_list:
2310  pass
2311  else:
2312  continue
2313
2314  if descriptor != "original":
2315  continue
2316
2317  confidence_list.append(confidence)
2318
2319  dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2320  tmp_dist_binary = []
2321  for d in dists:
2322  if d < 35:
2323  tmp_dist_binary.append(1)
2324  else:
2325  tmp_dist_binary.append(0)
2326  tmp_matrix.append(tmp_dist_binary)
2327
2328  matrix = list(zip(*tmp_matrix))
2329
2330  satisfied_high_sum = 0
2331  satisfied_mid_sum = 0
2332  satisfied_low_sum = 0
2333  total_satisfied_sum = 0
2334  for k, m in enumerate(matrix):
2335  satisfied_high = 0
2336  total_high = 0
2337  satisfied_mid = 0
2338  total_mid = 0
2339  satisfied_low = 0
2340  total_low = 0
2341  total_satisfied = 0
2342  total = 0
2343  for n, b in enumerate(m):
2344  if confidence_list[n] == "0.01":
2345  total_high += 1
2346  if b == 1:
2347  satisfied_high += 1
2348  satisfied_high_sum += 1
2349  elif confidence_list[n] == "0.05":
2350  total_mid += 1
2351  if b == 1:
2352  satisfied_mid += 1
2353  satisfied_mid_sum += 1
2354  elif confidence_list[n] == "0.1":
2355  total_low += 1
2356  if b == 1:
2357  satisfied_low += 1
2358  satisfied_low_sum += 1
2359  if b == 1:
2360  total_satisfied += 1
2361  total_satisfied_sum += 1
2362  total += 1
2363  print(k, satisfied_high, total_high)
2364  print(k, satisfied_mid, total_mid)
2365  print(k, satisfied_low, total_low)
2366  print(k, total_satisfied, total)
2367  print(float(satisfied_high_sum) / len(matrix))
2368  print(float(satisfied_mid_sum) / len(matrix))
2369  print(float(satisfied_low_sum) / len(matrix))
2370 # ------------
2371
2373  prot_list2=None):
2374
2375  print(prot_list)
2376  print(prot_list2)
2377  satisfied_high = 0
2378  total_high = 0
2379  satisfied_mid = 0
2380  total_mid = 0
2381  satisfied_low = 0
2382  total_low = 0
2383  total = 0
2384  tmp_matrix = []
2385  satisfied_string = []
2387  (r1, c1, r2, c2, mdist, stdv, confidence,
2388  unique_identifier, descriptor) = xl
2389
2390  if prot_list2 is None:
2391  if not c1 in prot_list:
2392  continue
2393  if not c2 in prot_list:
2394  continue
2395  else:
2396  if c1 in prot_list and c2 in prot_list2:
2397  pass
2398  elif c1 in prot_list2 and c2 in prot_list:
2399  pass
2400  else:
2401  continue
2402
2403  if descriptor != "original":
2404  continue
2405
2406  total += 1
2407  if confidence == "0.01":
2408  total_high += 1
2409  if mdist <= 35:
2410  satisfied_high += 1
2411  if confidence == "0.05":
2412  total_mid += 1
2413  if mdist <= 35:
2414  satisfied_mid += 1
2415  if confidence == "0.1":
2416  total_low += 1
2417  if mdist <= 35:
2418  satisfied_low += 1
2419  if mdist <= 35:
2420  satisfied_string.append(1)
2421  else:
2422  satisfied_string.append(0)
2423
2424  dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2425  tmp_dist_binary = []
2426  for d in dists:
2427  if d < 35:
2428  tmp_dist_binary.append(1)
2429  else:
2430  tmp_dist_binary.append(0)
2431  tmp_matrix.append(tmp_dist_binary)
2432
2433  print("unique satisfied_high/total_high", satisfied_high, "/", total_high)
2434  print("unique satisfied_mid/total_mid", satisfied_mid, "/", total_mid)
2435  print("unique satisfied_low/total_low", satisfied_low, "/", total_low)
2436  print("total", total)
2437
2438  matrix = list(zip(*tmp_matrix))
2439  satisfied_models = 0
2440  satstr = ""
2441  for b in satisfied_string:
2442  if b == 0:
2443  satstr += "-"
2444  if b == 1:
2445  satstr += "*"
2446
2447  for m in matrix:
2448  all_satisfied = True
2449  string = ""
2450  for n, b in enumerate(m):
2451  if b == 0:
2452  string += "0"
2453  if b == 1:
2454  string += "1"
2455  if b == 1 and satisfied_string[n] == 1:
2456  pass
2457  elif b == 1 and satisfied_string[n] == 0:
2458  pass
2459  elif b == 0 and satisfied_string[n] == 0:
2460  pass
2461  elif b == 0 and satisfied_string[n] == 1:
2462  all_satisfied = False
2463  if all_satisfied:
2464  satisfied_models += 1
2465  print(string)
2466  print(satstr, all_satisfied)
2467  print("models that satisfies the median satisfied crosslinks/total models", satisfied_models, len(matrix))
2468
2470  prot_list2=None):
2471
2472  import matplotlib as mpl
2473  mpl.use('Agg')
2474  import matplotlib.pylab as pl
2475
2476  tmp_matrix = []
2478  (r1, c1, r2, c2) = kw
2480
2481  if prot_list2 is None:
2482  if not c1 in prot_list:
2483  continue
2484  if not c2 in prot_list:
2485  continue
2486  else:
2487  if c1 in prot_list and c2 in prot_list2:
2488  pass
2489  elif c1 in prot_list2 and c2 in prot_list:
2490  pass
2491  else:
2492  continue
2493  # append the sum of dists to order by that in the matrix plot
2494  dists.append(sum(dists))
2495  tmp_matrix.append(dists)
2496
2497  tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
2498
2499  # print len(tmp_matrix), len(tmp_matrix[0])-1
2500  matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
2501
2502  for i in range(len(tmp_matrix)):
2503  for k in range(len(tmp_matrix[i]) - 1):
2504  matrix[i][k] = tmp_matrix[i][k]
2505
2506  print(matrix)
2507
2508  fig = pl.figure()
2510
2511  cax = ax.imshow(matrix, interpolation='nearest')
2512  # ax.set_yticks(range(len(self.model_list_names)))
2513  #ax.set_yticklabels( [self.model_list_names[i] for i in leaves_order] )
2514  fig.colorbar(cax)
2515  pl.savefig(figurename, dpi=300)
2516  pl.show()
2517
2518  def plot_bars(
2519  self,
2520  filename,
2521  prots1,
2522  prots2,
2523  nxl_per_row=20,
2524  arrangement="inter",
2525  confidence_input="None"):
2526
2527  data = []
2529  (r1, c1, r2, c2, mdist, confidence) = xl
2530  if c1 in prots1 and c2 in prots2:
2531  if arrangement == "inter" and c1 == c2:
2532  continue
2533  if arrangement == "intra" and c1 != c2:
2534  continue
2535  if confidence_input == confidence:
2536  label = str(c1) + ":" + str(r1) + \
2537  "-" + str(c2) + ":" + str(r2)
2539  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2540  data.append((label, values, mdist, frequency))
2541
2542  sort_by_dist = sorted(data, key=lambda tup: tup[2])
2543  sort_by_dist = list(zip(*sort_by_dist))
2544  values = sort_by_dist[1]
2545  positions = list(range(len(values)))
2546  labels = sort_by_dist[0]
2547  frequencies = list(map(float, sort_by_dist[3]))
2548  frequencies = [f * 10.0 for f in frequencies]
2549
2550  nchunks = int(float(len(values)) / nxl_per_row)
2551  values_chunks = IMP.pmi1.tools.chunk_list_into_segments(values, nchunks)
2552  positions_chunks = IMP.pmi1.tools.chunk_list_into_segments(
2553  positions,
2554  nchunks)
2555  frequencies_chunks = IMP.pmi1.tools.chunk_list_into_segments(
2556  frequencies,
2557  nchunks)
2558  labels_chunks = IMP.pmi1.tools.chunk_list_into_segments(labels, nchunks)
2559
2560  for n, v in enumerate(values_chunks):
2561  p = positions_chunks[n]
2562  f = frequencies_chunks[n]
2563  l = labels_chunks[n]
2565  filename + "." + str(n), v, p, f,
2566  valuename="Distance (Ang)", positionname="Unique " + arrangement + " Crosslinks", xlabels=l)
2567
2569  prot_list=None,
2570  prot_list2=None,
2571  confidence_classes=None,
2572  bins=40,
2573  color='#66CCCC',
2574  yplotrange=[0, 1],
2575  format="png",
2576  normalized=False):
2577  if prot_list is None:
2578  prot_list = list(self.prot_length_dict.keys())
2579
2580  distances = []
2582  (r1, c1, r2, c2, mdist, stdv, confidence,
2583  unique_identifier, descriptor) = xl
2584
2585  if not confidence_classes is None:
2586  if confidence not in confidence_classes:
2587  continue
2588
2589  if prot_list2 is None:
2590  if not c1 in prot_list:
2591  continue
2592  if not c2 in prot_list:
2593  continue
2594  else:
2595  if c1 in prot_list and c2 in prot_list2:
2596  pass
2597  elif c1 in prot_list2 and c2 in prot_list:
2598  pass
2599  else:
2600  continue
2601
2602  distances.append(mdist)
2603
2605  filename, distances, valuename="C-alpha C-alpha distance [Ang]",
2606  bins=bins, color=color,
2607  format=format,
2608  reference_xline=35.0,
2609  yplotrange=yplotrange, normalized=normalized)
2610
2611  def scatter_plot_xl_features(self, filename,
2612  feature1=None,
2613  feature2=None,
2614  prot_list=None,
2615  prot_list2=None,
2616  yplotrange=None,
2617  reference_ylines=None,
2618  distance_color=True,
2619  format="png"):
2620  import matplotlib as mpl
2621  mpl.use('Agg')
2622  import matplotlib.pyplot as plt
2623  import matplotlib.cm as cm
2624
2625  fig = plt.figure(figsize=(10, 10))
2627
2629  (r1, c1, r2, c2, mdist, stdv, confidence,
2630  unique_identifier, arrangement) = xl
2631
2632  if prot_list2 is None:
2633  if not c1 in prot_list:
2634  continue
2635  if not c2 in prot_list:
2636  continue
2637  else:
2638  if c1 in prot_list and c2 in prot_list2:
2639  pass
2640  elif c1 in prot_list2 and c2 in prot_list:
2641  pass
2642  else:
2643  continue
2644
2645  xldb = self.external_csv_data[unique_identifier]
2646  xldb.update({"Distance": mdist})
2647  xldb.update({"Distance_stdv": stdv})
2648
2649  xvalue = float(xldb[feature1])
2650  yvalue = float(xldb[feature2])
2651
2652  if distance_color:
2653  color = self.colormap(mdist)
2654  else:
2655  color = "gray"
2656
2657  ax.plot([xvalue], [yvalue], 'o', c=color, alpha=0.1, markersize=7)
2658
2659  if not yplotrange is None:
2660  ax.set_ylim(yplotrange)
2661  if not reference_ylines is None:
2662  for rl in reference_ylines:
2663  ax.axhline(rl, color='red', linestyle='dashed', linewidth=1)
2664
2665  if filename:
2666  plt.savefig(filename + "." + format, dpi=150, transparent="False")
2667
2668  plt.show()
Definition: Molecule.h:35
Simple 3D transformation class.
def get_moldict_coorddict
return data structure for the RMSD calculation
A decorator to associate a particle with a part of a protein/DNA/RNA.
Definition: Fragment.h:20
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
Performs alignment and RMSD calculation for two sets of coordinates.
A class to cluster structures.
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
Definition: atom/distance.h:49
Legacy PMI1 module to represent, score, sample and analyze models.
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
def do_cluster
Run K-means clustering.
void link_restraints(RMF::FileConstHandle fh, const Restraints &hs)
log
Definition: log.py:1
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name.
def get_residue_indexes
Retrieve the residue indexes for the given particle.
Definition: /tools.py:1064
Tools for clustering and cluster analysis.
Definition: pmi1/Analysis.py:1
static Molecule setup_particle(Model *m, ParticleIndex pi)
Definition: Molecule.h:37
def get_rmsd_wrt_reference_structure_with_alignment
First align then calculate RMSD.
def plot_field_histogram
Plot a list of histograms from a value list.
Definition: /output.py:1579
def get_molecules
This function returns the parent molecule hierarchies of given objects.
Definition: /tools.py:1914
Miscellaneous utilities.
Definition: /tools.py:1
Class for sampling a density map from particles.
A decorator for keeping track of copies of a molecule.
Definition: Copy.h:28
Add a frame to the densities.
A class to evaluate the precision of an ensemble.
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
Return a density map for which voxel i contains the result of m1[i]*m2[i].
DensityMap * create_density_map(const IMP::algebra::GridD< 3, S, V, E > &arg)
Create a density map from an arbitrary IMP::algebra::GridD.
Definition: DensityMap.h:674
def fill
Add coordinates for a single model.
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
def get_rmsf
Calculate the residue mean square fluctuations (RMSF).
Basic utilities for handling cryo-electron microscopy 3D density maps.
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
def get_density
Get the current density for some component name.
def get_average_distance_wrt_reference_structure
Compare the structure set to the reference structure.
Read a structure into the ensemble and store (as coordinates).
def get_precision
Evaluate the precision of two named structure groups.
Read a list of RMFs, supports parallel.
3D rotation class.
Definition: Rotation3D.h:52
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Definition: DensityMap.h:505
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
Definition: exception.h:48
VectorD< 3 > Vector3D
Definition: VectorD.h:408
Restraints create_restraints(RMF::FileConstHandle fh, Model *m)
A class for reading stat files (either rmf or ascii v1 and v2)
Definition: /output.py:771
Compute the RMSD (without alignment) taking into account the copy ambiguity.
Compute mean density maps from structures.
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
def scatter_and_gather
Synchronize data over a parallel run.
Definition: /tools.py:1114
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
def get_output
Returns output for IMP.pmi1.output.Output object.
Classes for writing output files and processing them.
Definition: /output.py:1
def set_reference_structure
Read in a structure used for reference computation.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
def plot_fields_box_plots
Plot time series as boxplots.
Definition: /output.py:1644
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Definition: atom/distance.h:85
Select hierarchy particles identified by the biological name.
Definition: Selection.h:70
Support for the RMF file format for storing hierarchical molecular data and markup.
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27