IMP logo
IMP Reference Guide  2.8.0
The Integrative Modeling Platform
pmi/Analysis.py
1 #!/usr/bin/env python
2 
3 """@namespace IMP.pmi.analysis
4  Tools for clustering and cluster analysis
5 """
6 from __future__ import print_function
7 import IMP
8 import IMP.algebra
9 import IMP.em
10 import IMP.pmi
11 import IMP.pmi.tools
12 import IMP.pmi.output
13 import IMP.rmf
14 import RMF
15 import IMP.pmi.analysis
16 from operator import itemgetter
17 from copy import deepcopy
18 from math import log,sqrt
19 import itertools
20 import numpy as np
21 
22 
23 class Alignment(object):
24  """Performs alignment and RMSD calculation for two sets of coordinates
25 
26  The class also takes into account non-equal stoichiometry of the proteins.
27  If this is the case, the protein names of proteins in multiple copies
28  should be specified in the following form:
29  nameA..1, nameA..2 (note two dots).
30  """
31 
32  def __init__(self, template, query, weights=None):
33  """Constructor.
34  @param query {'p1':coords(L,3), 'p2':coords(L,3)}
35  @param template {'p1':coords(L,3), 'p2':coords(L,3)}
36  @param weights optional weights for each set of coordinates
37  """
38  self.query = query
39  self.template = template
40  self.weights=weights
41 
42  if len(self.query.keys()) != len(self.template.keys()):
43  raise ValueError('''the number of proteins
44  in template and query does not match!''')
45 
46  def permute(self):
47  # get unique protein names for each protein
48  # this is, unfortunately, expecting that names are all 'Molname..X'
49  # where X is different for each copy.
50  self.proteins = sorted(self.query.keys())
51  prots_uniq = [i.split('..')[0] for i in self.proteins]
52 
53  # for each unique name, store list of permutations
54  # e.g. for keys A..1,A..2 store P[A] = [[A..1,A..2],[A..2,A..1]]
55  # then store the product: [[[A1,A2],[B1,B2]],[[A1,A2],[B2,B1]],
56  # [[A2,A1],[B1,B2]],[[A2,A1],[B2,B1]]]
57  P = {}
58  for p in prots_uniq:
59  np = prots_uniq.count(p)
60  copies = [i for i in self.proteins if i.split('..')[0] == p]
61  prmts = list(itertools.permutations(copies, len(copies)))
62  P[p] = prmts
63  self.P = P
64  self.Product = list(itertools.product(*P.values()))
65 
66  def get_rmsd(self):
67 
68  self.permute()
69 
70  template_xyz = []
71  weights = []
72  torder = sum([list(i) for i in self.Product[0]], [])
73  for t in torder:
74  template_xyz += [IMP.algebra.Vector3D(i) for i in self.template[t]]
75  if self.weights is not None:
76  weights += [i for i in self.weights[t]]
77  #template_xyz = np.array(template_xyz)
78 
79  self.rmsd = 10000000000.
80  for comb in self.Product:
81 
82  order = sum([list(i) for i in comb], [])
83  query_xyz = []
84  for p in order:
85  query_xyz += [IMP.algebra.Vector3D(i) for i in self.query[p]]
86  #query_xyz = np.array(query_xyz)
87  #if len(template_xyz) != len(query_xyz):
88  # print '''Alignment.get_rmsd: ERROR: the number of coordinates
89  # in template and query does not match!'''
90  # exit()
91 
92  if self.weights is not None:
93  dist=IMP.algebra.get_weighted_rmsd(template_xyz, query_xyz, weights)
94  else:
95  dist=IMP.algebra.get_rmsd(template_xyz, query_xyz)
96  #dist = sqrt(
97  # sum(np.diagonal(cdist(template_xyz, query_xyz) ** 2)) / len(template_xyz))
98  if dist < self.rmsd:
99  self.rmsd = dist
100  return self.rmsd
101 
102  def align(self):
103  from scipy.spatial.distance import cdist
104 
105  self.permute()
106 
107  # create flat coordinate list from template in standard order
108  # then loop through the permutations and try to align and get RMSD
109  # should allow you to try all mappings within each protein
110  template_xyz = []
111  torder = sum([list(i) for i in self.Product[0]], [])
112  for t in torder:
113  template_xyz += [IMP.algebra.Vector3D(i) for i in self.template[t]]
114  # template_xyz = np.array(template_xyz)
115  self.rmsd, Transformation = 10000000000., ''
116 
117  # then for each permutation, get flat list of coords and get RMSD
118  for comb in self.Product:
119  order = sum([list(i) for i in comb], [])
120  query_xyz = []
121  for p in order:
122  query_xyz += [IMP.algebra.Vector3D(i) for i in self.query[p]]
123  #query_xyz = np.array(query_xyz)
124 
125  if len(template_xyz) != len(query_xyz):
126  raise ValueError('''the number of coordinates
127  in template and query does not match!''')
128 
130  query_xyz,
131  template_xyz)
132  query_xyz_tr = [transformation.get_transformed(n)
133  for n in query_xyz]
134 
135  dist = sqrt(
136  sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2)) / len(template_xyz))
137  if dist < self.rmsd:
138  self.rmsd = dist
139  Transformation = transformation
140 
141  # return the transformation
142  return (self.rmsd, Transformation)
143 
144 
145 # TEST for the alignment ###
146 """
147 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
148  'a..2':np.array([np.array([1.,1.,])]),
149  'a..3':np.array([np.array([-2.,1.])]),
150  'b':np.array([np.array([0.,-1.])]),
151  'c..1':np.array([np.array([-1.,-1.])]),
152  'c..2':np.array([np.array([1.,-1.])]),
153  'd':np.array([np.array([0.,0.])]),
154  'e':np.array([np.array([0.,1.])])}
155 
156 Ali = Alignment(Proteins, Proteins)
157 Ali.permute()
158 if Ali.get_rmsd() == 0.0: print 'successful test!'
159 else: print 'ERROR!'; exit()
160 """
161 
162 
163 # ----------------------------------
164 class Violations(object):
165 
166  def __init__(self, filename):
167 
168  self.violation_thresholds = {}
169  self.violation_counts = {}
170 
171  data = open(filename)
172  D = data.readlines()
173  data.close()
174 
175  for d in D:
176  d = d.strip().split()
177  self.violation_thresholds[d[0]] = float(d[1])
178 
179  def get_number_violated_restraints(self, rsts_dict):
180  num_violated = 0
181  for rst in self.violation_thresholds:
182  if rst not in rsts_dict:
183  continue # print rst;
184  if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
185  num_violated += 1
186  if rst not in self.violation_counts:
187  self.violation_counts[rst] = 1
188  else:
189  self.violation_counts[rst] += 1
190  return num_violated
191 
192 
193 # ----------------------------------
194 class Clustering(object):
195  """A class to cluster structures.
196  Uses scipy's cdist function to compute distance matrices
197  and sklearn's kmeans clustering module.
198  """
199  def __init__(self,rmsd_weights=None):
200  """Constructor.
201  @param rmsd_weights Flat list of weights for each particle
202  (if they're coarse)
203  """
204  try:
205  from mpi4py import MPI
206  self.comm = MPI.COMM_WORLD
207  self.rank = self.comm.Get_rank()
208  self.number_of_processes = self.comm.size
209  except ImportError:
210  self.number_of_processes = 1
211  self.rank = 0
212  self.all_coords = {}
213  self.structure_cluster_ids = None
214  self.tmpl_coords = None
215  self.rmsd_weights=rmsd_weights
216 
217  def set_template(self, part_coords):
218 
219  self.tmpl_coords = part_coords
220 
221  def fill(self, frame, Coords):
222  """Add coordinates for a single model."""
223 
224  self.all_coords[frame] = Coords
225 
226  def dist_matrix(self):
227 
228  self.model_list_names = list(self.all_coords.keys())
229  self.model_indexes = list(range(len(self.model_list_names)))
230  self.model_indexes_dict = dict(
231  list(zip(self.model_list_names, self.model_indexes)))
232  model_indexes_unique_pairs = list(itertools.combinations(self.model_indexes, 2))
233 
234  my_model_indexes_unique_pairs = IMP.pmi.tools.chunk_list_into_segments(
235  model_indexes_unique_pairs,
236  self.number_of_processes)[self.rank]
237 
238  print("process %s assigned with %s pairs" % (str(self.rank), str(len(my_model_indexes_unique_pairs))))
239 
240  (raw_distance_dict, self.transformation_distance_dict) = self.matrix_calculation(self.all_coords,
241  self.tmpl_coords,
242  my_model_indexes_unique_pairs)
243 
244  if self.number_of_processes > 1:
245  raw_distance_dict = IMP.pmi.tools.scatter_and_gather(
246  raw_distance_dict)
247  pickable_transformations = self.get_pickable_transformation_distance_dict(
248  )
249  pickable_transformations = IMP.pmi.tools.scatter_and_gather(
250  pickable_transformations)
251  self.set_transformation_distance_dict_from_pickable(
252  pickable_transformations)
253 
254  self.raw_distance_matrix = np.zeros(
255  (len(self.model_list_names), len(self.model_list_names)))
256  for item in raw_distance_dict:
257  (f1, f2) = item
258  self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
259  self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
260 
261  def get_dist_matrix(self):
262  return self.raw_distance_matrix
263 
264  def do_cluster(self, number_of_clusters,seed=None):
265  """Run K-means clustering
266  @param number_of_clusters Num means
267  @param seed the random seed
268  """
269  from sklearn.cluster import KMeans
270  if seed is not None:
271  np.random.seed(seed)
272  try:
273  # check whether we have the right version of sklearn
274  kmeans = KMeans(n_clusters=number_of_clusters)
275  except TypeError:
276  # sklearn older than 0.12
277  kmeans = KMeans(k=number_of_clusters)
278  kmeans.fit_predict(self.raw_distance_matrix)
279 
280  self.structure_cluster_ids = kmeans.labels_
281 
282  def get_pickable_transformation_distance_dict(self):
283  pickable_transformations = {}
284  for label in self.transformation_distance_dict:
285  tr = self.transformation_distance_dict[label]
286  trans = tuple(tr.get_translation())
287  rot = tuple(tr.get_rotation().get_quaternion())
288  pickable_transformations[label] = (rot, trans)
289  return pickable_transformations
290 
291  def set_transformation_distance_dict_from_pickable(
292  self,
293  pickable_transformations):
294  self.transformation_distance_dict = {}
295  for label in pickable_transformations:
296  tr = pickable_transformations[label]
297  trans = IMP.algebra.Vector3D(tr[1])
298  rot = IMP.algebra.Rotation3D(tr[0])
299  self.transformation_distance_dict[
300  label] = IMP.algebra.Transformation3D(rot, trans)
301 
302  def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
303  import pickle
304  outf = open(file_name + ".data", 'wb')
305 
306  # to pickle the transformation dictionary
307  # you have to save the arrays correposnding to
308  # the transformations
309 
310  pickable_transformations = self.get_pickable_transformation_distance_dict(
311  )
312  pickle.dump(
313  (self.structure_cluster_ids,
314  self.model_list_names,
315  pickable_transformations),
316  outf)
317 
318  np.save(file_name + ".npy", self.raw_distance_matrix)
319 
320  def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
321  import pickle
322 
323  inputf = open(file_name + ".data", 'rb')
324  (self.structure_cluster_ids, self.model_list_names,
325  pickable_transformations) = pickle.load(inputf)
326  inputf.close()
327 
328  self.raw_distance_matrix = np.load(file_name + ".npy")
329 
330  self.set_transformation_distance_dict_from_pickable(
331  pickable_transformations)
332  self.model_indexes = list(range(len(self.model_list_names)))
333  self.model_indexes_dict = dict(
334  list(zip(self.model_list_names, self.model_indexes)))
335 
336  def plot_matrix(self, figurename="clustermatrix.pdf"):
337  import matplotlib as mpl
338  mpl.use('Agg')
339  import matplotlib.pylab as pl
340  from scipy.cluster import hierarchy as hrc
341 
342  fig = pl.figure(figsize=(10,8))
343  ax = fig.add_subplot(212)
344  dendrogram = hrc.dendrogram(
345  hrc.linkage(self.raw_distance_matrix),
346  color_threshold=7,
347  no_labels=True)
348  leaves_order = dendrogram['leaves']
349  ax.set_xlabel('Model')
350  ax.set_ylabel('RMSD [Angstroms]')
351 
352  ax2 = fig.add_subplot(221)
353  cax = ax2.imshow(
354  self.raw_distance_matrix[leaves_order,
355  :][:,
356  leaves_order],
357  interpolation='nearest')
358  cb = fig.colorbar(cax)
359  cb.set_label('RMSD [Angstroms]')
360  ax2.set_xlabel('Model')
361  ax2.set_ylabel('Model')
362 
363  pl.savefig(figurename, dpi=300)
364  pl.close(fig)
365 
366  def get_model_index_from_name(self, name):
367  return self.model_indexes_dict[name]
368 
369  def get_cluster_labels(self):
370  # this list
371  return list(set(self.structure_cluster_ids))
372 
373  def get_number_of_clusters(self):
374  return len(self.get_cluster_labels())
375 
376  def get_cluster_label_indexes(self, label):
377  return (
378  [i for i, l in enumerate(self.structure_cluster_ids) if l == label]
379  )
380 
381  def get_cluster_label_names(self, label):
382  return (
383  [self.model_list_names[i]
384  for i in self.get_cluster_label_indexes(label)]
385  )
386 
387  def get_cluster_label_average_rmsd(self, label):
388 
389  indexes = self.get_cluster_label_indexes(label)
390 
391  if len(indexes) > 1:
392  sub_distance_matrix = self.raw_distance_matrix[
393  indexes, :][:, indexes]
394  average_rmsd = np.sum(sub_distance_matrix) / \
395  (len(sub_distance_matrix)
396  ** 2 - len(sub_distance_matrix))
397  else:
398  average_rmsd = 0.0
399  return average_rmsd
400 
401  def get_cluster_label_size(self, label):
402  return len(self.get_cluster_label_indexes(label))
403 
404  def get_transformation_to_first_member(
405  self,
406  cluster_label,
407  structure_index):
408  reference = self.get_cluster_label_indexes(cluster_label)[0]
409  return self.transformation_distance_dict[(reference, structure_index)]
410 
411  def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
412 
413  model_list_names = list(all_coords.keys())
414  rmsd_protein_names = list(all_coords[model_list_names[0]].keys())
415  raw_distance_dict = {}
416  transformation_distance_dict = {}
417  if template_coords is None:
418  do_alignment = False
419  else:
420  do_alignment = True
421  alignment_template_protein_names = list(template_coords.keys())
422 
423  for (f1, f2) in list_of_pairs:
424 
425  if not do_alignment:
426  # here we only get the rmsd,
427  # we need that for instance when you want to cluster conformations
428  # globally, eg the EM map is a reference
430 
431  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
432  for pr in rmsd_protein_names])
433  coords_f2 = {}
434  for pr in rmsd_protein_names:
435  coords_f2[pr] = all_coords[model_list_names[f2]][pr]
436 
437  Ali = Alignment(coords_f1, coords_f2, self.rmsd_weights)
438  rmsd = Ali.get_rmsd()
439 
440  elif do_alignment:
441  # here we actually align the conformations first
442  # and than calculate the rmsd. We need that when the
443  # protein(s) is the reference
444  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
445  for pr in alignment_template_protein_names])
446  coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
447  for pr in alignment_template_protein_names])
448 
449  Ali = Alignment(coords_f1, coords_f2)
450  template_rmsd, transformation = Ali.align()
451 
452  # here we calculate the rmsd
453  # we will align two models based n the nuber of subunits provided
454  # and transform coordinates of model 2 to model 1
455  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
456  for pr in rmsd_protein_names])
457  coords_f2 = {}
458  for pr in rmsd_protein_names:
459  coords_f2[pr] = [transformation.get_transformed(
460  i) for i in all_coords[model_list_names[f2]][pr]]
461 
462  Ali = Alignment(coords_f1, coords_f2, self.rmsd_weights)
463  rmsd = Ali.get_rmsd()
464 
465  raw_distance_dict[(f1, f2)] = rmsd
466  raw_distance_dict[(f2, f1)] = rmsd
467  transformation_distance_dict[(f1, f2)] = transformation
468  transformation_distance_dict[(f2, f1)] = transformation
469 
470  return raw_distance_dict, transformation_distance_dict
471 
472 
473 class RMSD(object):
474  """Compute the RMSD (without alignment) taking into account the copy ambiguity.
475  To be used with pmi2 hierarchies. Can be used for instance as follows:
476 
477  rmsd=IMP.pmi.analysis.RMSD(hier,hier,[mol.get_name() for mol in mols],dynamic0=True,dynamic1=False)
478  output_objects.append(rmsd)
479 
480  before shuffling the coordinates
481  """
482 
483  def __init__(self,hier0,hier1,molnames,label="None",dynamic0=True,dynamic1=True,metric=IMP.algebra.get_rmsd):
484  """
485  @param hier0 first input hierarchy
486  @param hier1 second input hierarchy
487  @param molname the names of the molecules used for the RMSD
488  @dynamic0 if True stores the decorators XYZ and coordinates of hier0 can be update. If false coordinates are static (stored in Vector3Ds)
489  and will never be updated
490  @dynamic1 same as above
491  metric what metric should be used
492  """
493  self.moldict0,self.molcoords0,self.mol_XYZs0=self.get_moldict_coorddict(hier0,molnames)
494  self.moldict1,self.molcoords1,self.mol_XYZs1=self.get_moldict_coorddict(hier1,molnames)
495  self.dynamic0=dynamic0
496  self.dynamic1=dynamic1
497  self.metric=metric
498  self.label=label
499 
500  def get_moldict_coorddict(self,hier,molnames):
501  """return data structure for the RMSD calculation"""
502  moldict={}
503  mol_coords={}
504  mol_XYZs={}
505  for mol in IMP.pmi.tools.get_molecules(hier):
506  name=mol.get_name()
507  if name not in molnames:
508  continue
509  parts=True
510  mol_coords[mol]=[]
511  mol_XYZs[mol]=[]
512  i=1
513  while parts:
514  sel=IMP.atom.Selection(mol,residue_index=i,representation_type=IMP.atom.BALLS,resolution=1)
515  parts=sel.get_selected_particles()
516  if parts:
517  mol_coords[mol].append(IMP.core.XYZ(parts[0]).get_coordinates())
518  mol_XYZs[mol].append(IMP.core.XYZ(parts[0]))
519  i=i+1
520  if name in moldict:
521  moldict[name].append(mol)
522  else:
523  moldict[name]=[mol]
524  return moldict, mol_coords, mol_XYZs
525 
526  def get_rmsd_and_assigments(self):
527  best_orders=[]
528  total_rmsd=0
529  total_N=0
530  best_assignments=[]
531  rmsd_dict={}
532  for molname, ref_mols in self.moldict1.items():
533  ref_coords=[]
534  for ref_mol in ref_mols:
535  if self.dynamic1:
536  coord1=[XYZ.get_coordinates() for XYZ in self.mol_XYZs1[ref_mol]]
537  else:
538  coord1=self.molcoords1[ref_mol]
539  ref_coords+=coord1
540 
541  rmsd=[]
542  rmf_mols_list=[]
543  for rmf_mols in itertools.permutations(self.moldict0[molname]):
544  rmf_coords=[]
545  for rmf_mol in rmf_mols:
546  if self.dynamic0:
547  coord0=[XYZ.get_coordinates() for XYZ in self.mol_XYZs0[rmf_mol]]
548  else:
549  coord0=self.molcoords0[rmf_mol]
550  rmf_coords+=coord0
551 
552  rmsd.append(IMP.algebra.get_rmsd(ref_coords, rmf_coords))
553  rmf_mols_list.append(rmf_mols)
554  m=min(rmsd)
555  rmf_mols_best_order=rmf_mols_list[rmsd.index(m)]
556 
557 
558  for n, (ref_mol,rmf_mol) in enumerate(zip(ref_mols,rmf_mols_best_order)):
559  best_assignments.append((rmf_mol,ref_mol))
560  if self.dynamic0:
561  coord0=[XYZ.get_coordinates() for XYZ in self.mol_XYZs0[rmf_mol]]
562  else:
563  coord0=self.molcoords0[rmf_mol]
564 
565  if self.dynamic1:
566  coord1=[XYZ.get_coordinates() for XYZ in self.mol_XYZs1[ref_mol]]
567  else:
568  coord1=self.molcoords1[ref_mol]
569  rmsd_pair=self.metric(coord1, coord0)
570  N=len(self.molcoords1[ref_mol])
571  total_N+=N
572  total_rmsd+=rmsd_pair*rmsd_pair*N
573  rmsd_dict[ref_mol]=rmsd_pair
574  total_rmsd = sqrt(total_rmsd/total_N)
575  return total_rmsd,best_assignments
576 
577  def get_output(self):
578  """Returns output for IMP.pmi.output.Output object"""
579  total_rmsd,best_assignments=self.get_rmsd_and_assigments()
580 
581  assignments_out=[]
582  for rmf_mol,ref_mol in best_assignments:
583  ref_name=ref_mol.get_name()
584  ref_copy=IMP.atom.Copy(ref_mol).get_copy_index()
585  rmf_name=rmf_mol.get_name()
586  rmf_copy=IMP.atom.Copy(rmf_mol).get_copy_index()
587  assignments_out.append(rmf_name+"."+str(rmf_copy)+"->"+ref_name+"."+str(ref_copy))
588  return {"RMSD_"+self.label:str(total_rmsd),"RMSD_assignments_"+self.label:str(assignments_out)}
589 
590 
591 
592 
593 class Precision(object):
594  """A class to evaluate the precision of an ensemble.
595 
596  Also can evaluate the cross-precision of multiple ensembles.
597  Supports MPI for coordinate reading.
598  Recommended procedure:
599  -# initialize object and pass the selection for evaluating precision
600  -# call add_structures() to read in the data (specify group name)
601  -# call get_precision() to evaluate inter/intra precision
602  -# call get_rmsf() to evaluate within-group fluctuations
603  """
604  def __init__(self,model,
605  resolution=1,
606  selection_dictionary={}):
607  """Constructor.
608  @param model The IMP Model
609  @param resolution Use 1 or 10 (kluge: requires that "_Res:X" is
610  part of the hier name)
611  @param selection_dictionary Dictionary where keys are names for
612  selections and values are selection tuples for scoring
613  precision. "All" is automatically made as well
614  """
615  try:
616  from mpi4py import MPI
617  self.comm = MPI.COMM_WORLD
618  self.rank = self.comm.Get_rank()
619  self.number_of_processes = self.comm.size
620  except ImportError:
621  self.number_of_processes = 1
622  self.rank = 0
623 
624  self.styles = ['pairwise_rmsd','pairwise_drmsd_k','pairwise_drmsd_Q',
625  'pairwise_drms_k','pairwise_rmsd','drmsd_from_center']
626  self.style = 'pairwise_drmsd_k'
627  self.structures_dictionary = {}
628  self.reference_structures_dictionary = {}
629  self.prots = []
630  self.protein_names = None
631  self.len_particles_resolution_one = None
632  self.model = model
633  self.rmf_names_frames = {}
634  self.reference_rmf_names_frames = None
635  self.reference_structure = None
636  self.reference_prot = None
637  self.selection_dictionary = selection_dictionary
638  self.threshold = 40.0
639  self.residue_particle_index_map = None
640  self.prots = None
641  if resolution in [1,10]:
642  self.resolution = resolution
643  else:
644  raise KeyError("Currently only allow resolution 1 or 10")
645 
646  def _get_structure(self,rmf_frame_index,rmf_name):
647  """Read an RMF file and return the particles"""
648  rh = RMF.open_rmf_file_read_only(rmf_name)
649  if not self.prots:
650  print("getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
651  self.prots = IMP.rmf.create_hierarchies(rh, self.model)
652  IMP.rmf.load_frame(rh, RMF.FrameID(rmf_frame_index))
653  else:
654  print("linking coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
655  IMP.rmf.link_hierarchies(rh, self.prots)
656  IMP.rmf.load_frame(rh, RMF.FrameID(rmf_frame_index))
657  del rh
658 
659  if self.resolution==1:
660  particle_dict = get_particles_at_resolution_one(self.prots[0])
661  elif self.resolution==10:
662  particle_dict = get_particles_at_resolution_ten(self.prots[0])
663 
664  protein_names = list(particle_dict.keys())
665  particles_resolution_one = []
666  for k in particle_dict:
667  particles_resolution_one += (particle_dict[k])
668 
669  if self.protein_names==None:
670  self.protein_names = protein_names
671  else:
672  if self.protein_names!=protein_names:
673  print("Error: the protein names of the new coordinate set is not compatible with the previous one")
674 
675  if self.len_particles_resolution_one==None:
676  self.len_particles_resolution_one = len(particles_resolution_one)
677  else:
678  if self.len_particles_resolution_one!=len(particles_resolution_one):
679  raise ValueError("the new coordinate set is not compatible with the previous one")
680 
681  return particles_resolution_one
682 
683  def add_structure(self,
684  rmf_name,
685  rmf_frame_index,
686  structure_set_name,
687  setup_index_map=False):
688  """ Read a structure into the ensemble and store (as coordinates).
689  @param rmf_name The name of the RMF file
690  @param rmf_frame_index The frame to read
691  @param structure_set_name Name for the set that includes this structure
692  (e.g. "cluster 1")
693  @param setup_index_map if requested, set up a dictionary to help
694  find residue indexes
695  """
696 
697  # decide where to put this structure
698  if structure_set_name in self.structures_dictionary:
699  cdict = self.structures_dictionary[structure_set_name]
700  rmflist = self.rmf_names_frames[structure_set_name]
701  else:
702  self.structures_dictionary[structure_set_name]={}
703  self.rmf_names_frames[structure_set_name]=[]
704  cdict = self.structures_dictionary[structure_set_name]
705  rmflist = self.rmf_names_frames[structure_set_name]
706 
707  # read the particles
708  try:
709  particles_resolution_one = self._get_structure(rmf_frame_index,rmf_name)
710  except ValueError:
711  print("something wrong with the rmf")
712  return 0
713  self.selection_dictionary.update({"All":self.protein_names})
714 
715  for selection_name in self.selection_dictionary:
716  selection_tuple = self.selection_dictionary[selection_name]
717  coords = self._select_coordinates(selection_tuple,particles_resolution_one,self.prots[0])
718  if selection_name not in cdict:
719  cdict[selection_name] = [coords]
720  else:
721  cdict[selection_name].append(coords)
722 
723  rmflist.append((rmf_name,rmf_frame_index))
724 
725  # if requested, set up a dictionary to help find residue indexes
726  if setup_index_map:
727  self.residue_particle_index_map = {}
728  for prot_name in self.protein_names:
729  self.residue_particle_index_map[prot_name] = \
730  self._get_residue_particle_index_map(
731  prot_name,
732  particles_resolution_one,self.prots[0])
733 
734  def add_structures(self,
735  rmf_name_frame_tuples,
736  structure_set_name):
737  """Read a list of RMFs, supports parallel
738  @param rmf_name_frame_tuples list of (rmf_file_name,frame_number)
739  @param structure_set_name Name this set of structures (e.g. "cluster.1")
740  """
741 
742  # split up the requested list to read in parallel
743  my_rmf_name_frame_tuples=IMP.pmi.tools.chunk_list_into_segments(
744  rmf_name_frame_tuples,self.number_of_processes)[self.rank]
745  for nfr,tup in enumerate(my_rmf_name_frame_tuples):
746  rmf_name=tup[0]
747  rmf_frame_index=tup[1]
748  # the first frame stores the map between residues and particles
749  if self.residue_particle_index_map is None:
750  setup_index_map = True
751  else:
752  setup_index_map = False
753  self.add_structure(rmf_name,
754  rmf_frame_index,
755  structure_set_name,
756  setup_index_map)
757 
758  # synchronize the structures
759  if self.number_of_processes > 1:
760  self.rmf_names_frames=IMP.pmi.tools.scatter_and_gather(self.rmf_names_frames)
761  if self.rank != 0:
762  self.comm.send(self.structures_dictionary, dest=0, tag=11)
763  elif self.rank == 0:
764  for i in range(1, self.number_of_processes):
765  data_tmp = self.comm.recv(source=i, tag=11)
766  for key in self.structures_dictionary:
767  self.structures_dictionary[key].update(data_tmp[key])
768  for i in range(1, self.number_of_processes):
769  self.comm.send(self.structures_dictionary, dest=i, tag=11)
770  if self.rank != 0:
771  self.structures_dictionary = self.comm.recv(source=0, tag=11)
772 
773  def _get_residue_particle_index_map(self,prot_name,structure,hier):
774  # Creates map from all particles to residue numbers
775  residue_particle_index_map = []
776  if IMP.pmi.get_is_canonical(hier):
777  s = IMP.atom.Selection(hier,molecules=[prot_name],
778  resolution=1)
779  else:
780  s = IMP.atom.Selection(hier,molecules=[prot_name])
781  all_selected_particles = s.get_selected_particles()
782  intersection = list(set(all_selected_particles) & set(structure))
783  sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
784  for p in sorted_intersection:
785  residue_particle_index_map.append(IMP.pmi.tools.get_residue_indexes(p))
786  return residue_particle_index_map
787 
788 
789  def _select_coordinates(self,tuple_selections,structure,prot):
790  selected_coordinates=[]
791  for t in tuple_selections:
792  if type(t)==tuple and len(t)==3:
793  if IMP.pmi.get_is_canonical(prot):
794  s = IMP.atom.Selection(prot,molecules=[t[2]],residue_indexes=range(t[0],t[1]+1),
795  resolution=1)
796  else:
797  s = IMP.atom.Selection(prot,molecules=[t[2]],residue_indexes=range(t[0],t[1]+1))
798  all_selected_particles = s.get_selected_particles()
799  intersection = list(set(all_selected_particles) & set(structure))
800  sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
801  cc = [tuple(IMP.core.XYZ(p).get_coordinates()) for p in sorted_intersection]
802  selected_coordinates += cc
803  elif type(t)==str:
804  if IMP.pmi.get_is_canonical(prot):
805  s = IMP.atom.Selection(prot,molecules=[t],resolution=1)
806  else:
807  s = IMP.atom.Selection(prot,molecules=[t])
808  all_selected_particles = s.get_selected_particles()
809  intersection = list(set(all_selected_particles) & set(structure))
810  sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
811  cc = [tuple(IMP.core.XYZ(p).get_coordinates()) for p in sorted_intersection]
812  selected_coordinates += cc
813  else:
814  raise ValueError("Selection error")
815  return selected_coordinates
816 
817  def set_threshold(self,threshold):
818  self.threshold = threshold
819 
820  def _get_distance(self,
821  structure_set_name1,
822  structure_set_name2,
823  selection_name,
824  index1,
825  index2):
826  """ Compute distance between structures with various metrics """
827  c1 = self.structures_dictionary[structure_set_name1][selection_name][index1]
828  c2 = self.structures_dictionary[structure_set_name2][selection_name][index2]
829  coordinates1 = [IMP.algebra.Vector3D(c) for c in c1]
830  coordinates2 = [IMP.algebra.Vector3D(c) for c in c2]
831 
832  if self.style=='pairwise_drmsd_k':
833  distance=IMP.atom.get_drmsd(coordinates1,coordinates2)
834  if self.style=='pairwise_drms_k':
835  distance=IMP.atom.get_drms(coordinates1,coordinates2)
836  if self.style=='pairwise_drmsd_Q':
837  distance=IMP.atom.get_drmsd_Q(coordinates1,coordinates2,self.threshold)
838 
839  if self.style=='pairwise_rmsd':
840  distance=IMP.algebra.get_rmsd(coordinates1,coordinates2)
841  return distance
842 
843  def _get_particle_distances(self,structure_set_name1,structure_set_name2,
844  selection_name,index1,index2):
845  c1 = self.structures_dictionary[structure_set_name1][selection_name][index1]
846  c2 = self.structures_dictionary[structure_set_name2][selection_name][index2]
847 
848  coordinates1 = [IMP.algebra.Vector3D(c) for c in c1]
849  coordinates2 = [IMP.algebra.Vector3D(c) for c in c2]
850 
851  distances=[np.linalg.norm(a-b) for (a,b) in zip(coordinates1,coordinates2)]
852 
853  return distances
854 
855  def get_precision(self,
856  structure_set_name1,
857  structure_set_name2,
858  outfile=None,
859  skip=1,
860  selection_keywords=None):
861  """ Evaluate the precision of two named structure groups. Supports MPI.
862  When the structure_set_name1 is different from the structure_set_name2,
863  this evaluates the cross-precision (average pairwise distances).
864  @param outfile Name of the precision output file
865  @param structure_set_name1 string name of the first structure set
866  @param structure_set_name2 string name of the second structure set
867  @param skip analyze every (skip) structure for the distance matrix calculation
868  @param selection_keywords Specify the selection name you want to calculate on.
869  By default this is computed for everything you provided in the constructor,
870  plus all the subunits together.
871  """
872  if selection_keywords is None:
873  sel_keys = list(self.selection_dictionary.keys())
874  else:
875  for k in selection_keywords:
876  if k not in self.selection_dictionary:
877  raise KeyError("you are trying to find named selection " \
878  + k + " which was not requested in the constructor")
879  sel_keys = selection_keywords
880 
881  if outfile is not None:
882  of = open(outfile,"w")
883  centroid_index = 0
884  for selection_name in sel_keys:
885  number_of_structures_1 = len(self.structures_dictionary[structure_set_name1][selection_name])
886  number_of_structures_2 = len(self.structures_dictionary[structure_set_name2][selection_name])
887  distances={}
888  structure_pointers_1 = list(range(0,number_of_structures_1,skip))
889  structure_pointers_2 = list(range(0,number_of_structures_2,skip))
890  pair_combination_list = list(itertools.product(structure_pointers_1,structure_pointers_2))
891  if len(pair_combination_list)==0:
892  raise ValueError("no structure selected. Check the skip parameter.")
893 
894  # compute pairwise distances in parallel
895  my_pair_combination_list = IMP.pmi.tools.chunk_list_into_segments(
896  pair_combination_list,self.number_of_processes)[self.rank]
897  my_length = len(my_pair_combination_list)
898  for n,pair in enumerate(my_pair_combination_list):
899  progression = int(float(n)/my_length*100.0)
900  distances[pair] = self._get_distance(structure_set_name1,structure_set_name2,
901  selection_name,pair[0],pair[1])
902  if self.number_of_processes > 1:
903  distances = IMP.pmi.tools.scatter_and_gather(distances)
904 
905  # Finally compute distance to centroid
906  if self.rank == 0:
907  if structure_set_name1==structure_set_name2:
908  structure_pointers = structure_pointers_1
909  number_of_structures = number_of_structures_1
910 
911  # calculate the distance from the first centroid
912  # and determine the centroid
913  distance = 0.0
914  distances_to_structure = {}
915  distances_to_structure_normalization = {}
916 
917  for n in structure_pointers:
918  distances_to_structure[n] = 0.0
919  distances_to_structure_normalization[n]=0
920 
921  for k in distances:
922  distance += distances[k]
923  distances_to_structure[k[0]] += distances[k]
924  distances_to_structure[k[1]] += distances[k]
925  distances_to_structure_normalization[k[0]] += 1
926  distances_to_structure_normalization[k[1]] += 1
927 
928  for n in structure_pointers:
929  distances_to_structure[n] = distances_to_structure[n]/distances_to_structure_normalization[n]
930 
931  min_distance = min([distances_to_structure[n] for n in distances_to_structure])
932  centroid_index = [k for k, v in distances_to_structure.items() if v == min_distance][0]
933  centroid_rmf_name = self.rmf_names_frames[structure_set_name1][centroid_index]
934 
935  centroid_distance = 0.0
936  distance_list = []
937  for n in range(number_of_structures):
938  dist = self._get_distance(structure_set_name1,structure_set_name1,
939  selection_name,centroid_index,n)
940  centroid_distance += dist
941  distance_list.append(dist)
942 
943  #pairwise_distance=distance/len(distances.keys())
944  centroid_distance /= number_of_structures
945  #average_centroid_distance=sum(distances_to_structure)/len(distances_to_structure)
946  if outfile is not None:
947  of.write(str(selection_name)+" "+structure_set_name1+
948  " average centroid distance "+str(centroid_distance)+"\n")
949  of.write(str(selection_name)+" "+structure_set_name1+
950  " centroid index "+str(centroid_index)+"\n")
951  of.write(str(selection_name)+" "+structure_set_name1+
952  " centroid rmf name "+str(centroid_rmf_name)+"\n")
953  of.write(str(selection_name)+" "+structure_set_name1+
954  " median centroid distance "+str(np.median(distance_list))+"\n")
955 
956  average_pairwise_distances=sum(distances.values())/len(list(distances.values()))
957  if outfile is not None:
958  of.write(str(selection_name)+" "+structure_set_name1+" "+structure_set_name2+
959  " average pairwise distance "+str(average_pairwise_distances)+"\n")
960  if outfile is not None:
961  of.close()
962  return centroid_index
963 
964  def get_rmsf(self,
965  structure_set_name,
966  outdir="./",
967  skip=1,
968  set_plot_yaxis_range=None):
969  """ Calculate the residue mean square fluctuations (RMSF).
970  Automatically outputs as data file and pdf
971  @param structure_set_name Which structure set to calculate RMSF for
972  @param outdir Where to write the files
973  @param skip Skip this number of structures
974  @param set_plot_yaxis_range In case you need to change the plot
975  """
976  # get the centroid structure for the whole complex
977  centroid_index = self.get_precision(structure_set_name,
978  structure_set_name,
979  outfile=None,
980  skip=skip)
981  if self.rank==0:
982  for sel_name in self.protein_names:
983  self.selection_dictionary.update({sel_name:[sel_name]})
984  try:
985  number_of_structures = len(self.structures_dictionary[structure_set_name][sel_name])
986  except KeyError:
987  # that protein was not included in the selection
988  continue
989  rpim = self.residue_particle_index_map[sel_name]
990  outfile = outdir+"/rmsf."+sel_name+".dat"
991  of = open(outfile,"w")
992  residue_distances = {}
993  residue_nblock = {}
994  for index in range(number_of_structures):
995  distances = self._get_particle_distances(structure_set_name,
996  structure_set_name,
997  sel_name,
998  centroid_index,index)
999  for nblock,block in enumerate(rpim):
1000  for residue_number in block:
1001  residue_nblock[residue_number] = nblock
1002  if residue_number not in residue_distances:
1003  residue_distances[residue_number] = [distances[nblock]]
1004  else:
1005  residue_distances[residue_number].append(distances[nblock])
1006 
1007  residues = []
1008  rmsfs = []
1009  for rn in residue_distances:
1010  residues.append(rn)
1011  rmsf = np.std(residue_distances[rn])
1012  rmsfs.append(rmsf)
1013  of.write(str(rn)+" "+str(residue_nblock[rn])+" "+str(rmsf)+"\n")
1014 
1015  IMP.pmi.output.plot_xy_data(residues,rmsfs,title=sel_name,
1016  out_fn=outdir+"/rmsf."+sel_name,display=False,
1017  set_plot_yaxis_range=set_plot_yaxis_range,
1018  xlabel='Residue Number',ylabel='Standard error')
1019  of.close()
1020 
1021 
1022  def set_reference_structure(self,rmf_name,rmf_frame_index):
1023  """Read in a structure used for reference computation.
1024  Needed before calling get_average_distance_wrt_reference_structure()
1025  @param rmf_name The RMF file to read the reference
1026  @param rmf_frame_index The index in that file
1027  """
1028  particles_resolution_one = self._get_structure(rmf_frame_index,rmf_name)
1029  self.reference_rmf_names_frames = (rmf_name,rmf_frame_index)
1030 
1031  for selection_name in self.selection_dictionary:
1032  selection_tuple = self.selection_dictionary[selection_name]
1033  coords = self._select_coordinates(selection_tuple,
1034  particles_resolution_one,self.prots[0])
1035  self.reference_structures_dictionary[selection_name] = coords
1036 
1037  def get_rmsd_wrt_reference_structure_with_alignment(self,structure_set_name,alignment_selection_key):
1038  """First align then calculate RMSD
1039  @param structure_set_name: the name of the structure set
1040  @param alignment_selection: the key containing the selection tuples needed to make the alignment stored in self.selection_dictionary
1041  @return: for each structure in the structure set, returns the rmsd
1042  """
1043  if self.reference_structures_dictionary=={}:
1044  print("Cannot compute until you set a reference structure")
1045  return
1046 
1047  align_reference_coordinates = self.reference_structures_dictionary[alignment_selection_key]
1048  align_coordinates = self.structures_dictionary[structure_set_name][alignment_selection_key]
1049  transformations = []
1050  for c in align_coordinates:
1051  Ali = IMP.pmi.analysis.Alignment({"All":align_reference_coordinates}, {"All":c})
1052  transformation = Ali.align()[1]
1053  transformations.append(transformation)
1054  for selection_name in self.selection_dictionary:
1055  reference_coordinates = self.reference_structures_dictionary[selection_name]
1056  coordinates2 = [IMP.algebra.Vector3D(c) for c in reference_coordinates]
1057  distances = []
1058  for n,sc in enumerate(self.structures_dictionary[structure_set_name][selection_name]):
1059  coordinates1 = [transformations[n].get_transformed(IMP.algebra.Vector3D(c)) for c in sc]
1060  distance = IMP.algebra.get_rmsd(coordinates1,coordinates2)
1061  distances.append(distance)
1062  print(selection_name,"average rmsd",sum(distances)/len(distances),"median",self._get_median(distances),"minimum distance",min(distances))
1063 
1064  def _get_median(self,list_of_values):
1065  return np.median(np.array(list_of_values))
1066 
1067  def get_average_distance_wrt_reference_structure(self,structure_set_name):
1068  """Compare the structure set to the reference structure.
1069  @param structure_set_name The structure set to compute this on
1070  @note First call set_reference_structure()
1071  """
1072  ret = {}
1073  if self.reference_structures_dictionary=={}:
1074  print("Cannot compute until you set a reference structure")
1075  return
1076  for selection_name in self.selection_dictionary:
1077  reference_coordinates = self.reference_structures_dictionary[selection_name]
1078  coordinates2 = [IMP.algebra.Vector3D(c) for c in reference_coordinates]
1079  distances = []
1080  for sc in self.structures_dictionary[structure_set_name][selection_name]:
1081  coordinates1 = [IMP.algebra.Vector3D(c) for c in sc]
1082  if self.style=='pairwise_drmsd_k':
1083  distance = IMP.atom.get_drmsd(coordinates1,coordinates2)
1084  if self.style=='pairwise_drms_k':
1085  distance = IMP.atom.get_drms(coordinates1,coordinates2)
1086  if self.style=='pairwise_drmsd_Q':
1087  distance = IMP.atom.get_drmsd_Q(coordinates1,coordinates2,self.threshold)
1088  if self.style=='pairwise_rmsd':
1089  distance = IMP.algebra.get_rmsd(coordinates1,coordinates2)
1090  distances.append(distance)
1091 
1092  print(selection_name,"average distance",sum(distances)/len(distances),"minimum distance",min(distances),'nframes',len(distances))
1093  ret[selection_name] = {'average_distance':sum(distances)/len(distances),'minimum_distance':min(distances)}
1094  return ret
1095 
1096  def get_coordinates(self):
1097  pass
1098 
1099  def set_precision_style(self, style):
1100  if style in self.styles:
1101  self.style=style
1102  else:
1103  raise ValueError("No such style")
1104 
1105 
1106 class GetModelDensity(object):
1107  """Compute mean density maps from structures.
1108 
1109  Keeps a dictionary of density maps,
1110  keys are in the custom ranges. When you call add_subunits_density, it adds
1111  particle coordinates to the existing density maps.
1112  """
1113 
1114  def __init__(self, custom_ranges, representation=None, resolution=20.0, voxel=5.0):
1115  """Constructor.
1116  @param custom_ranges Required. It's a dictionary, keys are the
1117  density component names, values are selection tuples
1118  e.g. {'kin28':[['kin28',1,-1]],
1119  'density_name_1' :[('ccl1')],
1120  'density_name_2' :[(1,142,'tfb3d1'),
1121  (143,700,'tfb3d2')],
1122  @param representation PMI representation, for doing selections.
1123  Not needed if you only pass hierarchies
1124  @param resolution The MRC resolution of the output map (in Angstrom unit)
1125  @param voxel The voxel size for the output map (lower is slower)
1126  """
1127 
1128  self.representation = representation
1129  self.MRCresolution = resolution
1130  self.voxel = voxel
1131  self.densities = {}
1132  self.count_models = 0.0
1133  self.custom_ranges = custom_ranges
1134 
1135  def add_subunits_density(self, hierarchy=None):
1136  """Add a frame to the densities.
1137  @param hierarchy Optionally read the hierarchy from somewhere.
1138  If not passed, will just read the representation.
1139  """
1140  self.count_models += 1.0
1141 
1142  if hierarchy:
1143  part_dict = get_particles_at_resolution_one(hierarchy)
1144  all_particles_by_resolution = []
1145  for name in part_dict:
1146  all_particles_by_resolution += part_dict[name]
1147 
1148  for density_name in self.custom_ranges:
1149  parts = []
1150  if hierarchy:
1151  all_particles_by_segments = []
1152 
1153  for seg in self.custom_ranges[density_name]:
1154  if not hierarchy:
1155  # when you have a IMP.pmi.representation.Representation class
1156  parts += IMP.tools.select_by_tuple(self.representation,
1157  seg, resolution=1, name_is_ambiguous=False)
1158  else:
1159  # else, when you have a hierarchy, but not a representation
1160  if not IMP.pmi.get_is_canonical(hierarchy):
1161  for h in hierarchy.get_children():
1163  IMP.atom.Molecule.setup_particle(h.get_particle())
1164 
1165  if type(seg) == str:
1166  s = IMP.atom.Selection(hierarchy,molecule=seg)
1167  elif type(seg) == tuple and len(seg) == 2:
1168  s = IMP.atom.Selection(
1169  hierarchy, molecule=seg[0],copy_index=seg[1])
1170  elif type(seg) == tuple and len(seg) == 3:
1171  s = IMP.atom.Selection(
1172  hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1))
1173  elif type(seg) == tuple and len(seg) == 4:
1174  s = IMP.atom.Selection(
1175  hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1),copy_index=seg[3])
1176  else:
1177  raise Exception('could not understand selection tuple '+str(seg))
1178 
1179  all_particles_by_segments += s.get_selected_particles()
1180  if hierarchy:
1181  if IMP.pmi.get_is_canonical(hierarchy):
1182  parts = all_particles_by_segments
1183  else:
1184  parts = list(
1185  set(all_particles_by_segments) & set(all_particles_by_resolution))
1186  self._create_density_from_particles(parts, density_name)
1187 
1188  def normalize_density(self):
1189  pass
1190 
1191  def _create_density_from_particles(self, ps, name,
1192  kernel_type='GAUSSIAN'):
1193  '''Internal function for adding to densities.
1194  pass XYZR particles with mass and create a density from them.
1195  kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
1196  kd = {
1197  'GAUSSIAN': IMP.em.GAUSSIAN,
1198  'BINARIZED_SPHERE': IMP.em.BINARIZED_SPHERE,
1199  'SPHERE': IMP.em.SPHERE}
1200 
1201  dmap = IMP.em.SampledDensityMap(ps, self.MRCresolution, self.voxel)
1202  dmap.calcRMS()
1203  dmap.set_was_used(True)
1204  if name not in self.densities:
1205  self.densities[name] = dmap
1206  else:
1207  bbox1 = IMP.em.get_bounding_box(self.densities[name])
1208  bbox2 = IMP.em.get_bounding_box(dmap)
1209  bbox1 += bbox2
1210  dmap3 = IMP.em.create_density_map(bbox1,self.voxel)
1211  dmap3.set_was_used(True)
1212  dmap3.add(dmap)
1213  dmap3.add(self.densities[name])
1214  self.densities[name] = dmap3
1215 
1216  def get_density_keys(self):
1217  return list(self.densities.keys())
1218 
1219  def get_density(self,name):
1220  """Get the current density for some component name"""
1221  if name not in self.densities:
1222  return None
1223  else:
1224  return self.densities[name]
1225 
1226  def write_mrc(self, path="./"):
1227  for density_name in self.densities:
1228  self.densities[density_name].multiply(1. / self.count_models)
1229  IMP.em.write_map(
1230  self.densities[density_name],
1231  path + "/" + density_name + ".mrc",
1233 
1234 
1235 class GetContactMap(object):
1236 
1237  def __init__(self, distance=15.):
1238  self.distance = distance
1239  self.contactmap = ''
1240  self.namelist = []
1241  self.xlinks = 0
1242  self.XL = {}
1243  self.expanded = {}
1244  self.resmap = {}
1245 
1246  def set_prot(self, prot):
1247  from scipy.spatial.distance import cdist
1248  self.prot = prot
1249  self.protnames = []
1250  coords = []
1251  radii = []
1252  namelist = []
1253 
1254  particles_dictionary = get_particles_at_resolution_one(self.prot)
1255 
1256  for name in particles_dictionary:
1257  residue_indexes = []
1258  for p in particles_dictionary[name]:
1259  print(p.get_name())
1260  residue_indexes += IMP.pmi.tools.get_residue_indexes(p)
1261 
1262  if len(residue_indexes) != 0:
1263  self.protnames.append(name)
1264 
1265  def get_subunit_coords(self, frame, align=0):
1266  from scipy.spatial.distance import cdist
1267  coords = []
1268  radii = []
1269  namelist = []
1270  test, testr = [], []
1271  for part in self.prot.get_children():
1272  SortedSegments = []
1273  print(part)
1274  for chl in part.get_children():
1275  start = IMP.atom.get_leaves(chl)[0]
1276  end = IMP.atom.get_leaves(chl)[-1]
1277 
1278  startres = IMP.atom.Fragment(start).get_residue_indexes()[0]
1279  endres = IMP.atom.Fragment(end).get_residue_indexes()[-1]
1280  SortedSegments.append((chl, startres))
1281  SortedSegments = sorted(SortedSegments, key=itemgetter(1))
1282 
1283  for sgmnt in SortedSegments:
1284  for leaf in IMP.atom.get_leaves(sgmnt[0]):
1285  p = IMP.core.XYZR(leaf)
1286  crd = np.array([p.get_x(), p.get_y(), p.get_z()])
1287 
1288  coords.append(crd)
1289  radii.append(p.get_radius())
1290 
1291  new_name = part.get_name() + '_' + sgmnt[0].get_name() +\
1292  '_' + \
1293  str(IMP.atom.Fragment(leaf)
1294  .get_residue_indexes()[0])
1295  namelist.append(new_name)
1296  self.expanded[new_name] = len(
1297  IMP.atom.Fragment(leaf).get_residue_indexes())
1298  if part.get_name() not in self.resmap:
1299  self.resmap[part.get_name()] = {}
1300  for res in IMP.atom.Fragment(leaf).get_residue_indexes():
1301  self.resmap[part.get_name()][res] = new_name
1302 
1303  coords = np.array(coords)
1304  radii = np.array(radii)
1305  if len(self.namelist) == 0:
1306  self.namelist = namelist
1307  self.contactmap = np.zeros((len(coords), len(coords)))
1308  distances = cdist(coords, coords)
1309  distances = (distances - radii).T - radii
1310  distances = distances <= self.distance
1311  self.contactmap += distances
1312 
1313  def add_xlinks(
1314  self,
1315  filname,
1316  identification_string='ISDCrossLinkMS_Distance_'):
1317  # 'ISDCrossLinkMS_Distance_interrb_6629-State:0-20:RPS30_218:eIF3j-1-1-0.1_None'
1318  self.xlinks = 1
1319  data = open(filname)
1320  D = data.readlines()
1321  data.close()
1322 
1323  for d in D:
1324  if identification_string in d:
1325  d = d.replace(
1326  "_",
1327  " ").replace("-",
1328  " ").replace(":",
1329  " ").split()
1330 
1331  t1, t2 = (d[0], d[1]), (d[1], d[0])
1332  if t1 not in self.XL:
1333  self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
1334  self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
1335  else:
1336  self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
1337  self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
1338 
1339  def dist_matrix(self, skip_cmap=0, skip_xl=1, outname=None):
1340  K = self.namelist
1341  M = self.contactmap
1342  C, R = [], []
1343  L = sum(self.expanded.values())
1344  proteins = self.protnames
1345 
1346  # exp new
1347  if skip_cmap == 0:
1348  Matrices = {}
1349  proteins = [p.get_name() for p in self.prot.get_children()]
1350  missing = []
1351  for p1 in range(len(proteins)):
1352  for p2 in range(p1, len(proteins)):
1353  pl1, pl2 = max(
1354  self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1355  pn1, pn2 = proteins[p1], proteins[p2]
1356  mtr = np.zeros((pl1 + 1, pl2 + 1))
1357  print('Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1358  for i1 in range(1, pl1 + 1):
1359  for i2 in range(1, pl2 + 1):
1360  try:
1361  r1 = K.index(self.resmap[pn1][i1])
1362  r2 = K.index(self.resmap[pn2][i2])
1363  r = M[r1, r2]
1364  mtr[i1 - 1, i2 - 1] = r
1365  except KeyError:
1366  missing.append((pn1, pn2, i1, i2))
1367  pass
1368  Matrices[(pn1, pn2)] = mtr
1369 
1370  # add cross-links
1371  if skip_xl == 0:
1372  if self.XL == {}:
1373  raise ValueError("cross-links were not provided, use add_xlinks function!")
1374  Matrices_xl = {}
1375  missing_xl = []
1376  for p1 in range(len(proteins)):
1377  for p2 in range(p1, len(proteins)):
1378  pl1, pl2 = max(
1379  self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1380  pn1, pn2 = proteins[p1], proteins[p2]
1381  mtr = np.zeros((pl1 + 1, pl2 + 1))
1382  flg = 0
1383  try:
1384  xls = self.XL[(pn1, pn2)]
1385  except KeyError:
1386  try:
1387  xls = self.XL[(pn2, pn1)]
1388  flg = 1
1389  except KeyError:
1390  flg = 2
1391  if flg == 0:
1392  print('Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1393  for xl1, xl2 in xls:
1394  if xl1 > pl1:
1395  print('X' * 10, xl1, xl2)
1396  xl1 = pl1
1397  if xl2 > pl2:
1398  print('X' * 10, xl1, xl2)
1399  xl2 = pl2
1400  mtr[xl1 - 1, xl2 - 1] = 100
1401  elif flg == 1:
1402  print('Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1403  for xl1, xl2 in xls:
1404  if xl1 > pl1:
1405  print('X' * 10, xl1, xl2)
1406  xl1 = pl1
1407  if xl2 > pl2:
1408  print('X' * 10, xl1, xl2)
1409  xl2 = pl2
1410  mtr[xl2 - 1, xl1 - 1] = 100
1411  else:
1412  print('No cross links between: ', pn1, pn2)
1413  Matrices_xl[(pn1, pn2)] = mtr
1414 
1415  # expand the matrix to individual residues
1416  #NewM = []
1417  # for x1 in xrange(len(K)):
1418  # lst = []
1419  # for x2 in xrange(len(K)):
1420  # lst += [M[x1,x2]]*self.expanded[K[x2]]
1421  # for i in xrange(self.expanded[K[x1]]): NewM.append(np.array(lst))
1422  #NewM = np.array(NewM)
1423 
1424  # make list of protein names and create coordinate lists
1425  C = proteins
1426  # W is the component length list,
1427  # R is the contiguous coordinates list
1428  W, R = [], []
1429  for i, c in enumerate(C):
1430  cl = max(self.resmap[c].keys())
1431  W.append(cl)
1432  if i == 0:
1433  R.append(cl)
1434  else:
1435  R.append(R[-1] + cl)
1436 
1437  # start plotting
1438  if outname:
1439  # Don't require a display
1440  import matplotlib as mpl
1441  mpl.use('Agg')
1442  import matplotlib.pyplot as plt
1443  import matplotlib.gridspec as gridspec
1444  import scipy.sparse as sparse
1445 
1446  f = plt.figure()
1447  gs = gridspec.GridSpec(len(W), len(W),
1448  width_ratios=W,
1449  height_ratios=W)
1450 
1451  cnt = 0
1452  for x1, r1 in enumerate(R):
1453  if x1 == 0:
1454  s1 = 0
1455  else:
1456  s1 = R[x1 - 1]
1457  for x2, r2 in enumerate(R):
1458  if x2 == 0:
1459  s2 = 0
1460  else:
1461  s2 = R[x2 - 1]
1462 
1463  ax = plt.subplot(gs[cnt])
1464  if skip_cmap == 0:
1465  try:
1466  mtr = Matrices[(C[x1], C[x2])]
1467  except KeyError:
1468  mtr = Matrices[(C[x2], C[x1])].T
1469  #cax = ax.imshow(log(NewM[s1:r1,s2:r2] / 1.), interpolation='nearest', vmin=0., vmax=log(NewM.max()))
1470  cax = ax.imshow(
1471  np.log(mtr),
1472  interpolation='nearest',
1473  vmin=0.,
1474  vmax=log(mtr.max()))
1475  ax.set_xticks([])
1476  ax.set_yticks([])
1477  if skip_xl == 0:
1478  try:
1479  mtr = Matrices_xl[(C[x1], C[x2])]
1480  except KeyError:
1481  mtr = Matrices_xl[(C[x2], C[x1])].T
1482  cax = ax.spy(
1483  sparse.csr_matrix(mtr),
1484  markersize=10,
1485  color='white',
1486  linewidth=100,
1487  alpha=0.5)
1488  ax.set_xticks([])
1489  ax.set_yticks([])
1490 
1491  cnt += 1
1492  if x2 == 0:
1493  ax.set_ylabel(C[x1], rotation=90)
1494  if outname:
1495  plt.savefig(outname + ".pdf", dpi=300, transparent="False")
1496  else:
1497  plt.show()
1498 
1499 
1500 # ------------------------------------------------------------------
1501 # a few random tools
1502 
1503 def get_hiers_from_rmf(model, frame_number, rmf_file):
1504  # I have to deprecate this function
1505  print("getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1506 
1507  # load the frame
1508  rh = RMF.open_rmf_file_read_only(rmf_file)
1509 
1510  try:
1511  prots = IMP.rmf.create_hierarchies(rh, model)
1512  except IOError:
1513  print("Unable to open rmf file %s" % (rmf_file))
1514  return None
1515  #IMP.rmf.link_hierarchies(rh, prots)
1516 
1517  try:
1518  IMP.rmf.load_frame(rh, RMF.FrameID(frame_number))
1519  except IOError:
1520  print("Unable to open frame %i of file %s" % (frame_number, rmf_file))
1521  return None
1522  model.update()
1523  del rh
1524 
1525  return prots
1526 
1527 def link_hiers_to_rmf(model,hiers,frame_number, rmf_file):
1528  print("linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1529  rh = RMF.open_rmf_file_read_only(rmf_file)
1530  IMP.rmf.link_hierarchies(rh, hiers)
1531  try:
1532  IMP.rmf.load_frame(rh, RMF.FrameID(frame_number))
1533  except:
1534  print("Unable to open frame %i of file %s" % (frame_number, rmf_file))
1535  return False
1536  model.update()
1537  del rh
1538  return True
1539 
1540 
1541 def get_hiers_and_restraints_from_rmf(model, frame_number, rmf_file):
1542  # I have to deprecate this function
1543  print("getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1544 
1545  # load the frame
1546  rh = RMF.open_rmf_file_read_only(rmf_file)
1547 
1548  try:
1549  prots = IMP.rmf.create_hierarchies(rh, model)
1550  rs = IMP.rmf.create_restraints(rh, model)
1551  except:
1552  print("Unable to open rmf file %s" % (rmf_file))
1553  return None,None
1554  try:
1555  IMP.rmf.load_frame(rh, RMF.FrameID(frame_number))
1556  except:
1557  print("Unable to open frame %i of file %s" % (frame_number, rmf_file))
1558  return None,None
1559  model.update()
1560  del rh
1561 
1562  return prots,rs
1563 
1564 def link_hiers_and_restraints_to_rmf(model,hiers,rs, frame_number, rmf_file):
1565  print("linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1566  rh = RMF.open_rmf_file_read_only(rmf_file)
1567  IMP.rmf.link_hierarchies(rh, hiers)
1568  IMP.rmf.link_restraints(rh, rs)
1569  try:
1570  IMP.rmf.load_frame(rh, RMF.FrameID(frame_number))
1571  except:
1572  print("Unable to open frame %i of file %s" % (frame_number, rmf_file))
1573  return False
1574  model.update()
1575  del rh
1576  return True
1577 
1578 
1579 def get_hiers_from_rmf(model, frame_number, rmf_file):
1580  print("getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1581 
1582  # load the frame
1583  rh = RMF.open_rmf_file_read_only(rmf_file)
1584 
1585  try:
1586  prots = IMP.rmf.create_hierarchies(rh, model)
1587  except:
1588  print("Unable to open rmf file %s" % (rmf_file))
1589  prot = None
1590  return prot
1591  #IMP.rmf.link_hierarchies(rh, prots)
1592  try:
1593  IMP.rmf.load_frame(rh, RMF.FrameID(frame_number))
1594  except:
1595  print("Unable to open frame %i of file %s" % (frame_number, rmf_file))
1596  prots = None
1597  model.update()
1598  del rh
1599  return prots
1600 
1601 
1603  """Get particles at res 1, or any beads, based on the name.
1604  No Representation is needed. This is mainly used when the hierarchy
1605  is read from an RMF file.
1606  @return a dictionary of component names and their particles
1607  \note If the root node is named "System" or is a "State", do proper selection.
1608  """
1609  particle_dict = {}
1610 
1611  # attempt to give good results for PMI2
1612  if IMP.pmi.get_is_canonical(prot):
1613  for mol in IMP.atom.get_by_type(prot,IMP.atom.MOLECULE_TYPE):
1614  sel = IMP.atom.Selection(mol,resolution=1)
1615  particle_dict[mol.get_name()] = sel.get_selected_particles()
1616  else:
1617  allparticles = []
1618  for c in prot.get_children():
1619  name = c.get_name()
1620  particle_dict[name] = IMP.atom.get_leaves(c)
1621  for s in c.get_children():
1622  if "_Res:1" in s.get_name() and "_Res:10" not in s.get_name():
1623  allparticles += IMP.atom.get_leaves(s)
1624  if "Beads" in s.get_name():
1625  allparticles += IMP.atom.get_leaves(s)
1626 
1627  particle_align = []
1628  for name in particle_dict:
1629  particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1630  list(set(particle_dict[name]) & set(allparticles)))
1631  return particle_dict
1632 
1634  """Get particles at res 10, or any beads, based on the name.
1635  No Representation is needed.
1636  This is mainly used when the hierarchy is read from an RMF file.
1637  @return a dictionary of component names and their particles
1638  \note If the root node is named "System" or is a "State", do proper selection.
1639  """
1640  particle_dict = {}
1641  # attempt to give good results for PMI2
1642  if IMP.pmi.get_is_canonical(prot):
1643  for mol in IMP.atom.get_by_type(prot,IMP.atom.MOLECULE_TYPE):
1644  sel = IMP.atom.Selection(mol,resolution=10)
1645  particle_dict[mol.get_name()] = sel.get_selected_particles()
1646  else:
1647  allparticles = []
1648  for c in prot.get_children():
1649  name = c.get_name()
1650  particle_dict[name] = IMP.atom.get_leaves(c)
1651  for s in c.get_children():
1652  if "_Res:10" in s.get_name():
1653  allparticles += IMP.atom.get_leaves(s)
1654  if "Beads" in s.get_name():
1655  allparticles += IMP.atom.get_leaves(s)
1656  particle_align = []
1657  for name in particle_dict:
1658  particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1659  list(set(particle_dict[name]) & set(allparticles)))
1660  return particle_dict
1661 
1662 def select_by_tuple(first_res_last_res_name_tuple):
1663  first_res = first_res_last_res_hier_tuple[0]
1664  last_res = first_res_last_res_hier_tuple[1]
1665  name = first_res_last_res_hier_tuple[2]
1666 
1667 class CrossLinkTable(object):
1668  """Visualization of crosslinks"""
1669  def __init__(self):
1670  self.crosslinks = []
1671  self.external_csv_data = None
1672  self.crosslinkedprots = set()
1673  self.mindist = +10000000.0
1674  self.maxdist = -10000000.0
1675  self.contactmap = None
1676 
1677  def set_hierarchy(self, prot):
1678  self.prot_length_dict = {}
1679  self.model=prot.get_model()
1680 
1681  for i in prot.get_children():
1682  name = i.get_name()
1683  residue_indexes = []
1684  for p in IMP.atom.get_leaves(i):
1685  residue_indexes += IMP.pmi.tools.get_residue_indexes(p)
1686 
1687  if len(residue_indexes) != 0:
1688  self.prot_length_dict[name] = max(residue_indexes)
1689 
1690  def set_coordinates_for_contact_map(self, rmf_name,rmf_frame_index):
1691  from scipy.spatial.distance import cdist
1692 
1693  rh= RMF.open_rmf_file_read_only(rmf_name)
1694  prots=IMP.rmf.create_hierarchies(rh, self.model)
1695  IMP.rmf.load_frame(rh, RMF.FrameID(rmf_frame_index))
1696  print("getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
1697  del rh
1698 
1699 
1700  coords = []
1701  radii = []
1702  namelist = []
1703 
1704  particles_dictionary = get_particles_at_resolution_one(prots[0])
1705 
1706  resindex = 0
1707  self.index_dictionary = {}
1708 
1709  for name in particles_dictionary:
1710  residue_indexes = []
1711  for p in particles_dictionary[name]:
1712  print(p.get_name())
1713  residue_indexes = IMP.pmi.tools.get_residue_indexes(p)
1714  #residue_indexes.add( )
1715 
1716  if len(residue_indexes) != 0:
1717 
1718  for res in range(min(residue_indexes), max(residue_indexes) + 1):
1719  d = IMP.core.XYZR(p)
1720 
1721  crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1722  coords.append(crd)
1723  radii.append(d.get_radius())
1724  if name not in self.index_dictionary:
1725  self.index_dictionary[name] = [resindex]
1726  else:
1727  self.index_dictionary[name].append(resindex)
1728  resindex += 1
1729 
1730  coords = np.array(coords)
1731  radii = np.array(radii)
1732 
1733  distances = cdist(coords, coords)
1734  distances = (distances - radii).T - radii
1735 
1736  distances = np.where(distances <= 20.0, 1.0, 0)
1737  if self.contactmap is None:
1738  self.contactmap = np.zeros((len(coords), len(coords)))
1739  self.contactmap += distances
1740 
1741  for prot in prots: IMP.atom.destroy(prot)
1742 
1743  def set_crosslinks(
1744  self, data_file, search_label='ISDCrossLinkMS_Distance_',
1745  mapping=None,
1746  filter_label=None,
1747  filter_rmf_file_names=None, #provide a list of rmf base names to filter the stat file
1748  external_csv_data_file=None,
1749  external_csv_data_file_unique_id_key="Unique ID"):
1750 
1751  # example key ISDCrossLinkMS_Distance_intrarb_937-State:0-108:RPS3_55:RPS30-1-1-0.1_None
1752  # mapping is a dictionary that maps standard keywords to entry positions in the key string
1753  # confidence class is a filter that
1754  # external datafile is a datafile that contains further information on the crosslinks
1755  # it will use the unique id to create the dictionary keys
1756 
1757  po = IMP.pmi.output.ProcessOutput(data_file)
1758  keys = po.get_keys()
1759 
1760  xl_keys = [k for k in keys if search_label in k]
1761 
1762  if filter_rmf_file_names is not None:
1763  rmf_file_key="local_rmf_file_name"
1764  fs = po.get_fields(xl_keys+[rmf_file_key])
1765  else:
1766  fs = po.get_fields(xl_keys)
1767 
1768  # this dictionary stores the occurency of given crosslinks
1769  self.cross_link_frequency = {}
1770 
1771  # this dictionary stores the series of distances for given crosslinked
1772  # residues
1773  self.cross_link_distances = {}
1774 
1775  # this dictionary stores the series of distances for given crosslinked
1776  # residues
1777  self.cross_link_distances_unique = {}
1778 
1779  if not external_csv_data_file is None:
1780  # this dictionary stores the further information on crosslinks
1781  # labeled by unique ID
1782  self.external_csv_data = {}
1783  xldb = IMP.pmi.tools.get_db_from_csv(external_csv_data_file)
1784 
1785  for xl in xldb:
1786  self.external_csv_data[
1787  xl[external_csv_data_file_unique_id_key]] = xl
1788 
1789  # this list keeps track the tuple of cross-links and sample
1790  # so that we don't count twice the same crosslinked residues in the
1791  # same sample
1792  cross_link_frequency_list = []
1793 
1794  self.unique_cross_link_list = []
1795 
1796  for key in xl_keys:
1797  print(key)
1798  keysplit = key.replace(
1799  "_",
1800  " ").replace(
1801  "-",
1802  " ").replace(
1803  ":",
1804  " ").split(
1805  )
1806 
1807  if filter_label!=None:
1808  if filter_label not in keysplit: continue
1809 
1810  if mapping is None:
1811  r1 = int(keysplit[5])
1812  c1 = keysplit[6]
1813  r2 = int(keysplit[7])
1814  c2 = keysplit[8]
1815  try:
1816  confidence = keysplit[12]
1817  except:
1818  confidence = '0.0'
1819  try:
1820  unique_identifier = keysplit[3]
1821  except:
1822  unique_identifier = '0'
1823  else:
1824  r1 = int(keysplit[mapping["Residue1"]])
1825  c1 = keysplit[mapping["Protein1"]]
1826  r2 = int(keysplit[mapping["Residue2"]])
1827  c2 = keysplit[mapping["Protein2"]]
1828  try:
1829  confidence = keysplit[mapping["Confidence"]]
1830  except:
1831  confidence = '0.0'
1832  try:
1833  unique_identifier = keysplit[mapping["Unique Identifier"]]
1834  except:
1835  unique_identifier = '0'
1836 
1837  self.crosslinkedprots.add(c1)
1838  self.crosslinkedprots.add(c2)
1839 
1840  # construct the list of distances corresponding to the input rmf
1841  # files
1842 
1843  dists=[]
1844  if filter_rmf_file_names is not None:
1845  for n,d in enumerate(fs[key]):
1846  if fs[rmf_file_key][n] in filter_rmf_file_names:
1847  dists.append(float(d))
1848  else:
1849  dists=[float(f) for f in fs[key]]
1850 
1851  # check if the input confidence class corresponds to the
1852  # one of the cross-link
1853 
1854  mdist = self.median(dists)
1855 
1856  stdv = np.std(np.array(dists))
1857  if self.mindist > mdist:
1858  self.mindist = mdist
1859  if self.maxdist < mdist:
1860  self.maxdist = mdist
1861 
1862  # calculate the frequency of unique crosslinks within the same
1863  # sample
1864  if not self.external_csv_data is None:
1865  sample = self.external_csv_data[unique_identifier]["Sample"]
1866  else:
1867  sample = "None"
1868 
1869  if (r1, c1, r2, c2,mdist) not in cross_link_frequency_list:
1870  if (r1, c1, r2, c2) not in self.cross_link_frequency:
1871  self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1872  self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1873  else:
1874  self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1875  self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1876  cross_link_frequency_list.append((r1, c1, r2, c2))
1877  cross_link_frequency_list.append((r2, c2, r1, c1))
1878  self.unique_cross_link_list.append(
1879  (r1, c1, r2, c2,mdist))
1880 
1881  if (r1, c1, r2, c2) not in self.cross_link_distances:
1882  self.cross_link_distances[(
1883  r1,
1884  c1,
1885  r2,
1886  c2,
1887  mdist,
1888  confidence)] = dists
1889  self.cross_link_distances[(
1890  r2,
1891  c2,
1892  r1,
1893  c1,
1894  mdist,
1895  confidence)] = dists
1896  self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1897  else:
1898  self.cross_link_distances[(
1899  r2,
1900  c2,
1901  r1,
1902  c1,
1903  mdist,
1904  confidence)] += dists
1905  self.cross_link_distances[(
1906  r1,
1907  c1,
1908  r2,
1909  c2,
1910  mdist,
1911  confidence)] += dists
1912 
1913  self.crosslinks.append(
1914  (r1,
1915  c1,
1916  r2,
1917  c2,
1918  mdist,
1919  stdv,
1920  confidence,
1921  unique_identifier,
1922  'original'))
1923  self.crosslinks.append(
1924  (r2,
1925  c2,
1926  r1,
1927  c1,
1928  mdist,
1929  stdv,
1930  confidence,
1931  unique_identifier,
1932  'reversed'))
1933 
1934  self.cross_link_frequency_inverted = {}
1935  for xl in self.unique_cross_link_list:
1936  (r1, c1, r2, c2, mdist) = xl
1937  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1938  if frequency not in self.cross_link_frequency_inverted:
1939  self.cross_link_frequency_inverted[
1940  frequency] = [(r1, c1, r2, c2)]
1941  else:
1942  self.cross_link_frequency_inverted[
1943  frequency].append((r1, c1, r2, c2))
1944 
1945  # -------------
1946 
1947  def median(self, mylist):
1948  sorts = sorted(mylist)
1949  length = len(sorts)
1950  print(length)
1951  if length == 1:
1952  return mylist[0]
1953  if not length % 2:
1954  return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1955  return sorts[length / 2]
1956 
1957  def set_threshold(self,threshold):
1958  self.threshold=threshold
1959 
1960  def set_tolerance(self,tolerance):
1961  self.tolerance=tolerance
1962 
1963  def colormap(self, dist):
1964  if dist < self.threshold - self.tolerance:
1965  return "Green"
1966  elif dist >= self.threshold + self.tolerance:
1967  return "Orange"
1968  else:
1969  return "Red"
1970 
1971  def write_cross_link_database(self, filename, format='csv'):
1972  import csv
1973 
1974  fieldnames = [
1975  "Unique ID", "Protein1", "Residue1", "Protein2", "Residue2",
1976  "Median Distance", "Standard Deviation", "Confidence", "Frequency", "Arrangement"]
1977 
1978  if not self.external_csv_data is None:
1979  keys = list(self.external_csv_data.keys())
1980  innerkeys = list(self.external_csv_data[keys[0]].keys())
1981  innerkeys.sort()
1982  fieldnames += innerkeys
1983 
1984  dw = csv.DictWriter(
1985  open(filename,
1986  "w"),
1987  delimiter=',',
1988  fieldnames=fieldnames)
1989  dw.writeheader()
1990  for xl in self.crosslinks:
1991  (r1, c1, r2, c2, mdist, stdv, confidence,
1992  unique_identifier, descriptor) = xl
1993  if descriptor == 'original':
1994  outdict = {}
1995  outdict["Unique ID"] = unique_identifier
1996  outdict["Protein1"] = c1
1997  outdict["Protein2"] = c2
1998  outdict["Residue1"] = r1
1999  outdict["Residue2"] = r2
2000  outdict["Median Distance"] = mdist
2001  outdict["Standard Deviation"] = stdv
2002  outdict["Confidence"] = confidence
2003  outdict["Frequency"] = self.cross_link_frequency[
2004  (r1, c1, r2, c2)]
2005  if c1 == c2:
2006  arrangement = "Intra"
2007  else:
2008  arrangement = "Inter"
2009  outdict["Arrangement"] = arrangement
2010  if not self.external_csv_data is None:
2011  outdict.update(self.external_csv_data[unique_identifier])
2012 
2013  dw.writerow(outdict)
2014 
2015  def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
2016  no_confidence_info=False, filter=None, layout="whole", crosslinkedonly=False,
2017  filename=None, confidence_classes=None, alphablend=0.1, scale_symbol_size=1.0,
2018  gap_between_components=0,
2019  rename_protein_map=None):
2020  # layout can be:
2021  # "lowerdiagonal" print only the lower diagonal plot
2022  # "upperdiagonal" print only the upper diagonal plot
2023  # "whole" print all
2024  # crosslinkedonly: plot only components that have crosslinks
2025  # no_dist_info: if True will plot only the cross-links as grey spots
2026  # filter = tuple the tuple contains a keyword to be search in the database
2027  # a relationship ">","==","<"
2028  # and a value
2029  # example ("ID_Score",">",40)
2030  # scale_symbol_size rescale the symbol for the crosslink
2031  # rename_protein_map is a dictionary to rename proteins
2032 
2033  import matplotlib as mpl
2034  mpl.use('Agg')
2035  import matplotlib.pyplot as plt
2036  import matplotlib.cm as cm
2037 
2038  fig = plt.figure(figsize=(10, 10))
2039  ax = fig.add_subplot(111)
2040 
2041  ax.set_xticks([])
2042  ax.set_yticks([])
2043 
2044  # set the list of proteins on the x axis
2045  if prot_listx is None:
2046  if crosslinkedonly:
2047  prot_listx = list(self.crosslinkedprots)
2048  else:
2049  prot_listx = list(self.prot_length_dict.keys())
2050  prot_listx.sort()
2051 
2052  nresx = gap_between_components + \
2053  sum([self.prot_length_dict[name]
2054  + gap_between_components for name in prot_listx])
2055 
2056  # set the list of proteins on the y axis
2057 
2058  if prot_listy is None:
2059  if crosslinkedonly:
2060  prot_listy = list(self.crosslinkedprots)
2061  else:
2062  prot_listy = list(self.prot_length_dict.keys())
2063  prot_listy.sort()
2064 
2065  nresy = gap_between_components + \
2066  sum([self.prot_length_dict[name]
2067  + gap_between_components for name in prot_listy])
2068 
2069  # this is the residue offset for each protein
2070  resoffsetx = {}
2071  resendx = {}
2072  res = gap_between_components
2073  for prot in prot_listx:
2074  resoffsetx[prot] = res
2075  res += self.prot_length_dict[prot]
2076  resendx[prot] = res
2077  res += gap_between_components
2078 
2079  resoffsety = {}
2080  resendy = {}
2081  res = gap_between_components
2082  for prot in prot_listy:
2083  resoffsety[prot] = res
2084  res += self.prot_length_dict[prot]
2085  resendy[prot] = res
2086  res += gap_between_components
2087 
2088  resoffsetdiagonal = {}
2089  res = gap_between_components
2090  for prot in IMP.pmi.tools.OrderedSet(prot_listx + prot_listy):
2091  resoffsetdiagonal[prot] = res
2092  res += self.prot_length_dict[prot]
2093  res += gap_between_components
2094 
2095  # plot protein boundaries
2096 
2097  xticks = []
2098  xlabels = []
2099  for n, prot in enumerate(prot_listx):
2100  res = resoffsetx[prot]
2101  end = resendx[prot]
2102  for proty in prot_listy:
2103  resy = resoffsety[proty]
2104  endy = resendy[proty]
2105  ax.plot([res, res], [resy, endy], 'k-', lw=0.4)
2106  ax.plot([end, end], [resy, endy], 'k-', lw=0.4)
2107  xticks.append((float(res) + float(end)) / 2)
2108  if rename_protein_map is not None:
2109  if prot in rename_protein_map:
2110  prot=rename_protein_map[prot]
2111  xlabels.append(prot)
2112 
2113  yticks = []
2114  ylabels = []
2115  for n, prot in enumerate(prot_listy):
2116  res = resoffsety[prot]
2117  end = resendy[prot]
2118  for protx in prot_listx:
2119  resx = resoffsetx[protx]
2120  endx = resendx[protx]
2121  ax.plot([resx, endx], [res, res], 'k-', lw=0.4)
2122  ax.plot([resx, endx], [end, end], 'k-', lw=0.4)
2123  yticks.append((float(res) + float(end)) / 2)
2124  if rename_protein_map is not None:
2125  if prot in rename_protein_map:
2126  prot=rename_protein_map[prot]
2127  ylabels.append(prot)
2128 
2129  # plot the contact map
2130  print(prot_listx, prot_listy)
2131 
2132  if not self.contactmap is None:
2133  import matplotlib.cm as cm
2134  tmp_array = np.zeros((nresx, nresy))
2135 
2136  for px in prot_listx:
2137  print(px)
2138  for py in prot_listy:
2139  print(py)
2140  resx = resoffsety[px]
2141  lengx = resendx[px] - 1
2142  resy = resoffsety[py]
2143  lengy = resendy[py] - 1
2144  indexes_x = self.index_dictionary[px]
2145  minx = min(indexes_x)
2146  maxx = max(indexes_x)
2147  indexes_y = self.index_dictionary[py]
2148  miny = min(indexes_y)
2149  maxy = max(indexes_y)
2150 
2151  print(px, py, minx, maxx, miny, maxy)
2152 
2153  try:
2154  tmp_array[
2155  resx:lengx,
2156  resy:lengy] = self.contactmap[
2157  minx:maxx,
2158  miny:maxy]
2159  except:
2160  continue
2161 
2162 
2163  ax.imshow(tmp_array,
2164  cmap=cm.binary,
2165  origin='lower',
2166  interpolation='nearest')
2167 
2168  ax.set_xticks(xticks)
2169  ax.set_xticklabels(xlabels, rotation=90)
2170  ax.set_yticks(yticks)
2171  ax.set_yticklabels(ylabels)
2172  ax.set_xlim(0,nresx)
2173  ax.set_ylim(0,nresy)
2174 
2175 
2176  # set the crosslinks
2177 
2178  already_added_xls = []
2179 
2180  for xl in self.crosslinks:
2181 
2182  (r1, c1, r2, c2, mdist, stdv, confidence,
2183  unique_identifier, descriptor) = xl
2184 
2185  if confidence_classes is not None:
2186  if confidence not in confidence_classes:
2187  continue
2188 
2189  try:
2190  pos1 = r1 + resoffsetx[c1]
2191  except:
2192  continue
2193  try:
2194  pos2 = r2 + resoffsety[c2]
2195  except:
2196  continue
2197 
2198  if not filter is None:
2199  xldb = self.external_csv_data[unique_identifier]
2200  xldb.update({"Distance": mdist})
2201  xldb.update({"Distance_stdv": stdv})
2202 
2203  if filter[1] == ">":
2204  if float(xldb[filter[0]]) <= float(filter[2]):
2205  continue
2206 
2207  if filter[1] == "<":
2208  if float(xldb[filter[0]]) >= float(filter[2]):
2209  continue
2210 
2211  if filter[1] == "==":
2212  if float(xldb[filter[0]]) != float(filter[2]):
2213  continue
2214 
2215  # all that below is used for plotting the diagonal
2216  # when you have a rectangolar plots
2217 
2218  pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
2219  pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
2220 
2221  if layout == 'lowerdiagonal':
2222  if pos_for_diagonal1 <= pos_for_diagonal2:
2223  continue
2224  if layout == 'upperdiagonal':
2225  if pos_for_diagonal1 >= pos_for_diagonal2:
2226  continue
2227 
2228  already_added_xls.append((r1, c1, r2, c2))
2229 
2230  if not no_confidence_info:
2231  if confidence == '0.01':
2232  markersize = 14 * scale_symbol_size
2233  elif confidence == '0.05':
2234  markersize = 9 * scale_symbol_size
2235  elif confidence == '0.1':
2236  markersize = 6 * scale_symbol_size
2237  else:
2238  markersize = 15 * scale_symbol_size
2239  else:
2240  markersize = 5 * scale_symbol_size
2241 
2242  if not no_dist_info:
2243  color = self.colormap(mdist)
2244  else:
2245  color = "gray"
2246 
2247  ax.plot(
2248  [pos1],
2249  [pos2],
2250  'o',
2251  c=color,
2252  alpha=alphablend,
2253  markersize=markersize)
2254 
2255 
2256 
2257  fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
2258 
2259  [i.set_linewidth(2.0) for i in ax.spines.values()]
2260 
2261  #plt.tight_layout()
2262 
2263  if filename:
2264  plt.savefig(filename + ".pdf", dpi=300, transparent="False")
2265  else:
2266  plt.show()
2267 
2268  def get_frequency_statistics(self, prot_list,
2269  prot_list2=None):
2270 
2271  violated_histogram = {}
2272  satisfied_histogram = {}
2273  unique_cross_links = []
2274 
2275  for xl in self.unique_cross_link_list:
2276  (r1, c1, r2, c2, mdist) = xl
2277 
2278  # here we filter by the protein
2279  if prot_list2 is None:
2280  if not c1 in prot_list:
2281  continue
2282  if not c2 in prot_list:
2283  continue
2284  else:
2285  if c1 in prot_list and c2 in prot_list2:
2286  pass
2287  elif c1 in prot_list2 and c2 in prot_list:
2288  pass
2289  else:
2290  continue
2291 
2292  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2293 
2294  if (r1, c1, r2, c2) not in unique_cross_links:
2295  if mdist > 35.0:
2296  if frequency not in violated_histogram:
2297  violated_histogram[frequency] = 1
2298  else:
2299  violated_histogram[frequency] += 1
2300  else:
2301  if frequency not in satisfied_histogram:
2302  satisfied_histogram[frequency] = 1
2303  else:
2304  satisfied_histogram[frequency] += 1
2305  unique_cross_links.append((r1, c1, r2, c2))
2306  unique_cross_links.append((r2, c2, r1, c1))
2307 
2308  print("# satisfied")
2309 
2310  total_number_of_crosslinks=0
2311 
2312  for i in satisfied_histogram:
2313  # if i in violated_histogram:
2314  # print i, satisfied_histogram[i]+violated_histogram[i]
2315  # else:
2316  if i in violated_histogram:
2317  print(i, violated_histogram[i]+satisfied_histogram[i])
2318  else:
2319  print(i, satisfied_histogram[i])
2320  total_number_of_crosslinks+=i*satisfied_histogram[i]
2321 
2322  print("# violated")
2323 
2324  for i in violated_histogram:
2325  print(i, violated_histogram[i])
2326  total_number_of_crosslinks+=i*violated_histogram[i]
2327 
2328  print(total_number_of_crosslinks)
2329 
2330 
2331 # ------------
2332  def print_cross_link_binary_symbols(self, prot_list,
2333  prot_list2=None):
2334  tmp_matrix = []
2335  confidence_list = []
2336  for xl in self.crosslinks:
2337  (r1, c1, r2, c2, mdist, stdv, confidence,
2338  unique_identifier, descriptor) = xl
2339 
2340  if prot_list2 is None:
2341  if not c1 in prot_list:
2342  continue
2343  if not c2 in prot_list:
2344  continue
2345  else:
2346  if c1 in prot_list and c2 in prot_list2:
2347  pass
2348  elif c1 in prot_list2 and c2 in prot_list:
2349  pass
2350  else:
2351  continue
2352 
2353  if descriptor != "original":
2354  continue
2355 
2356  confidence_list.append(confidence)
2357 
2358  dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2359  tmp_dist_binary = []
2360  for d in dists:
2361  if d < 35:
2362  tmp_dist_binary.append(1)
2363  else:
2364  tmp_dist_binary.append(0)
2365  tmp_matrix.append(tmp_dist_binary)
2366 
2367  matrix = list(zip(*tmp_matrix))
2368 
2369  satisfied_high_sum = 0
2370  satisfied_mid_sum = 0
2371  satisfied_low_sum = 0
2372  total_satisfied_sum = 0
2373  for k, m in enumerate(matrix):
2374  satisfied_high = 0
2375  total_high = 0
2376  satisfied_mid = 0
2377  total_mid = 0
2378  satisfied_low = 0
2379  total_low = 0
2380  total_satisfied = 0
2381  total = 0
2382  for n, b in enumerate(m):
2383  if confidence_list[n] == "0.01":
2384  total_high += 1
2385  if b == 1:
2386  satisfied_high += 1
2387  satisfied_high_sum += 1
2388  elif confidence_list[n] == "0.05":
2389  total_mid += 1
2390  if b == 1:
2391  satisfied_mid += 1
2392  satisfied_mid_sum += 1
2393  elif confidence_list[n] == "0.1":
2394  total_low += 1
2395  if b == 1:
2396  satisfied_low += 1
2397  satisfied_low_sum += 1
2398  if b == 1:
2399  total_satisfied += 1
2400  total_satisfied_sum += 1
2401  total += 1
2402  print(k, satisfied_high, total_high)
2403  print(k, satisfied_mid, total_mid)
2404  print(k, satisfied_low, total_low)
2405  print(k, total_satisfied, total)
2406  print(float(satisfied_high_sum) / len(matrix))
2407  print(float(satisfied_mid_sum) / len(matrix))
2408  print(float(satisfied_low_sum) / len(matrix))
2409 # ------------
2410 
2411  def get_unique_crosslinks_statistics(self, prot_list,
2412  prot_list2=None):
2413 
2414  print(prot_list)
2415  print(prot_list2)
2416  satisfied_high = 0
2417  total_high = 0
2418  satisfied_mid = 0
2419  total_mid = 0
2420  satisfied_low = 0
2421  total_low = 0
2422  total = 0
2423  tmp_matrix = []
2424  satisfied_string = []
2425  for xl in self.crosslinks:
2426  (r1, c1, r2, c2, mdist, stdv, confidence,
2427  unique_identifier, descriptor) = xl
2428 
2429  if prot_list2 is None:
2430  if not c1 in prot_list:
2431  continue
2432  if not c2 in prot_list:
2433  continue
2434  else:
2435  if c1 in prot_list and c2 in prot_list2:
2436  pass
2437  elif c1 in prot_list2 and c2 in prot_list:
2438  pass
2439  else:
2440  continue
2441 
2442  if descriptor != "original":
2443  continue
2444 
2445  total += 1
2446  if confidence == "0.01":
2447  total_high += 1
2448  if mdist <= 35:
2449  satisfied_high += 1
2450  if confidence == "0.05":
2451  total_mid += 1
2452  if mdist <= 35:
2453  satisfied_mid += 1
2454  if confidence == "0.1":
2455  total_low += 1
2456  if mdist <= 35:
2457  satisfied_low += 1
2458  if mdist <= 35:
2459  satisfied_string.append(1)
2460  else:
2461  satisfied_string.append(0)
2462 
2463  dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2464  tmp_dist_binary = []
2465  for d in dists:
2466  if d < 35:
2467  tmp_dist_binary.append(1)
2468  else:
2469  tmp_dist_binary.append(0)
2470  tmp_matrix.append(tmp_dist_binary)
2471 
2472  print("unique satisfied_high/total_high", satisfied_high, "/", total_high)
2473  print("unique satisfied_mid/total_mid", satisfied_mid, "/", total_mid)
2474  print("unique satisfied_low/total_low", satisfied_low, "/", total_low)
2475  print("total", total)
2476 
2477  matrix = list(zip(*tmp_matrix))
2478  satisfied_models = 0
2479  satstr = ""
2480  for b in satisfied_string:
2481  if b == 0:
2482  satstr += "-"
2483  if b == 1:
2484  satstr += "*"
2485 
2486  for m in matrix:
2487  all_satisfied = True
2488  string = ""
2489  for n, b in enumerate(m):
2490  if b == 0:
2491  string += "0"
2492  if b == 1:
2493  string += "1"
2494  if b == 1 and satisfied_string[n] == 1:
2495  pass
2496  elif b == 1 and satisfied_string[n] == 0:
2497  pass
2498  elif b == 0 and satisfied_string[n] == 0:
2499  pass
2500  elif b == 0 and satisfied_string[n] == 1:
2501  all_satisfied = False
2502  if all_satisfied:
2503  satisfied_models += 1
2504  print(string)
2505  print(satstr, all_satisfied)
2506  print("models that satisfies the median satisfied crosslinks/total models", satisfied_models, len(matrix))
2507 
2508  def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,
2509  prot_list2=None):
2510 
2511  import matplotlib as mpl
2512  mpl.use('Agg')
2513  import matplotlib.pylab as pl
2514 
2515  tmp_matrix = []
2516  for kw in self.cross_link_distances_unique:
2517  (r1, c1, r2, c2) = kw
2518  dists = self.cross_link_distances_unique[kw]
2519 
2520  if prot_list2 is None:
2521  if not c1 in prot_list:
2522  continue
2523  if not c2 in prot_list:
2524  continue
2525  else:
2526  if c1 in prot_list and c2 in prot_list2:
2527  pass
2528  elif c1 in prot_list2 and c2 in prot_list:
2529  pass
2530  else:
2531  continue
2532  # append the sum of dists to order by that in the matrix plot
2533  dists.append(sum(dists))
2534  tmp_matrix.append(dists)
2535 
2536  tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
2537 
2538  # print len(tmp_matrix), len(tmp_matrix[0])-1
2539  matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
2540 
2541  for i in range(len(tmp_matrix)):
2542  for k in range(len(tmp_matrix[i]) - 1):
2543  matrix[i][k] = tmp_matrix[i][k]
2544 
2545  print(matrix)
2546 
2547  fig = pl.figure()
2548  ax = fig.add_subplot(211)
2549 
2550  cax = ax.imshow(matrix, interpolation='nearest')
2551  # ax.set_yticks(range(len(self.model_list_names)))
2552  #ax.set_yticklabels( [self.model_list_names[i] for i in leaves_order] )
2553  fig.colorbar(cax)
2554  pl.savefig(figurename, dpi=300)
2555  pl.show()
2556 
2557  def plot_bars(
2558  self,
2559  filename,
2560  prots1,
2561  prots2,
2562  nxl_per_row=20,
2563  arrangement="inter",
2564  confidence_input="None"):
2565 
2566  data = []
2567  for xl in self.cross_link_distances:
2568  (r1, c1, r2, c2, mdist, confidence) = xl
2569  if c1 in prots1 and c2 in prots2:
2570  if arrangement == "inter" and c1 == c2:
2571  continue
2572  if arrangement == "intra" and c1 != c2:
2573  continue
2574  if confidence_input == confidence:
2575  label = str(c1) + ":" + str(r1) + \
2576  "-" + str(c2) + ":" + str(r2)
2577  values = self.cross_link_distances[xl]
2578  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2579  data.append((label, values, mdist, frequency))
2580 
2581  sort_by_dist = sorted(data, key=lambda tup: tup[2])
2582  sort_by_dist = list(zip(*sort_by_dist))
2583  values = sort_by_dist[1]
2584  positions = list(range(len(values)))
2585  labels = sort_by_dist[0]
2586  frequencies = list(map(float, sort_by_dist[3]))
2587  frequencies = [f * 10.0 for f in frequencies]
2588 
2589  nchunks = int(float(len(values)) / nxl_per_row)
2590  values_chunks = IMP.pmi.tools.chunk_list_into_segments(values, nchunks)
2591  positions_chunks = IMP.pmi.tools.chunk_list_into_segments(
2592  positions,
2593  nchunks)
2594  frequencies_chunks = IMP.pmi.tools.chunk_list_into_segments(
2595  frequencies,
2596  nchunks)
2597  labels_chunks = IMP.pmi.tools.chunk_list_into_segments(labels, nchunks)
2598 
2599  for n, v in enumerate(values_chunks):
2600  p = positions_chunks[n]
2601  f = frequencies_chunks[n]
2602  l = labels_chunks[n]
2604  filename + "." + str(n), v, p, f,
2605  valuename="Distance (Ang)", positionname="Unique " + arrangement + " Crosslinks", xlabels=l)
2606 
2607  def crosslink_distance_histogram(self, filename,
2608  prot_list=None,
2609  prot_list2=None,
2610  confidence_classes=None,
2611  bins=40,
2612  color='#66CCCC',
2613  yplotrange=[0, 1],
2614  format="png",
2615  normalized=False):
2616  if prot_list is None:
2617  prot_list = list(self.prot_length_dict.keys())
2618 
2619  distances = []
2620  for xl in self.crosslinks:
2621  (r1, c1, r2, c2, mdist, stdv, confidence,
2622  unique_identifier, descriptor) = xl
2623 
2624  if not confidence_classes is None:
2625  if confidence not in confidence_classes:
2626  continue
2627 
2628  if prot_list2 is None:
2629  if not c1 in prot_list:
2630  continue
2631  if not c2 in prot_list:
2632  continue
2633  else:
2634  if c1 in prot_list and c2 in prot_list2:
2635  pass
2636  elif c1 in prot_list2 and c2 in prot_list:
2637  pass
2638  else:
2639  continue
2640 
2641  distances.append(mdist)
2642 
2644  filename, distances, valuename="C-alpha C-alpha distance [Ang]",
2645  bins=bins, color=color,
2646  format=format,
2647  reference_xline=35.0,
2648  yplotrange=yplotrange, normalized=normalized)
2649 
2650  def scatter_plot_xl_features(self, filename,
2651  feature1=None,
2652  feature2=None,
2653  prot_list=None,
2654  prot_list2=None,
2655  yplotrange=None,
2656  reference_ylines=None,
2657  distance_color=True,
2658  format="png"):
2659  import matplotlib as mpl
2660  mpl.use('Agg')
2661  import matplotlib.pyplot as plt
2662  import matplotlib.cm as cm
2663 
2664  fig = plt.figure(figsize=(10, 10))
2665  ax = fig.add_subplot(111)
2666 
2667  for xl in self.crosslinks:
2668  (r1, c1, r2, c2, mdist, stdv, confidence,
2669  unique_identifier, arrangement) = xl
2670 
2671  if prot_list2 is None:
2672  if not c1 in prot_list:
2673  continue
2674  if not c2 in prot_list:
2675  continue
2676  else:
2677  if c1 in prot_list and c2 in prot_list2:
2678  pass
2679  elif c1 in prot_list2 and c2 in prot_list:
2680  pass
2681  else:
2682  continue
2683 
2684  xldb = self.external_csv_data[unique_identifier]
2685  xldb.update({"Distance": mdist})
2686  xldb.update({"Distance_stdv": stdv})
2687 
2688  xvalue = float(xldb[feature1])
2689  yvalue = float(xldb[feature2])
2690 
2691  if distance_color:
2692  color = self.colormap(mdist)
2693  else:
2694  color = "gray"
2695 
2696  ax.plot([xvalue], [yvalue], 'o', c=color, alpha=0.1, markersize=7)
2697 
2698  if not yplotrange is None:
2699  ax.set_ylim(yplotrange)
2700  if not reference_ylines is None:
2701  for rl in reference_ylines:
2702  ax.axhline(rl, color='red', linestyle='dashed', linewidth=1)
2703 
2704  if filename:
2705  plt.savefig(filename + "." + format, dpi=150, transparent="False")
2706 
2707  plt.show()
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
def get_output
Returns output for IMP.pmi.output.Output object.
Simple 3D transformation class.
Visualization of crosslinks.
A decorator to associate a particle with a part of a protein/DNA/RNA.
Definition: Fragment.h:20
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
A class for reading stat files.
Definition: output.py:696
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
def plot_field_histogram
Plot a list of histograms from a value list.
Definition: output.py:963
def plot_fields_box_plots
Plot time series as boxplots.
Definition: output.py:1028
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
Definition: atom/distance.h:49
def get_molecules
This function returns the parent molecule hierarchies of given objects.
Definition: tools.py:1872
Miscellaneous utilities.
Definition: tools.py:1
def __init__
Constructor.
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
Compute the RMSD (without alignment) taking into account the copy ambiguity.
void link_restraints(RMF::FileConstHandle fh, const Restraints &hs)
A class to evaluate the precision of an ensemble.
log
Definition: log.py:1
A class to cluster structures.
def get_rmsd_wrt_reference_structure_with_alignment
First align then calculate RMSD.
static Molecule setup_particle(Model *m, ParticleIndex pi)
Definition: Molecule.h:37
def __init__
Constructor.
Definition: pmi/Analysis.py:32
Class for sampling a density map from particles.
A decorator for keeping track of copies of a molecule.
Definition: Copy.h:28
def add_structure
Read a structure into the ensemble and store (as coordinates).
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
Return a density map for which voxel i contains the result of m1[i]*m2[i].
Performs alignment and RMSD calculation for two sets of coordinates.
Definition: pmi/Analysis.py:23
def add_subunits_density
Add a frame to the densities.
DensityMap * create_density_map(const IMP::algebra::GridD< 3, S, V, E > &arg)
Create a density map from an arbitrary IMP::algebra::GridD.
Definition: DensityMap.h:633
def scatter_and_gather
Synchronize data over a parallel run.
Definition: tools.py:1079
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Basic utilities for handling cryo-electron microscopy 3D density maps.
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
def fill
Add coordinates for a single model.
Tools for clustering and cluster analysis.
Definition: pmi/Analysis.py:1
bool get_is_canonical(atom::Hierarchy h)
Walk up a PMI2 hierarchy/representations and check if the root is named System.
Definition: utilities.h:91
def do_cluster
Run K-means clustering.
3D rotation class.
Definition: Rotation3D.h:46
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Definition: DensityMap.h:464
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Classes for writing output files and processing them.
Definition: output.py:1
def set_reference_structure
Read in a structure used for reference computation.
def get_rmsf
Calculate the residue mean square fluctuations (RMSF).
def get_average_distance_wrt_reference_structure
Compare the structure set to the reference structure.
def get_density
Get the current density for some component name.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
Definition: exception.h:49
def get_precision
Evaluate the precision of two named structure groups.
VectorD< 3 > Vector3D
Definition: VectorD.h:395
Restraints create_restraints(RMF::FileConstHandle fh, Model *m)
def get_moldict_coorddict
return data structure for the RMSD calculation
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name.
Python classes to represent, score, sample and analyze models.
def add_structures
Read a list of RMFs, supports parallel.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Definition: atom/distance.h:85
Select hierarchy particles identified by the biological name.
Definition: Selection.h:66
Compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
def get_residue_indexes
Retrieve the residue indexes for the given particle.
Definition: tools.py:1037
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27