IMP logo
IMP Reference Guide  2.12.0
The Integrative Modeling Platform
pmi/Analysis.py
1 #!/usr/bin/env python
2 
3 """@namespace IMP.pmi.analysis
4  Tools for clustering and cluster analysis
5 """
6 from __future__ import print_function
7 import IMP
8 import IMP.algebra
9 import IMP.em
10 import IMP.pmi
11 import IMP.pmi.tools
12 import IMP.pmi.output
13 import IMP.rmf
14 import RMF
15 import IMP.pmi.analysis
16 from operator import itemgetter
17 from copy import deepcopy
18 from math import log,sqrt
19 import itertools
20 import numpy as np
21 
22 
23 class Alignment(object):
24  """Performs alignment and RMSD calculation for two sets of coordinates
25 
26  The class also takes into account non-equal stoichiometry of the proteins.
27  If this is the case, the protein names of proteins in multiple copies
28  should be specified in the following form:
29  nameA..1, nameA..2 (note two dots).
30  """
31 
32  def __init__(self, template, query, weights=None):
33  """Constructor.
34  @param query {'p1':coords(L,3), 'p2':coords(L,3)}
35  @param template {'p1':coords(L,3), 'p2':coords(L,3)}
36  @param weights optional weights for each set of coordinates
37  """
38  self.query = query
39  self.template = template
40  self.weights=weights
41 
42  if len(self.query.keys()) != len(self.template.keys()):
43  raise ValueError('''the number of proteins
44  in template and query does not match!''')
45 
46  def permute(self):
47  # get unique protein names for each protein
48  # this is, unfortunately, expecting that names are all 'Molname..X'
49  # where X is different for each copy.
50  self.proteins = sorted(self.query.keys())
51  prots_uniq = [i.split('..')[0] for i in self.proteins]
52 
53  # for each unique name, store list of permutations
54  # e.g. for keys A..1,A..2 store P[A] = [[A..1,A..2],[A..2,A..1]]
55  # then store the product: [[[A1,A2],[B1,B2]],[[A1,A2],[B2,B1]],
56  # [[A2,A1],[B1,B2]],[[A2,A1],[B2,B1]]]
57  P = {}
58  for p in prots_uniq:
59  np = prots_uniq.count(p)
60  copies = [i for i in self.proteins if i.split('..')[0] == p]
61  prmts = list(itertools.permutations(copies, len(copies)))
62  P[p] = prmts
63  self.P = P
64  self.Product = list(itertools.product(*P.values()))
65 
66  def get_rmsd(self):
67 
68  self.permute()
69 
70  template_xyz = []
71  weights = []
72  torder = sum([list(i) for i in self.Product[0]], [])
73  for t in torder:
74  template_xyz += [IMP.algebra.Vector3D(i) for i in self.template[t]]
75  if self.weights is not None:
76  weights += [i for i in self.weights[t]]
77  #template_xyz = np.array(template_xyz)
78 
79  self.rmsd = 10000000000.
80  for comb in self.Product:
81 
82  order = sum([list(i) for i in comb], [])
83  query_xyz = []
84  for p in order:
85  query_xyz += [IMP.algebra.Vector3D(i) for i in self.query[p]]
86  #query_xyz = np.array(query_xyz)
87  #if len(template_xyz) != len(query_xyz):
88  # print '''Alignment.get_rmsd: ERROR: the number of coordinates
89  # in template and query does not match!'''
90  # exit()
91 
92  if self.weights is not None:
93  dist=IMP.algebra.get_weighted_rmsd(template_xyz, query_xyz, weights)
94  else:
95  dist=IMP.algebra.get_rmsd(template_xyz, query_xyz)
96  #dist = sqrt(
97  # sum(np.diagonal(cdist(template_xyz, query_xyz) ** 2)) / len(template_xyz))
98  if dist < self.rmsd:
99  self.rmsd = dist
100  return self.rmsd
101 
102  def align(self):
103  from scipy.spatial.distance import cdist
104 
105  self.permute()
106 
107  # create flat coordinate list from template in standard order
108  # then loop through the permutations and try to align and get RMSD
109  # should allow you to try all mappings within each protein
110  template_xyz = []
111  torder = sum([list(i) for i in self.Product[0]], [])
112  for t in torder:
113  template_xyz += [IMP.algebra.Vector3D(i) for i in self.template[t]]
114  # template_xyz = np.array(template_xyz)
115  self.rmsd, Transformation = 10000000000., ''
116 
117  # then for each permutation, get flat list of coords and get RMSD
118  for comb in self.Product:
119  order = sum([list(i) for i in comb], [])
120  query_xyz = []
121  for p in order:
122  query_xyz += [IMP.algebra.Vector3D(i) for i in self.query[p]]
123  #query_xyz = np.array(query_xyz)
124 
125  if len(template_xyz) != len(query_xyz):
126  raise ValueError('''the number of coordinates
127  in template and query does not match!''')
128 
130  query_xyz,
131  template_xyz)
132  query_xyz_tr = [transformation.get_transformed(n)
133  for n in query_xyz]
134 
135  dist = sqrt(
136  sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2)) / len(template_xyz))
137  if dist < self.rmsd:
138  self.rmsd = dist
139  Transformation = transformation
140 
141  # return the transformation
142  return (self.rmsd, Transformation)
143 
144 
145 # TEST for the alignment ###
146 """
147 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
148  'a..2':np.array([np.array([1.,1.,])]),
149  'a..3':np.array([np.array([-2.,1.])]),
150  'b':np.array([np.array([0.,-1.])]),
151  'c..1':np.array([np.array([-1.,-1.])]),
152  'c..2':np.array([np.array([1.,-1.])]),
153  'd':np.array([np.array([0.,0.])]),
154  'e':np.array([np.array([0.,1.])])}
155 
156 Ali = Alignment(Proteins, Proteins)
157 Ali.permute()
158 if Ali.get_rmsd() == 0.0: print 'successful test!'
159 else: print 'ERROR!'; exit()
160 """
161 
162 
163 # ----------------------------------
164 class Violations(object):
165 
166  def __init__(self, filename):
167 
168  self.violation_thresholds = {}
169  self.violation_counts = {}
170 
171  data = open(filename)
172  D = data.readlines()
173  data.close()
174 
175  for d in D:
176  d = d.strip().split()
177  self.violation_thresholds[d[0]] = float(d[1])
178 
179  def get_number_violated_restraints(self, rsts_dict):
180  num_violated = 0
181  for rst in self.violation_thresholds:
182  if rst not in rsts_dict:
183  continue # print rst;
184  if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
185  num_violated += 1
186  if rst not in self.violation_counts:
187  self.violation_counts[rst] = 1
188  else:
189  self.violation_counts[rst] += 1
190  return num_violated
191 
192 
193 # ----------------------------------
194 class Clustering(object):
195  """A class to cluster structures.
196  Uses scipy's cdist function to compute distance matrices
197  and sklearn's kmeans clustering module.
198  """
199  def __init__(self,rmsd_weights=None):
200  """Constructor.
201  @param rmsd_weights Flat list of weights for each particle
202  (if they're coarse)
203  """
204  try:
205  from mpi4py import MPI
206  self.comm = MPI.COMM_WORLD
207  self.rank = self.comm.Get_rank()
208  self.number_of_processes = self.comm.size
209  except ImportError:
210  self.number_of_processes = 1
211  self.rank = 0
212  self.all_coords = {}
213  self.structure_cluster_ids = None
214  self.tmpl_coords = None
215  self.rmsd_weights=rmsd_weights
216 
217  def set_template(self, part_coords):
218 
219  self.tmpl_coords = part_coords
220 
221  def fill(self, frame, Coords):
222  """Add coordinates for a single model."""
223 
224  self.all_coords[frame] = Coords
225 
226  def dist_matrix(self):
227 
228  self.model_list_names = list(self.all_coords.keys())
229  self.model_indexes = list(range(len(self.model_list_names)))
230  self.model_indexes_dict = dict(
231  list(zip(self.model_list_names, self.model_indexes)))
232  model_indexes_unique_pairs = list(itertools.combinations(self.model_indexes, 2))
233 
234  my_model_indexes_unique_pairs = IMP.pmi.tools.chunk_list_into_segments(
235  model_indexes_unique_pairs,
236  self.number_of_processes)[self.rank]
237 
238  print("process %s assigned with %s pairs" % (str(self.rank), str(len(my_model_indexes_unique_pairs))))
239 
240  (raw_distance_dict, self.transformation_distance_dict) = self.matrix_calculation(self.all_coords,
241  self.tmpl_coords,
242  my_model_indexes_unique_pairs)
243 
244  if self.number_of_processes > 1:
245  raw_distance_dict = IMP.pmi.tools.scatter_and_gather(
246  raw_distance_dict)
247  pickable_transformations = self.get_pickable_transformation_distance_dict(
248  )
249  pickable_transformations = IMP.pmi.tools.scatter_and_gather(
250  pickable_transformations)
251  self.set_transformation_distance_dict_from_pickable(
252  pickable_transformations)
253 
254  self.raw_distance_matrix = np.zeros(
255  (len(self.model_list_names), len(self.model_list_names)))
256  for item in raw_distance_dict:
257  (f1, f2) = item
258  self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
259  self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
260 
261  def get_dist_matrix(self):
262  return self.raw_distance_matrix
263 
264  def do_cluster(self, number_of_clusters,seed=None):
265  """Run K-means clustering
266  @param number_of_clusters Num means
267  @param seed the random seed
268  """
269  from sklearn.cluster import KMeans
270  if seed is not None:
271  np.random.seed(seed)
272  try:
273  # check whether we have the right version of sklearn
274  kmeans = KMeans(n_clusters=number_of_clusters)
275  except TypeError:
276  # sklearn older than 0.12
277  kmeans = KMeans(k=number_of_clusters)
278  kmeans.fit_predict(self.raw_distance_matrix)
279 
280  self.structure_cluster_ids = kmeans.labels_
281 
282  def get_pickable_transformation_distance_dict(self):
283  pickable_transformations = {}
284  for label in self.transformation_distance_dict:
285  tr = self.transformation_distance_dict[label]
286  trans = tuple(tr.get_translation())
287  rot = tuple(tr.get_rotation().get_quaternion())
288  pickable_transformations[label] = (rot, trans)
289  return pickable_transformations
290 
291  def set_transformation_distance_dict_from_pickable(
292  self,
293  pickable_transformations):
294  self.transformation_distance_dict = {}
295  for label in pickable_transformations:
296  tr = pickable_transformations[label]
297  trans = IMP.algebra.Vector3D(tr[1])
298  rot = IMP.algebra.Rotation3D(tr[0])
299  self.transformation_distance_dict[
300  label] = IMP.algebra.Transformation3D(rot, trans)
301 
302  def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
303  import pickle
304  outf = open(file_name + ".data", 'wb')
305 
306  # to pickle the transformation dictionary
307  # you have to save the arrays correposnding to
308  # the transformations
309 
310  pickable_transformations = self.get_pickable_transformation_distance_dict(
311  )
312  pickle.dump(
313  (self.structure_cluster_ids,
314  self.model_list_names,
315  pickable_transformations),
316  outf)
317 
318  np.save(file_name + ".npy", self.raw_distance_matrix)
319 
320  def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
321  import pickle
322 
323  inputf = open(file_name + ".data", 'rb')
324  (self.structure_cluster_ids, self.model_list_names,
325  pickable_transformations) = pickle.load(inputf)
326  inputf.close()
327 
328  self.raw_distance_matrix = np.load(file_name + ".npy")
329 
330  self.set_transformation_distance_dict_from_pickable(
331  pickable_transformations)
332  self.model_indexes = list(range(len(self.model_list_names)))
333  self.model_indexes_dict = dict(
334  list(zip(self.model_list_names, self.model_indexes)))
335 
336  def plot_matrix(self, figurename="clustermatrix.pdf"):
337  import matplotlib as mpl
338  mpl.use('Agg')
339  import matplotlib.pylab as pl
340  from scipy.cluster import hierarchy as hrc
341 
342  fig = pl.figure(figsize=(10,8))
343  ax = fig.add_subplot(212)
344  dendrogram = hrc.dendrogram(
345  hrc.linkage(self.raw_distance_matrix),
346  color_threshold=7,
347  no_labels=True)
348  leaves_order = dendrogram['leaves']
349  ax.set_xlabel('Model')
350  ax.set_ylabel('RMSD [Angstroms]')
351 
352  ax2 = fig.add_subplot(221)
353  cax = ax2.imshow(
354  self.raw_distance_matrix[leaves_order,
355  :][:,
356  leaves_order],
357  interpolation='nearest')
358  cb = fig.colorbar(cax)
359  cb.set_label('RMSD [Angstroms]')
360  ax2.set_xlabel('Model')
361  ax2.set_ylabel('Model')
362 
363  pl.savefig(figurename, dpi=300)
364  pl.close(fig)
365 
366  def get_model_index_from_name(self, name):
367  return self.model_indexes_dict[name]
368 
369  def get_cluster_labels(self):
370  # this list
371  return list(set(self.structure_cluster_ids))
372 
373  def get_number_of_clusters(self):
374  return len(self.get_cluster_labels())
375 
376  def get_cluster_label_indexes(self, label):
377  return (
378  [i for i, l in enumerate(self.structure_cluster_ids) if l == label]
379  )
380 
381  def get_cluster_label_names(self, label):
382  return (
383  [self.model_list_names[i]
384  for i in self.get_cluster_label_indexes(label)]
385  )
386 
387  def get_cluster_label_average_rmsd(self, label):
388 
389  indexes = self.get_cluster_label_indexes(label)
390 
391  if len(indexes) > 1:
392  sub_distance_matrix = self.raw_distance_matrix[
393  indexes, :][:, indexes]
394  average_rmsd = np.sum(sub_distance_matrix) / \
395  (len(sub_distance_matrix)
396  ** 2 - len(sub_distance_matrix))
397  else:
398  average_rmsd = 0.0
399  return average_rmsd
400 
401  def get_cluster_label_size(self, label):
402  return len(self.get_cluster_label_indexes(label))
403 
404  def get_transformation_to_first_member(
405  self,
406  cluster_label,
407  structure_index):
408  reference = self.get_cluster_label_indexes(cluster_label)[0]
409  return self.transformation_distance_dict[(reference, structure_index)]
410 
411  def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
412 
413  model_list_names = list(all_coords.keys())
414  rmsd_protein_names = list(all_coords[model_list_names[0]].keys())
415  raw_distance_dict = {}
416  transformation_distance_dict = {}
417  if template_coords is None:
418  do_alignment = False
419  else:
420  do_alignment = True
421  alignment_template_protein_names = list(template_coords.keys())
422 
423  for (f1, f2) in list_of_pairs:
424 
425  if not do_alignment:
426  # here we only get the rmsd,
427  # we need that for instance when you want to cluster conformations
428  # globally, eg the EM map is a reference
430 
431  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
432  for pr in rmsd_protein_names])
433  coords_f2 = {}
434  for pr in rmsd_protein_names:
435  coords_f2[pr] = all_coords[model_list_names[f2]][pr]
436 
437  Ali = Alignment(coords_f1, coords_f2, self.rmsd_weights)
438  rmsd = Ali.get_rmsd()
439 
440  elif do_alignment:
441  # here we actually align the conformations first
442  # and than calculate the rmsd. We need that when the
443  # protein(s) is the reference
444  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
445  for pr in alignment_template_protein_names])
446  coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
447  for pr in alignment_template_protein_names])
448 
449  Ali = Alignment(coords_f1, coords_f2)
450  template_rmsd, transformation = Ali.align()
451 
452  # here we calculate the rmsd
453  # we will align two models based n the nuber of subunits provided
454  # and transform coordinates of model 2 to model 1
455  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
456  for pr in rmsd_protein_names])
457  coords_f2 = {}
458  for pr in rmsd_protein_names:
459  coords_f2[pr] = [transformation.get_transformed(
460  i) for i in all_coords[model_list_names[f2]][pr]]
461 
462  Ali = Alignment(coords_f1, coords_f2, self.rmsd_weights)
463  rmsd = Ali.get_rmsd()
464 
465  raw_distance_dict[(f1, f2)] = rmsd
466  raw_distance_dict[(f2, f1)] = rmsd
467  transformation_distance_dict[(f1, f2)] = transformation
468  transformation_distance_dict[(f2, f1)] = transformation
469 
470  return raw_distance_dict, transformation_distance_dict
471 
472 
473 class RMSD(object):
474  """Compute the RMSD (without alignment) taking into account the copy ambiguity.
475  To be used with pmi2 hierarchies. Can be used for instance as follows:
476 
477  rmsd=IMP.pmi.analysis.RMSD(hier,hier,[mol.get_name() for mol in mols],dynamic0=True,dynamic1=False)
478  output_objects.append(rmsd)
479 
480  before shuffling the coordinates
481  """
482 
483  def __init__(self,hier0,hier1,molnames,label="None",dynamic0=True,dynamic1=True,metric=IMP.algebra.get_rmsd):
484  """
485  @param hier0 first input hierarchy
486  @param hier1 second input hierarchy
487  @param molname the names of the molecules used for the RMSD
488  @dynamic0 if True stores the decorators XYZ and coordinates of hier0 can be update. If false coordinates are static (stored in Vector3Ds)
489  and will never be updated
490  @dynamic1 same as above
491  metric what metric should be used
492  """
493  self.moldict0,self.molcoords0,self.mol_XYZs0=self.get_moldict_coorddict(hier0,molnames)
494  self.moldict1,self.molcoords1,self.mol_XYZs1=self.get_moldict_coorddict(hier1,molnames)
495  self.dynamic0=dynamic0
496  self.dynamic1=dynamic1
497  self.metric=metric
498  self.label=label
499 
500  def get_moldict_coorddict(self,hier,molnames):
501  """return data structure for the RMSD calculation"""
502  moldict={}
503  mol_coords={}
504  mol_XYZs={}
505  for mol in IMP.pmi.tools.get_molecules(hier):
506  name=mol.get_name()
507  if name not in molnames:
508  continue
509  parts=True
510  mol_coords[mol]=[]
511  mol_XYZs[mol]=[]
512  i=1
513  while parts:
514  sel=IMP.atom.Selection(mol,residue_index=i,representation_type=IMP.atom.BALLS,resolution=1)
515  parts=sel.get_selected_particles()
516  if parts:
517  mol_coords[mol].append(IMP.core.XYZ(parts[0]).get_coordinates())
518  mol_XYZs[mol].append(IMP.core.XYZ(parts[0]))
519  i=i+1
520  if name in moldict:
521  moldict[name].append(mol)
522  else:
523  moldict[name]=[mol]
524  return moldict, mol_coords, mol_XYZs
525 
526  def get_rmsd_and_assigments(self):
527  best_orders=[]
528  total_rmsd=0
529  total_N=0
530  best_assignments=[]
531  rmsd_dict={}
532  for molname, ref_mols in self.moldict1.items():
533  ref_coords=[]
534  for ref_mol in ref_mols:
535  if self.dynamic1:
536  coord1=[XYZ.get_coordinates() for XYZ in self.mol_XYZs1[ref_mol]]
537  else:
538  coord1=self.molcoords1[ref_mol]
539  ref_coords+=coord1
540 
541  rmsd=[]
542  rmf_mols_list=[]
543  for rmf_mols in itertools.permutations(self.moldict0[molname]):
544  rmf_coords=[]
545  for rmf_mol in rmf_mols:
546  if self.dynamic0:
547  coord0=[XYZ.get_coordinates() for XYZ in self.mol_XYZs0[rmf_mol]]
548  else:
549  coord0=self.molcoords0[rmf_mol]
550  rmf_coords+=coord0
551 
552  rmsd.append(IMP.algebra.get_rmsd(ref_coords, rmf_coords))
553  rmf_mols_list.append(rmf_mols)
554  m=min(rmsd)
555  rmf_mols_best_order=rmf_mols_list[rmsd.index(m)]
556 
557 
558  for n, (ref_mol,rmf_mol) in enumerate(zip(ref_mols,rmf_mols_best_order)):
559  best_assignments.append((rmf_mol,ref_mol))
560  if self.dynamic0:
561  coord0=[XYZ.get_coordinates() for XYZ in self.mol_XYZs0[rmf_mol]]
562  else:
563  coord0=self.molcoords0[rmf_mol]
564 
565  if self.dynamic1:
566  coord1=[XYZ.get_coordinates() for XYZ in self.mol_XYZs1[ref_mol]]
567  else:
568  coord1=self.molcoords1[ref_mol]
569  rmsd_pair=self.metric(coord1, coord0)
570  N=len(self.molcoords1[ref_mol])
571  total_N+=N
572  total_rmsd+=rmsd_pair*rmsd_pair*N
573  rmsd_dict[ref_mol]=rmsd_pair
574  total_rmsd = sqrt(total_rmsd/total_N)
575  return total_rmsd,best_assignments
576 
577  def get_output(self):
578  """Returns output for IMP.pmi.output.Output object"""
579  total_rmsd,best_assignments=self.get_rmsd_and_assigments()
580 
581  assignments_out=[]
582  for rmf_mol,ref_mol in best_assignments:
583  ref_name=ref_mol.get_name()
584  ref_copy=IMP.atom.Copy(ref_mol).get_copy_index()
585  rmf_name=rmf_mol.get_name()
586  rmf_copy=IMP.atom.Copy(rmf_mol).get_copy_index()
587  assignments_out.append(rmf_name+"."+str(rmf_copy)+"->"+ref_name+"."+str(ref_copy))
588  return {"RMSD_"+self.label:str(total_rmsd),"RMSD_assignments_"+self.label:str(assignments_out)}
589 
590 
591 
592 
593 class Precision(object):
594  """A class to evaluate the precision of an ensemble.
595 
596  Also can evaluate the cross-precision of multiple ensembles.
597  Supports MPI for coordinate reading.
598  Recommended procedure:
599  -# initialize object and pass the selection for evaluating precision
600  -# call add_structures() to read in the data (specify group name)
601  -# call get_precision() to evaluate inter/intra precision
602  -# call get_rmsf() to evaluate within-group fluctuations
603  """
604  def __init__(self,model,
605  resolution=1,
606  selection_dictionary={}):
607  """Constructor.
608  @param model The IMP Model
609  @param resolution Use 1 or 10 (kluge: requires that "_Res:X" is
610  part of the hier name)
611  @param selection_dictionary Dictionary where keys are names for
612  selections and values are selection tuples for scoring
613  precision. "All" is automatically made as well
614  """
615  try:
616  from mpi4py import MPI
617  self.comm = MPI.COMM_WORLD
618  self.rank = self.comm.Get_rank()
619  self.number_of_processes = self.comm.size
620  except ImportError:
621  self.number_of_processes = 1
622  self.rank = 0
623 
624  self.styles = ['pairwise_rmsd','pairwise_drmsd_k','pairwise_drmsd_Q',
625  'pairwise_drms_k','pairwise_rmsd','drmsd_from_center']
626  self.style = 'pairwise_drmsd_k'
627  self.structures_dictionary = {}
628  self.reference_structures_dictionary = {}
629  self.prots = []
630  self.protein_names = None
631  self.len_particles_resolution_one = None
632  self.model = model
633  self.rmf_names_frames = {}
634  self.reference_rmf_names_frames = None
635  self.reference_structure = None
636  self.reference_prot = None
637  self.selection_dictionary = selection_dictionary
638  self.threshold = 40.0
639  self.residue_particle_index_map = None
640  self.prots = None
641  if resolution in [1,10]:
642  self.resolution = resolution
643  else:
644  raise KeyError("Currently only allow resolution 1 or 10")
645 
646  def _get_structure(self,rmf_frame_index,rmf_name):
647  """Read an RMF file and return the particles"""
648  rh = RMF.open_rmf_file_read_only(rmf_name)
649  if not self.prots:
650  print("getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
651  self.prots = IMP.rmf.create_hierarchies(rh, self.model)
652  IMP.rmf.load_frame(rh, RMF.FrameID(rmf_frame_index))
653  else:
654  print("linking coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
655  IMP.rmf.link_hierarchies(rh, self.prots)
656  IMP.rmf.load_frame(rh, RMF.FrameID(rmf_frame_index))
657  del rh
658 
659  if self.resolution==1:
660  particle_dict = get_particles_at_resolution_one(self.prots[0])
661  elif self.resolution==10:
662  particle_dict = get_particles_at_resolution_ten(self.prots[0])
663 
664  protein_names = list(particle_dict.keys())
665  particles_resolution_one = []
666  for k in particle_dict:
667  particles_resolution_one += (particle_dict[k])
668 
669  if self.protein_names==None:
670  self.protein_names = protein_names
671  else:
672  if self.protein_names!=protein_names:
673  print("Error: the protein names of the new coordinate set is not compatible with the previous one")
674 
675  if self.len_particles_resolution_one==None:
676  self.len_particles_resolution_one = len(particles_resolution_one)
677  else:
678  if self.len_particles_resolution_one!=len(particles_resolution_one):
679  raise ValueError("the new coordinate set is not compatible with the previous one")
680 
681  return particles_resolution_one
682 
683  def add_structure(self,
684  rmf_name,
685  rmf_frame_index,
686  structure_set_name,
687  setup_index_map=False):
688  """ Read a structure into the ensemble and store (as coordinates).
689  @param rmf_name The name of the RMF file
690  @param rmf_frame_index The frame to read
691  @param structure_set_name Name for the set that includes this structure
692  (e.g. "cluster 1")
693  @param setup_index_map if requested, set up a dictionary to help
694  find residue indexes
695  """
696 
697  # decide where to put this structure
698  if structure_set_name in self.structures_dictionary:
699  cdict = self.structures_dictionary[structure_set_name]
700  rmflist = self.rmf_names_frames[structure_set_name]
701  else:
702  self.structures_dictionary[structure_set_name]={}
703  self.rmf_names_frames[structure_set_name]=[]
704  cdict = self.structures_dictionary[structure_set_name]
705  rmflist = self.rmf_names_frames[structure_set_name]
706 
707  # read the particles
708  try:
709  particles_resolution_one = self._get_structure(rmf_frame_index,rmf_name)
710  except ValueError:
711  print("something wrong with the rmf")
712  return 0
713  self.selection_dictionary.update({"All":self.protein_names})
714 
715  for selection_name in self.selection_dictionary:
716  selection_tuple = self.selection_dictionary[selection_name]
717  coords = self._select_coordinates(selection_tuple,particles_resolution_one,self.prots[0])
718  if selection_name not in cdict:
719  cdict[selection_name] = [coords]
720  else:
721  cdict[selection_name].append(coords)
722 
723  rmflist.append((rmf_name,rmf_frame_index))
724 
725  # if requested, set up a dictionary to help find residue indexes
726  if setup_index_map:
727  self.residue_particle_index_map = {}
728  for prot_name in self.protein_names:
729  self.residue_particle_index_map[prot_name] = \
730  self._get_residue_particle_index_map(
731  prot_name,
732  particles_resolution_one,self.prots[0])
733 
734  def add_structures(self,
735  rmf_name_frame_tuples,
736  structure_set_name):
737  """Read a list of RMFs, supports parallel
738  @param rmf_name_frame_tuples list of (rmf_file_name,frame_number)
739  @param structure_set_name Name this set of structures (e.g. "cluster.1")
740  """
741 
742  # split up the requested list to read in parallel
743  my_rmf_name_frame_tuples=IMP.pmi.tools.chunk_list_into_segments(
744  rmf_name_frame_tuples,self.number_of_processes)[self.rank]
745  for nfr,tup in enumerate(my_rmf_name_frame_tuples):
746  rmf_name=tup[0]
747  rmf_frame_index=tup[1]
748  # the first frame stores the map between residues and particles
749  if self.residue_particle_index_map is None:
750  setup_index_map = True
751  else:
752  setup_index_map = False
753  self.add_structure(rmf_name,
754  rmf_frame_index,
755  structure_set_name,
756  setup_index_map)
757 
758  # synchronize the structures
759  if self.number_of_processes > 1:
760  self.rmf_names_frames=IMP.pmi.tools.scatter_and_gather(self.rmf_names_frames)
761  if self.rank != 0:
762  self.comm.send(self.structures_dictionary, dest=0, tag=11)
763  elif self.rank == 0:
764  for i in range(1, self.number_of_processes):
765  data_tmp = self.comm.recv(source=i, tag=11)
766  for key in self.structures_dictionary:
767  self.structures_dictionary[key].update(data_tmp[key])
768  for i in range(1, self.number_of_processes):
769  self.comm.send(self.structures_dictionary, dest=i, tag=11)
770  if self.rank != 0:
771  self.structures_dictionary = self.comm.recv(source=0, tag=11)
772 
773  def _get_residue_particle_index_map(self,prot_name,structure,hier):
774  # Creates map from all particles to residue numbers
775  residue_particle_index_map = []
776  if IMP.pmi.get_is_canonical(hier):
777  s = IMP.atom.Selection(hier,molecules=[prot_name],
778  resolution=1)
779  else:
780  s = IMP.atom.Selection(hier,molecules=[prot_name])
781  all_selected_particles = s.get_selected_particles()
782  intersection = list(set(all_selected_particles) & set(structure))
783  sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
784  for p in sorted_intersection:
785  residue_particle_index_map.append(IMP.pmi.tools.get_residue_indexes(p))
786  return residue_particle_index_map
787 
788 
789  def _select_coordinates(self,tuple_selections,structure,prot):
790  selected_coordinates=[]
791  for t in tuple_selections:
792  if isinstance(t, tuple) and len(t)==3:
793  if IMP.pmi.get_is_canonical(prot):
794  s = IMP.atom.Selection(prot,molecules=[t[2]],residue_indexes=range(t[0],t[1]+1),
795  resolution=1)
796  else:
797  s = IMP.atom.Selection(prot,molecules=[t[2]],residue_indexes=range(t[0],t[1]+1))
798  all_selected_particles = s.get_selected_particles()
799  intersection = list(set(all_selected_particles) & set(structure))
800  sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
801  cc = [tuple(IMP.core.XYZ(p).get_coordinates()) for p in sorted_intersection]
802  selected_coordinates += cc
803  elif isinstance(t, str):
804  if IMP.pmi.get_is_canonical(prot):
805  s = IMP.atom.Selection(prot,molecules=[t],resolution=1)
806  else:
807  s = IMP.atom.Selection(prot,molecules=[t])
808  all_selected_particles = s.get_selected_particles()
809  intersection = list(set(all_selected_particles) & set(structure))
810  sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
811  cc = [tuple(IMP.core.XYZ(p).get_coordinates()) for p in sorted_intersection]
812  selected_coordinates += cc
813  else:
814  raise ValueError("Selection error")
815  return selected_coordinates
816 
817  def set_threshold(self,threshold):
818  self.threshold = threshold
819 
820  def _get_distance(self,
821  structure_set_name1,
822  structure_set_name2,
823  selection_name,
824  index1,
825  index2):
826  """ Compute distance between structures with various metrics """
827  c1 = self.structures_dictionary[structure_set_name1][selection_name][index1]
828  c2 = self.structures_dictionary[structure_set_name2][selection_name][index2]
829  coordinates1 = [IMP.algebra.Vector3D(c) for c in c1]
830  coordinates2 = [IMP.algebra.Vector3D(c) for c in c2]
831 
832  if self.style=='pairwise_drmsd_k':
833  distance=IMP.atom.get_drmsd(coordinates1,coordinates2)
834  if self.style=='pairwise_drms_k':
835  distance=IMP.atom.get_drms(coordinates1,coordinates2)
836  if self.style=='pairwise_drmsd_Q':
837  distance=IMP.atom.get_drmsd_Q(coordinates1,coordinates2,self.threshold)
838 
839  if self.style=='pairwise_rmsd':
840  distance=IMP.algebra.get_rmsd(coordinates1,coordinates2)
841  return distance
842 
843  def _get_particle_distances(self,structure_set_name1,structure_set_name2,
844  selection_name,index1,index2):
845  c1 = self.structures_dictionary[structure_set_name1][selection_name][index1]
846  c2 = self.structures_dictionary[structure_set_name2][selection_name][index2]
847 
848  coordinates1 = [IMP.algebra.Vector3D(c) for c in c1]
849  coordinates2 = [IMP.algebra.Vector3D(c) for c in c2]
850 
851  distances=[np.linalg.norm(a-b) for (a,b) in zip(coordinates1,coordinates2)]
852 
853  return distances
854 
855  def get_precision(self,
856  structure_set_name1,
857  structure_set_name2,
858  outfile=None,
859  skip=1,
860  selection_keywords=None):
861  """ Evaluate the precision of two named structure groups. Supports MPI.
862  When the structure_set_name1 is different from the structure_set_name2,
863  this evaluates the cross-precision (average pairwise distances).
864  @param outfile Name of the precision output file
865  @param structure_set_name1 string name of the first structure set
866  @param structure_set_name2 string name of the second structure set
867  @param skip analyze every (skip) structure for the distance matrix calculation
868  @param selection_keywords Specify the selection name you want to calculate on.
869  By default this is computed for everything you provided in the constructor,
870  plus all the subunits together.
871  """
872  if selection_keywords is None:
873  sel_keys = list(self.selection_dictionary.keys())
874  else:
875  for k in selection_keywords:
876  if k not in self.selection_dictionary:
877  raise KeyError("you are trying to find named selection " \
878  + k + " which was not requested in the constructor")
879  sel_keys = selection_keywords
880 
881  if outfile is not None:
882  of = open(outfile,"w")
883  centroid_index = 0
884  for selection_name in sel_keys:
885  number_of_structures_1 = len(self.structures_dictionary[structure_set_name1][selection_name])
886  number_of_structures_2 = len(self.structures_dictionary[structure_set_name2][selection_name])
887  distances={}
888  structure_pointers_1 = list(range(0,number_of_structures_1,skip))
889  structure_pointers_2 = list(range(0,number_of_structures_2,skip))
890  pair_combination_list = list(itertools.product(structure_pointers_1,structure_pointers_2))
891  if len(pair_combination_list)==0:
892  raise ValueError("no structure selected. Check the skip parameter.")
893 
894  # compute pairwise distances in parallel
895  my_pair_combination_list = IMP.pmi.tools.chunk_list_into_segments(
896  pair_combination_list,self.number_of_processes)[self.rank]
897  my_length = len(my_pair_combination_list)
898  for n,pair in enumerate(my_pair_combination_list):
899  progression = int(float(n)/my_length*100.0)
900  distances[pair] = self._get_distance(structure_set_name1,structure_set_name2,
901  selection_name,pair[0],pair[1])
902  if self.number_of_processes > 1:
903  distances = IMP.pmi.tools.scatter_and_gather(distances)
904 
905  # Finally compute distance to centroid
906  if self.rank == 0:
907  if structure_set_name1==structure_set_name2:
908  structure_pointers = structure_pointers_1
909  number_of_structures = number_of_structures_1
910 
911  # calculate the distance from the first centroid
912  # and determine the centroid
913  distance = 0.0
914  distances_to_structure = {}
915  distances_to_structure_normalization = {}
916 
917  for n in structure_pointers:
918  distances_to_structure[n] = 0.0
919  distances_to_structure_normalization[n]=0
920 
921  for k in distances:
922  distance += distances[k]
923  distances_to_structure[k[0]] += distances[k]
924  distances_to_structure[k[1]] += distances[k]
925  distances_to_structure_normalization[k[0]] += 1
926  distances_to_structure_normalization[k[1]] += 1
927 
928  for n in structure_pointers:
929  distances_to_structure[n] = distances_to_structure[n]/distances_to_structure_normalization[n]
930 
931  min_distance = min([distances_to_structure[n] for n in distances_to_structure])
932  centroid_index = [k for k, v in distances_to_structure.items() if v == min_distance][0]
933  centroid_rmf_name = self.rmf_names_frames[structure_set_name1][centroid_index]
934 
935  centroid_distance = 0.0
936  distance_list = []
937  for n in range(number_of_structures):
938  dist = self._get_distance(structure_set_name1,structure_set_name1,
939  selection_name,centroid_index,n)
940  centroid_distance += dist
941  distance_list.append(dist)
942 
943  #pairwise_distance=distance/len(distances.keys())
944  centroid_distance /= number_of_structures
945  #average_centroid_distance=sum(distances_to_structure)/len(distances_to_structure)
946  if outfile is not None:
947  of.write(str(selection_name)+" "+structure_set_name1+
948  " average centroid distance "+str(centroid_distance)+"\n")
949  of.write(str(selection_name)+" "+structure_set_name1+
950  " centroid index "+str(centroid_index)+"\n")
951  of.write(str(selection_name)+" "+structure_set_name1+
952  " centroid rmf name "+str(centroid_rmf_name)+"\n")
953  of.write(str(selection_name)+" "+structure_set_name1+
954  " median centroid distance "+str(np.median(distance_list))+"\n")
955 
956  average_pairwise_distances=sum(distances.values())/len(list(distances.values()))
957  if outfile is not None:
958  of.write(str(selection_name)+" "+structure_set_name1+" "+structure_set_name2+
959  " average pairwise distance "+str(average_pairwise_distances)+"\n")
960  if outfile is not None:
961  of.close()
962  return centroid_index
963 
964  def get_rmsf(self,
965  structure_set_name,
966  outdir="./",
967  skip=1,
968  set_plot_yaxis_range=None):
969  """ Calculate the residue mean square fluctuations (RMSF).
970  Automatically outputs as data file and pdf
971  @param structure_set_name Which structure set to calculate RMSF for
972  @param outdir Where to write the files
973  @param skip Skip this number of structures
974  @param set_plot_yaxis_range In case you need to change the plot
975  """
976  # get the centroid structure for the whole complex
977  centroid_index = self.get_precision(structure_set_name,
978  structure_set_name,
979  outfile=None,
980  skip=skip)
981  if self.rank==0:
982  for sel_name in self.protein_names:
983  self.selection_dictionary.update({sel_name:[sel_name]})
984  try:
985  number_of_structures = len(self.structures_dictionary[structure_set_name][sel_name])
986  except KeyError:
987  # that protein was not included in the selection
988  continue
989  rpim = self.residue_particle_index_map[sel_name]
990  outfile = outdir+"/rmsf."+sel_name+".dat"
991  of = open(outfile,"w")
992  residue_distances = {}
993  residue_nblock = {}
994  for index in range(number_of_structures):
995  distances = self._get_particle_distances(structure_set_name,
996  structure_set_name,
997  sel_name,
998  centroid_index,index)
999  for nblock,block in enumerate(rpim):
1000  for residue_number in block:
1001  residue_nblock[residue_number] = nblock
1002  if residue_number not in residue_distances:
1003  residue_distances[residue_number] = [distances[nblock]]
1004  else:
1005  residue_distances[residue_number].append(distances[nblock])
1006 
1007  residues = []
1008  rmsfs = []
1009  for rn in residue_distances:
1010  residues.append(rn)
1011  rmsf = np.std(residue_distances[rn])
1012  rmsfs.append(rmsf)
1013  of.write(str(rn)+" "+str(residue_nblock[rn])+" "+str(rmsf)+"\n")
1014 
1015  IMP.pmi.output.plot_xy_data(residues,rmsfs,title=sel_name,
1016  out_fn=outdir+"/rmsf."+sel_name,display=False,
1017  set_plot_yaxis_range=set_plot_yaxis_range,
1018  xlabel='Residue Number',ylabel='Standard error')
1019  of.close()
1020 
1021 
1022  def set_reference_structure(self,rmf_name,rmf_frame_index):
1023  """Read in a structure used for reference computation.
1024  Needed before calling get_average_distance_wrt_reference_structure()
1025  @param rmf_name The RMF file to read the reference
1026  @param rmf_frame_index The index in that file
1027  """
1028  particles_resolution_one = self._get_structure(rmf_frame_index,rmf_name)
1029  self.reference_rmf_names_frames = (rmf_name,rmf_frame_index)
1030 
1031  for selection_name in self.selection_dictionary:
1032  selection_tuple = self.selection_dictionary[selection_name]
1033  coords = self._select_coordinates(selection_tuple,
1034  particles_resolution_one,self.prots[0])
1035  self.reference_structures_dictionary[selection_name] = coords
1036 
1037  def get_rmsd_wrt_reference_structure_with_alignment(self,structure_set_name,alignment_selection_key):
1038  """First align then calculate RMSD
1039  @param structure_set_name: the name of the structure set
1040  @param alignment_selection: the key containing the selection tuples needed to make the alignment stored in self.selection_dictionary
1041  @return: for each structure in the structure set, returns the rmsd
1042  """
1043  ret = {}
1044  if self.reference_structures_dictionary=={}:
1045  print("Cannot compute until you set a reference structure")
1046  return
1047 
1048  align_reference_coordinates = self.reference_structures_dictionary[alignment_selection_key]
1049  align_coordinates = self.structures_dictionary[structure_set_name][alignment_selection_key]
1050  transformations = []
1051  for c in align_coordinates:
1052  Ali = IMP.pmi.analysis.Alignment({"All":align_reference_coordinates}, {"All":c})
1053  transformation = Ali.align()[1]
1054  transformations.append(transformation)
1055  for selection_name in self.selection_dictionary:
1056  reference_coordinates = self.reference_structures_dictionary[selection_name]
1057  coordinates2 = [IMP.algebra.Vector3D(c) for c in reference_coordinates]
1058  distances = []
1059  for n,sc in enumerate(self.structures_dictionary[structure_set_name][selection_name]):
1060  coordinates1 = [transformations[n].get_transformed(IMP.algebra.Vector3D(c)) for c in sc]
1061  distance = IMP.algebra.get_rmsd(coordinates1,coordinates2)
1062  distances.append(distance)
1063  ret[selection_name] = {'all_distances':distances,'average_distance':sum(distances)/len(distances),'minimum_distance':min(distances)}
1064  return ret
1065 
1066  def _get_median(self,list_of_values):
1067  return np.median(np.array(list_of_values))
1068 
1069  def get_average_distance_wrt_reference_structure(self,structure_set_name):
1070  """Compare the structure set to the reference structure.
1071  @param structure_set_name The structure set to compute this on
1072  @note First call set_reference_structure()
1073  """
1074  ret = {}
1075  if self.reference_structures_dictionary=={}:
1076  print("Cannot compute until you set a reference structure")
1077  return
1078  for selection_name in self.selection_dictionary:
1079  reference_coordinates = self.reference_structures_dictionary[selection_name]
1080  coordinates2 = [IMP.algebra.Vector3D(c) for c in reference_coordinates]
1081  distances = []
1082  for sc in self.structures_dictionary[structure_set_name][selection_name]:
1083  coordinates1 = [IMP.algebra.Vector3D(c) for c in sc]
1084  if self.style=='pairwise_drmsd_k':
1085  distance = IMP.atom.get_drmsd(coordinates1,coordinates2)
1086  if self.style=='pairwise_drms_k':
1087  distance = IMP.atom.get_drms(coordinates1,coordinates2)
1088  if self.style=='pairwise_drmsd_Q':
1089  distance = IMP.atom.get_drmsd_Q(coordinates1,coordinates2,self.threshold)
1090  if self.style=='pairwise_rmsd':
1091  distance = IMP.algebra.get_rmsd(coordinates1,coordinates2)
1092  distances.append(distance)
1093 
1094  print(selection_name,"average distance",sum(distances)/len(distances),"minimum distance",min(distances),'nframes',len(distances))
1095  ret[selection_name] = {'average_distance':sum(distances)/len(distances),'minimum_distance':min(distances)}
1096  return ret
1097 
1098  def get_coordinates(self):
1099  pass
1100 
1101  def set_precision_style(self, style):
1102  if style in self.styles:
1103  self.style=style
1104  else:
1105  raise ValueError("No such style")
1106 
1107 
1108 class GetModelDensity(object):
1109  """Compute mean density maps from structures.
1110 
1111  Keeps a dictionary of density maps,
1112  keys are in the custom ranges. When you call add_subunits_density, it adds
1113  particle coordinates to the existing density maps.
1114  """
1115 
1116  def __init__(self, custom_ranges, representation=None, resolution=20.0, voxel=5.0):
1117  """Constructor.
1118  @param custom_ranges Required. It's a dictionary, keys are the
1119  density component names, values are selection tuples
1120  e.g. {'kin28':[['kin28',1,-1]],
1121  'density_name_1' :[('ccl1')],
1122  'density_name_2' :[(1,142,'tfb3d1'),
1123  (143,700,'tfb3d2')],
1124  @param representation PMI representation, for doing selections.
1125  Not needed if you only pass hierarchies
1126  @param resolution The MRC resolution of the output map (in Angstrom unit)
1127  @param voxel The voxel size for the output map (lower is slower)
1128  """
1129 
1130  self.representation = representation
1131  self.MRCresolution = resolution
1132  self.voxel = voxel
1133  self.densities = {}
1134  self.count_models = 0.0
1135  self.custom_ranges = custom_ranges
1136 
1137  def add_subunits_density(self, hierarchy=None):
1138  """Add a frame to the densities.
1139  @param hierarchy Optionally read the hierarchy from somewhere.
1140  If not passed, will just read the representation.
1141  """
1142  self.count_models += 1.0
1143 
1144  if hierarchy:
1145  part_dict = get_particles_at_resolution_one(hierarchy)
1146  all_particles_by_resolution = []
1147  for name in part_dict:
1148  all_particles_by_resolution += part_dict[name]
1149 
1150  for density_name in self.custom_ranges:
1151  parts = []
1152  if hierarchy:
1153  all_particles_by_segments = []
1154 
1155  for seg in self.custom_ranges[density_name]:
1156  if not hierarchy:
1157  # when you have a IMP.pmi.representation.Representation class
1158  parts += IMP.tools.select_by_tuple(self.representation,
1159  seg, resolution=1, name_is_ambiguous=False)
1160  else:
1161  # else, when you have a hierarchy, but not a representation
1162  if not IMP.pmi.get_is_canonical(hierarchy):
1163  for h in hierarchy.get_children():
1165  IMP.atom.Molecule.setup_particle(h.get_particle())
1166 
1167  if isinstance(seg, str):
1168  s = IMP.atom.Selection(hierarchy,molecule=seg)
1169  elif isinstance(seg, tuple) and len(seg) == 2:
1170  s = IMP.atom.Selection(
1171  hierarchy, molecule=seg[0],copy_index=seg[1])
1172  elif isinstance(seg, tuple) and len(seg) == 3:
1173  s = IMP.atom.Selection(
1174  hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1))
1175  elif isinstance(seg, tuple) and len(seg) == 4:
1176  s = IMP.atom.Selection(
1177  hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1),copy_index=seg[3])
1178  else:
1179  raise Exception('could not understand selection tuple '+str(seg))
1180 
1181  all_particles_by_segments += s.get_selected_particles()
1182  if hierarchy:
1183  if IMP.pmi.get_is_canonical(hierarchy):
1184  parts = all_particles_by_segments
1185  else:
1186  parts = list(
1187  set(all_particles_by_segments) & set(all_particles_by_resolution))
1188  self._create_density_from_particles(parts, density_name)
1189 
1190  def normalize_density(self):
1191  pass
1192 
1193  def _create_density_from_particles(self, ps, name,
1194  kernel_type='GAUSSIAN'):
1195  '''Internal function for adding to densities.
1196  pass XYZR particles with mass and create a density from them.
1197  kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
1198  kd = {
1199  'GAUSSIAN': IMP.em.GAUSSIAN,
1200  'BINARIZED_SPHERE': IMP.em.BINARIZED_SPHERE,
1201  'SPHERE': IMP.em.SPHERE}
1202 
1203  dmap = IMP.em.SampledDensityMap(ps, self.MRCresolution, self.voxel)
1204  dmap.calcRMS()
1205  dmap.set_was_used(True)
1206  if name not in self.densities:
1207  self.densities[name] = dmap
1208  else:
1209  bbox1 = IMP.em.get_bounding_box(self.densities[name])
1210  bbox2 = IMP.em.get_bounding_box(dmap)
1211  bbox1 += bbox2
1212  dmap3 = IMP.em.create_density_map(bbox1,self.voxel)
1213  dmap3.set_was_used(True)
1214  dmap3.add(dmap)
1215  dmap3.add(self.densities[name])
1216  self.densities[name] = dmap3
1217 
1218  def get_density_keys(self):
1219  return list(self.densities.keys())
1220 
1221  def get_density(self,name):
1222  """Get the current density for some component name"""
1223  if name not in self.densities:
1224  return None
1225  else:
1226  return self.densities[name]
1227 
1228  def write_mrc(self, path="./",suffix=None):
1229  import os, errno
1230  for density_name in self.densities:
1231  self.densities[density_name].multiply(1. / self.count_models)
1232  if suffix is None:
1233  name=path + "/" + density_name + ".mrc"
1234  else:
1235  name=path + "/" + density_name + "." + suffix + ".mrc"
1236  path, file = os.path.split(name)
1237  if not os.path.exists(path):
1238  try:
1239  os.makedirs(path)
1240  except OSError as e:
1241  if e.errno != errno.EEXIST:
1242  raise
1243  IMP.em.write_map(
1244  self.densities[density_name],name,
1246 
1247 
1248 class GetContactMap(object):
1249 
1250  def __init__(self, distance=15.):
1251  self.distance = distance
1252  self.contactmap = ''
1253  self.namelist = []
1254  self.xlinks = 0
1255  self.XL = {}
1256  self.expanded = {}
1257  self.resmap = {}
1258 
1259  def set_prot(self, prot):
1260  from scipy.spatial.distance import cdist
1261  self.prot = prot
1262  self.protnames = []
1263  coords = []
1264  radii = []
1265  namelist = []
1266 
1267  particles_dictionary = get_particles_at_resolution_one(self.prot)
1268 
1269  for name in particles_dictionary:
1270  residue_indexes = []
1271  for p in particles_dictionary[name]:
1272  print(p.get_name())
1273  residue_indexes += IMP.pmi.tools.get_residue_indexes(p)
1274 
1275  if len(residue_indexes) != 0:
1276  self.protnames.append(name)
1277 
1278  def get_subunit_coords(self, frame, align=0):
1279  from scipy.spatial.distance import cdist
1280  coords = []
1281  radii = []
1282  namelist = []
1283  test, testr = [], []
1284  for part in self.prot.get_children():
1285  SortedSegments = []
1286  print(part)
1287  for chl in part.get_children():
1288  start = IMP.atom.get_leaves(chl)[0]
1289  end = IMP.atom.get_leaves(chl)[-1]
1290 
1291  startres = IMP.atom.Fragment(start).get_residue_indexes()[0]
1292  endres = IMP.atom.Fragment(end).get_residue_indexes()[-1]
1293  SortedSegments.append((chl, startres))
1294  SortedSegments = sorted(SortedSegments, key=itemgetter(1))
1295 
1296  for sgmnt in SortedSegments:
1297  for leaf in IMP.atom.get_leaves(sgmnt[0]):
1298  p = IMP.core.XYZR(leaf)
1299  crd = np.array([p.get_x(), p.get_y(), p.get_z()])
1300 
1301  coords.append(crd)
1302  radii.append(p.get_radius())
1303 
1304  new_name = part.get_name() + '_' + sgmnt[0].get_name() +\
1305  '_' + \
1306  str(IMP.atom.Fragment(leaf)
1307  .get_residue_indexes()[0])
1308  namelist.append(new_name)
1309  self.expanded[new_name] = len(
1310  IMP.atom.Fragment(leaf).get_residue_indexes())
1311  if part.get_name() not in self.resmap:
1312  self.resmap[part.get_name()] = {}
1313  for res in IMP.atom.Fragment(leaf).get_residue_indexes():
1314  self.resmap[part.get_name()][res] = new_name
1315 
1316  coords = np.array(coords)
1317  radii = np.array(radii)
1318  if len(self.namelist) == 0:
1319  self.namelist = namelist
1320  self.contactmap = np.zeros((len(coords), len(coords)))
1321  distances = cdist(coords, coords)
1322  distances = (distances - radii).T - radii
1323  distances = distances <= self.distance
1324  self.contactmap += distances
1325 
1326  def add_xlinks(
1327  self,
1328  filname,
1329  identification_string='ISDCrossLinkMS_Distance_'):
1330  # 'ISDCrossLinkMS_Distance_interrb_6629-State:0-20:RPS30_218:eIF3j-1-1-0.1_None'
1331  self.xlinks = 1
1332  data = open(filname)
1333  D = data.readlines()
1334  data.close()
1335 
1336  for d in D:
1337  if identification_string in d:
1338  d = d.replace(
1339  "_",
1340  " ").replace("-",
1341  " ").replace(":",
1342  " ").split()
1343 
1344  t1, t2 = (d[0], d[1]), (d[1], d[0])
1345  if t1 not in self.XL:
1346  self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
1347  self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
1348  else:
1349  self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
1350  self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
1351 
1352  def dist_matrix(self, skip_cmap=0, skip_xl=1, outname=None):
1353  K = self.namelist
1354  M = self.contactmap
1355  C, R = [], []
1356  L = sum(self.expanded.values())
1357  proteins = self.protnames
1358 
1359  # exp new
1360  if skip_cmap == 0:
1361  Matrices = {}
1362  proteins = [p.get_name() for p in self.prot.get_children()]
1363  missing = []
1364  for p1 in range(len(proteins)):
1365  for p2 in range(p1, len(proteins)):
1366  pl1, pl2 = max(
1367  self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1368  pn1, pn2 = proteins[p1], proteins[p2]
1369  mtr = np.zeros((pl1 + 1, pl2 + 1))
1370  print('Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1371  for i1 in range(1, pl1 + 1):
1372  for i2 in range(1, pl2 + 1):
1373  try:
1374  r1 = K.index(self.resmap[pn1][i1])
1375  r2 = K.index(self.resmap[pn2][i2])
1376  r = M[r1, r2]
1377  mtr[i1 - 1, i2 - 1] = r
1378  except KeyError:
1379  missing.append((pn1, pn2, i1, i2))
1380  Matrices[(pn1, pn2)] = mtr
1381 
1382  # add cross-links
1383  if skip_xl == 0:
1384  if self.XL == {}:
1385  raise ValueError("cross-links were not provided, use add_xlinks function!")
1386  Matrices_xl = {}
1387  for p1 in range(len(proteins)):
1388  for p2 in range(p1, len(proteins)):
1389  pl1, pl2 = max(
1390  self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1391  pn1, pn2 = proteins[p1], proteins[p2]
1392  mtr = np.zeros((pl1 + 1, pl2 + 1))
1393  flg = 0
1394  try:
1395  xls = self.XL[(pn1, pn2)]
1396  except KeyError:
1397  try:
1398  xls = self.XL[(pn2, pn1)]
1399  flg = 1
1400  except KeyError:
1401  flg = 2
1402  if flg == 0:
1403  print('Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1404  for xl1, xl2 in xls:
1405  if xl1 > pl1:
1406  print('X' * 10, xl1, xl2)
1407  xl1 = pl1
1408  if xl2 > pl2:
1409  print('X' * 10, xl1, xl2)
1410  xl2 = pl2
1411  mtr[xl1 - 1, xl2 - 1] = 100
1412  elif flg == 1:
1413  print('Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1414  for xl1, xl2 in xls:
1415  if xl1 > pl1:
1416  print('X' * 10, xl1, xl2)
1417  xl1 = pl1
1418  if xl2 > pl2:
1419  print('X' * 10, xl1, xl2)
1420  xl2 = pl2
1421  mtr[xl2 - 1, xl1 - 1] = 100
1422  else:
1423  print('No cross links between: ', pn1, pn2)
1424  Matrices_xl[(pn1, pn2)] = mtr
1425 
1426  # expand the matrix to individual residues
1427  #NewM = []
1428  # for x1 in xrange(len(K)):
1429  # lst = []
1430  # for x2 in xrange(len(K)):
1431  # lst += [M[x1,x2]]*self.expanded[K[x2]]
1432  # for i in xrange(self.expanded[K[x1]]): NewM.append(np.array(lst))
1433  #NewM = np.array(NewM)
1434 
1435  # make list of protein names and create coordinate lists
1436  C = proteins
1437  # W is the component length list,
1438  # R is the contiguous coordinates list
1439  W, R = [], []
1440  for i, c in enumerate(C):
1441  cl = max(self.resmap[c].keys())
1442  W.append(cl)
1443  if i == 0:
1444  R.append(cl)
1445  else:
1446  R.append(R[-1] + cl)
1447 
1448  # start plotting
1449  if outname:
1450  # Don't require a display
1451  import matplotlib as mpl
1452  mpl.use('Agg')
1453  import matplotlib.pyplot as plt
1454  import matplotlib.gridspec as gridspec
1455  import scipy.sparse as sparse
1456 
1457  f = plt.figure()
1458  gs = gridspec.GridSpec(len(W), len(W),
1459  width_ratios=W,
1460  height_ratios=W)
1461 
1462  cnt = 0
1463  for x1, r1 in enumerate(R):
1464  if x1 == 0:
1465  s1 = 0
1466  else:
1467  s1 = R[x1 - 1]
1468  for x2, r2 in enumerate(R):
1469  if x2 == 0:
1470  s2 = 0
1471  else:
1472  s2 = R[x2 - 1]
1473 
1474  ax = plt.subplot(gs[cnt])
1475  if skip_cmap == 0:
1476  try:
1477  mtr = Matrices[(C[x1], C[x2])]
1478  except KeyError:
1479  mtr = Matrices[(C[x2], C[x1])].T
1480  #cax = ax.imshow(log(NewM[s1:r1,s2:r2] / 1.), interpolation='nearest', vmin=0., vmax=log(NewM.max()))
1481  cax = ax.imshow(
1482  np.log(mtr),
1483  interpolation='nearest',
1484  vmin=0.,
1485  vmax=log(mtr.max()))
1486  ax.set_xticks([])
1487  ax.set_yticks([])
1488  if skip_xl == 0:
1489  try:
1490  mtr = Matrices_xl[(C[x1], C[x2])]
1491  except KeyError:
1492  mtr = Matrices_xl[(C[x2], C[x1])].T
1493  cax = ax.spy(
1494  sparse.csr_matrix(mtr),
1495  markersize=10,
1496  color='white',
1497  linewidth=100,
1498  alpha=0.5)
1499  ax.set_xticks([])
1500  ax.set_yticks([])
1501 
1502  cnt += 1
1503  if x2 == 0:
1504  ax.set_ylabel(C[x1], rotation=90)
1505  if outname:
1506  plt.savefig(outname + ".pdf", dpi=300, transparent="False")
1507  else:
1508  plt.show()
1509 
1510 
1511 # ------------------------------------------------------------------
1512 # a few random tools
1513 
1514 def get_hiers_from_rmf(model, frame_number, rmf_file):
1515  # I have to deprecate this function
1516  print("getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1517 
1518  # load the frame
1519  rh = RMF.open_rmf_file_read_only(rmf_file)
1520 
1521  try:
1522  prots = IMP.rmf.create_hierarchies(rh, model)
1523  except IOError:
1524  print("Unable to open rmf file %s" % (rmf_file))
1525  return None
1526  #IMP.rmf.link_hierarchies(rh, prots)
1527 
1528  try:
1529  IMP.rmf.load_frame(rh, RMF.FrameID(frame_number))
1530  except IOError:
1531  print("Unable to open frame %i of file %s" % (frame_number, rmf_file))
1532  return None
1533  model.update()
1534  del rh
1535 
1536  return prots
1537 
1538 def link_hiers_to_rmf(model,hiers,frame_number, rmf_file):
1539  print("linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1540  rh = RMF.open_rmf_file_read_only(rmf_file)
1541  IMP.rmf.link_hierarchies(rh, hiers)
1542  try:
1543  IMP.rmf.load_frame(rh, RMF.FrameID(frame_number))
1544  except:
1545  print("Unable to open frame %i of file %s" % (frame_number, rmf_file))
1546  return False
1547  model.update()
1548  del rh
1549  return True
1550 
1551 
1552 def get_hiers_and_restraints_from_rmf(model, frame_number, rmf_file):
1553  # I have to deprecate this function
1554  print("getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1555 
1556  # load the frame
1557  rh = RMF.open_rmf_file_read_only(rmf_file)
1558 
1559  try:
1560  prots = IMP.rmf.create_hierarchies(rh, model)
1561  rs = IMP.rmf.create_restraints(rh, model)
1562  except:
1563  print("Unable to open rmf file %s" % (rmf_file))
1564  return None,None
1565  try:
1566  IMP.rmf.load_frame(rh, RMF.FrameID(frame_number))
1567  except:
1568  print("Unable to open frame %i of file %s" % (frame_number, rmf_file))
1569  return None,None
1570  model.update()
1571  del rh
1572 
1573  return prots,rs
1574 
1575 def link_hiers_and_restraints_to_rmf(model,hiers,rs, frame_number, rmf_file):
1576  print("linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1577  rh = RMF.open_rmf_file_read_only(rmf_file)
1578  IMP.rmf.link_hierarchies(rh, hiers)
1579  IMP.rmf.link_restraints(rh, rs)
1580  try:
1581  IMP.rmf.load_frame(rh, RMF.FrameID(frame_number))
1582  except:
1583  print("Unable to open frame %i of file %s" % (frame_number, rmf_file))
1584  return False
1585  model.update()
1586  del rh
1587  return True
1588 
1589 
1591  """Get particles at res 1, or any beads, based on the name.
1592  No Representation is needed. This is mainly used when the hierarchy
1593  is read from an RMF file.
1594  @return a dictionary of component names and their particles
1595  @note If the root node is named "System" or is a "State", do proper selection.
1596  """
1597  particle_dict = {}
1598 
1599  # attempt to give good results for PMI2
1600  if IMP.pmi.get_is_canonical(prot):
1601  for mol in IMP.atom.get_by_type(prot,IMP.atom.MOLECULE_TYPE):
1602  sel = IMP.atom.Selection(mol,resolution=1)
1603  particle_dict[mol.get_name()] = sel.get_selected_particles()
1604  else:
1605  allparticles = []
1606  for c in prot.get_children():
1607  name = c.get_name()
1608  particle_dict[name] = IMP.atom.get_leaves(c)
1609  for s in c.get_children():
1610  if "_Res:1" in s.get_name() and "_Res:10" not in s.get_name():
1611  allparticles += IMP.atom.get_leaves(s)
1612  if "Beads" in s.get_name():
1613  allparticles += IMP.atom.get_leaves(s)
1614 
1615  particle_align = []
1616  for name in particle_dict:
1617  particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1618  list(set(particle_dict[name]) & set(allparticles)))
1619  return particle_dict
1620 
1622  """Get particles at res 10, or any beads, based on the name.
1623  No Representation is needed.
1624  This is mainly used when the hierarchy is read from an RMF file.
1625  @return a dictionary of component names and their particles
1626  @note If the root node is named "System" or is a "State", do proper selection.
1627  """
1628  particle_dict = {}
1629  # attempt to give good results for PMI2
1630  if IMP.pmi.get_is_canonical(prot):
1631  for mol in IMP.atom.get_by_type(prot,IMP.atom.MOLECULE_TYPE):
1632  sel = IMP.atom.Selection(mol,resolution=10)
1633  particle_dict[mol.get_name()] = sel.get_selected_particles()
1634  else:
1635  allparticles = []
1636  for c in prot.get_children():
1637  name = c.get_name()
1638  particle_dict[name] = IMP.atom.get_leaves(c)
1639  for s in c.get_children():
1640  if "_Res:10" in s.get_name():
1641  allparticles += IMP.atom.get_leaves(s)
1642  if "Beads" in s.get_name():
1643  allparticles += IMP.atom.get_leaves(s)
1644  particle_align = []
1645  for name in particle_dict:
1646  particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1647  list(set(particle_dict[name]) & set(allparticles)))
1648  return particle_dict
1649 
1650 def select_by_tuple(first_res_last_res_name_tuple):
1651  first_res = first_res_last_res_hier_tuple[0]
1652  last_res = first_res_last_res_hier_tuple[1]
1653  name = first_res_last_res_hier_tuple[2]
1654 
1655 class CrossLinkTable(object):
1656  """Visualization of crosslinks"""
1657  def __init__(self):
1658  self.crosslinks = []
1659  self.external_csv_data = None
1660  self.crosslinkedprots = set()
1661  self.mindist = +10000000.0
1662  self.maxdist = -10000000.0
1663  self.contactmap = None
1664 
1665  def set_hierarchy(self, prot):
1666  self.prot_length_dict = {}
1667  self.model=prot.get_model()
1668 
1669  for i in prot.get_children():
1670  name = i.get_name()
1671  residue_indexes = []
1672  for p in IMP.atom.get_leaves(i):
1673  residue_indexes += IMP.pmi.tools.get_residue_indexes(p)
1674 
1675  if len(residue_indexes) != 0:
1676  self.prot_length_dict[name] = max(residue_indexes)
1677 
1678  def set_coordinates_for_contact_map(self, rmf_name,rmf_frame_index):
1679  from scipy.spatial.distance import cdist
1680 
1681  rh= RMF.open_rmf_file_read_only(rmf_name)
1682  prots=IMP.rmf.create_hierarchies(rh, self.model)
1683  IMP.rmf.load_frame(rh, RMF.FrameID(rmf_frame_index))
1684  print("getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
1685  del rh
1686 
1687 
1688  coords = []
1689  radii = []
1690  namelist = []
1691 
1692  particles_dictionary = get_particles_at_resolution_one(prots[0])
1693 
1694  resindex = 0
1695  self.index_dictionary = {}
1696 
1697  for name in particles_dictionary:
1698  residue_indexes = []
1699  for p in particles_dictionary[name]:
1700  print(p.get_name())
1701  residue_indexes = IMP.pmi.tools.get_residue_indexes(p)
1702  #residue_indexes.add( )
1703 
1704  if len(residue_indexes) != 0:
1705 
1706  for res in range(min(residue_indexes), max(residue_indexes) + 1):
1707  d = IMP.core.XYZR(p)
1708 
1709  crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1710  coords.append(crd)
1711  radii.append(d.get_radius())
1712  if name not in self.index_dictionary:
1713  self.index_dictionary[name] = [resindex]
1714  else:
1715  self.index_dictionary[name].append(resindex)
1716  resindex += 1
1717 
1718  coords = np.array(coords)
1719  radii = np.array(radii)
1720 
1721  distances = cdist(coords, coords)
1722  distances = (distances - radii).T - radii
1723 
1724  distances = np.where(distances <= 20.0, 1.0, 0)
1725  if self.contactmap is None:
1726  self.contactmap = np.zeros((len(coords), len(coords)))
1727  self.contactmap += distances
1728 
1729  for prot in prots: IMP.atom.destroy(prot)
1730 
1731  def set_crosslinks(
1732  self, data_file, search_label='ISDCrossLinkMS_Distance_',
1733  mapping=None,
1734  filter_label=None,
1735  filter_rmf_file_names=None, #provide a list of rmf base names to filter the stat file
1736  external_csv_data_file=None,
1737  external_csv_data_file_unique_id_key="Unique ID"):
1738 
1739  # example key ISDCrossLinkMS_Distance_intrarb_937-State:0-108:RPS3_55:RPS30-1-1-0.1_None
1740  # mapping is a dictionary that maps standard keywords to entry positions in the key string
1741  # confidence class is a filter that
1742  # external datafile is a datafile that contains further information on the crosslinks
1743  # it will use the unique id to create the dictionary keys
1744 
1745  po = IMP.pmi.output.ProcessOutput(data_file)
1746  keys = po.get_keys()
1747 
1748  xl_keys = [k for k in keys if search_label in k]
1749 
1750  if filter_rmf_file_names is not None:
1751  rmf_file_key="local_rmf_file_name"
1752  fs = po.get_fields(xl_keys+[rmf_file_key])
1753  else:
1754  fs = po.get_fields(xl_keys)
1755 
1756  # this dictionary stores the occurency of given crosslinks
1757  self.cross_link_frequency = {}
1758 
1759  # this dictionary stores the series of distances for given crosslinked
1760  # residues
1761  self.cross_link_distances = {}
1762 
1763  # this dictionary stores the series of distances for given crosslinked
1764  # residues
1765  self.cross_link_distances_unique = {}
1766 
1767  if not external_csv_data_file is None:
1768  # this dictionary stores the further information on crosslinks
1769  # labeled by unique ID
1770  self.external_csv_data = {}
1771  xldb = IMP.pmi.tools.get_db_from_csv(external_csv_data_file)
1772 
1773  for xl in xldb:
1774  self.external_csv_data[
1775  xl[external_csv_data_file_unique_id_key]] = xl
1776 
1777  # this list keeps track the tuple of cross-links and sample
1778  # so that we don't count twice the same crosslinked residues in the
1779  # same sample
1780  cross_link_frequency_list = []
1781 
1782  self.unique_cross_link_list = []
1783 
1784  for key in xl_keys:
1785  print(key)
1786  keysplit = key.replace(
1787  "_",
1788  " ").replace(
1789  "-",
1790  " ").replace(
1791  ":",
1792  " ").split(
1793  )
1794 
1795  if filter_label!=None:
1796  if filter_label not in keysplit: continue
1797 
1798  if mapping is None:
1799  r1 = int(keysplit[5])
1800  c1 = keysplit[6]
1801  r2 = int(keysplit[7])
1802  c2 = keysplit[8]
1803  try:
1804  confidence = keysplit[12]
1805  except:
1806  confidence = '0.0'
1807  try:
1808  unique_identifier = keysplit[3]
1809  except:
1810  unique_identifier = '0'
1811  else:
1812  r1 = int(keysplit[mapping["Residue1"]])
1813  c1 = keysplit[mapping["Protein1"]]
1814  r2 = int(keysplit[mapping["Residue2"]])
1815  c2 = keysplit[mapping["Protein2"]]
1816  try:
1817  confidence = keysplit[mapping["Confidence"]]
1818  except:
1819  confidence = '0.0'
1820  try:
1821  unique_identifier = keysplit[mapping["Unique Identifier"]]
1822  except:
1823  unique_identifier = '0'
1824 
1825  self.crosslinkedprots.add(c1)
1826  self.crosslinkedprots.add(c2)
1827 
1828  # construct the list of distances corresponding to the input rmf
1829  # files
1830 
1831  dists=[]
1832  if filter_rmf_file_names is not None:
1833  for n,d in enumerate(fs[key]):
1834  if fs[rmf_file_key][n] in filter_rmf_file_names:
1835  dists.append(float(d))
1836  else:
1837  dists=[float(f) for f in fs[key]]
1838 
1839  # check if the input confidence class corresponds to the
1840  # one of the cross-link
1841 
1842  mdist = self.median(dists)
1843 
1844  stdv = np.std(np.array(dists))
1845  if self.mindist > mdist:
1846  self.mindist = mdist
1847  if self.maxdist < mdist:
1848  self.maxdist = mdist
1849 
1850  # calculate the frequency of unique crosslinks within the same
1851  # sample
1852  if not self.external_csv_data is None:
1853  sample = self.external_csv_data[unique_identifier]["Sample"]
1854  else:
1855  sample = "None"
1856 
1857  if (r1, c1, r2, c2,mdist) not in cross_link_frequency_list:
1858  if (r1, c1, r2, c2) not in self.cross_link_frequency:
1859  self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1860  self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1861  else:
1862  self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1863  self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1864  cross_link_frequency_list.append((r1, c1, r2, c2))
1865  cross_link_frequency_list.append((r2, c2, r1, c1))
1866  self.unique_cross_link_list.append(
1867  (r1, c1, r2, c2,mdist))
1868 
1869  if (r1, c1, r2, c2) not in self.cross_link_distances:
1870  self.cross_link_distances[(
1871  r1,
1872  c1,
1873  r2,
1874  c2,
1875  mdist,
1876  confidence)] = dists
1877  self.cross_link_distances[(
1878  r2,
1879  c2,
1880  r1,
1881  c1,
1882  mdist,
1883  confidence)] = dists
1884  self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1885  else:
1886  self.cross_link_distances[(
1887  r2,
1888  c2,
1889  r1,
1890  c1,
1891  mdist,
1892  confidence)] += dists
1893  self.cross_link_distances[(
1894  r1,
1895  c1,
1896  r2,
1897  c2,
1898  mdist,
1899  confidence)] += dists
1900 
1901  self.crosslinks.append(
1902  (r1,
1903  c1,
1904  r2,
1905  c2,
1906  mdist,
1907  stdv,
1908  confidence,
1909  unique_identifier,
1910  'original'))
1911  self.crosslinks.append(
1912  (r2,
1913  c2,
1914  r1,
1915  c1,
1916  mdist,
1917  stdv,
1918  confidence,
1919  unique_identifier,
1920  'reversed'))
1921 
1922  self.cross_link_frequency_inverted = {}
1923  for xl in self.unique_cross_link_list:
1924  (r1, c1, r2, c2, mdist) = xl
1925  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1926  if frequency not in self.cross_link_frequency_inverted:
1927  self.cross_link_frequency_inverted[
1928  frequency] = [(r1, c1, r2, c2)]
1929  else:
1930  self.cross_link_frequency_inverted[
1931  frequency].append((r1, c1, r2, c2))
1932 
1933  # -------------
1934 
1935  def median(self, mylist):
1936  sorts = sorted(mylist)
1937  length = len(sorts)
1938  print(length)
1939  if length == 1:
1940  return mylist[0]
1941  if not length % 2:
1942  return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1943  return sorts[length / 2]
1944 
1945  def set_threshold(self,threshold):
1946  self.threshold=threshold
1947 
1948  def set_tolerance(self,tolerance):
1949  self.tolerance=tolerance
1950 
1951  def colormap(self, dist):
1952  if dist < self.threshold - self.tolerance:
1953  return "Green"
1954  elif dist >= self.threshold + self.tolerance:
1955  return "Orange"
1956  else:
1957  return "Red"
1958 
1959  def write_cross_link_database(self, filename, format='csv'):
1960  import csv
1961 
1962  fieldnames = [
1963  "Unique ID", "Protein1", "Residue1", "Protein2", "Residue2",
1964  "Median Distance", "Standard Deviation", "Confidence", "Frequency", "Arrangement"]
1965 
1966  if not self.external_csv_data is None:
1967  keys = list(self.external_csv_data.keys())
1968  innerkeys = list(self.external_csv_data[keys[0]].keys())
1969  innerkeys.sort()
1970  fieldnames += innerkeys
1971 
1972  dw = csv.DictWriter(
1973  open(filename,
1974  "w"),
1975  delimiter=',',
1976  fieldnames=fieldnames)
1977  dw.writeheader()
1978  for xl in self.crosslinks:
1979  (r1, c1, r2, c2, mdist, stdv, confidence,
1980  unique_identifier, descriptor) = xl
1981  if descriptor == 'original':
1982  outdict = {}
1983  outdict["Unique ID"] = unique_identifier
1984  outdict["Protein1"] = c1
1985  outdict["Protein2"] = c2
1986  outdict["Residue1"] = r1
1987  outdict["Residue2"] = r2
1988  outdict["Median Distance"] = mdist
1989  outdict["Standard Deviation"] = stdv
1990  outdict["Confidence"] = confidence
1991  outdict["Frequency"] = self.cross_link_frequency[
1992  (r1, c1, r2, c2)]
1993  if c1 == c2:
1994  arrangement = "Intra"
1995  else:
1996  arrangement = "Inter"
1997  outdict["Arrangement"] = arrangement
1998  if not self.external_csv_data is None:
1999  outdict.update(self.external_csv_data[unique_identifier])
2000 
2001  dw.writerow(outdict)
2002 
2003  def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
2004  no_confidence_info=False, filter=None, layout="whole", crosslinkedonly=False,
2005  filename=None, confidence_classes=None, alphablend=0.1, scale_symbol_size=1.0,
2006  gap_between_components=0,
2007  rename_protein_map=None):
2008  # layout can be:
2009  # "lowerdiagonal" print only the lower diagonal plot
2010  # "upperdiagonal" print only the upper diagonal plot
2011  # "whole" print all
2012  # crosslinkedonly: plot only components that have crosslinks
2013  # no_dist_info: if True will plot only the cross-links as grey spots
2014  # filter = tuple the tuple contains a keyword to be search in the database
2015  # a relationship ">","==","<"
2016  # and a value
2017  # example ("ID_Score",">",40)
2018  # scale_symbol_size rescale the symbol for the crosslink
2019  # rename_protein_map is a dictionary to rename proteins
2020 
2021  import matplotlib as mpl
2022  mpl.use('Agg')
2023  import matplotlib.pyplot as plt
2024  import matplotlib.cm as cm
2025 
2026  fig = plt.figure(figsize=(10, 10))
2027  ax = fig.add_subplot(111)
2028 
2029  ax.set_xticks([])
2030  ax.set_yticks([])
2031 
2032  # set the list of proteins on the x axis
2033  if prot_listx is None:
2034  if crosslinkedonly:
2035  prot_listx = list(self.crosslinkedprots)
2036  else:
2037  prot_listx = list(self.prot_length_dict.keys())
2038  prot_listx.sort()
2039 
2040  nresx = gap_between_components + \
2041  sum([self.prot_length_dict[name]
2042  + gap_between_components for name in prot_listx])
2043 
2044  # set the list of proteins on the y axis
2045 
2046  if prot_listy is None:
2047  if crosslinkedonly:
2048  prot_listy = list(self.crosslinkedprots)
2049  else:
2050  prot_listy = list(self.prot_length_dict.keys())
2051  prot_listy.sort()
2052 
2053  nresy = gap_between_components + \
2054  sum([self.prot_length_dict[name]
2055  + gap_between_components for name in prot_listy])
2056 
2057  # this is the residue offset for each protein
2058  resoffsetx = {}
2059  resendx = {}
2060  res = gap_between_components
2061  for prot in prot_listx:
2062  resoffsetx[prot] = res
2063  res += self.prot_length_dict[prot]
2064  resendx[prot] = res
2065  res += gap_between_components
2066 
2067  resoffsety = {}
2068  resendy = {}
2069  res = gap_between_components
2070  for prot in prot_listy:
2071  resoffsety[prot] = res
2072  res += self.prot_length_dict[prot]
2073  resendy[prot] = res
2074  res += gap_between_components
2075 
2076  resoffsetdiagonal = {}
2077  res = gap_between_components
2078  for prot in IMP.pmi.tools.OrderedSet(prot_listx + prot_listy):
2079  resoffsetdiagonal[prot] = res
2080  res += self.prot_length_dict[prot]
2081  res += gap_between_components
2082 
2083  # plot protein boundaries
2084 
2085  xticks = []
2086  xlabels = []
2087  for n, prot in enumerate(prot_listx):
2088  res = resoffsetx[prot]
2089  end = resendx[prot]
2090  for proty in prot_listy:
2091  resy = resoffsety[proty]
2092  endy = resendy[proty]
2093  ax.plot([res, res], [resy, endy], 'k-', lw=0.4)
2094  ax.plot([end, end], [resy, endy], 'k-', lw=0.4)
2095  xticks.append((float(res) + float(end)) / 2)
2096  if rename_protein_map is not None:
2097  if prot in rename_protein_map:
2098  prot=rename_protein_map[prot]
2099  xlabels.append(prot)
2100 
2101  yticks = []
2102  ylabels = []
2103  for n, prot in enumerate(prot_listy):
2104  res = resoffsety[prot]
2105  end = resendy[prot]
2106  for protx in prot_listx:
2107  resx = resoffsetx[protx]
2108  endx = resendx[protx]
2109  ax.plot([resx, endx], [res, res], 'k-', lw=0.4)
2110  ax.plot([resx, endx], [end, end], 'k-', lw=0.4)
2111  yticks.append((float(res) + float(end)) / 2)
2112  if rename_protein_map is not None:
2113  if prot in rename_protein_map:
2114  prot=rename_protein_map[prot]
2115  ylabels.append(prot)
2116 
2117  # plot the contact map
2118  print(prot_listx, prot_listy)
2119 
2120  if not self.contactmap is None:
2121  import matplotlib.cm as cm
2122  tmp_array = np.zeros((nresx, nresy))
2123 
2124  for px in prot_listx:
2125  print(px)
2126  for py in prot_listy:
2127  print(py)
2128  resx = resoffsety[px]
2129  lengx = resendx[px] - 1
2130  resy = resoffsety[py]
2131  lengy = resendy[py] - 1
2132  indexes_x = self.index_dictionary[px]
2133  minx = min(indexes_x)
2134  maxx = max(indexes_x)
2135  indexes_y = self.index_dictionary[py]
2136  miny = min(indexes_y)
2137  maxy = max(indexes_y)
2138 
2139  print(px, py, minx, maxx, miny, maxy)
2140 
2141  try:
2142  tmp_array[
2143  resx:lengx,
2144  resy:lengy] = self.contactmap[
2145  minx:maxx,
2146  miny:maxy]
2147  except:
2148  continue
2149 
2150 
2151  ax.imshow(tmp_array,
2152  cmap=cm.binary,
2153  origin='lower',
2154  interpolation='nearest')
2155 
2156  ax.set_xticks(xticks)
2157  ax.set_xticklabels(xlabels, rotation=90)
2158  ax.set_yticks(yticks)
2159  ax.set_yticklabels(ylabels)
2160  ax.set_xlim(0,nresx)
2161  ax.set_ylim(0,nresy)
2162 
2163 
2164  # set the crosslinks
2165 
2166  already_added_xls = []
2167 
2168  for xl in self.crosslinks:
2169 
2170  (r1, c1, r2, c2, mdist, stdv, confidence,
2171  unique_identifier, descriptor) = xl
2172 
2173  if confidence_classes is not None:
2174  if confidence not in confidence_classes:
2175  continue
2176 
2177  try:
2178  pos1 = r1 + resoffsetx[c1]
2179  except:
2180  continue
2181  try:
2182  pos2 = r2 + resoffsety[c2]
2183  except:
2184  continue
2185 
2186  if not filter is None:
2187  xldb = self.external_csv_data[unique_identifier]
2188  xldb.update({"Distance": mdist})
2189  xldb.update({"Distance_stdv": stdv})
2190 
2191  if filter[1] == ">":
2192  if float(xldb[filter[0]]) <= float(filter[2]):
2193  continue
2194 
2195  if filter[1] == "<":
2196  if float(xldb[filter[0]]) >= float(filter[2]):
2197  continue
2198 
2199  if filter[1] == "==":
2200  if float(xldb[filter[0]]) != float(filter[2]):
2201  continue
2202 
2203  # all that below is used for plotting the diagonal
2204  # when you have a rectangolar plots
2205 
2206  pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
2207  pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
2208 
2209  if layout == 'lowerdiagonal':
2210  if pos_for_diagonal1 <= pos_for_diagonal2:
2211  continue
2212  if layout == 'upperdiagonal':
2213  if pos_for_diagonal1 >= pos_for_diagonal2:
2214  continue
2215 
2216  already_added_xls.append((r1, c1, r2, c2))
2217 
2218  if not no_confidence_info:
2219  if confidence == '0.01':
2220  markersize = 14 * scale_symbol_size
2221  elif confidence == '0.05':
2222  markersize = 9 * scale_symbol_size
2223  elif confidence == '0.1':
2224  markersize = 6 * scale_symbol_size
2225  else:
2226  markersize = 15 * scale_symbol_size
2227  else:
2228  markersize = 5 * scale_symbol_size
2229 
2230  if not no_dist_info:
2231  color = self.colormap(mdist)
2232  else:
2233  color = "gray"
2234 
2235  ax.plot(
2236  [pos1],
2237  [pos2],
2238  'o',
2239  c=color,
2240  alpha=alphablend,
2241  markersize=markersize)
2242 
2243 
2244 
2245  fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
2246 
2247  for i in ax.spines.values():
2248  i.set_linewidth(2.0)
2249 
2250  #plt.tight_layout()
2251 
2252  if filename:
2253  plt.savefig(filename + ".pdf", dpi=300, transparent="False")
2254  else:
2255  plt.show()
2256 
2257  def get_frequency_statistics(self, prot_list,
2258  prot_list2=None):
2259 
2260  violated_histogram = {}
2261  satisfied_histogram = {}
2262  unique_cross_links = []
2263 
2264  for xl in self.unique_cross_link_list:
2265  (r1, c1, r2, c2, mdist) = xl
2266 
2267  # here we filter by the protein
2268  if prot_list2 is None:
2269  if not c1 in prot_list:
2270  continue
2271  if not c2 in prot_list:
2272  continue
2273  else:
2274  if c1 in prot_list and c2 in prot_list2:
2275  pass
2276  elif c1 in prot_list2 and c2 in prot_list:
2277  pass
2278  else:
2279  continue
2280 
2281  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2282 
2283  if (r1, c1, r2, c2) not in unique_cross_links:
2284  if mdist > 35.0:
2285  if frequency not in violated_histogram:
2286  violated_histogram[frequency] = 1
2287  else:
2288  violated_histogram[frequency] += 1
2289  else:
2290  if frequency not in satisfied_histogram:
2291  satisfied_histogram[frequency] = 1
2292  else:
2293  satisfied_histogram[frequency] += 1
2294  unique_cross_links.append((r1, c1, r2, c2))
2295  unique_cross_links.append((r2, c2, r1, c1))
2296 
2297  print("# satisfied")
2298 
2299  total_number_of_crosslinks=0
2300 
2301  for i in satisfied_histogram:
2302  # if i in violated_histogram:
2303  # print i, satisfied_histogram[i]+violated_histogram[i]
2304  # else:
2305  if i in violated_histogram:
2306  print(i, violated_histogram[i]+satisfied_histogram[i])
2307  else:
2308  print(i, satisfied_histogram[i])
2309  total_number_of_crosslinks+=i*satisfied_histogram[i]
2310 
2311  print("# violated")
2312 
2313  for i in violated_histogram:
2314  print(i, violated_histogram[i])
2315  total_number_of_crosslinks+=i*violated_histogram[i]
2316 
2317  print(total_number_of_crosslinks)
2318 
2319 
2320 # ------------
2321  def print_cross_link_binary_symbols(self, prot_list,
2322  prot_list2=None):
2323  tmp_matrix = []
2324  confidence_list = []
2325  for xl in self.crosslinks:
2326  (r1, c1, r2, c2, mdist, stdv, confidence,
2327  unique_identifier, descriptor) = xl
2328 
2329  if prot_list2 is None:
2330  if not c1 in prot_list:
2331  continue
2332  if not c2 in prot_list:
2333  continue
2334  else:
2335  if c1 in prot_list and c2 in prot_list2:
2336  pass
2337  elif c1 in prot_list2 and c2 in prot_list:
2338  pass
2339  else:
2340  continue
2341 
2342  if descriptor != "original":
2343  continue
2344 
2345  confidence_list.append(confidence)
2346 
2347  dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2348  tmp_dist_binary = []
2349  for d in dists:
2350  if d < 35:
2351  tmp_dist_binary.append(1)
2352  else:
2353  tmp_dist_binary.append(0)
2354  tmp_matrix.append(tmp_dist_binary)
2355 
2356  matrix = list(zip(*tmp_matrix))
2357 
2358  satisfied_high_sum = 0
2359  satisfied_mid_sum = 0
2360  satisfied_low_sum = 0
2361  total_satisfied_sum = 0
2362  for k, m in enumerate(matrix):
2363  satisfied_high = 0
2364  total_high = 0
2365  satisfied_mid = 0
2366  total_mid = 0
2367  satisfied_low = 0
2368  total_low = 0
2369  total_satisfied = 0
2370  total = 0
2371  for n, b in enumerate(m):
2372  if confidence_list[n] == "0.01":
2373  total_high += 1
2374  if b == 1:
2375  satisfied_high += 1
2376  satisfied_high_sum += 1
2377  elif confidence_list[n] == "0.05":
2378  total_mid += 1
2379  if b == 1:
2380  satisfied_mid += 1
2381  satisfied_mid_sum += 1
2382  elif confidence_list[n] == "0.1":
2383  total_low += 1
2384  if b == 1:
2385  satisfied_low += 1
2386  satisfied_low_sum += 1
2387  if b == 1:
2388  total_satisfied += 1
2389  total_satisfied_sum += 1
2390  total += 1
2391  print(k, satisfied_high, total_high)
2392  print(k, satisfied_mid, total_mid)
2393  print(k, satisfied_low, total_low)
2394  print(k, total_satisfied, total)
2395  print(float(satisfied_high_sum) / len(matrix))
2396  print(float(satisfied_mid_sum) / len(matrix))
2397  print(float(satisfied_low_sum) / len(matrix))
2398 # ------------
2399 
2400  def get_unique_crosslinks_statistics(self, prot_list,
2401  prot_list2=None):
2402 
2403  print(prot_list)
2404  print(prot_list2)
2405  satisfied_high = 0
2406  total_high = 0
2407  satisfied_mid = 0
2408  total_mid = 0
2409  satisfied_low = 0
2410  total_low = 0
2411  total = 0
2412  tmp_matrix = []
2413  satisfied_string = []
2414  for xl in self.crosslinks:
2415  (r1, c1, r2, c2, mdist, stdv, confidence,
2416  unique_identifier, descriptor) = xl
2417 
2418  if prot_list2 is None:
2419  if not c1 in prot_list:
2420  continue
2421  if not c2 in prot_list:
2422  continue
2423  else:
2424  if c1 in prot_list and c2 in prot_list2:
2425  pass
2426  elif c1 in prot_list2 and c2 in prot_list:
2427  pass
2428  else:
2429  continue
2430 
2431  if descriptor != "original":
2432  continue
2433 
2434  total += 1
2435  if confidence == "0.01":
2436  total_high += 1
2437  if mdist <= 35:
2438  satisfied_high += 1
2439  if confidence == "0.05":
2440  total_mid += 1
2441  if mdist <= 35:
2442  satisfied_mid += 1
2443  if confidence == "0.1":
2444  total_low += 1
2445  if mdist <= 35:
2446  satisfied_low += 1
2447  if mdist <= 35:
2448  satisfied_string.append(1)
2449  else:
2450  satisfied_string.append(0)
2451 
2452  dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2453  tmp_dist_binary = []
2454  for d in dists:
2455  if d < 35:
2456  tmp_dist_binary.append(1)
2457  else:
2458  tmp_dist_binary.append(0)
2459  tmp_matrix.append(tmp_dist_binary)
2460 
2461  print("unique satisfied_high/total_high", satisfied_high, "/", total_high)
2462  print("unique satisfied_mid/total_mid", satisfied_mid, "/", total_mid)
2463  print("unique satisfied_low/total_low", satisfied_low, "/", total_low)
2464  print("total", total)
2465 
2466  matrix = list(zip(*tmp_matrix))
2467  satisfied_models = 0
2468  satstr = ""
2469  for b in satisfied_string:
2470  if b == 0:
2471  satstr += "-"
2472  if b == 1:
2473  satstr += "*"
2474 
2475  for m in matrix:
2476  all_satisfied = True
2477  string = ""
2478  for n, b in enumerate(m):
2479  if b == 0:
2480  string += "0"
2481  if b == 1:
2482  string += "1"
2483  if b == 1 and satisfied_string[n] == 1:
2484  pass
2485  elif b == 1 and satisfied_string[n] == 0:
2486  pass
2487  elif b == 0 and satisfied_string[n] == 0:
2488  pass
2489  elif b == 0 and satisfied_string[n] == 1:
2490  all_satisfied = False
2491  if all_satisfied:
2492  satisfied_models += 1
2493  print(string)
2494  print(satstr, all_satisfied)
2495  print("models that satisfies the median satisfied crosslinks/total models", satisfied_models, len(matrix))
2496 
2497  def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,
2498  prot_list2=None):
2499 
2500  import matplotlib as mpl
2501  mpl.use('Agg')
2502  import matplotlib.pylab as pl
2503 
2504  tmp_matrix = []
2505  for kw in self.cross_link_distances_unique:
2506  (r1, c1, r2, c2) = kw
2507  dists = self.cross_link_distances_unique[kw]
2508 
2509  if prot_list2 is None:
2510  if not c1 in prot_list:
2511  continue
2512  if not c2 in prot_list:
2513  continue
2514  else:
2515  if c1 in prot_list and c2 in prot_list2:
2516  pass
2517  elif c1 in prot_list2 and c2 in prot_list:
2518  pass
2519  else:
2520  continue
2521  # append the sum of dists to order by that in the matrix plot
2522  dists.append(sum(dists))
2523  tmp_matrix.append(dists)
2524 
2525  tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
2526 
2527  # print len(tmp_matrix), len(tmp_matrix[0])-1
2528  matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
2529 
2530  for i in range(len(tmp_matrix)):
2531  for k in range(len(tmp_matrix[i]) - 1):
2532  matrix[i][k] = tmp_matrix[i][k]
2533 
2534  print(matrix)
2535 
2536  fig = pl.figure()
2537  ax = fig.add_subplot(211)
2538 
2539  cax = ax.imshow(matrix, interpolation='nearest')
2540  # ax.set_yticks(range(len(self.model_list_names)))
2541  #ax.set_yticklabels( [self.model_list_names[i] for i in leaves_order] )
2542  fig.colorbar(cax)
2543  pl.savefig(figurename, dpi=300)
2544  pl.show()
2545 
2546  def plot_bars(
2547  self,
2548  filename,
2549  prots1,
2550  prots2,
2551  nxl_per_row=20,
2552  arrangement="inter",
2553  confidence_input="None"):
2554 
2555  data = []
2556  for xl in self.cross_link_distances:
2557  (r1, c1, r2, c2, mdist, confidence) = xl
2558  if c1 in prots1 and c2 in prots2:
2559  if arrangement == "inter" and c1 == c2:
2560  continue
2561  if arrangement == "intra" and c1 != c2:
2562  continue
2563  if confidence_input == confidence:
2564  label = str(c1) + ":" + str(r1) + \
2565  "-" + str(c2) + ":" + str(r2)
2566  values = self.cross_link_distances[xl]
2567  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2568  data.append((label, values, mdist, frequency))
2569 
2570  sort_by_dist = sorted(data, key=lambda tup: tup[2])
2571  sort_by_dist = list(zip(*sort_by_dist))
2572  values = sort_by_dist[1]
2573  positions = list(range(len(values)))
2574  labels = sort_by_dist[0]
2575  frequencies = list(map(float, sort_by_dist[3]))
2576  frequencies = [f * 10.0 for f in frequencies]
2577 
2578  nchunks = int(float(len(values)) / nxl_per_row)
2579  values_chunks = IMP.pmi.tools.chunk_list_into_segments(values, nchunks)
2580  positions_chunks = IMP.pmi.tools.chunk_list_into_segments(
2581  positions,
2582  nchunks)
2583  frequencies_chunks = IMP.pmi.tools.chunk_list_into_segments(
2584  frequencies,
2585  nchunks)
2586  labels_chunks = IMP.pmi.tools.chunk_list_into_segments(labels, nchunks)
2587 
2588  for n, v in enumerate(values_chunks):
2589  p = positions_chunks[n]
2590  f = frequencies_chunks[n]
2591  l = labels_chunks[n]
2593  filename + "." + str(n), v, p, f,
2594  valuename="Distance (Ang)", positionname="Unique " + arrangement + " Crosslinks", xlabels=l)
2595 
2596  def crosslink_distance_histogram(self, filename,
2597  prot_list=None,
2598  prot_list2=None,
2599  confidence_classes=None,
2600  bins=40,
2601  color='#66CCCC',
2602  yplotrange=[0, 1],
2603  format="png",
2604  normalized=False):
2605  if prot_list is None:
2606  prot_list = list(self.prot_length_dict.keys())
2607 
2608  distances = []
2609  for xl in self.crosslinks:
2610  (r1, c1, r2, c2, mdist, stdv, confidence,
2611  unique_identifier, descriptor) = xl
2612 
2613  if not confidence_classes is None:
2614  if confidence not in confidence_classes:
2615  continue
2616 
2617  if prot_list2 is None:
2618  if not c1 in prot_list:
2619  continue
2620  if not c2 in prot_list:
2621  continue
2622  else:
2623  if c1 in prot_list and c2 in prot_list2:
2624  pass
2625  elif c1 in prot_list2 and c2 in prot_list:
2626  pass
2627  else:
2628  continue
2629 
2630  distances.append(mdist)
2631 
2633  filename, distances, valuename="C-alpha C-alpha distance [Ang]",
2634  bins=bins, color=color,
2635  format=format,
2636  reference_xline=35.0,
2637  yplotrange=yplotrange, normalized=normalized)
2638 
2639  def scatter_plot_xl_features(self, filename,
2640  feature1=None,
2641  feature2=None,
2642  prot_list=None,
2643  prot_list2=None,
2644  yplotrange=None,
2645  reference_ylines=None,
2646  distance_color=True,
2647  format="png"):
2648  import matplotlib as mpl
2649  mpl.use('Agg')
2650  import matplotlib.pyplot as plt
2651  import matplotlib.cm as cm
2652 
2653  fig = plt.figure(figsize=(10, 10))
2654  ax = fig.add_subplot(111)
2655 
2656  for xl in self.crosslinks:
2657  (r1, c1, r2, c2, mdist, stdv, confidence,
2658  unique_identifier, arrangement) = xl
2659 
2660  if prot_list2 is None:
2661  if not c1 in prot_list:
2662  continue
2663  if not c2 in prot_list:
2664  continue
2665  else:
2666  if c1 in prot_list and c2 in prot_list2:
2667  pass
2668  elif c1 in prot_list2 and c2 in prot_list:
2669  pass
2670  else:
2671  continue
2672 
2673  xldb = self.external_csv_data[unique_identifier]
2674  xldb.update({"Distance": mdist})
2675  xldb.update({"Distance_stdv": stdv})
2676 
2677  xvalue = float(xldb[feature1])
2678  yvalue = float(xldb[feature2])
2679 
2680  if distance_color:
2681  color = self.colormap(mdist)
2682  else:
2683  color = "gray"
2684 
2685  ax.plot([xvalue], [yvalue], 'o', c=color, alpha=0.1, markersize=7)
2686 
2687  if not yplotrange is None:
2688  ax.set_ylim(yplotrange)
2689  if not reference_ylines is None:
2690  for rl in reference_ylines:
2691  ax.axhline(rl, color='red', linestyle='dashed', linewidth=1)
2692 
2693  if filename:
2694  plt.savefig(filename + "." + format, dpi=150, transparent="False")
2695 
2696  plt.show()
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
def get_output
Returns output for IMP.pmi.output.Output object.
Simple 3D transformation class.
Visualization of crosslinks.
A decorator to associate a particle with a part of a protein/DNA/RNA.
Definition: Fragment.h:20
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
A class for reading stat files (either rmf or ascii v1 and v2)
Definition: output.py:769
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
def plot_field_histogram
Plot a list of histograms from a value list.
Definition: output.py:1557
def plot_fields_box_plots
Plot time series as boxplots.
Definition: output.py:1622
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
Definition: atom/distance.h:49
def get_molecules
This function returns the parent molecule hierarchies of given objects.
Definition: tools.py:1216
Miscellaneous utilities.
Definition: tools.py:1
def __init__
Constructor.
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
Compute the RMSD (without alignment) taking into account the copy ambiguity.
void link_restraints(RMF::FileConstHandle fh, const Restraints &hs)
A class to evaluate the precision of an ensemble.
log
Definition: log.py:1
A class to cluster structures.
def get_rmsd_wrt_reference_structure_with_alignment
First align then calculate RMSD.
static Molecule setup_particle(Model *m, ParticleIndex pi)
Definition: Molecule.h:37
def __init__
Constructor.
Definition: pmi/Analysis.py:32
Class for sampling a density map from particles.
A decorator for keeping track of copies of a molecule.
Definition: Copy.h:28
def add_structure
Read a structure into the ensemble and store (as coordinates).
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
Return a density map for which voxel i contains the result of m1[i]*m2[i].
Performs alignment and RMSD calculation for two sets of coordinates.
Definition: pmi/Analysis.py:23
def add_subunits_density
Add a frame to the densities.
DensityMap * create_density_map(const IMP::algebra::GridD< 3, S, V, E > &arg)
Create a density map from an arbitrary IMP::algebra::GridD.
Definition: DensityMap.h:633
def scatter_and_gather
Synchronize data over a parallel run.
Definition: tools.py:596
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Basic utilities for handling cryo-electron microscopy 3D density maps.
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
def fill
Add coordinates for a single model.
Tools for clustering and cluster analysis.
Definition: pmi/Analysis.py:1
bool get_is_canonical(atom::Hierarchy h)
Walk up a PMI2 hierarchy/representations and check if the root is named System.
Definition: pmi/utilities.h:91
def do_cluster
Run K-means clustering.
3D rotation class.
Definition: Rotation3D.h:51
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Definition: DensityMap.h:464
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Classes for writing output files and processing them.
Definition: output.py:1
def set_reference_structure
Read in a structure used for reference computation.
def get_rmsf
Calculate the residue mean square fluctuations (RMSF).
def get_average_distance_wrt_reference_structure
Compare the structure set to the reference structure.
def get_density
Get the current density for some component name.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
Definition: exception.h:49
def get_precision
Evaluate the precision of two named structure groups.
VectorD< 3 > Vector3D
Definition: VectorD.h:421
Restraints create_restraints(RMF::FileConstHandle fh, Model *m)
def get_moldict_coorddict
return data structure for the RMSD calculation
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name.
Python classes to represent, score, sample and analyze models.
def add_structures
Read a list of RMFs, supports parallel.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Definition: atom/distance.h:85
Select hierarchy particles identified by the biological name.
Definition: Selection.h:66
Compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
def get_residue_indexes
Retrieve the residue indexes for the given particle.
Definition: tools.py:554
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27