IMP logo
IMP Reference Guide  2.9.0
The Integrative Modeling Platform
pmi/Analysis.py
1 #!/usr/bin/env python
2 
3 """@namespace IMP.pmi.analysis
4  Tools for clustering and cluster analysis
5 """
6 from __future__ import print_function
7 import IMP
8 import IMP.algebra
9 import IMP.em
10 import IMP.pmi
11 import IMP.pmi.tools
12 import IMP.pmi.output
13 import IMP.rmf
14 import RMF
15 import IMP.pmi.analysis
16 from operator import itemgetter
17 from copy import deepcopy
18 from math import log,sqrt
19 import itertools
20 import numpy as np
21 
22 
23 class Alignment(object):
24  """Performs alignment and RMSD calculation for two sets of coordinates
25 
26  The class also takes into account non-equal stoichiometry of the proteins.
27  If this is the case, the protein names of proteins in multiple copies
28  should be specified in the following form:
29  nameA..1, nameA..2 (note two dots).
30  """
31 
32  def __init__(self, template, query, weights=None):
33  """Constructor.
34  @param query {'p1':coords(L,3), 'p2':coords(L,3)}
35  @param template {'p1':coords(L,3), 'p2':coords(L,3)}
36  @param weights optional weights for each set of coordinates
37  """
38  self.query = query
39  self.template = template
40  self.weights=weights
41 
42  if len(self.query.keys()) != len(self.template.keys()):
43  raise ValueError('''the number of proteins
44  in template and query does not match!''')
45 
46  def permute(self):
47  # get unique protein names for each protein
48  # this is, unfortunately, expecting that names are all 'Molname..X'
49  # where X is different for each copy.
50  self.proteins = sorted(self.query.keys())
51  prots_uniq = [i.split('..')[0] for i in self.proteins]
52 
53  # for each unique name, store list of permutations
54  # e.g. for keys A..1,A..2 store P[A] = [[A..1,A..2],[A..2,A..1]]
55  # then store the product: [[[A1,A2],[B1,B2]],[[A1,A2],[B2,B1]],
56  # [[A2,A1],[B1,B2]],[[A2,A1],[B2,B1]]]
57  P = {}
58  for p in prots_uniq:
59  np = prots_uniq.count(p)
60  copies = [i for i in self.proteins if i.split('..')[0] == p]
61  prmts = list(itertools.permutations(copies, len(copies)))
62  P[p] = prmts
63  self.P = P
64  self.Product = list(itertools.product(*P.values()))
65 
66  def get_rmsd(self):
67 
68  self.permute()
69 
70  template_xyz = []
71  weights = []
72  torder = sum([list(i) for i in self.Product[0]], [])
73  for t in torder:
74  template_xyz += [IMP.algebra.Vector3D(i) for i in self.template[t]]
75  if self.weights is not None:
76  weights += [i for i in self.weights[t]]
77  #template_xyz = np.array(template_xyz)
78 
79  self.rmsd = 10000000000.
80  for comb in self.Product:
81 
82  order = sum([list(i) for i in comb], [])
83  query_xyz = []
84  for p in order:
85  query_xyz += [IMP.algebra.Vector3D(i) for i in self.query[p]]
86  #query_xyz = np.array(query_xyz)
87  #if len(template_xyz) != len(query_xyz):
88  # print '''Alignment.get_rmsd: ERROR: the number of coordinates
89  # in template and query does not match!'''
90  # exit()
91 
92  if self.weights is not None:
93  dist=IMP.algebra.get_weighted_rmsd(template_xyz, query_xyz, weights)
94  else:
95  dist=IMP.algebra.get_rmsd(template_xyz, query_xyz)
96  #dist = sqrt(
97  # sum(np.diagonal(cdist(template_xyz, query_xyz) ** 2)) / len(template_xyz))
98  if dist < self.rmsd:
99  self.rmsd = dist
100  return self.rmsd
101 
102  def align(self):
103  from scipy.spatial.distance import cdist
104 
105  self.permute()
106 
107  # create flat coordinate list from template in standard order
108  # then loop through the permutations and try to align and get RMSD
109  # should allow you to try all mappings within each protein
110  template_xyz = []
111  torder = sum([list(i) for i in self.Product[0]], [])
112  for t in torder:
113  template_xyz += [IMP.algebra.Vector3D(i) for i in self.template[t]]
114  # template_xyz = np.array(template_xyz)
115  self.rmsd, Transformation = 10000000000., ''
116 
117  # then for each permutation, get flat list of coords and get RMSD
118  for comb in self.Product:
119  order = sum([list(i) for i in comb], [])
120  query_xyz = []
121  for p in order:
122  query_xyz += [IMP.algebra.Vector3D(i) for i in self.query[p]]
123  #query_xyz = np.array(query_xyz)
124 
125  if len(template_xyz) != len(query_xyz):
126  raise ValueError('''the number of coordinates
127  in template and query does not match!''')
128 
130  query_xyz,
131  template_xyz)
132  query_xyz_tr = [transformation.get_transformed(n)
133  for n in query_xyz]
134 
135  dist = sqrt(
136  sum(np.diagonal(cdist(template_xyz, query_xyz_tr) ** 2)) / len(template_xyz))
137  if dist < self.rmsd:
138  self.rmsd = dist
139  Transformation = transformation
140 
141  # return the transformation
142  return (self.rmsd, Transformation)
143 
144 
145 # TEST for the alignment ###
146 """
147 Proteins = {'a..1':np.array([np.array([-1.,1.])]),
148  'a..2':np.array([np.array([1.,1.,])]),
149  'a..3':np.array([np.array([-2.,1.])]),
150  'b':np.array([np.array([0.,-1.])]),
151  'c..1':np.array([np.array([-1.,-1.])]),
152  'c..2':np.array([np.array([1.,-1.])]),
153  'd':np.array([np.array([0.,0.])]),
154  'e':np.array([np.array([0.,1.])])}
155 
156 Ali = Alignment(Proteins, Proteins)
157 Ali.permute()
158 if Ali.get_rmsd() == 0.0: print 'successful test!'
159 else: print 'ERROR!'; exit()
160 """
161 
162 
163 # ----------------------------------
164 class Violations(object):
165 
166  def __init__(self, filename):
167 
168  self.violation_thresholds = {}
169  self.violation_counts = {}
170 
171  data = open(filename)
172  D = data.readlines()
173  data.close()
174 
175  for d in D:
176  d = d.strip().split()
177  self.violation_thresholds[d[0]] = float(d[1])
178 
179  def get_number_violated_restraints(self, rsts_dict):
180  num_violated = 0
181  for rst in self.violation_thresholds:
182  if rst not in rsts_dict:
183  continue # print rst;
184  if float(rsts_dict[rst]) > self.violation_thresholds[rst]:
185  num_violated += 1
186  if rst not in self.violation_counts:
187  self.violation_counts[rst] = 1
188  else:
189  self.violation_counts[rst] += 1
190  return num_violated
191 
192 
193 # ----------------------------------
194 class Clustering(object):
195  """A class to cluster structures.
196  Uses scipy's cdist function to compute distance matrices
197  and sklearn's kmeans clustering module.
198  """
199  def __init__(self,rmsd_weights=None):
200  """Constructor.
201  @param rmsd_weights Flat list of weights for each particle
202  (if they're coarse)
203  """
204  try:
205  from mpi4py import MPI
206  self.comm = MPI.COMM_WORLD
207  self.rank = self.comm.Get_rank()
208  self.number_of_processes = self.comm.size
209  except ImportError:
210  self.number_of_processes = 1
211  self.rank = 0
212  self.all_coords = {}
213  self.structure_cluster_ids = None
214  self.tmpl_coords = None
215  self.rmsd_weights=rmsd_weights
216 
217  def set_template(self, part_coords):
218 
219  self.tmpl_coords = part_coords
220 
221  def fill(self, frame, Coords):
222  """Add coordinates for a single model."""
223 
224  self.all_coords[frame] = Coords
225 
226  def dist_matrix(self):
227 
228  self.model_list_names = list(self.all_coords.keys())
229  self.model_indexes = list(range(len(self.model_list_names)))
230  self.model_indexes_dict = dict(
231  list(zip(self.model_list_names, self.model_indexes)))
232  model_indexes_unique_pairs = list(itertools.combinations(self.model_indexes, 2))
233 
234  my_model_indexes_unique_pairs = IMP.pmi.tools.chunk_list_into_segments(
235  model_indexes_unique_pairs,
236  self.number_of_processes)[self.rank]
237 
238  print("process %s assigned with %s pairs" % (str(self.rank), str(len(my_model_indexes_unique_pairs))))
239 
240  (raw_distance_dict, self.transformation_distance_dict) = self.matrix_calculation(self.all_coords,
241  self.tmpl_coords,
242  my_model_indexes_unique_pairs)
243 
244  if self.number_of_processes > 1:
245  raw_distance_dict = IMP.pmi.tools.scatter_and_gather(
246  raw_distance_dict)
247  pickable_transformations = self.get_pickable_transformation_distance_dict(
248  )
249  pickable_transformations = IMP.pmi.tools.scatter_and_gather(
250  pickable_transformations)
251  self.set_transformation_distance_dict_from_pickable(
252  pickable_transformations)
253 
254  self.raw_distance_matrix = np.zeros(
255  (len(self.model_list_names), len(self.model_list_names)))
256  for item in raw_distance_dict:
257  (f1, f2) = item
258  self.raw_distance_matrix[f1, f2] = raw_distance_dict[item]
259  self.raw_distance_matrix[f2, f1] = raw_distance_dict[item]
260 
261  def get_dist_matrix(self):
262  return self.raw_distance_matrix
263 
264  def do_cluster(self, number_of_clusters,seed=None):
265  """Run K-means clustering
266  @param number_of_clusters Num means
267  @param seed the random seed
268  """
269  from sklearn.cluster import KMeans
270  if seed is not None:
271  np.random.seed(seed)
272  try:
273  # check whether we have the right version of sklearn
274  kmeans = KMeans(n_clusters=number_of_clusters)
275  except TypeError:
276  # sklearn older than 0.12
277  kmeans = KMeans(k=number_of_clusters)
278  kmeans.fit_predict(self.raw_distance_matrix)
279 
280  self.structure_cluster_ids = kmeans.labels_
281 
282  def get_pickable_transformation_distance_dict(self):
283  pickable_transformations = {}
284  for label in self.transformation_distance_dict:
285  tr = self.transformation_distance_dict[label]
286  trans = tuple(tr.get_translation())
287  rot = tuple(tr.get_rotation().get_quaternion())
288  pickable_transformations[label] = (rot, trans)
289  return pickable_transformations
290 
291  def set_transformation_distance_dict_from_pickable(
292  self,
293  pickable_transformations):
294  self.transformation_distance_dict = {}
295  for label in pickable_transformations:
296  tr = pickable_transformations[label]
297  trans = IMP.algebra.Vector3D(tr[1])
298  rot = IMP.algebra.Rotation3D(tr[0])
299  self.transformation_distance_dict[
300  label] = IMP.algebra.Transformation3D(rot, trans)
301 
302  def save_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
303  import pickle
304  outf = open(file_name + ".data", 'wb')
305 
306  # to pickle the transformation dictionary
307  # you have to save the arrays correposnding to
308  # the transformations
309 
310  pickable_transformations = self.get_pickable_transformation_distance_dict(
311  )
312  pickle.dump(
313  (self.structure_cluster_ids,
314  self.model_list_names,
315  pickable_transformations),
316  outf)
317 
318  np.save(file_name + ".npy", self.raw_distance_matrix)
319 
320  def load_distance_matrix_file(self, file_name='cluster.rawmatrix.pkl'):
321  import pickle
322 
323  inputf = open(file_name + ".data", 'rb')
324  (self.structure_cluster_ids, self.model_list_names,
325  pickable_transformations) = pickle.load(inputf)
326  inputf.close()
327 
328  self.raw_distance_matrix = np.load(file_name + ".npy")
329 
330  self.set_transformation_distance_dict_from_pickable(
331  pickable_transformations)
332  self.model_indexes = list(range(len(self.model_list_names)))
333  self.model_indexes_dict = dict(
334  list(zip(self.model_list_names, self.model_indexes)))
335 
336  def plot_matrix(self, figurename="clustermatrix.pdf"):
337  import matplotlib as mpl
338  mpl.use('Agg')
339  import matplotlib.pylab as pl
340  from scipy.cluster import hierarchy as hrc
341 
342  fig = pl.figure(figsize=(10,8))
343  ax = fig.add_subplot(212)
344  dendrogram = hrc.dendrogram(
345  hrc.linkage(self.raw_distance_matrix),
346  color_threshold=7,
347  no_labels=True)
348  leaves_order = dendrogram['leaves']
349  ax.set_xlabel('Model')
350  ax.set_ylabel('RMSD [Angstroms]')
351 
352  ax2 = fig.add_subplot(221)
353  cax = ax2.imshow(
354  self.raw_distance_matrix[leaves_order,
355  :][:,
356  leaves_order],
357  interpolation='nearest')
358  cb = fig.colorbar(cax)
359  cb.set_label('RMSD [Angstroms]')
360  ax2.set_xlabel('Model')
361  ax2.set_ylabel('Model')
362 
363  pl.savefig(figurename, dpi=300)
364  pl.close(fig)
365 
366  def get_model_index_from_name(self, name):
367  return self.model_indexes_dict[name]
368 
369  def get_cluster_labels(self):
370  # this list
371  return list(set(self.structure_cluster_ids))
372 
373  def get_number_of_clusters(self):
374  return len(self.get_cluster_labels())
375 
376  def get_cluster_label_indexes(self, label):
377  return (
378  [i for i, l in enumerate(self.structure_cluster_ids) if l == label]
379  )
380 
381  def get_cluster_label_names(self, label):
382  return (
383  [self.model_list_names[i]
384  for i in self.get_cluster_label_indexes(label)]
385  )
386 
387  def get_cluster_label_average_rmsd(self, label):
388 
389  indexes = self.get_cluster_label_indexes(label)
390 
391  if len(indexes) > 1:
392  sub_distance_matrix = self.raw_distance_matrix[
393  indexes, :][:, indexes]
394  average_rmsd = np.sum(sub_distance_matrix) / \
395  (len(sub_distance_matrix)
396  ** 2 - len(sub_distance_matrix))
397  else:
398  average_rmsd = 0.0
399  return average_rmsd
400 
401  def get_cluster_label_size(self, label):
402  return len(self.get_cluster_label_indexes(label))
403 
404  def get_transformation_to_first_member(
405  self,
406  cluster_label,
407  structure_index):
408  reference = self.get_cluster_label_indexes(cluster_label)[0]
409  return self.transformation_distance_dict[(reference, structure_index)]
410 
411  def matrix_calculation(self, all_coords, template_coords, list_of_pairs):
412 
413  model_list_names = list(all_coords.keys())
414  rmsd_protein_names = list(all_coords[model_list_names[0]].keys())
415  raw_distance_dict = {}
416  transformation_distance_dict = {}
417  if template_coords is None:
418  do_alignment = False
419  else:
420  do_alignment = True
421  alignment_template_protein_names = list(template_coords.keys())
422 
423  for (f1, f2) in list_of_pairs:
424 
425  if not do_alignment:
426  # here we only get the rmsd,
427  # we need that for instance when you want to cluster conformations
428  # globally, eg the EM map is a reference
430 
431  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
432  for pr in rmsd_protein_names])
433  coords_f2 = {}
434  for pr in rmsd_protein_names:
435  coords_f2[pr] = all_coords[model_list_names[f2]][pr]
436 
437  Ali = Alignment(coords_f1, coords_f2, self.rmsd_weights)
438  rmsd = Ali.get_rmsd()
439 
440  elif do_alignment:
441  # here we actually align the conformations first
442  # and than calculate the rmsd. We need that when the
443  # protein(s) is the reference
444  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
445  for pr in alignment_template_protein_names])
446  coords_f2 = dict([(pr, all_coords[model_list_names[f2]][pr])
447  for pr in alignment_template_protein_names])
448 
449  Ali = Alignment(coords_f1, coords_f2)
450  template_rmsd, transformation = Ali.align()
451 
452  # here we calculate the rmsd
453  # we will align two models based n the nuber of subunits provided
454  # and transform coordinates of model 2 to model 1
455  coords_f1 = dict([(pr, all_coords[model_list_names[f1]][pr])
456  for pr in rmsd_protein_names])
457  coords_f2 = {}
458  for pr in rmsd_protein_names:
459  coords_f2[pr] = [transformation.get_transformed(
460  i) for i in all_coords[model_list_names[f2]][pr]]
461 
462  Ali = Alignment(coords_f1, coords_f2, self.rmsd_weights)
463  rmsd = Ali.get_rmsd()
464 
465  raw_distance_dict[(f1, f2)] = rmsd
466  raw_distance_dict[(f2, f1)] = rmsd
467  transformation_distance_dict[(f1, f2)] = transformation
468  transformation_distance_dict[(f2, f1)] = transformation
469 
470  return raw_distance_dict, transformation_distance_dict
471 
472 
473 class RMSD(object):
474  """Compute the RMSD (without alignment) taking into account the copy ambiguity.
475  To be used with pmi2 hierarchies. Can be used for instance as follows:
476 
477  rmsd=IMP.pmi.analysis.RMSD(hier,hier,[mol.get_name() for mol in mols],dynamic0=True,dynamic1=False)
478  output_objects.append(rmsd)
479 
480  before shuffling the coordinates
481  """
482 
483  def __init__(self,hier0,hier1,molnames,label="None",dynamic0=True,dynamic1=True,metric=IMP.algebra.get_rmsd):
484  """
485  @param hier0 first input hierarchy
486  @param hier1 second input hierarchy
487  @param molname the names of the molecules used for the RMSD
488  @dynamic0 if True stores the decorators XYZ and coordinates of hier0 can be update. If false coordinates are static (stored in Vector3Ds)
489  and will never be updated
490  @dynamic1 same as above
491  metric what metric should be used
492  """
493  self.moldict0,self.molcoords0,self.mol_XYZs0=self.get_moldict_coorddict(hier0,molnames)
494  self.moldict1,self.molcoords1,self.mol_XYZs1=self.get_moldict_coorddict(hier1,molnames)
495  self.dynamic0=dynamic0
496  self.dynamic1=dynamic1
497  self.metric=metric
498  self.label=label
499 
500  def get_moldict_coorddict(self,hier,molnames):
501  """return data structure for the RMSD calculation"""
502  moldict={}
503  mol_coords={}
504  mol_XYZs={}
505  for mol in IMP.pmi.tools.get_molecules(hier):
506  name=mol.get_name()
507  if name not in molnames:
508  continue
509  parts=True
510  mol_coords[mol]=[]
511  mol_XYZs[mol]=[]
512  i=1
513  while parts:
514  sel=IMP.atom.Selection(mol,residue_index=i,representation_type=IMP.atom.BALLS,resolution=1)
515  parts=sel.get_selected_particles()
516  if parts:
517  mol_coords[mol].append(IMP.core.XYZ(parts[0]).get_coordinates())
518  mol_XYZs[mol].append(IMP.core.XYZ(parts[0]))
519  i=i+1
520  if name in moldict:
521  moldict[name].append(mol)
522  else:
523  moldict[name]=[mol]
524  return moldict, mol_coords, mol_XYZs
525 
526  def get_rmsd_and_assigments(self):
527  best_orders=[]
528  total_rmsd=0
529  total_N=0
530  best_assignments=[]
531  rmsd_dict={}
532  for molname, ref_mols in self.moldict1.items():
533  ref_coords=[]
534  for ref_mol in ref_mols:
535  if self.dynamic1:
536  coord1=[XYZ.get_coordinates() for XYZ in self.mol_XYZs1[ref_mol]]
537  else:
538  coord1=self.molcoords1[ref_mol]
539  ref_coords+=coord1
540 
541  rmsd=[]
542  rmf_mols_list=[]
543  for rmf_mols in itertools.permutations(self.moldict0[molname]):
544  rmf_coords=[]
545  for rmf_mol in rmf_mols:
546  if self.dynamic0:
547  coord0=[XYZ.get_coordinates() for XYZ in self.mol_XYZs0[rmf_mol]]
548  else:
549  coord0=self.molcoords0[rmf_mol]
550  rmf_coords+=coord0
551 
552  rmsd.append(IMP.algebra.get_rmsd(ref_coords, rmf_coords))
553  rmf_mols_list.append(rmf_mols)
554  m=min(rmsd)
555  rmf_mols_best_order=rmf_mols_list[rmsd.index(m)]
556 
557 
558  for n, (ref_mol,rmf_mol) in enumerate(zip(ref_mols,rmf_mols_best_order)):
559  best_assignments.append((rmf_mol,ref_mol))
560  if self.dynamic0:
561  coord0=[XYZ.get_coordinates() for XYZ in self.mol_XYZs0[rmf_mol]]
562  else:
563  coord0=self.molcoords0[rmf_mol]
564 
565  if self.dynamic1:
566  coord1=[XYZ.get_coordinates() for XYZ in self.mol_XYZs1[ref_mol]]
567  else:
568  coord1=self.molcoords1[ref_mol]
569  rmsd_pair=self.metric(coord1, coord0)
570  N=len(self.molcoords1[ref_mol])
571  total_N+=N
572  total_rmsd+=rmsd_pair*rmsd_pair*N
573  rmsd_dict[ref_mol]=rmsd_pair
574  total_rmsd = sqrt(total_rmsd/total_N)
575  return total_rmsd,best_assignments
576 
577  def get_output(self):
578  """Returns output for IMP.pmi.output.Output object"""
579  total_rmsd,best_assignments=self.get_rmsd_and_assigments()
580 
581  assignments_out=[]
582  for rmf_mol,ref_mol in best_assignments:
583  ref_name=ref_mol.get_name()
584  ref_copy=IMP.atom.Copy(ref_mol).get_copy_index()
585  rmf_name=rmf_mol.get_name()
586  rmf_copy=IMP.atom.Copy(rmf_mol).get_copy_index()
587  assignments_out.append(rmf_name+"."+str(rmf_copy)+"->"+ref_name+"."+str(ref_copy))
588  return {"RMSD_"+self.label:str(total_rmsd),"RMSD_assignments_"+self.label:str(assignments_out)}
589 
590 
591 
592 
593 class Precision(object):
594  """A class to evaluate the precision of an ensemble.
595 
596  Also can evaluate the cross-precision of multiple ensembles.
597  Supports MPI for coordinate reading.
598  Recommended procedure:
599  -# initialize object and pass the selection for evaluating precision
600  -# call add_structures() to read in the data (specify group name)
601  -# call get_precision() to evaluate inter/intra precision
602  -# call get_rmsf() to evaluate within-group fluctuations
603  """
604  def __init__(self,model,
605  resolution=1,
606  selection_dictionary={}):
607  """Constructor.
608  @param model The IMP Model
609  @param resolution Use 1 or 10 (kluge: requires that "_Res:X" is
610  part of the hier name)
611  @param selection_dictionary Dictionary where keys are names for
612  selections and values are selection tuples for scoring
613  precision. "All" is automatically made as well
614  """
615  try:
616  from mpi4py import MPI
617  self.comm = MPI.COMM_WORLD
618  self.rank = self.comm.Get_rank()
619  self.number_of_processes = self.comm.size
620  except ImportError:
621  self.number_of_processes = 1
622  self.rank = 0
623 
624  self.styles = ['pairwise_rmsd','pairwise_drmsd_k','pairwise_drmsd_Q',
625  'pairwise_drms_k','pairwise_rmsd','drmsd_from_center']
626  self.style = 'pairwise_drmsd_k'
627  self.structures_dictionary = {}
628  self.reference_structures_dictionary = {}
629  self.prots = []
630  self.protein_names = None
631  self.len_particles_resolution_one = None
632  self.model = model
633  self.rmf_names_frames = {}
634  self.reference_rmf_names_frames = None
635  self.reference_structure = None
636  self.reference_prot = None
637  self.selection_dictionary = selection_dictionary
638  self.threshold = 40.0
639  self.residue_particle_index_map = None
640  self.prots = None
641  if resolution in [1,10]:
642  self.resolution = resolution
643  else:
644  raise KeyError("Currently only allow resolution 1 or 10")
645 
646  def _get_structure(self,rmf_frame_index,rmf_name):
647  """Read an RMF file and return the particles"""
648  rh = RMF.open_rmf_file_read_only(rmf_name)
649  if not self.prots:
650  print("getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
651  self.prots = IMP.rmf.create_hierarchies(rh, self.model)
652  IMP.rmf.load_frame(rh, RMF.FrameID(rmf_frame_index))
653  else:
654  print("linking coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
655  IMP.rmf.link_hierarchies(rh, self.prots)
656  IMP.rmf.load_frame(rh, RMF.FrameID(rmf_frame_index))
657  del rh
658 
659  if self.resolution==1:
660  particle_dict = get_particles_at_resolution_one(self.prots[0])
661  elif self.resolution==10:
662  particle_dict = get_particles_at_resolution_ten(self.prots[0])
663 
664  protein_names = list(particle_dict.keys())
665  particles_resolution_one = []
666  for k in particle_dict:
667  particles_resolution_one += (particle_dict[k])
668 
669  if self.protein_names==None:
670  self.protein_names = protein_names
671  else:
672  if self.protein_names!=protein_names:
673  print("Error: the protein names of the new coordinate set is not compatible with the previous one")
674 
675  if self.len_particles_resolution_one==None:
676  self.len_particles_resolution_one = len(particles_resolution_one)
677  else:
678  if self.len_particles_resolution_one!=len(particles_resolution_one):
679  raise ValueError("the new coordinate set is not compatible with the previous one")
680 
681  return particles_resolution_one
682 
683  def add_structure(self,
684  rmf_name,
685  rmf_frame_index,
686  structure_set_name,
687  setup_index_map=False):
688  """ Read a structure into the ensemble and store (as coordinates).
689  @param rmf_name The name of the RMF file
690  @param rmf_frame_index The frame to read
691  @param structure_set_name Name for the set that includes this structure
692  (e.g. "cluster 1")
693  @param setup_index_map if requested, set up a dictionary to help
694  find residue indexes
695  """
696 
697  # decide where to put this structure
698  if structure_set_name in self.structures_dictionary:
699  cdict = self.structures_dictionary[structure_set_name]
700  rmflist = self.rmf_names_frames[structure_set_name]
701  else:
702  self.structures_dictionary[structure_set_name]={}
703  self.rmf_names_frames[structure_set_name]=[]
704  cdict = self.structures_dictionary[structure_set_name]
705  rmflist = self.rmf_names_frames[structure_set_name]
706 
707  # read the particles
708  try:
709  particles_resolution_one = self._get_structure(rmf_frame_index,rmf_name)
710  except ValueError:
711  print("something wrong with the rmf")
712  return 0
713  self.selection_dictionary.update({"All":self.protein_names})
714 
715  for selection_name in self.selection_dictionary:
716  selection_tuple = self.selection_dictionary[selection_name]
717  coords = self._select_coordinates(selection_tuple,particles_resolution_one,self.prots[0])
718  if selection_name not in cdict:
719  cdict[selection_name] = [coords]
720  else:
721  cdict[selection_name].append(coords)
722 
723  rmflist.append((rmf_name,rmf_frame_index))
724 
725  # if requested, set up a dictionary to help find residue indexes
726  if setup_index_map:
727  self.residue_particle_index_map = {}
728  for prot_name in self.protein_names:
729  self.residue_particle_index_map[prot_name] = \
730  self._get_residue_particle_index_map(
731  prot_name,
732  particles_resolution_one,self.prots[0])
733 
734  def add_structures(self,
735  rmf_name_frame_tuples,
736  structure_set_name):
737  """Read a list of RMFs, supports parallel
738  @param rmf_name_frame_tuples list of (rmf_file_name,frame_number)
739  @param structure_set_name Name this set of structures (e.g. "cluster.1")
740  """
741 
742  # split up the requested list to read in parallel
743  my_rmf_name_frame_tuples=IMP.pmi.tools.chunk_list_into_segments(
744  rmf_name_frame_tuples,self.number_of_processes)[self.rank]
745  for nfr,tup in enumerate(my_rmf_name_frame_tuples):
746  rmf_name=tup[0]
747  rmf_frame_index=tup[1]
748  # the first frame stores the map between residues and particles
749  if self.residue_particle_index_map is None:
750  setup_index_map = True
751  else:
752  setup_index_map = False
753  self.add_structure(rmf_name,
754  rmf_frame_index,
755  structure_set_name,
756  setup_index_map)
757 
758  # synchronize the structures
759  if self.number_of_processes > 1:
760  self.rmf_names_frames=IMP.pmi.tools.scatter_and_gather(self.rmf_names_frames)
761  if self.rank != 0:
762  self.comm.send(self.structures_dictionary, dest=0, tag=11)
763  elif self.rank == 0:
764  for i in range(1, self.number_of_processes):
765  data_tmp = self.comm.recv(source=i, tag=11)
766  for key in self.structures_dictionary:
767  self.structures_dictionary[key].update(data_tmp[key])
768  for i in range(1, self.number_of_processes):
769  self.comm.send(self.structures_dictionary, dest=i, tag=11)
770  if self.rank != 0:
771  self.structures_dictionary = self.comm.recv(source=0, tag=11)
772 
773  def _get_residue_particle_index_map(self,prot_name,structure,hier):
774  # Creates map from all particles to residue numbers
775  residue_particle_index_map = []
776  if IMP.pmi.get_is_canonical(hier):
777  s = IMP.atom.Selection(hier,molecules=[prot_name],
778  resolution=1)
779  else:
780  s = IMP.atom.Selection(hier,molecules=[prot_name])
781  all_selected_particles = s.get_selected_particles()
782  intersection = list(set(all_selected_particles) & set(structure))
783  sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
784  for p in sorted_intersection:
785  residue_particle_index_map.append(IMP.pmi.tools.get_residue_indexes(p))
786  return residue_particle_index_map
787 
788 
789  def _select_coordinates(self,tuple_selections,structure,prot):
790  selected_coordinates=[]
791  for t in tuple_selections:
792  if type(t)==tuple and len(t)==3:
793  if IMP.pmi.get_is_canonical(prot):
794  s = IMP.atom.Selection(prot,molecules=[t[2]],residue_indexes=range(t[0],t[1]+1),
795  resolution=1)
796  else:
797  s = IMP.atom.Selection(prot,molecules=[t[2]],residue_indexes=range(t[0],t[1]+1))
798  all_selected_particles = s.get_selected_particles()
799  intersection = list(set(all_selected_particles) & set(structure))
800  sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
801  cc = [tuple(IMP.core.XYZ(p).get_coordinates()) for p in sorted_intersection]
802  selected_coordinates += cc
803  elif type(t)==str:
804  if IMP.pmi.get_is_canonical(prot):
805  s = IMP.atom.Selection(prot,molecules=[t],resolution=1)
806  else:
807  s = IMP.atom.Selection(prot,molecules=[t])
808  all_selected_particles = s.get_selected_particles()
809  intersection = list(set(all_selected_particles) & set(structure))
810  sorted_intersection = IMP.pmi.tools.sort_by_residues(intersection)
811  cc = [tuple(IMP.core.XYZ(p).get_coordinates()) for p in sorted_intersection]
812  selected_coordinates += cc
813  else:
814  raise ValueError("Selection error")
815  return selected_coordinates
816 
817  def set_threshold(self,threshold):
818  self.threshold = threshold
819 
820  def _get_distance(self,
821  structure_set_name1,
822  structure_set_name2,
823  selection_name,
824  index1,
825  index2):
826  """ Compute distance between structures with various metrics """
827  c1 = self.structures_dictionary[structure_set_name1][selection_name][index1]
828  c2 = self.structures_dictionary[structure_set_name2][selection_name][index2]
829  coordinates1 = [IMP.algebra.Vector3D(c) for c in c1]
830  coordinates2 = [IMP.algebra.Vector3D(c) for c in c2]
831 
832  if self.style=='pairwise_drmsd_k':
833  distance=IMP.atom.get_drmsd(coordinates1,coordinates2)
834  if self.style=='pairwise_drms_k':
835  distance=IMP.atom.get_drms(coordinates1,coordinates2)
836  if self.style=='pairwise_drmsd_Q':
837  distance=IMP.atom.get_drmsd_Q(coordinates1,coordinates2,self.threshold)
838 
839  if self.style=='pairwise_rmsd':
840  distance=IMP.algebra.get_rmsd(coordinates1,coordinates2)
841  return distance
842 
843  def _get_particle_distances(self,structure_set_name1,structure_set_name2,
844  selection_name,index1,index2):
845  c1 = self.structures_dictionary[structure_set_name1][selection_name][index1]
846  c2 = self.structures_dictionary[structure_set_name2][selection_name][index2]
847 
848  coordinates1 = [IMP.algebra.Vector3D(c) for c in c1]
849  coordinates2 = [IMP.algebra.Vector3D(c) for c in c2]
850 
851  distances=[np.linalg.norm(a-b) for (a,b) in zip(coordinates1,coordinates2)]
852 
853  return distances
854 
855  def get_precision(self,
856  structure_set_name1,
857  structure_set_name2,
858  outfile=None,
859  skip=1,
860  selection_keywords=None):
861  """ Evaluate the precision of two named structure groups. Supports MPI.
862  When the structure_set_name1 is different from the structure_set_name2,
863  this evaluates the cross-precision (average pairwise distances).
864  @param outfile Name of the precision output file
865  @param structure_set_name1 string name of the first structure set
866  @param structure_set_name2 string name of the second structure set
867  @param skip analyze every (skip) structure for the distance matrix calculation
868  @param selection_keywords Specify the selection name you want to calculate on.
869  By default this is computed for everything you provided in the constructor,
870  plus all the subunits together.
871  """
872  if selection_keywords is None:
873  sel_keys = list(self.selection_dictionary.keys())
874  else:
875  for k in selection_keywords:
876  if k not in self.selection_dictionary:
877  raise KeyError("you are trying to find named selection " \
878  + k + " which was not requested in the constructor")
879  sel_keys = selection_keywords
880 
881  if outfile is not None:
882  of = open(outfile,"w")
883  centroid_index = 0
884  for selection_name in sel_keys:
885  number_of_structures_1 = len(self.structures_dictionary[structure_set_name1][selection_name])
886  number_of_structures_2 = len(self.structures_dictionary[structure_set_name2][selection_name])
887  distances={}
888  structure_pointers_1 = list(range(0,number_of_structures_1,skip))
889  structure_pointers_2 = list(range(0,number_of_structures_2,skip))
890  pair_combination_list = list(itertools.product(structure_pointers_1,structure_pointers_2))
891  if len(pair_combination_list)==0:
892  raise ValueError("no structure selected. Check the skip parameter.")
893 
894  # compute pairwise distances in parallel
895  my_pair_combination_list = IMP.pmi.tools.chunk_list_into_segments(
896  pair_combination_list,self.number_of_processes)[self.rank]
897  my_length = len(my_pair_combination_list)
898  for n,pair in enumerate(my_pair_combination_list):
899  progression = int(float(n)/my_length*100.0)
900  distances[pair] = self._get_distance(structure_set_name1,structure_set_name2,
901  selection_name,pair[0],pair[1])
902  if self.number_of_processes > 1:
903  distances = IMP.pmi.tools.scatter_and_gather(distances)
904 
905  # Finally compute distance to centroid
906  if self.rank == 0:
907  if structure_set_name1==structure_set_name2:
908  structure_pointers = structure_pointers_1
909  number_of_structures = number_of_structures_1
910 
911  # calculate the distance from the first centroid
912  # and determine the centroid
913  distance = 0.0
914  distances_to_structure = {}
915  distances_to_structure_normalization = {}
916 
917  for n in structure_pointers:
918  distances_to_structure[n] = 0.0
919  distances_to_structure_normalization[n]=0
920 
921  for k in distances:
922  distance += distances[k]
923  distances_to_structure[k[0]] += distances[k]
924  distances_to_structure[k[1]] += distances[k]
925  distances_to_structure_normalization[k[0]] += 1
926  distances_to_structure_normalization[k[1]] += 1
927 
928  for n in structure_pointers:
929  distances_to_structure[n] = distances_to_structure[n]/distances_to_structure_normalization[n]
930 
931  min_distance = min([distances_to_structure[n] for n in distances_to_structure])
932  centroid_index = [k for k, v in distances_to_structure.items() if v == min_distance][0]
933  centroid_rmf_name = self.rmf_names_frames[structure_set_name1][centroid_index]
934 
935  centroid_distance = 0.0
936  distance_list = []
937  for n in range(number_of_structures):
938  dist = self._get_distance(structure_set_name1,structure_set_name1,
939  selection_name,centroid_index,n)
940  centroid_distance += dist
941  distance_list.append(dist)
942 
943  #pairwise_distance=distance/len(distances.keys())
944  centroid_distance /= number_of_structures
945  #average_centroid_distance=sum(distances_to_structure)/len(distances_to_structure)
946  if outfile is not None:
947  of.write(str(selection_name)+" "+structure_set_name1+
948  " average centroid distance "+str(centroid_distance)+"\n")
949  of.write(str(selection_name)+" "+structure_set_name1+
950  " centroid index "+str(centroid_index)+"\n")
951  of.write(str(selection_name)+" "+structure_set_name1+
952  " centroid rmf name "+str(centroid_rmf_name)+"\n")
953  of.write(str(selection_name)+" "+structure_set_name1+
954  " median centroid distance "+str(np.median(distance_list))+"\n")
955 
956  average_pairwise_distances=sum(distances.values())/len(list(distances.values()))
957  if outfile is not None:
958  of.write(str(selection_name)+" "+structure_set_name1+" "+structure_set_name2+
959  " average pairwise distance "+str(average_pairwise_distances)+"\n")
960  if outfile is not None:
961  of.close()
962  return centroid_index
963 
964  def get_rmsf(self,
965  structure_set_name,
966  outdir="./",
967  skip=1,
968  set_plot_yaxis_range=None):
969  """ Calculate the residue mean square fluctuations (RMSF).
970  Automatically outputs as data file and pdf
971  @param structure_set_name Which structure set to calculate RMSF for
972  @param outdir Where to write the files
973  @param skip Skip this number of structures
974  @param set_plot_yaxis_range In case you need to change the plot
975  """
976  # get the centroid structure for the whole complex
977  centroid_index = self.get_precision(structure_set_name,
978  structure_set_name,
979  outfile=None,
980  skip=skip)
981  if self.rank==0:
982  for sel_name in self.protein_names:
983  self.selection_dictionary.update({sel_name:[sel_name]})
984  try:
985  number_of_structures = len(self.structures_dictionary[structure_set_name][sel_name])
986  except KeyError:
987  # that protein was not included in the selection
988  continue
989  rpim = self.residue_particle_index_map[sel_name]
990  outfile = outdir+"/rmsf."+sel_name+".dat"
991  of = open(outfile,"w")
992  residue_distances = {}
993  residue_nblock = {}
994  for index in range(number_of_structures):
995  distances = self._get_particle_distances(structure_set_name,
996  structure_set_name,
997  sel_name,
998  centroid_index,index)
999  for nblock,block in enumerate(rpim):
1000  for residue_number in block:
1001  residue_nblock[residue_number] = nblock
1002  if residue_number not in residue_distances:
1003  residue_distances[residue_number] = [distances[nblock]]
1004  else:
1005  residue_distances[residue_number].append(distances[nblock])
1006 
1007  residues = []
1008  rmsfs = []
1009  for rn in residue_distances:
1010  residues.append(rn)
1011  rmsf = np.std(residue_distances[rn])
1012  rmsfs.append(rmsf)
1013  of.write(str(rn)+" "+str(residue_nblock[rn])+" "+str(rmsf)+"\n")
1014 
1015  IMP.pmi.output.plot_xy_data(residues,rmsfs,title=sel_name,
1016  out_fn=outdir+"/rmsf."+sel_name,display=False,
1017  set_plot_yaxis_range=set_plot_yaxis_range,
1018  xlabel='Residue Number',ylabel='Standard error')
1019  of.close()
1020 
1021 
1022  def set_reference_structure(self,rmf_name,rmf_frame_index):
1023  """Read in a structure used for reference computation.
1024  Needed before calling get_average_distance_wrt_reference_structure()
1025  @param rmf_name The RMF file to read the reference
1026  @param rmf_frame_index The index in that file
1027  """
1028  particles_resolution_one = self._get_structure(rmf_frame_index,rmf_name)
1029  self.reference_rmf_names_frames = (rmf_name,rmf_frame_index)
1030 
1031  for selection_name in self.selection_dictionary:
1032  selection_tuple = self.selection_dictionary[selection_name]
1033  coords = self._select_coordinates(selection_tuple,
1034  particles_resolution_one,self.prots[0])
1035  self.reference_structures_dictionary[selection_name] = coords
1036 
1037  def get_rmsd_wrt_reference_structure_with_alignment(self,structure_set_name,alignment_selection_key):
1038  """First align then calculate RMSD
1039  @param structure_set_name: the name of the structure set
1040  @param alignment_selection: the key containing the selection tuples needed to make the alignment stored in self.selection_dictionary
1041  @return: for each structure in the structure set, returns the rmsd
1042  """
1043  if self.reference_structures_dictionary=={}:
1044  print("Cannot compute until you set a reference structure")
1045  return
1046 
1047  align_reference_coordinates = self.reference_structures_dictionary[alignment_selection_key]
1048  align_coordinates = self.structures_dictionary[structure_set_name][alignment_selection_key]
1049  transformations = []
1050  for c in align_coordinates:
1051  Ali = IMP.pmi.analysis.Alignment({"All":align_reference_coordinates}, {"All":c})
1052  transformation = Ali.align()[1]
1053  transformations.append(transformation)
1054  for selection_name in self.selection_dictionary:
1055  reference_coordinates = self.reference_structures_dictionary[selection_name]
1056  coordinates2 = [IMP.algebra.Vector3D(c) for c in reference_coordinates]
1057  distances = []
1058  for n,sc in enumerate(self.structures_dictionary[structure_set_name][selection_name]):
1059  coordinates1 = [transformations[n].get_transformed(IMP.algebra.Vector3D(c)) for c in sc]
1060  distance = IMP.algebra.get_rmsd(coordinates1,coordinates2)
1061  distances.append(distance)
1062  print(selection_name,"average rmsd",sum(distances)/len(distances),"median",self._get_median(distances),"minimum distance",min(distances))
1063 
1064  def _get_median(self,list_of_values):
1065  return np.median(np.array(list_of_values))
1066 
1067  def get_average_distance_wrt_reference_structure(self,structure_set_name):
1068  """Compare the structure set to the reference structure.
1069  @param structure_set_name The structure set to compute this on
1070  @note First call set_reference_structure()
1071  """
1072  ret = {}
1073  if self.reference_structures_dictionary=={}:
1074  print("Cannot compute until you set a reference structure")
1075  return
1076  for selection_name in self.selection_dictionary:
1077  reference_coordinates = self.reference_structures_dictionary[selection_name]
1078  coordinates2 = [IMP.algebra.Vector3D(c) for c in reference_coordinates]
1079  distances = []
1080  for sc in self.structures_dictionary[structure_set_name][selection_name]:
1081  coordinates1 = [IMP.algebra.Vector3D(c) for c in sc]
1082  if self.style=='pairwise_drmsd_k':
1083  distance = IMP.atom.get_drmsd(coordinates1,coordinates2)
1084  if self.style=='pairwise_drms_k':
1085  distance = IMP.atom.get_drms(coordinates1,coordinates2)
1086  if self.style=='pairwise_drmsd_Q':
1087  distance = IMP.atom.get_drmsd_Q(coordinates1,coordinates2,self.threshold)
1088  if self.style=='pairwise_rmsd':
1089  distance = IMP.algebra.get_rmsd(coordinates1,coordinates2)
1090  distances.append(distance)
1091 
1092  print(selection_name,"average distance",sum(distances)/len(distances),"minimum distance",min(distances),'nframes',len(distances))
1093  ret[selection_name] = {'average_distance':sum(distances)/len(distances),'minimum_distance':min(distances)}
1094  return ret
1095 
1096  def get_coordinates(self):
1097  pass
1098 
1099  def set_precision_style(self, style):
1100  if style in self.styles:
1101  self.style=style
1102  else:
1103  raise ValueError("No such style")
1104 
1105 
1106 class GetModelDensity(object):
1107  """Compute mean density maps from structures.
1108 
1109  Keeps a dictionary of density maps,
1110  keys are in the custom ranges. When you call add_subunits_density, it adds
1111  particle coordinates to the existing density maps.
1112  """
1113 
1114  def __init__(self, custom_ranges, representation=None, resolution=20.0, voxel=5.0):
1115  """Constructor.
1116  @param custom_ranges Required. It's a dictionary, keys are the
1117  density component names, values are selection tuples
1118  e.g. {'kin28':[['kin28',1,-1]],
1119  'density_name_1' :[('ccl1')],
1120  'density_name_2' :[(1,142,'tfb3d1'),
1121  (143,700,'tfb3d2')],
1122  @param representation PMI representation, for doing selections.
1123  Not needed if you only pass hierarchies
1124  @param resolution The MRC resolution of the output map (in Angstrom unit)
1125  @param voxel The voxel size for the output map (lower is slower)
1126  """
1127 
1128  self.representation = representation
1129  self.MRCresolution = resolution
1130  self.voxel = voxel
1131  self.densities = {}
1132  self.count_models = 0.0
1133  self.custom_ranges = custom_ranges
1134 
1135  def add_subunits_density(self, hierarchy=None):
1136  """Add a frame to the densities.
1137  @param hierarchy Optionally read the hierarchy from somewhere.
1138  If not passed, will just read the representation.
1139  """
1140  self.count_models += 1.0
1141 
1142  if hierarchy:
1143  part_dict = get_particles_at_resolution_one(hierarchy)
1144  all_particles_by_resolution = []
1145  for name in part_dict:
1146  all_particles_by_resolution += part_dict[name]
1147 
1148  for density_name in self.custom_ranges:
1149  parts = []
1150  if hierarchy:
1151  all_particles_by_segments = []
1152 
1153  for seg in self.custom_ranges[density_name]:
1154  if not hierarchy:
1155  # when you have a IMP.pmi.representation.Representation class
1156  parts += IMP.tools.select_by_tuple(self.representation,
1157  seg, resolution=1, name_is_ambiguous=False)
1158  else:
1159  # else, when you have a hierarchy, but not a representation
1160  if not IMP.pmi.get_is_canonical(hierarchy):
1161  for h in hierarchy.get_children():
1163  IMP.atom.Molecule.setup_particle(h.get_particle())
1164 
1165  if type(seg) == str:
1166  s = IMP.atom.Selection(hierarchy,molecule=seg)
1167  elif type(seg) == tuple and len(seg) == 2:
1168  s = IMP.atom.Selection(
1169  hierarchy, molecule=seg[0],copy_index=seg[1])
1170  elif type(seg) == tuple and len(seg) == 3:
1171  s = IMP.atom.Selection(
1172  hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1))
1173  elif type(seg) == tuple and len(seg) == 4:
1174  s = IMP.atom.Selection(
1175  hierarchy, molecule=seg[2],residue_indexes=range(seg[0], seg[1] + 1),copy_index=seg[3])
1176  else:
1177  raise Exception('could not understand selection tuple '+str(seg))
1178 
1179  all_particles_by_segments += s.get_selected_particles()
1180  if hierarchy:
1181  if IMP.pmi.get_is_canonical(hierarchy):
1182  parts = all_particles_by_segments
1183  else:
1184  parts = list(
1185  set(all_particles_by_segments) & set(all_particles_by_resolution))
1186  self._create_density_from_particles(parts, density_name)
1187 
1188  def normalize_density(self):
1189  pass
1190 
1191  def _create_density_from_particles(self, ps, name,
1192  kernel_type='GAUSSIAN'):
1193  '''Internal function for adding to densities.
1194  pass XYZR particles with mass and create a density from them.
1195  kernel type options are GAUSSIAN, BINARIZED_SPHERE, and SPHERE.'''
1196  kd = {
1197  'GAUSSIAN': IMP.em.GAUSSIAN,
1198  'BINARIZED_SPHERE': IMP.em.BINARIZED_SPHERE,
1199  'SPHERE': IMP.em.SPHERE}
1200 
1201  dmap = IMP.em.SampledDensityMap(ps, self.MRCresolution, self.voxel)
1202  dmap.calcRMS()
1203  dmap.set_was_used(True)
1204  if name not in self.densities:
1205  self.densities[name] = dmap
1206  else:
1207  bbox1 = IMP.em.get_bounding_box(self.densities[name])
1208  bbox2 = IMP.em.get_bounding_box(dmap)
1209  bbox1 += bbox2
1210  dmap3 = IMP.em.create_density_map(bbox1,self.voxel)
1211  dmap3.set_was_used(True)
1212  dmap3.add(dmap)
1213  dmap3.add(self.densities[name])
1214  self.densities[name] = dmap3
1215 
1216  def get_density_keys(self):
1217  return list(self.densities.keys())
1218 
1219  def get_density(self,name):
1220  """Get the current density for some component name"""
1221  if name not in self.densities:
1222  return None
1223  else:
1224  return self.densities[name]
1225 
1226  def write_mrc(self, path="./",suffix=None):
1227  import os, errno
1228  for density_name in self.densities:
1229  self.densities[density_name].multiply(1. / self.count_models)
1230  if suffix is None:
1231  name=path + "/" + density_name + ".mrc"
1232  else:
1233  name=path + "/" + density_name + "." + suffix + ".mrc"
1234  path, file = os.path.split(name)
1235  if not os.path.exists(path):
1236  try:
1237  os.makedirs(path)
1238  except OSError as e:
1239  if e.errno != errno.EEXIST:
1240  raise
1241  IMP.em.write_map(
1242  self.densities[density_name],name,
1244 
1245 
1246 class GetContactMap(object):
1247 
1248  def __init__(self, distance=15.):
1249  self.distance = distance
1250  self.contactmap = ''
1251  self.namelist = []
1252  self.xlinks = 0
1253  self.XL = {}
1254  self.expanded = {}
1255  self.resmap = {}
1256 
1257  def set_prot(self, prot):
1258  from scipy.spatial.distance import cdist
1259  self.prot = prot
1260  self.protnames = []
1261  coords = []
1262  radii = []
1263  namelist = []
1264 
1265  particles_dictionary = get_particles_at_resolution_one(self.prot)
1266 
1267  for name in particles_dictionary:
1268  residue_indexes = []
1269  for p in particles_dictionary[name]:
1270  print(p.get_name())
1271  residue_indexes += IMP.pmi.tools.get_residue_indexes(p)
1272 
1273  if len(residue_indexes) != 0:
1274  self.protnames.append(name)
1275 
1276  def get_subunit_coords(self, frame, align=0):
1277  from scipy.spatial.distance import cdist
1278  coords = []
1279  radii = []
1280  namelist = []
1281  test, testr = [], []
1282  for part in self.prot.get_children():
1283  SortedSegments = []
1284  print(part)
1285  for chl in part.get_children():
1286  start = IMP.atom.get_leaves(chl)[0]
1287  end = IMP.atom.get_leaves(chl)[-1]
1288 
1289  startres = IMP.atom.Fragment(start).get_residue_indexes()[0]
1290  endres = IMP.atom.Fragment(end).get_residue_indexes()[-1]
1291  SortedSegments.append((chl, startres))
1292  SortedSegments = sorted(SortedSegments, key=itemgetter(1))
1293 
1294  for sgmnt in SortedSegments:
1295  for leaf in IMP.atom.get_leaves(sgmnt[0]):
1296  p = IMP.core.XYZR(leaf)
1297  crd = np.array([p.get_x(), p.get_y(), p.get_z()])
1298 
1299  coords.append(crd)
1300  radii.append(p.get_radius())
1301 
1302  new_name = part.get_name() + '_' + sgmnt[0].get_name() +\
1303  '_' + \
1304  str(IMP.atom.Fragment(leaf)
1305  .get_residue_indexes()[0])
1306  namelist.append(new_name)
1307  self.expanded[new_name] = len(
1308  IMP.atom.Fragment(leaf).get_residue_indexes())
1309  if part.get_name() not in self.resmap:
1310  self.resmap[part.get_name()] = {}
1311  for res in IMP.atom.Fragment(leaf).get_residue_indexes():
1312  self.resmap[part.get_name()][res] = new_name
1313 
1314  coords = np.array(coords)
1315  radii = np.array(radii)
1316  if len(self.namelist) == 0:
1317  self.namelist = namelist
1318  self.contactmap = np.zeros((len(coords), len(coords)))
1319  distances = cdist(coords, coords)
1320  distances = (distances - radii).T - radii
1321  distances = distances <= self.distance
1322  self.contactmap += distances
1323 
1324  def add_xlinks(
1325  self,
1326  filname,
1327  identification_string='ISDCrossLinkMS_Distance_'):
1328  # 'ISDCrossLinkMS_Distance_interrb_6629-State:0-20:RPS30_218:eIF3j-1-1-0.1_None'
1329  self.xlinks = 1
1330  data = open(filname)
1331  D = data.readlines()
1332  data.close()
1333 
1334  for d in D:
1335  if identification_string in d:
1336  d = d.replace(
1337  "_",
1338  " ").replace("-",
1339  " ").replace(":",
1340  " ").split()
1341 
1342  t1, t2 = (d[0], d[1]), (d[1], d[0])
1343  if t1 not in self.XL:
1344  self.XL[t1] = [(int(d[2]) + 1, int(d[3]) + 1)]
1345  self.XL[t2] = [(int(d[3]) + 1, int(d[2]) + 1)]
1346  else:
1347  self.XL[t1].append((int(d[2]) + 1, int(d[3]) + 1))
1348  self.XL[t2].append((int(d[3]) + 1, int(d[2]) + 1))
1349 
1350  def dist_matrix(self, skip_cmap=0, skip_xl=1, outname=None):
1351  K = self.namelist
1352  M = self.contactmap
1353  C, R = [], []
1354  L = sum(self.expanded.values())
1355  proteins = self.protnames
1356 
1357  # exp new
1358  if skip_cmap == 0:
1359  Matrices = {}
1360  proteins = [p.get_name() for p in self.prot.get_children()]
1361  missing = []
1362  for p1 in range(len(proteins)):
1363  for p2 in range(p1, len(proteins)):
1364  pl1, pl2 = max(
1365  self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1366  pn1, pn2 = proteins[p1], proteins[p2]
1367  mtr = np.zeros((pl1 + 1, pl2 + 1))
1368  print('Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1369  for i1 in range(1, pl1 + 1):
1370  for i2 in range(1, pl2 + 1):
1371  try:
1372  r1 = K.index(self.resmap[pn1][i1])
1373  r2 = K.index(self.resmap[pn2][i2])
1374  r = M[r1, r2]
1375  mtr[i1 - 1, i2 - 1] = r
1376  except KeyError:
1377  missing.append((pn1, pn2, i1, i2))
1378  pass
1379  Matrices[(pn1, pn2)] = mtr
1380 
1381  # add cross-links
1382  if skip_xl == 0:
1383  if self.XL == {}:
1384  raise ValueError("cross-links were not provided, use add_xlinks function!")
1385  Matrices_xl = {}
1386  missing_xl = []
1387  for p1 in range(len(proteins)):
1388  for p2 in range(p1, len(proteins)):
1389  pl1, pl2 = max(
1390  self.resmap[proteins[p1]].keys()), max(self.resmap[proteins[p2]].keys())
1391  pn1, pn2 = proteins[p1], proteins[p2]
1392  mtr = np.zeros((pl1 + 1, pl2 + 1))
1393  flg = 0
1394  try:
1395  xls = self.XL[(pn1, pn2)]
1396  except KeyError:
1397  try:
1398  xls = self.XL[(pn2, pn1)]
1399  flg = 1
1400  except KeyError:
1401  flg = 2
1402  if flg == 0:
1403  print('Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1404  for xl1, xl2 in xls:
1405  if xl1 > pl1:
1406  print('X' * 10, xl1, xl2)
1407  xl1 = pl1
1408  if xl2 > pl2:
1409  print('X' * 10, xl1, xl2)
1410  xl2 = pl2
1411  mtr[xl1 - 1, xl2 - 1] = 100
1412  elif flg == 1:
1413  print('Creating matrix for: ', p1, p2, pn1, pn2, mtr.shape, pl1, pl2)
1414  for xl1, xl2 in xls:
1415  if xl1 > pl1:
1416  print('X' * 10, xl1, xl2)
1417  xl1 = pl1
1418  if xl2 > pl2:
1419  print('X' * 10, xl1, xl2)
1420  xl2 = pl2
1421  mtr[xl2 - 1, xl1 - 1] = 100
1422  else:
1423  print('No cross links between: ', pn1, pn2)
1424  Matrices_xl[(pn1, pn2)] = mtr
1425 
1426  # expand the matrix to individual residues
1427  #NewM = []
1428  # for x1 in xrange(len(K)):
1429  # lst = []
1430  # for x2 in xrange(len(K)):
1431  # lst += [M[x1,x2]]*self.expanded[K[x2]]
1432  # for i in xrange(self.expanded[K[x1]]): NewM.append(np.array(lst))
1433  #NewM = np.array(NewM)
1434 
1435  # make list of protein names and create coordinate lists
1436  C = proteins
1437  # W is the component length list,
1438  # R is the contiguous coordinates list
1439  W, R = [], []
1440  for i, c in enumerate(C):
1441  cl = max(self.resmap[c].keys())
1442  W.append(cl)
1443  if i == 0:
1444  R.append(cl)
1445  else:
1446  R.append(R[-1] + cl)
1447 
1448  # start plotting
1449  if outname:
1450  # Don't require a display
1451  import matplotlib as mpl
1452  mpl.use('Agg')
1453  import matplotlib.pyplot as plt
1454  import matplotlib.gridspec as gridspec
1455  import scipy.sparse as sparse
1456 
1457  f = plt.figure()
1458  gs = gridspec.GridSpec(len(W), len(W),
1459  width_ratios=W,
1460  height_ratios=W)
1461 
1462  cnt = 0
1463  for x1, r1 in enumerate(R):
1464  if x1 == 0:
1465  s1 = 0
1466  else:
1467  s1 = R[x1 - 1]
1468  for x2, r2 in enumerate(R):
1469  if x2 == 0:
1470  s2 = 0
1471  else:
1472  s2 = R[x2 - 1]
1473 
1474  ax = plt.subplot(gs[cnt])
1475  if skip_cmap == 0:
1476  try:
1477  mtr = Matrices[(C[x1], C[x2])]
1478  except KeyError:
1479  mtr = Matrices[(C[x2], C[x1])].T
1480  #cax = ax.imshow(log(NewM[s1:r1,s2:r2] / 1.), interpolation='nearest', vmin=0., vmax=log(NewM.max()))
1481  cax = ax.imshow(
1482  np.log(mtr),
1483  interpolation='nearest',
1484  vmin=0.,
1485  vmax=log(mtr.max()))
1486  ax.set_xticks([])
1487  ax.set_yticks([])
1488  if skip_xl == 0:
1489  try:
1490  mtr = Matrices_xl[(C[x1], C[x2])]
1491  except KeyError:
1492  mtr = Matrices_xl[(C[x2], C[x1])].T
1493  cax = ax.spy(
1494  sparse.csr_matrix(mtr),
1495  markersize=10,
1496  color='white',
1497  linewidth=100,
1498  alpha=0.5)
1499  ax.set_xticks([])
1500  ax.set_yticks([])
1501 
1502  cnt += 1
1503  if x2 == 0:
1504  ax.set_ylabel(C[x1], rotation=90)
1505  if outname:
1506  plt.savefig(outname + ".pdf", dpi=300, transparent="False")
1507  else:
1508  plt.show()
1509 
1510 
1511 # ------------------------------------------------------------------
1512 # a few random tools
1513 
1514 def get_hiers_from_rmf(model, frame_number, rmf_file):
1515  # I have to deprecate this function
1516  print("getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1517 
1518  # load the frame
1519  rh = RMF.open_rmf_file_read_only(rmf_file)
1520 
1521  try:
1522  prots = IMP.rmf.create_hierarchies(rh, model)
1523  except IOError:
1524  print("Unable to open rmf file %s" % (rmf_file))
1525  return None
1526  #IMP.rmf.link_hierarchies(rh, prots)
1527 
1528  try:
1529  IMP.rmf.load_frame(rh, RMF.FrameID(frame_number))
1530  except IOError:
1531  print("Unable to open frame %i of file %s" % (frame_number, rmf_file))
1532  return None
1533  model.update()
1534  del rh
1535 
1536  return prots
1537 
1538 def link_hiers_to_rmf(model,hiers,frame_number, rmf_file):
1539  print("linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1540  rh = RMF.open_rmf_file_read_only(rmf_file)
1541  IMP.rmf.link_hierarchies(rh, hiers)
1542  try:
1543  IMP.rmf.load_frame(rh, RMF.FrameID(frame_number))
1544  except:
1545  print("Unable to open frame %i of file %s" % (frame_number, rmf_file))
1546  return False
1547  model.update()
1548  del rh
1549  return True
1550 
1551 
1552 def get_hiers_and_restraints_from_rmf(model, frame_number, rmf_file):
1553  # I have to deprecate this function
1554  print("getting coordinates for frame %i rmf file %s" % (frame_number, rmf_file))
1555 
1556  # load the frame
1557  rh = RMF.open_rmf_file_read_only(rmf_file)
1558 
1559  try:
1560  prots = IMP.rmf.create_hierarchies(rh, model)
1561  rs = IMP.rmf.create_restraints(rh, model)
1562  except:
1563  print("Unable to open rmf file %s" % (rmf_file))
1564  return None,None
1565  try:
1566  IMP.rmf.load_frame(rh, RMF.FrameID(frame_number))
1567  except:
1568  print("Unable to open frame %i of file %s" % (frame_number, rmf_file))
1569  return None,None
1570  model.update()
1571  del rh
1572 
1573  return prots,rs
1574 
1575 def link_hiers_and_restraints_to_rmf(model,hiers,rs, frame_number, rmf_file):
1576  print("linking hierarchies for frame %i rmf file %s" % (frame_number, rmf_file))
1577  rh = RMF.open_rmf_file_read_only(rmf_file)
1578  IMP.rmf.link_hierarchies(rh, hiers)
1579  IMP.rmf.link_restraints(rh, rs)
1580  try:
1581  IMP.rmf.load_frame(rh, RMF.FrameID(frame_number))
1582  except:
1583  print("Unable to open frame %i of file %s" % (frame_number, rmf_file))
1584  return False
1585  model.update()
1586  del rh
1587  return True
1588 
1589 
1591  """Get particles at res 1, or any beads, based on the name.
1592  No Representation is needed. This is mainly used when the hierarchy
1593  is read from an RMF file.
1594  @return a dictionary of component names and their particles
1595  \note If the root node is named "System" or is a "State", do proper selection.
1596  """
1597  particle_dict = {}
1598 
1599  # attempt to give good results for PMI2
1600  if IMP.pmi.get_is_canonical(prot):
1601  for mol in IMP.atom.get_by_type(prot,IMP.atom.MOLECULE_TYPE):
1602  sel = IMP.atom.Selection(mol,resolution=1)
1603  particle_dict[mol.get_name()] = sel.get_selected_particles()
1604  else:
1605  allparticles = []
1606  for c in prot.get_children():
1607  name = c.get_name()
1608  particle_dict[name] = IMP.atom.get_leaves(c)
1609  for s in c.get_children():
1610  if "_Res:1" in s.get_name() and "_Res:10" not in s.get_name():
1611  allparticles += IMP.atom.get_leaves(s)
1612  if "Beads" in s.get_name():
1613  allparticles += IMP.atom.get_leaves(s)
1614 
1615  particle_align = []
1616  for name in particle_dict:
1617  particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1618  list(set(particle_dict[name]) & set(allparticles)))
1619  return particle_dict
1620 
1622  """Get particles at res 10, or any beads, based on the name.
1623  No Representation is needed.
1624  This is mainly used when the hierarchy is read from an RMF file.
1625  @return a dictionary of component names and their particles
1626  \note If the root node is named "System" or is a "State", do proper selection.
1627  """
1628  particle_dict = {}
1629  # attempt to give good results for PMI2
1630  if IMP.pmi.get_is_canonical(prot):
1631  for mol in IMP.atom.get_by_type(prot,IMP.atom.MOLECULE_TYPE):
1632  sel = IMP.atom.Selection(mol,resolution=10)
1633  particle_dict[mol.get_name()] = sel.get_selected_particles()
1634  else:
1635  allparticles = []
1636  for c in prot.get_children():
1637  name = c.get_name()
1638  particle_dict[name] = IMP.atom.get_leaves(c)
1639  for s in c.get_children():
1640  if "_Res:10" in s.get_name():
1641  allparticles += IMP.atom.get_leaves(s)
1642  if "Beads" in s.get_name():
1643  allparticles += IMP.atom.get_leaves(s)
1644  particle_align = []
1645  for name in particle_dict:
1646  particle_dict[name] = IMP.pmi.tools.sort_by_residues(
1647  list(set(particle_dict[name]) & set(allparticles)))
1648  return particle_dict
1649 
1650 def select_by_tuple(first_res_last_res_name_tuple):
1651  first_res = first_res_last_res_hier_tuple[0]
1652  last_res = first_res_last_res_hier_tuple[1]
1653  name = first_res_last_res_hier_tuple[2]
1654 
1655 class CrossLinkTable(object):
1656  """Visualization of crosslinks"""
1657  def __init__(self):
1658  self.crosslinks = []
1659  self.external_csv_data = None
1660  self.crosslinkedprots = set()
1661  self.mindist = +10000000.0
1662  self.maxdist = -10000000.0
1663  self.contactmap = None
1664 
1665  def set_hierarchy(self, prot):
1666  self.prot_length_dict = {}
1667  self.model=prot.get_model()
1668 
1669  for i in prot.get_children():
1670  name = i.get_name()
1671  residue_indexes = []
1672  for p in IMP.atom.get_leaves(i):
1673  residue_indexes += IMP.pmi.tools.get_residue_indexes(p)
1674 
1675  if len(residue_indexes) != 0:
1676  self.prot_length_dict[name] = max(residue_indexes)
1677 
1678  def set_coordinates_for_contact_map(self, rmf_name,rmf_frame_index):
1679  from scipy.spatial.distance import cdist
1680 
1681  rh= RMF.open_rmf_file_read_only(rmf_name)
1682  prots=IMP.rmf.create_hierarchies(rh, self.model)
1683  IMP.rmf.load_frame(rh, RMF.FrameID(rmf_frame_index))
1684  print("getting coordinates for frame %i rmf file %s" % (rmf_frame_index, rmf_name))
1685  del rh
1686 
1687 
1688  coords = []
1689  radii = []
1690  namelist = []
1691 
1692  particles_dictionary = get_particles_at_resolution_one(prots[0])
1693 
1694  resindex = 0
1695  self.index_dictionary = {}
1696 
1697  for name in particles_dictionary:
1698  residue_indexes = []
1699  for p in particles_dictionary[name]:
1700  print(p.get_name())
1701  residue_indexes = IMP.pmi.tools.get_residue_indexes(p)
1702  #residue_indexes.add( )
1703 
1704  if len(residue_indexes) != 0:
1705 
1706  for res in range(min(residue_indexes), max(residue_indexes) + 1):
1707  d = IMP.core.XYZR(p)
1708 
1709  crd = np.array([d.get_x(), d.get_y(), d.get_z()])
1710  coords.append(crd)
1711  radii.append(d.get_radius())
1712  if name not in self.index_dictionary:
1713  self.index_dictionary[name] = [resindex]
1714  else:
1715  self.index_dictionary[name].append(resindex)
1716  resindex += 1
1717 
1718  coords = np.array(coords)
1719  radii = np.array(radii)
1720 
1721  distances = cdist(coords, coords)
1722  distances = (distances - radii).T - radii
1723 
1724  distances = np.where(distances <= 20.0, 1.0, 0)
1725  if self.contactmap is None:
1726  self.contactmap = np.zeros((len(coords), len(coords)))
1727  self.contactmap += distances
1728 
1729  for prot in prots: IMP.atom.destroy(prot)
1730 
1731  def set_crosslinks(
1732  self, data_file, search_label='ISDCrossLinkMS_Distance_',
1733  mapping=None,
1734  filter_label=None,
1735  filter_rmf_file_names=None, #provide a list of rmf base names to filter the stat file
1736  external_csv_data_file=None,
1737  external_csv_data_file_unique_id_key="Unique ID"):
1738 
1739  # example key ISDCrossLinkMS_Distance_intrarb_937-State:0-108:RPS3_55:RPS30-1-1-0.1_None
1740  # mapping is a dictionary that maps standard keywords to entry positions in the key string
1741  # confidence class is a filter that
1742  # external datafile is a datafile that contains further information on the crosslinks
1743  # it will use the unique id to create the dictionary keys
1744 
1745  po = IMP.pmi.output.ProcessOutput(data_file)
1746  keys = po.get_keys()
1747 
1748  xl_keys = [k for k in keys if search_label in k]
1749 
1750  if filter_rmf_file_names is not None:
1751  rmf_file_key="local_rmf_file_name"
1752  fs = po.get_fields(xl_keys+[rmf_file_key])
1753  else:
1754  fs = po.get_fields(xl_keys)
1755 
1756  # this dictionary stores the occurency of given crosslinks
1757  self.cross_link_frequency = {}
1758 
1759  # this dictionary stores the series of distances for given crosslinked
1760  # residues
1761  self.cross_link_distances = {}
1762 
1763  # this dictionary stores the series of distances for given crosslinked
1764  # residues
1765  self.cross_link_distances_unique = {}
1766 
1767  if not external_csv_data_file is None:
1768  # this dictionary stores the further information on crosslinks
1769  # labeled by unique ID
1770  self.external_csv_data = {}
1771  xldb = IMP.pmi.tools.get_db_from_csv(external_csv_data_file)
1772 
1773  for xl in xldb:
1774  self.external_csv_data[
1775  xl[external_csv_data_file_unique_id_key]] = xl
1776 
1777  # this list keeps track the tuple of cross-links and sample
1778  # so that we don't count twice the same crosslinked residues in the
1779  # same sample
1780  cross_link_frequency_list = []
1781 
1782  self.unique_cross_link_list = []
1783 
1784  for key in xl_keys:
1785  print(key)
1786  keysplit = key.replace(
1787  "_",
1788  " ").replace(
1789  "-",
1790  " ").replace(
1791  ":",
1792  " ").split(
1793  )
1794 
1795  if filter_label!=None:
1796  if filter_label not in keysplit: continue
1797 
1798  if mapping is None:
1799  r1 = int(keysplit[5])
1800  c1 = keysplit[6]
1801  r2 = int(keysplit[7])
1802  c2 = keysplit[8]
1803  try:
1804  confidence = keysplit[12]
1805  except:
1806  confidence = '0.0'
1807  try:
1808  unique_identifier = keysplit[3]
1809  except:
1810  unique_identifier = '0'
1811  else:
1812  r1 = int(keysplit[mapping["Residue1"]])
1813  c1 = keysplit[mapping["Protein1"]]
1814  r2 = int(keysplit[mapping["Residue2"]])
1815  c2 = keysplit[mapping["Protein2"]]
1816  try:
1817  confidence = keysplit[mapping["Confidence"]]
1818  except:
1819  confidence = '0.0'
1820  try:
1821  unique_identifier = keysplit[mapping["Unique Identifier"]]
1822  except:
1823  unique_identifier = '0'
1824 
1825  self.crosslinkedprots.add(c1)
1826  self.crosslinkedprots.add(c2)
1827 
1828  # construct the list of distances corresponding to the input rmf
1829  # files
1830 
1831  dists=[]
1832  if filter_rmf_file_names is not None:
1833  for n,d in enumerate(fs[key]):
1834  if fs[rmf_file_key][n] in filter_rmf_file_names:
1835  dists.append(float(d))
1836  else:
1837  dists=[float(f) for f in fs[key]]
1838 
1839  # check if the input confidence class corresponds to the
1840  # one of the cross-link
1841 
1842  mdist = self.median(dists)
1843 
1844  stdv = np.std(np.array(dists))
1845  if self.mindist > mdist:
1846  self.mindist = mdist
1847  if self.maxdist < mdist:
1848  self.maxdist = mdist
1849 
1850  # calculate the frequency of unique crosslinks within the same
1851  # sample
1852  if not self.external_csv_data is None:
1853  sample = self.external_csv_data[unique_identifier]["Sample"]
1854  else:
1855  sample = "None"
1856 
1857  if (r1, c1, r2, c2,mdist) not in cross_link_frequency_list:
1858  if (r1, c1, r2, c2) not in self.cross_link_frequency:
1859  self.cross_link_frequency[(r1, c1, r2, c2)] = 1
1860  self.cross_link_frequency[(r2, c2, r1, c1)] = 1
1861  else:
1862  self.cross_link_frequency[(r2, c2, r1, c1)] += 1
1863  self.cross_link_frequency[(r1, c1, r2, c2)] += 1
1864  cross_link_frequency_list.append((r1, c1, r2, c2))
1865  cross_link_frequency_list.append((r2, c2, r1, c1))
1866  self.unique_cross_link_list.append(
1867  (r1, c1, r2, c2,mdist))
1868 
1869  if (r1, c1, r2, c2) not in self.cross_link_distances:
1870  self.cross_link_distances[(
1871  r1,
1872  c1,
1873  r2,
1874  c2,
1875  mdist,
1876  confidence)] = dists
1877  self.cross_link_distances[(
1878  r2,
1879  c2,
1880  r1,
1881  c1,
1882  mdist,
1883  confidence)] = dists
1884  self.cross_link_distances_unique[(r1, c1, r2, c2)] = dists
1885  else:
1886  self.cross_link_distances[(
1887  r2,
1888  c2,
1889  r1,
1890  c1,
1891  mdist,
1892  confidence)] += dists
1893  self.cross_link_distances[(
1894  r1,
1895  c1,
1896  r2,
1897  c2,
1898  mdist,
1899  confidence)] += dists
1900 
1901  self.crosslinks.append(
1902  (r1,
1903  c1,
1904  r2,
1905  c2,
1906  mdist,
1907  stdv,
1908  confidence,
1909  unique_identifier,
1910  'original'))
1911  self.crosslinks.append(
1912  (r2,
1913  c2,
1914  r1,
1915  c1,
1916  mdist,
1917  stdv,
1918  confidence,
1919  unique_identifier,
1920  'reversed'))
1921 
1922  self.cross_link_frequency_inverted = {}
1923  for xl in self.unique_cross_link_list:
1924  (r1, c1, r2, c2, mdist) = xl
1925  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
1926  if frequency not in self.cross_link_frequency_inverted:
1927  self.cross_link_frequency_inverted[
1928  frequency] = [(r1, c1, r2, c2)]
1929  else:
1930  self.cross_link_frequency_inverted[
1931  frequency].append((r1, c1, r2, c2))
1932 
1933  # -------------
1934 
1935  def median(self, mylist):
1936  sorts = sorted(mylist)
1937  length = len(sorts)
1938  print(length)
1939  if length == 1:
1940  return mylist[0]
1941  if not length % 2:
1942  return (sorts[length / 2] + sorts[length / 2 - 1]) / 2.0
1943  return sorts[length / 2]
1944 
1945  def set_threshold(self,threshold):
1946  self.threshold=threshold
1947 
1948  def set_tolerance(self,tolerance):
1949  self.tolerance=tolerance
1950 
1951  def colormap(self, dist):
1952  if dist < self.threshold - self.tolerance:
1953  return "Green"
1954  elif dist >= self.threshold + self.tolerance:
1955  return "Orange"
1956  else:
1957  return "Red"
1958 
1959  def write_cross_link_database(self, filename, format='csv'):
1960  import csv
1961 
1962  fieldnames = [
1963  "Unique ID", "Protein1", "Residue1", "Protein2", "Residue2",
1964  "Median Distance", "Standard Deviation", "Confidence", "Frequency", "Arrangement"]
1965 
1966  if not self.external_csv_data is None:
1967  keys = list(self.external_csv_data.keys())
1968  innerkeys = list(self.external_csv_data[keys[0]].keys())
1969  innerkeys.sort()
1970  fieldnames += innerkeys
1971 
1972  dw = csv.DictWriter(
1973  open(filename,
1974  "w"),
1975  delimiter=',',
1976  fieldnames=fieldnames)
1977  dw.writeheader()
1978  for xl in self.crosslinks:
1979  (r1, c1, r2, c2, mdist, stdv, confidence,
1980  unique_identifier, descriptor) = xl
1981  if descriptor == 'original':
1982  outdict = {}
1983  outdict["Unique ID"] = unique_identifier
1984  outdict["Protein1"] = c1
1985  outdict["Protein2"] = c2
1986  outdict["Residue1"] = r1
1987  outdict["Residue2"] = r2
1988  outdict["Median Distance"] = mdist
1989  outdict["Standard Deviation"] = stdv
1990  outdict["Confidence"] = confidence
1991  outdict["Frequency"] = self.cross_link_frequency[
1992  (r1, c1, r2, c2)]
1993  if c1 == c2:
1994  arrangement = "Intra"
1995  else:
1996  arrangement = "Inter"
1997  outdict["Arrangement"] = arrangement
1998  if not self.external_csv_data is None:
1999  outdict.update(self.external_csv_data[unique_identifier])
2000 
2001  dw.writerow(outdict)
2002 
2003  def plot(self, prot_listx=None, prot_listy=None, no_dist_info=False,
2004  no_confidence_info=False, filter=None, layout="whole", crosslinkedonly=False,
2005  filename=None, confidence_classes=None, alphablend=0.1, scale_symbol_size=1.0,
2006  gap_between_components=0,
2007  rename_protein_map=None):
2008  # layout can be:
2009  # "lowerdiagonal" print only the lower diagonal plot
2010  # "upperdiagonal" print only the upper diagonal plot
2011  # "whole" print all
2012  # crosslinkedonly: plot only components that have crosslinks
2013  # no_dist_info: if True will plot only the cross-links as grey spots
2014  # filter = tuple the tuple contains a keyword to be search in the database
2015  # a relationship ">","==","<"
2016  # and a value
2017  # example ("ID_Score",">",40)
2018  # scale_symbol_size rescale the symbol for the crosslink
2019  # rename_protein_map is a dictionary to rename proteins
2020 
2021  import matplotlib as mpl
2022  mpl.use('Agg')
2023  import matplotlib.pyplot as plt
2024  import matplotlib.cm as cm
2025 
2026  fig = plt.figure(figsize=(10, 10))
2027  ax = fig.add_subplot(111)
2028 
2029  ax.set_xticks([])
2030  ax.set_yticks([])
2031 
2032  # set the list of proteins on the x axis
2033  if prot_listx is None:
2034  if crosslinkedonly:
2035  prot_listx = list(self.crosslinkedprots)
2036  else:
2037  prot_listx = list(self.prot_length_dict.keys())
2038  prot_listx.sort()
2039 
2040  nresx = gap_between_components + \
2041  sum([self.prot_length_dict[name]
2042  + gap_between_components for name in prot_listx])
2043 
2044  # set the list of proteins on the y axis
2045 
2046  if prot_listy is None:
2047  if crosslinkedonly:
2048  prot_listy = list(self.crosslinkedprots)
2049  else:
2050  prot_listy = list(self.prot_length_dict.keys())
2051  prot_listy.sort()
2052 
2053  nresy = gap_between_components + \
2054  sum([self.prot_length_dict[name]
2055  + gap_between_components for name in prot_listy])
2056 
2057  # this is the residue offset for each protein
2058  resoffsetx = {}
2059  resendx = {}
2060  res = gap_between_components
2061  for prot in prot_listx:
2062  resoffsetx[prot] = res
2063  res += self.prot_length_dict[prot]
2064  resendx[prot] = res
2065  res += gap_between_components
2066 
2067  resoffsety = {}
2068  resendy = {}
2069  res = gap_between_components
2070  for prot in prot_listy:
2071  resoffsety[prot] = res
2072  res += self.prot_length_dict[prot]
2073  resendy[prot] = res
2074  res += gap_between_components
2075 
2076  resoffsetdiagonal = {}
2077  res = gap_between_components
2078  for prot in IMP.pmi.tools.OrderedSet(prot_listx + prot_listy):
2079  resoffsetdiagonal[prot] = res
2080  res += self.prot_length_dict[prot]
2081  res += gap_between_components
2082 
2083  # plot protein boundaries
2084 
2085  xticks = []
2086  xlabels = []
2087  for n, prot in enumerate(prot_listx):
2088  res = resoffsetx[prot]
2089  end = resendx[prot]
2090  for proty in prot_listy:
2091  resy = resoffsety[proty]
2092  endy = resendy[proty]
2093  ax.plot([res, res], [resy, endy], 'k-', lw=0.4)
2094  ax.plot([end, end], [resy, endy], 'k-', lw=0.4)
2095  xticks.append((float(res) + float(end)) / 2)
2096  if rename_protein_map is not None:
2097  if prot in rename_protein_map:
2098  prot=rename_protein_map[prot]
2099  xlabels.append(prot)
2100 
2101  yticks = []
2102  ylabels = []
2103  for n, prot in enumerate(prot_listy):
2104  res = resoffsety[prot]
2105  end = resendy[prot]
2106  for protx in prot_listx:
2107  resx = resoffsetx[protx]
2108  endx = resendx[protx]
2109  ax.plot([resx, endx], [res, res], 'k-', lw=0.4)
2110  ax.plot([resx, endx], [end, end], 'k-', lw=0.4)
2111  yticks.append((float(res) + float(end)) / 2)
2112  if rename_protein_map is not None:
2113  if prot in rename_protein_map:
2114  prot=rename_protein_map[prot]
2115  ylabels.append(prot)
2116 
2117  # plot the contact map
2118  print(prot_listx, prot_listy)
2119 
2120  if not self.contactmap is None:
2121  import matplotlib.cm as cm
2122  tmp_array = np.zeros((nresx, nresy))
2123 
2124  for px in prot_listx:
2125  print(px)
2126  for py in prot_listy:
2127  print(py)
2128  resx = resoffsety[px]
2129  lengx = resendx[px] - 1
2130  resy = resoffsety[py]
2131  lengy = resendy[py] - 1
2132  indexes_x = self.index_dictionary[px]
2133  minx = min(indexes_x)
2134  maxx = max(indexes_x)
2135  indexes_y = self.index_dictionary[py]
2136  miny = min(indexes_y)
2137  maxy = max(indexes_y)
2138 
2139  print(px, py, minx, maxx, miny, maxy)
2140 
2141  try:
2142  tmp_array[
2143  resx:lengx,
2144  resy:lengy] = self.contactmap[
2145  minx:maxx,
2146  miny:maxy]
2147  except:
2148  continue
2149 
2150 
2151  ax.imshow(tmp_array,
2152  cmap=cm.binary,
2153  origin='lower',
2154  interpolation='nearest')
2155 
2156  ax.set_xticks(xticks)
2157  ax.set_xticklabels(xlabels, rotation=90)
2158  ax.set_yticks(yticks)
2159  ax.set_yticklabels(ylabels)
2160  ax.set_xlim(0,nresx)
2161  ax.set_ylim(0,nresy)
2162 
2163 
2164  # set the crosslinks
2165 
2166  already_added_xls = []
2167 
2168  for xl in self.crosslinks:
2169 
2170  (r1, c1, r2, c2, mdist, stdv, confidence,
2171  unique_identifier, descriptor) = xl
2172 
2173  if confidence_classes is not None:
2174  if confidence not in confidence_classes:
2175  continue
2176 
2177  try:
2178  pos1 = r1 + resoffsetx[c1]
2179  except:
2180  continue
2181  try:
2182  pos2 = r2 + resoffsety[c2]
2183  except:
2184  continue
2185 
2186  if not filter is None:
2187  xldb = self.external_csv_data[unique_identifier]
2188  xldb.update({"Distance": mdist})
2189  xldb.update({"Distance_stdv": stdv})
2190 
2191  if filter[1] == ">":
2192  if float(xldb[filter[0]]) <= float(filter[2]):
2193  continue
2194 
2195  if filter[1] == "<":
2196  if float(xldb[filter[0]]) >= float(filter[2]):
2197  continue
2198 
2199  if filter[1] == "==":
2200  if float(xldb[filter[0]]) != float(filter[2]):
2201  continue
2202 
2203  # all that below is used for plotting the diagonal
2204  # when you have a rectangolar plots
2205 
2206  pos_for_diagonal1 = r1 + resoffsetdiagonal[c1]
2207  pos_for_diagonal2 = r2 + resoffsetdiagonal[c2]
2208 
2209  if layout == 'lowerdiagonal':
2210  if pos_for_diagonal1 <= pos_for_diagonal2:
2211  continue
2212  if layout == 'upperdiagonal':
2213  if pos_for_diagonal1 >= pos_for_diagonal2:
2214  continue
2215 
2216  already_added_xls.append((r1, c1, r2, c2))
2217 
2218  if not no_confidence_info:
2219  if confidence == '0.01':
2220  markersize = 14 * scale_symbol_size
2221  elif confidence == '0.05':
2222  markersize = 9 * scale_symbol_size
2223  elif confidence == '0.1':
2224  markersize = 6 * scale_symbol_size
2225  else:
2226  markersize = 15 * scale_symbol_size
2227  else:
2228  markersize = 5 * scale_symbol_size
2229 
2230  if not no_dist_info:
2231  color = self.colormap(mdist)
2232  else:
2233  color = "gray"
2234 
2235  ax.plot(
2236  [pos1],
2237  [pos2],
2238  'o',
2239  c=color,
2240  alpha=alphablend,
2241  markersize=markersize)
2242 
2243 
2244 
2245  fig.set_size_inches(0.004 * nresx, 0.004 * nresy)
2246 
2247  [i.set_linewidth(2.0) for i in ax.spines.values()]
2248 
2249  #plt.tight_layout()
2250 
2251  if filename:
2252  plt.savefig(filename + ".pdf", dpi=300, transparent="False")
2253  else:
2254  plt.show()
2255 
2256  def get_frequency_statistics(self, prot_list,
2257  prot_list2=None):
2258 
2259  violated_histogram = {}
2260  satisfied_histogram = {}
2261  unique_cross_links = []
2262 
2263  for xl in self.unique_cross_link_list:
2264  (r1, c1, r2, c2, mdist) = xl
2265 
2266  # here we filter by the protein
2267  if prot_list2 is None:
2268  if not c1 in prot_list:
2269  continue
2270  if not c2 in prot_list:
2271  continue
2272  else:
2273  if c1 in prot_list and c2 in prot_list2:
2274  pass
2275  elif c1 in prot_list2 and c2 in prot_list:
2276  pass
2277  else:
2278  continue
2279 
2280  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2281 
2282  if (r1, c1, r2, c2) not in unique_cross_links:
2283  if mdist > 35.0:
2284  if frequency not in violated_histogram:
2285  violated_histogram[frequency] = 1
2286  else:
2287  violated_histogram[frequency] += 1
2288  else:
2289  if frequency not in satisfied_histogram:
2290  satisfied_histogram[frequency] = 1
2291  else:
2292  satisfied_histogram[frequency] += 1
2293  unique_cross_links.append((r1, c1, r2, c2))
2294  unique_cross_links.append((r2, c2, r1, c1))
2295 
2296  print("# satisfied")
2297 
2298  total_number_of_crosslinks=0
2299 
2300  for i in satisfied_histogram:
2301  # if i in violated_histogram:
2302  # print i, satisfied_histogram[i]+violated_histogram[i]
2303  # else:
2304  if i in violated_histogram:
2305  print(i, violated_histogram[i]+satisfied_histogram[i])
2306  else:
2307  print(i, satisfied_histogram[i])
2308  total_number_of_crosslinks+=i*satisfied_histogram[i]
2309 
2310  print("# violated")
2311 
2312  for i in violated_histogram:
2313  print(i, violated_histogram[i])
2314  total_number_of_crosslinks+=i*violated_histogram[i]
2315 
2316  print(total_number_of_crosslinks)
2317 
2318 
2319 # ------------
2320  def print_cross_link_binary_symbols(self, prot_list,
2321  prot_list2=None):
2322  tmp_matrix = []
2323  confidence_list = []
2324  for xl in self.crosslinks:
2325  (r1, c1, r2, c2, mdist, stdv, confidence,
2326  unique_identifier, descriptor) = xl
2327 
2328  if prot_list2 is None:
2329  if not c1 in prot_list:
2330  continue
2331  if not c2 in prot_list:
2332  continue
2333  else:
2334  if c1 in prot_list and c2 in prot_list2:
2335  pass
2336  elif c1 in prot_list2 and c2 in prot_list:
2337  pass
2338  else:
2339  continue
2340 
2341  if descriptor != "original":
2342  continue
2343 
2344  confidence_list.append(confidence)
2345 
2346  dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2347  tmp_dist_binary = []
2348  for d in dists:
2349  if d < 35:
2350  tmp_dist_binary.append(1)
2351  else:
2352  tmp_dist_binary.append(0)
2353  tmp_matrix.append(tmp_dist_binary)
2354 
2355  matrix = list(zip(*tmp_matrix))
2356 
2357  satisfied_high_sum = 0
2358  satisfied_mid_sum = 0
2359  satisfied_low_sum = 0
2360  total_satisfied_sum = 0
2361  for k, m in enumerate(matrix):
2362  satisfied_high = 0
2363  total_high = 0
2364  satisfied_mid = 0
2365  total_mid = 0
2366  satisfied_low = 0
2367  total_low = 0
2368  total_satisfied = 0
2369  total = 0
2370  for n, b in enumerate(m):
2371  if confidence_list[n] == "0.01":
2372  total_high += 1
2373  if b == 1:
2374  satisfied_high += 1
2375  satisfied_high_sum += 1
2376  elif confidence_list[n] == "0.05":
2377  total_mid += 1
2378  if b == 1:
2379  satisfied_mid += 1
2380  satisfied_mid_sum += 1
2381  elif confidence_list[n] == "0.1":
2382  total_low += 1
2383  if b == 1:
2384  satisfied_low += 1
2385  satisfied_low_sum += 1
2386  if b == 1:
2387  total_satisfied += 1
2388  total_satisfied_sum += 1
2389  total += 1
2390  print(k, satisfied_high, total_high)
2391  print(k, satisfied_mid, total_mid)
2392  print(k, satisfied_low, total_low)
2393  print(k, total_satisfied, total)
2394  print(float(satisfied_high_sum) / len(matrix))
2395  print(float(satisfied_mid_sum) / len(matrix))
2396  print(float(satisfied_low_sum) / len(matrix))
2397 # ------------
2398 
2399  def get_unique_crosslinks_statistics(self, prot_list,
2400  prot_list2=None):
2401 
2402  print(prot_list)
2403  print(prot_list2)
2404  satisfied_high = 0
2405  total_high = 0
2406  satisfied_mid = 0
2407  total_mid = 0
2408  satisfied_low = 0
2409  total_low = 0
2410  total = 0
2411  tmp_matrix = []
2412  satisfied_string = []
2413  for xl in self.crosslinks:
2414  (r1, c1, r2, c2, mdist, stdv, confidence,
2415  unique_identifier, descriptor) = xl
2416 
2417  if prot_list2 is None:
2418  if not c1 in prot_list:
2419  continue
2420  if not c2 in prot_list:
2421  continue
2422  else:
2423  if c1 in prot_list and c2 in prot_list2:
2424  pass
2425  elif c1 in prot_list2 and c2 in prot_list:
2426  pass
2427  else:
2428  continue
2429 
2430  if descriptor != "original":
2431  continue
2432 
2433  total += 1
2434  if confidence == "0.01":
2435  total_high += 1
2436  if mdist <= 35:
2437  satisfied_high += 1
2438  if confidence == "0.05":
2439  total_mid += 1
2440  if mdist <= 35:
2441  satisfied_mid += 1
2442  if confidence == "0.1":
2443  total_low += 1
2444  if mdist <= 35:
2445  satisfied_low += 1
2446  if mdist <= 35:
2447  satisfied_string.append(1)
2448  else:
2449  satisfied_string.append(0)
2450 
2451  dists = self.cross_link_distances_unique[(r1, c1, r2, c2)]
2452  tmp_dist_binary = []
2453  for d in dists:
2454  if d < 35:
2455  tmp_dist_binary.append(1)
2456  else:
2457  tmp_dist_binary.append(0)
2458  tmp_matrix.append(tmp_dist_binary)
2459 
2460  print("unique satisfied_high/total_high", satisfied_high, "/", total_high)
2461  print("unique satisfied_mid/total_mid", satisfied_mid, "/", total_mid)
2462  print("unique satisfied_low/total_low", satisfied_low, "/", total_low)
2463  print("total", total)
2464 
2465  matrix = list(zip(*tmp_matrix))
2466  satisfied_models = 0
2467  satstr = ""
2468  for b in satisfied_string:
2469  if b == 0:
2470  satstr += "-"
2471  if b == 1:
2472  satstr += "*"
2473 
2474  for m in matrix:
2475  all_satisfied = True
2476  string = ""
2477  for n, b in enumerate(m):
2478  if b == 0:
2479  string += "0"
2480  if b == 1:
2481  string += "1"
2482  if b == 1 and satisfied_string[n] == 1:
2483  pass
2484  elif b == 1 and satisfied_string[n] == 0:
2485  pass
2486  elif b == 0 and satisfied_string[n] == 0:
2487  pass
2488  elif b == 0 and satisfied_string[n] == 1:
2489  all_satisfied = False
2490  if all_satisfied:
2491  satisfied_models += 1
2492  print(string)
2493  print(satstr, all_satisfied)
2494  print("models that satisfies the median satisfied crosslinks/total models", satisfied_models, len(matrix))
2495 
2496  def plot_matrix_cross_link_distances_unique(self, figurename, prot_list,
2497  prot_list2=None):
2498 
2499  import matplotlib as mpl
2500  mpl.use('Agg')
2501  import matplotlib.pylab as pl
2502 
2503  tmp_matrix = []
2504  for kw in self.cross_link_distances_unique:
2505  (r1, c1, r2, c2) = kw
2506  dists = self.cross_link_distances_unique[kw]
2507 
2508  if prot_list2 is None:
2509  if not c1 in prot_list:
2510  continue
2511  if not c2 in prot_list:
2512  continue
2513  else:
2514  if c1 in prot_list and c2 in prot_list2:
2515  pass
2516  elif c1 in prot_list2 and c2 in prot_list:
2517  pass
2518  else:
2519  continue
2520  # append the sum of dists to order by that in the matrix plot
2521  dists.append(sum(dists))
2522  tmp_matrix.append(dists)
2523 
2524  tmp_matrix.sort(key=itemgetter(len(tmp_matrix[0]) - 1))
2525 
2526  # print len(tmp_matrix), len(tmp_matrix[0])-1
2527  matrix = np.zeros((len(tmp_matrix), len(tmp_matrix[0]) - 1))
2528 
2529  for i in range(len(tmp_matrix)):
2530  for k in range(len(tmp_matrix[i]) - 1):
2531  matrix[i][k] = tmp_matrix[i][k]
2532 
2533  print(matrix)
2534 
2535  fig = pl.figure()
2536  ax = fig.add_subplot(211)
2537 
2538  cax = ax.imshow(matrix, interpolation='nearest')
2539  # ax.set_yticks(range(len(self.model_list_names)))
2540  #ax.set_yticklabels( [self.model_list_names[i] for i in leaves_order] )
2541  fig.colorbar(cax)
2542  pl.savefig(figurename, dpi=300)
2543  pl.show()
2544 
2545  def plot_bars(
2546  self,
2547  filename,
2548  prots1,
2549  prots2,
2550  nxl_per_row=20,
2551  arrangement="inter",
2552  confidence_input="None"):
2553 
2554  data = []
2555  for xl in self.cross_link_distances:
2556  (r1, c1, r2, c2, mdist, confidence) = xl
2557  if c1 in prots1 and c2 in prots2:
2558  if arrangement == "inter" and c1 == c2:
2559  continue
2560  if arrangement == "intra" and c1 != c2:
2561  continue
2562  if confidence_input == confidence:
2563  label = str(c1) + ":" + str(r1) + \
2564  "-" + str(c2) + ":" + str(r2)
2565  values = self.cross_link_distances[xl]
2566  frequency = self.cross_link_frequency[(r1, c1, r2, c2)]
2567  data.append((label, values, mdist, frequency))
2568 
2569  sort_by_dist = sorted(data, key=lambda tup: tup[2])
2570  sort_by_dist = list(zip(*sort_by_dist))
2571  values = sort_by_dist[1]
2572  positions = list(range(len(values)))
2573  labels = sort_by_dist[0]
2574  frequencies = list(map(float, sort_by_dist[3]))
2575  frequencies = [f * 10.0 for f in frequencies]
2576 
2577  nchunks = int(float(len(values)) / nxl_per_row)
2578  values_chunks = IMP.pmi.tools.chunk_list_into_segments(values, nchunks)
2579  positions_chunks = IMP.pmi.tools.chunk_list_into_segments(
2580  positions,
2581  nchunks)
2582  frequencies_chunks = IMP.pmi.tools.chunk_list_into_segments(
2583  frequencies,
2584  nchunks)
2585  labels_chunks = IMP.pmi.tools.chunk_list_into_segments(labels, nchunks)
2586 
2587  for n, v in enumerate(values_chunks):
2588  p = positions_chunks[n]
2589  f = frequencies_chunks[n]
2590  l = labels_chunks[n]
2592  filename + "." + str(n), v, p, f,
2593  valuename="Distance (Ang)", positionname="Unique " + arrangement + " Crosslinks", xlabels=l)
2594 
2595  def crosslink_distance_histogram(self, filename,
2596  prot_list=None,
2597  prot_list2=None,
2598  confidence_classes=None,
2599  bins=40,
2600  color='#66CCCC',
2601  yplotrange=[0, 1],
2602  format="png",
2603  normalized=False):
2604  if prot_list is None:
2605  prot_list = list(self.prot_length_dict.keys())
2606 
2607  distances = []
2608  for xl in self.crosslinks:
2609  (r1, c1, r2, c2, mdist, stdv, confidence,
2610  unique_identifier, descriptor) = xl
2611 
2612  if not confidence_classes is None:
2613  if confidence not in confidence_classes:
2614  continue
2615 
2616  if prot_list2 is None:
2617  if not c1 in prot_list:
2618  continue
2619  if not c2 in prot_list:
2620  continue
2621  else:
2622  if c1 in prot_list and c2 in prot_list2:
2623  pass
2624  elif c1 in prot_list2 and c2 in prot_list:
2625  pass
2626  else:
2627  continue
2628 
2629  distances.append(mdist)
2630 
2632  filename, distances, valuename="C-alpha C-alpha distance [Ang]",
2633  bins=bins, color=color,
2634  format=format,
2635  reference_xline=35.0,
2636  yplotrange=yplotrange, normalized=normalized)
2637 
2638  def scatter_plot_xl_features(self, filename,
2639  feature1=None,
2640  feature2=None,
2641  prot_list=None,
2642  prot_list2=None,
2643  yplotrange=None,
2644  reference_ylines=None,
2645  distance_color=True,
2646  format="png"):
2647  import matplotlib as mpl
2648  mpl.use('Agg')
2649  import matplotlib.pyplot as plt
2650  import matplotlib.cm as cm
2651 
2652  fig = plt.figure(figsize=(10, 10))
2653  ax = fig.add_subplot(111)
2654 
2655  for xl in self.crosslinks:
2656  (r1, c1, r2, c2, mdist, stdv, confidence,
2657  unique_identifier, arrangement) = xl
2658 
2659  if prot_list2 is None:
2660  if not c1 in prot_list:
2661  continue
2662  if not c2 in prot_list:
2663  continue
2664  else:
2665  if c1 in prot_list and c2 in prot_list2:
2666  pass
2667  elif c1 in prot_list2 and c2 in prot_list:
2668  pass
2669  else:
2670  continue
2671 
2672  xldb = self.external_csv_data[unique_identifier]
2673  xldb.update({"Distance": mdist})
2674  xldb.update({"Distance_stdv": stdv})
2675 
2676  xvalue = float(xldb[feature1])
2677  yvalue = float(xldb[feature2])
2678 
2679  if distance_color:
2680  color = self.colormap(mdist)
2681  else:
2682  color = "gray"
2683 
2684  ax.plot([xvalue], [yvalue], 'o', c=color, alpha=0.1, markersize=7)
2685 
2686  if not yplotrange is None:
2687  ax.set_ylim(yplotrange)
2688  if not reference_ylines is None:
2689  for rl in reference_ylines:
2690  ax.axhline(rl, color='red', linestyle='dashed', linewidth=1)
2691 
2692  if filename:
2693  plt.savefig(filename + "." + format, dpi=150, transparent="False")
2694 
2695  plt.show()
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
def get_output
Returns output for IMP.pmi.output.Output object.
Simple 3D transformation class.
Visualization of crosslinks.
A decorator to associate a particle with a part of a protein/DNA/RNA.
Definition: Fragment.h:20
double get_drms(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
A class for reading stat files (either rmf or ascii v1 and v2)
Definition: output.py:785
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
def plot_field_histogram
Plot a list of histograms from a value list.
Definition: output.py:1593
def plot_fields_box_plots
Plot time series as boxplots.
Definition: output.py:1658
double get_drmsd(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1)
Calculate distance the root mean square deviation between two sets of 3D points.
Definition: atom/distance.h:49
def get_molecules
This function returns the parent molecule hierarchies of given objects.
Definition: tools.py:1911
Miscellaneous utilities.
Definition: tools.py:1
def __init__
Constructor.
double get_weighted_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2, const Floats &weights)
Compute the RMSD (without alignment) taking into account the copy ambiguity.
void link_restraints(RMF::FileConstHandle fh, const Restraints &hs)
A class to evaluate the precision of an ensemble.
log
Definition: log.py:1
A class to cluster structures.
def get_rmsd_wrt_reference_structure_with_alignment
First align then calculate RMSD.
static Molecule setup_particle(Model *m, ParticleIndex pi)
Definition: Molecule.h:37
def __init__
Constructor.
Definition: pmi/Analysis.py:32
Class for sampling a density map from particles.
A decorator for keeping track of copies of a molecule.
Definition: Copy.h:28
def add_structure
Read a structure into the ensemble and store (as coordinates).
DensityMap * multiply(const DensityMap *m1, const DensityMap *m2)
Return a density map for which voxel i contains the result of m1[i]*m2[i].
Performs alignment and RMSD calculation for two sets of coordinates.
Definition: pmi/Analysis.py:23
def add_subunits_density
Add a frame to the densities.
DensityMap * create_density_map(const IMP::algebra::GridD< 3, S, V, E > &arg)
Create a density map from an arbitrary IMP::algebra::GridD.
Definition: DensityMap.h:633
def scatter_and_gather
Synchronize data over a parallel run.
Definition: tools.py:1111
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Basic utilities for handling cryo-electron microscopy 3D density maps.
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
def fill
Add coordinates for a single model.
Tools for clustering and cluster analysis.
Definition: pmi/Analysis.py:1
bool get_is_canonical(atom::Hierarchy h)
Walk up a PMI2 hierarchy/representations and check if the root is named System.
Definition: utilities.h:91
def do_cluster
Run K-means clustering.
3D rotation class.
Definition: Rotation3D.h:46
algebra::BoundingBoxD< 3 > get_bounding_box(const DensityMap *m)
Definition: DensityMap.h:464
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Classes for writing output files and processing them.
Definition: output.py:1
def set_reference_structure
Read in a structure used for reference computation.
def get_rmsf
Calculate the residue mean square fluctuations (RMSF).
def get_average_distance_wrt_reference_structure
Compare the structure set to the reference structure.
def get_density
Get the current density for some component name.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
Definition: exception.h:49
def get_precision
Evaluate the precision of two named structure groups.
VectorD< 3 > Vector3D
Definition: VectorD.h:395
Restraints create_restraints(RMF::FileConstHandle fh, Model *m)
def get_moldict_coorddict
return data structure for the RMSD calculation
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
def get_particles_at_resolution_ten
Get particles at res 10, or any beads, based on the name.
Python classes to represent, score, sample and analyze models.
def add_structures
Read a list of RMFs, supports parallel.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Hierarchies get_leaves(const Selection &h)
double get_drmsd_Q(const Vector3DsOrXYZs0 &m0, const Vector3DsOrXYZs1 &m1, double threshold)
Definition: atom/distance.h:85
Select hierarchy particles identified by the biological name.
Definition: Selection.h:66
Compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
def get_residue_indexes
Retrieve the residue indexes for the given particle.
Definition: tools.py:1061
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27