IMP logo
IMP Reference Guide  2.18.0
The Integrative Modeling Platform
exhaust.py
1 from __future__ import print_function
2 from IMP import ArgumentParser
3 import os
4 
5 __doc__ = "Perform analysis to determine sampling convergence."
6 
7 ############################################################
8 # Scripts written by Shruthi Viswanath and Ilan E. Chemmama#
9 # in Andrej Sali Lab at UCSF. #
10 # Based on Viswanath, Chemmama et al. Biophys. J. (2017) #
11 # #
12 ############################################################
13 
14 
15 def parse_args():
16  parser = ArgumentParser(
17  description="First stages of analysis for assessing sampling "
18  "convergence")
19  parser.add_argument(
20  '--sysname', '-n', dest="sysname",
21  help='name of the system', default="")
22  parser.add_argument(
23  '--path', '-p', dest="path",
24  help='path to the good-scoring models', default="./")
25  parser.add_argument(
26  '--extension', '-e', dest="extension",
27  help='extension of the file', choices=['rmf', 'pdb'], default="rmf")
28  parser.add_argument(
29  '--mode', '-m', dest="mode", help='pyRMSD calculator',
30  choices=['cuda', 'cpu_omp', 'cpu_serial'], default="cuda")
31  parser.add_argument(
32  '--matrix-cores', '-c', dest="cores", type=int,
33  help='number of cores for parallel RMSD matrix calculations; '
34  'only for cpu_omp', default=1)
35  parser.add_argument(
36  '--cluster-cores', '-cc', dest="cores2", type=int,
37  help='number of cores for clustering at different thresholds'
38  ' and parallel IO; only for cpu_omp', default=1)
39  parser.add_argument(
40  '--resolution', '-r', dest="resolution", type=int,
41  help='resolution at which to select proteins in a multiscale system',
42  default=1)
43  parser.add_argument(
44  '--subunit', '-su', dest="subunit",
45  help='calculate RMSD/sampling and cluster precision/densities '
46  'etc over this subunit only', default=None)
47  parser.add_argument(
48  '--align', '-a', dest="align",
49  help='boolean flag to allow superposition of models',
50  default=False, action='store_true')
51  parser.add_argument(
52  '--ambiguity', '-amb', dest="symmetry_groups",
53  help='file containing symmetry groups', default=None)
54  parser.add_argument(
55  '--scoreA', '-sa', dest="scoreA",
56  help='name of the file having the good-scoring scores for sample A',
57  default="scoresA.txt")
58  parser.add_argument(
59  '--scoreB', '-sb', dest="scoreB",
60  help='name of the file having the good-scoring scores for sample B',
61  default="scoresB.txt")
62  parser.add_argument(
63  '--rmfA', '-ra', dest="rmf_A",
64  help='RMF file with conformations from Sample A', default=None)
65  parser.add_argument(
66  '--rmfB', '-rb', dest="rmf_B",
67  help='RMF file with conformations from Sample B', default=None)
68  parser.add_argument(
69  '--gridsize', '-g', dest="gridsize", type=float,
70  help='grid size for calculating sampling precision', default=10.0)
71  parser.add_argument(
72  '--skip', '-s', dest="skip_sampling_precision",
73  help="This option will bypass the calculation of sampling "
74  "precision. This option needs to be used with the clustering "
75  "threshold option. Otherwise by default, sampling precision "
76  "is calculated and the clustering threshold is the "
77  "calculated sampling precision.", default=False,
78  action='store_true')
79  parser.add_argument(
80  '--cluster_threshold', '-ct', dest="cluster_threshold", type=float,
81  help='final clustering threshold to visualize clusters. Assumes '
82  'that the user has previously calculated sampling precision '
83  'and wants clusters defined at a threshold higher than the '
84  'sampling precision for ease of analysis (lesser number of '
85  'clusters).', default=30.0)
86  parser.add_argument(
87  '--voxel', '-v', dest="voxel", type=float,
88  help='voxel size for the localization densities', default=5.0)
89  parser.add_argument(
90  '--density_threshold', '-dt', type=float,
91  dest="density_threshold",
92  help='threshold for localization densities', default=20.0)
93  parser.add_argument(
94  '--density', '-d', dest="density",
95  help='file containing dictionary of density custom ranges',
96  default=None)
97  parser.add_argument(
98  '--gnuplot', '-gp', dest="gnuplot",
99  help="plotting automatically with gnuplot", default=False,
100  action='store_true')
101  parser.add_argument(
102  '--selection', '-sn', dest="selection",
103  help='file containing dictionary'
104  'of selected subunits and residues'
105  'for RMSD and clustering calculation'
106  "each entry in the dictionary takes the form"
107  "'selection name': [(residue_start, residue_end, protein name)",
108  default=None)
109  parser.add_argument(
110  '--prism', '-pr', dest="prism",
111  help="Save input files for PrISM", default=False,
112  action='store_true')
113  return parser.parse_args()
114 
115 
116 def make_cluster_centroid(infname, frame, outfname, cluster_index,
117  cluster_size, precision, density, path):
118 
119  import RMF
120  # If we have new enough IMP/RMF, do our own RMF slicing with provenance
121  if hasattr(RMF.NodeHandle, 'replace_child'):
122  print(infname, outfname)
123  inr = RMF.open_rmf_file_read_only(infname)
124  outr = RMF.create_rmf_file(outfname)
125  cpf = RMF.ClusterProvenanceFactory(outr)
126  RMF.clone_file_info(inr, outr)
127  RMF.clone_hierarchy(inr, outr)
128  RMF.clone_static_frame(inr, outr)
129  inr.set_current_frame(RMF.FrameID(frame))
130  outr.add_frame("f0")
131  RMF.clone_loaded_frame(inr, outr)
132  rn = outr.get_root_node()
133  children = rn.get_children()
134  if len(children) == 0:
135  return
136  rn = children[0] # Should be the top-level IMP node
137  prov = [c for c in rn.get_children() if c.get_type() == RMF.PROVENANCE]
138  if not prov:
139  return
140  prov = prov[0]
141  # Add cluster-provenance info
142  newp = rn.replace_child(
143  prov, "cluster.%d" % cluster_index, RMF.PROVENANCE)
144  cp = cpf.get(newp)
145  cp.set_members(cluster_size)
146  cp.set_precision(precision)
147  cp.set_density(os.path.abspath(density))
148  else:
149  # Otherwise, fall back to RMF's command line tool
150  import subprocess
151  print(infname, frame, outfname)
152  subprocess.call(['rmf_slice', path + infname, '-f', str(frame),
153  outfname])
154 
155 
156 def main():
157  args = parse_args()
158 
159  import os
160  import shutil
161  import numpy
162 
163  import scipy as sp
164 
165  import IMP.sampcon
166  from IMP.sampcon import scores_convergence, clustering_rmsd
167  from IMP.sampcon import rmsd_calculation, precision_rmsd
168 
169  import IMP
170 
171  idfile_A = "Identities_A.txt"
172  idfile_B = "Identities_B.txt"
173 
174  # Step 0: Compute Score convergence
175  score_A = []
176  score_B = []
177 
178  with open(os.path.join(args.path, args.scoreA), 'r') as f:
179  for line in f:
180  score_A.append(float(line.strip("\n")))
181 
182  with open(os.path.join(args.path, args.scoreB), 'r') as f:
183  for line in f:
184  score_B.append(float(line.strip("\n")))
185 
186  scores = score_A + score_B
187 
188  # Get the convergence of the best score
189  scores_convergence.get_top_scorings_statistics(scores, 0, args.sysname)
190 
191  # Check if the two score distributions are similar
192  scores_convergence.get_scores_distributions_KS_Stats(
193  score_A, score_B, 100, args.sysname)
194 
195  # Step 1: Compute RMSD matrix
196  if args.extension == "pdb":
197  ps_names = [] # bead names are not stored in PDB files
198  symm_groups = None
199  conforms, masses, radii, models_name = \
200  rmsd_calculation.get_pdbs_coordinates(
201  args.path, idfile_A, idfile_B)
202  else:
203  args.extension = "rmf3"
204  if args.selection is not None:
205  rmsd_custom_ranges = \
206  precision_rmsd.parse_custom_ranges(args.selection)
207  else:
208  rmsd_custom_ranges = None
209  # If we have a single RMF file, read conformations from that
210  if args.rmf_A is not None:
211  (ps_names, masses, radii, conforms, symm_groups, models_name,
212  n_models) = rmsd_calculation.get_rmfs_coordinates_one_rmf(
213  args.path, args.rmf_A, args.rmf_B,
214  args.subunit,
215  args.symmetry_groups,
216  rmsd_custom_ranges,
217  args.resolution,
218  args.cores2)
219 
220  # If not, default to the Identities.txt file
221  else:
222  symm_groups = None
223  (ps_names, masses, radii, conforms,
224  models_name) = rmsd_calculation.get_rmfs_coordinates(
225  args.path, idfile_A, idfile_B, args.subunit,
226  selection=rmsd_custom_ranges,
227  resolution=args.resolution)
228 
229  print("Size of conformation matrix", conforms.shape)
230 
231  if not args.skip_sampling_precision:
232  # get_rmsds_matrix modifies conforms, so save it to a file and restore
233  # afterwards (so that we retain the original IMP orientation)
234  numpy.save("conforms", conforms)
235  inner_data = rmsd_calculation.get_rmsds_matrix(
236  conforms, args.mode, args.align, args.cores, symm_groups)
237  print("Size of RMSD matrix (flattened):", inner_data.shape)
238  del conforms
239  conforms = numpy.load("conforms.npy")
240  os.unlink('conforms.npy')
241 
242  from pyRMSD.matrixHandler import MatrixHandler
243  mHandler = MatrixHandler()
244  mHandler.loadMatrix("Distances_Matrix.data")
245 
246  rmsd_matrix = mHandler.getMatrix()
247  distmat = rmsd_matrix.get_data()
248 
249  distmat_full = sp.spatial.distance.squareform(distmat)
250  print("Size of RMSD matrix (unpacked, N x N):", distmat_full.shape)
251 
252  # Get model lists
253  if args.rmf_A is not None:
254  sampleA_all_models = list(range(n_models[0]))
255  sampleB_all_models = list(range(n_models[0],
256  n_models[1] + n_models[0]))
257  total_num_models = n_models[1] + n_models[0]
258  else:
259  (sampleA_all_models,
260  sampleB_all_models) = clustering_rmsd.get_sample_identity(
261  idfile_A, idfile_B)
262  total_num_models = len(sampleA_all_models) + len(sampleB_all_models)
263  all_models = list(sampleA_all_models) + list(sampleB_all_models)
264  print("Size of Sample A:", len(sampleA_all_models),
265  " ; Size of Sample B: ", len(sampleB_all_models),
266  "; Total", total_num_models)
267 
268  if not args.skip_sampling_precision:
269 
270  print("Calculating sampling precision")
271 
272  # Step 2: Cluster at intervals of grid size to get the
273  # sampling precision
274  gridSize = args.gridsize
275 
276  # Get cutoffs for clustering
277  cutoffs_list = clustering_rmsd.get_cutoffs_list(distmat, gridSize)
278  print("Clustering at thresholds:", cutoffs_list)
279 
280  # Do clustering at each cutoff
281  pvals, cvs, percents = clustering_rmsd.get_clusters(
282  cutoffs_list, distmat_full, all_models, total_num_models,
283  sampleA_all_models, sampleB_all_models, args.sysname,
284  args.cores2)
285 
286  # Now apply the rule for selecting the right precision based
287  # on population of contingency table, pvalue and cramersv
288  (sampling_precision, pval_converged, cramersv_converged,
289  percent_converged) = clustering_rmsd.get_sampling_precision(
290  cutoffs_list, pvals, cvs, percents)
291 
292  # Output test statistics
293  with open("%s.Sampling_Precision_Stats.txt"
294  % args.sysname, 'w+') as fpv:
295  print("The sampling precision is defined as the largest allowed "
296  "RMSD between the cluster centroid and a ", args.sysname,
297  "model within any cluster in the finest clustering for "
298  "which each sample contributes models proportionally to "
299  "its size (considering both significance and magnitude of "
300  "the difference) and for which a sufficient proportion of "
301  "all models occur in sufficiently large clusters. The "
302  "sampling precision for our ", args.sysname,
303  " modeling is %.3f" % (sampling_precision), " A.", file=fpv)
304 
305  print("Sampling precision, P-value, Cramer's V and percentage "
306  "of clustered models below:", file=fpv)
307  print("%.3f\t%.3f\t%.3f\t%.3f"
308  % (sampling_precision, pval_converged, cramersv_converged,
309  percent_converged), file=fpv)
310  print("", file=fpv)
311 
312  final_clustering_threshold = sampling_precision
313 
314  else:
315  final_clustering_threshold = args.cluster_threshold
316 
317  # Perform final clustering at the required precision
318  print("Clustering at threshold %.3f" % final_clustering_threshold)
319  (cluster_centers, cluster_members) = clustering_rmsd.precision_cluster(
320  distmat_full, total_num_models, final_clustering_threshold)
321 
322  (ctable, retained_clusters) = clustering_rmsd.get_contingency_table(
323  len(cluster_centers), cluster_members, all_models,
324  sampleA_all_models, sampleB_all_models)
325  print("Contingency table:", ctable)
326  # Output the number of models in each cluster and each sample
327  with open("%s.Cluster_Population.txt" % args.sysname, 'w+') as fcp:
328  for rows in range(len(ctable)):
329  print(rows, ctable[rows][0], ctable[rows][1], file=fcp)
330 
331  # Obtain the subunits for which we need to calculate densities
332  density_custom_ranges = precision_rmsd.parse_custom_ranges(args.density)
333 
334  # Output cluster precisions
335  fpc = open("%s.Cluster_Precision.txt" % args.sysname, 'w+')
336 
337  # For each cluster, output the models in the cluster
338  # Also output the densities for the cluster models
339  for i in range(len(retained_clusters)):
340  clus = retained_clusters[i]
341 
342  # The cluster centroid is the first conformation.
343  # We use this as to align and compute RMSD/precision
344  conform_0 = conforms[all_models[cluster_members[clus][0]]]
345 
346  # create a directory for the cluster
347  if not os.path.exists("./cluster.%s" % i):
348  os.mkdir("./cluster.%s" % i)
349  os.mkdir("./cluster.%s/Sample_A/" % i)
350  os.mkdir("./cluster.%s/Sample_B/" % i)
351  else:
352  shutil.rmtree("./cluster.%s" % i)
353  os.mkdir("./cluster.%s" % i)
354  os.mkdir("./cluster.%s/Sample_A/" % i)
355  os.mkdir("./cluster.%s/Sample_B/" % i)
356  # File for saving input to PrISM
357  if args.prism:
358  prism_file = 'cluster.'+str(i)+'.prism.npz'
359  superposed_coords_cluster = []
360  # Create densities for all subunits for both sample A and sample B
361  # as well as separately.
362  gmd1 = precision_rmsd.GetModelDensity(
363  custom_ranges=density_custom_ranges,
364  resolution=args.density_threshold, voxel=args.voxel,
365  bead_names=ps_names)
366  gmd2 = precision_rmsd.GetModelDensity(
367  custom_ranges=density_custom_ranges,
368  resolution=args.density_threshold, voxel=args.voxel,
369  bead_names=ps_names)
370  gmdt = precision_rmsd.GetModelDensity(
371  custom_ranges=density_custom_ranges,
372  resolution=args.density_threshold, voxel=args.voxel,
373  bead_names=ps_names)
374 
375  # Also output the identities of cluster members
376  both_file = open('cluster.'+str(i)+'.all.txt', 'w')
377  sampleA_file = open('cluster.'+str(i)+'.sample_A.txt', 'w')
378  sampleB_file = open('cluster.'+str(i)+'.sample_B.txt', 'w')
379 
380  # Create a model with just the cluster_member particles
381  model = IMP.Model()
382  ps = [] # particle list to be updated by each RMF frame
383  for pi in range(len(conform_0)):
384  p = IMP.Particle(model, "%s" % str(pi))
385  IMP.core.XYZ.setup_particle(p, (0, 0, 0))
386  IMP.core.XYZR.setup_particle(p, float(radii[pi]))
387  IMP.atom.Mass.setup_particle(p, float(masses[pi]))
388  ps.append(p)
389 
390  # Obtain cluster precision by obtaining average RMSD of each model
391  # to the cluster center
392  cluster_precision = 0.0
393 
394  # transformation from internal pyRMSD orientation
395  trans = None
396  # for each model in the cluster
397  for mem in cluster_members[clus]:
398 
399  model_index = all_models[mem]
400 
401  # get superposition of each model to cluster center and the
402  # RMSD between the two
403  rmsd, superposed_ps, trans = \
404  precision_rmsd.get_particles_from_superposed(
405  conforms[model_index], conform_0, args.align,
406  ps, trans, symm_groups)
407 
408  model.update() # why not?
409 
410  cluster_precision += rmsd
411 
412  # Add the superposed particles to the respective density maps
413  gmdt.add_subunits_density(superposed_ps) # total density map
414  print(model_index, file=both_file)
415 
416  if model_index in sampleA_all_models:
417  # density map for sample A
418  gmd1.add_subunits_density(superposed_ps)
419  print(model_index, file=sampleA_file)
420  else:
421  # density map for sample B
422  gmd2.add_subunits_density(superposed_ps)
423  print(model_index, file=sampleB_file)
424  if args.prism:
425  superposed_coords = \
426  [IMP.core.XYZ(s_ps).get_coordinates()
427  for s_ps in superposed_ps]
428  superposed_coords_cluster.append(
429  numpy.array(superposed_coords))
430  if args.prism:
431  mass = \
432  [IMP.atom.Mass(m_p).get_mass() for m_p in superposed_ps]
433  radii = \
434  [IMP.core.XYZR(r_p).get_radius() for r_p in superposed_ps]
435  numpy.savez(
436  prism_file,
437  numpy.array(superposed_coords_cluster),
438  numpy.array(mass),
439  numpy.array(radii),
440  numpy.array(ps_names))
441  cluster_precision /= float(len(cluster_members[clus]) - 1.0)
442  print("Cluster precision (average distance to cluster centroid) "
443  "of cluster ", str(i), " is %.3f" % cluster_precision, "A",
444  file=fpc)
445 
446  both_file.close()
447  sampleA_file.close()
448  sampleB_file.close()
449 
450  # Output density files for the cluster
451  density = gmdt.write_mrc(path="./cluster.%s" % i, file_prefix="LPD")
452  gmd1.write_mrc(path="./cluster.%s/Sample_A/" % i, file_prefix="LPD")
453  gmd2.write_mrc(path="./cluster.%s/Sample_B/" % i, file_prefix="LPD")
454 
455  # Add the cluster center model RMF to the cluster directory
456  cluster_center_index = cluster_members[clus][0]
457  if args.rmf_A is not None:
458  cluster_center_model_id = cluster_center_index
459  if cluster_center_index < n_models[0]:
460  make_cluster_centroid(
461  os.path.join(args.path, args.rmf_A),
462  cluster_center_index,
463  os.path.join("cluster.%d" % i,
464  "cluster_center_model.rmf3"),
465  i, len(cluster_members[clus]),
466  cluster_precision, density, args.path)
467  else:
468  make_cluster_centroid(
469  os.path.join(args.path, args.rmf_B),
470  cluster_center_index - n_models[0],
471  os.path.join("cluster.%d" % i,
472  "cluster_center_model.rmf3"),
473  i, len(cluster_members[clus]),
474  cluster_precision, density, args.path)
475  else:
476  # index to Identities file.
477  cluster_center_model_id = all_models[cluster_center_index]
478  outfname = os.path.join("cluster.%d" % i,
479  "cluster_center_model." + args.extension)
480  if 'rmf' in args.extension:
481  make_cluster_centroid(
482  models_name[cluster_center_model_id], 0, outfname,
483  i, len(cluster_members[clus]),
484  cluster_precision, density, args.path)
485  else:
486  shutil.copy(models_name[cluster_center_model_id], outfname)
487 
488  fpc.close()
489 
490  # generate plots for the score and structure tests
491  if args.gnuplot:
492  import subprocess
493  import glob
494 
495  gnuplotdir = IMP.sampcon.get_data_path("gnuplot_scripts")
496  for filename in sorted(glob.glob(os.path.join(gnuplotdir, "*.plt"))):
497  cmd = ['gnuplot', '-e', 'sysname="%s"' % args.sysname, filename]
498  print(" ".join(cmd))
499  subprocess.check_call(cmd)
500 
501 
502 if __name__ == '__main__':
503  main()
Add mass to a particle.
Definition: Mass.h:23
def get_data_path
Return the full path to one of this module's data files.
static XYZR setup_particle(Model *m, ParticleIndex pi)
Definition: XYZR.h:48
double get_mass(ResidueType c)
Get the mass from the residue type.
static XYZ setup_particle(Model *m, ParticleIndex pi)
Definition: XYZ.h:51
Class for storing model, its restraints, constraints, and particles.
Definition: Model.h:73
static Mass setup_particle(Model *m, ParticleIndex pi, Float mass)
Definition: Mass.h:48
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
Class to handle individual particles of a Model object.
Definition: Particle.h:41
Sampling exhaustiveness protocol.
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27