IMP logo
IMP Reference Guide  develop.63b38c487d,2024/12/22
The Integrative Modeling Platform
exhaust.py
1 from __future__ import print_function
2 from IMP import ArgumentParser
3 import os
4 import json
5 
6 __doc__ = "Perform analysis to determine sampling convergence."
7 
8 ############################################################
9 # Scripts written by Shruthi Viswanath and Ilan E. Chemmama#
10 # in Andrej Sali Lab at UCSF. #
11 # Based on Viswanath, Chemmama et al. Biophys. J. (2017) #
12 # #
13 ############################################################
14 
15 
16 def parse_args():
17  parser = ArgumentParser(
18  description="First stages of analysis for assessing sampling "
19  "convergence")
20  parser.add_argument(
21  '--sysname', '-n', dest="sysname",
22  help='name of the system', default="")
23  parser.add_argument(
24  '--path', '-p', dest="path",
25  help='path to the good-scoring models', default="./")
26  parser.add_argument(
27  '--extension', '-e', dest="extension",
28  help='extension of the file', choices=['rmf', 'pdb'], default="rmf")
29  parser.add_argument(
30  '--mode', '-m', dest="mode", help='pyRMSD calculator',
31  choices=['cuda', 'cpu_omp', 'cpu_serial'], default="cuda")
32  parser.add_argument(
33  '--matrix-cores', '-c', dest="cores", type=int,
34  help='number of cores for parallel RMSD matrix calculations; '
35  'only for cpu_omp', default=1)
36  parser.add_argument(
37  '--cluster-cores', '-cc', dest="cores2", type=int,
38  help='number of cores for clustering at different thresholds'
39  ' and parallel IO; only for cpu_omp', default=1)
40  parser.add_argument(
41  '--resolution', '-r', dest="resolution", type=int,
42  help='resolution at which to select proteins in a multiscale system',
43  default=1)
44  parser.add_argument(
45  '--subunit', '-su', dest="subunit",
46  help='calculate RMSD/sampling and cluster precision/densities '
47  'etc over this subunit only', default=None)
48  parser.add_argument(
49  '--align', '-a', dest="align",
50  help='boolean flag to allow superposition of models',
51  default=False, action='store_true')
52  parser.add_argument(
53  '--ambiguity', '-amb', dest="symmetry_groups",
54  help='file containing symmetry groups', default=None)
55  parser.add_argument(
56  '--scoreA', '-sa', dest="scoreA",
57  help='name of the file having the good-scoring scores for sample A',
58  default="scoresA.txt")
59  parser.add_argument(
60  '--scoreB', '-sb', dest="scoreB",
61  help='name of the file having the good-scoring scores for sample B',
62  default="scoresB.txt")
63  parser.add_argument(
64  '--rmfA', '-ra', dest="rmf_A",
65  help='RMF file with conformations from Sample A', default=None)
66  parser.add_argument(
67  '--rmfB', '-rb', dest="rmf_B",
68  help='RMF file with conformations from Sample B', default=None)
69  parser.add_argument(
70  '--gridsize', '-g', dest="gridsize", type=float,
71  help='grid size for calculating sampling precision', default=10.0)
72  parser.add_argument(
73  '--skip', '-s', dest="skip_sampling_precision",
74  help="This option will bypass the calculation of sampling "
75  "precision. This option needs to be used with the clustering "
76  "threshold option. Otherwise by default, sampling precision "
77  "is calculated and the clustering threshold is the "
78  "calculated sampling precision.", default=False,
79  action='store_true')
80  parser.add_argument(
81  '--cluster_threshold', '-ct', dest="cluster_threshold", type=float,
82  help='final clustering threshold to visualize clusters. Assumes '
83  'that the user has previously calculated sampling precision '
84  'and wants clusters defined at a threshold higher than the '
85  'sampling precision for ease of analysis (lesser number of '
86  'clusters).', default=30.0)
87  parser.add_argument(
88  '--voxel', '-v', dest="voxel", type=float,
89  help='voxel size for the localization densities', default=5.0)
90  parser.add_argument(
91  '--density_threshold', '-dt', type=float,
92  dest="density_threshold",
93  help='threshold for localization densities', default=20.0)
94  parser.add_argument(
95  '--density', '-d', dest="density",
96  help='file containing dictionary of density custom ranges',
97  default=None)
98  parser.add_argument(
99  '--gnuplot', '-gp', dest="gnuplot",
100  help="plotting automatically with gnuplot", default=False,
101  action='store_true')
102  parser.add_argument(
103  '--selection', '-sn', dest="selection",
104  help='file containing dictionary '
105  'of selected subunits and residues '
106  'for RMSD and clustering calculation; '
107  "each entry in the dictionary takes the form "
108  "'selection name': [(residue_start, residue_end, protein name)",
109  default=None)
110  parser.add_argument(
111  '--prism', '-pr', dest="prism",
112  help="Save input files for PrISM", default=False,
113  action='store_true')
114  return parser.parse_args()
115 
116 
117 def make_cluster_centroid(infname, frame, outfname, cluster_index,
118  cluster_size, precision, metadata_fname, path):
119 
120  import RMF
121  # If we have new enough IMP/RMF, do our own RMF slicing with provenance
122  if hasattr(RMF.NodeHandle, 'replace_child'):
123  print(infname, outfname)
124  inr = RMF.open_rmf_file_read_only(infname)
125  outr = RMF.create_rmf_file(outfname)
126  cpf = RMF.ClusterProvenanceFactory(outr)
127  RMF.clone_file_info(inr, outr)
128  RMF.clone_hierarchy(inr, outr)
129  RMF.clone_static_frame(inr, outr)
130  inr.set_current_frame(RMF.FrameID(frame))
131  outr.add_frame("f0")
132  RMF.clone_loaded_frame(inr, outr)
133  rn = outr.get_root_node()
134  children = rn.get_children()
135  if len(children) == 0:
136  return
137  rn = children[0] # Should be the top-level IMP node
138  prov = [c for c in rn.get_children() if c.get_type() == RMF.PROVENANCE]
139  if not prov:
140  return
141  prov = prov[0]
142  # Add cluster-provenance info
143  newp = rn.replace_child(
144  prov, "cluster.%d" % cluster_index, RMF.PROVENANCE)
145  cp = cpf.get(newp)
146  cp.set_members(cluster_size)
147  cp.set_precision(precision)
148  cp.set_density(os.path.abspath(metadata_fname))
149  else:
150  # Otherwise, fall back to RMF's command line tool
151  import subprocess
152  print(infname, frame, outfname)
153  subprocess.call(['rmf_slice', path + infname, '-f', str(frame),
154  outfname])
155 
156 
157 def _read_scores(fname, sample_name):
158  scores = []
159  with open(fname, 'r') as f:
160  for line in f:
161  scores.append(float(line.strip("\n")))
162  if len(scores) == 0:
163  raise ValueError("%s (file %s) is empty" % (sample_name, fname))
164  return scores
165 
166 
167 def main():
168  args = parse_args()
169 
170  import os
171  import shutil
172  import numpy
173 
174  import scipy as sp
175 
176  import IMP.sampcon
177  from IMP.sampcon import scores_convergence, clustering_rmsd
178  from IMP.sampcon import rmsd_calculation, precision_rmsd
179 
180  import IMP
181 
182  # Write output metadata in JSON format
183  metadata = {}
184  metadata_fname = "%s.output.json" % args.sysname
185  # Increment file_version whenever incompatible changes are made to
186  # the JSON output
187  metadata['producer'] = {'name': 'IMP.sampcon',
188  'version': IMP.sampcon.__version__,
189  'file_version': 1}
190 
191  idfile_A = "Identities_A.txt"
192  idfile_B = "Identities_B.txt"
193 
194  # Step 0: Compute Score convergence
195  score_A = _read_scores(os.path.join(args.path, args.scoreA), "Sample A")
196  score_B = _read_scores(os.path.join(args.path, args.scoreB), "Sample B")
197  scores = score_A + score_B
198 
199  # Get the convergence of the best score
200  scores_convergence.get_top_scorings_statistics(scores, 0, args.sysname)
201 
202  # Check if the two score distributions are similar
203  scores_convergence.get_scores_distributions_KS_Stats(
204  score_A, score_B, 100, args.sysname)
205 
206  # Step 1: Compute RMSD matrix
207  if args.extension == "pdb":
208  ps_names = [] # bead names are not stored in PDB files
209  symm_groups = None
210  conforms, masses, radii, models_name = \
211  rmsd_calculation.get_pdbs_coordinates(
212  args.path, idfile_A, idfile_B)
213  metadata['input_frames'] = models_name
214  else:
215  args.extension = "rmf3"
216  if args.selection is not None:
217  rmsd_custom_ranges = \
218  precision_rmsd.parse_custom_ranges(args.selection)
219  else:
220  rmsd_custom_ranges = None
221  # If we have a single RMF file, read conformations from that
222  if args.rmf_A is not None:
223  metadata['input_files'] = {'A': args.rmf_A, 'B': args.rmf_B}
224  (ps_names, masses, radii, conforms, symm_groups, models_name,
225  n_models) = rmsd_calculation.get_rmfs_coordinates_one_rmf(
226  args.path, args.rmf_A, args.rmf_B,
227  args.subunit,
228  args.symmetry_groups,
229  rmsd_custom_ranges,
230  args.resolution,
231  args.cores2)
232 
233  # If not, default to the Identities.txt file
234  else:
235  symm_groups = None
236  (ps_names, masses, radii, conforms,
237  models_name) = rmsd_calculation.get_rmfs_coordinates(
238  args.path, idfile_A, idfile_B, args.subunit,
239  selection=rmsd_custom_ranges,
240  resolution=args.resolution)
241  metadata['input_frames'] = models_name
242 
243  print("Size of conformation matrix", conforms.shape)
244 
245  if not args.skip_sampling_precision:
246  # get_rmsds_matrix modifies conforms, so save it to a file and restore
247  # afterwards (so that we retain the original IMP orientation)
248  numpy.save("conforms", conforms)
249  inner_data = rmsd_calculation.get_rmsds_matrix(
250  conforms, args.mode, args.align, args.cores, symm_groups)
251  print("Size of RMSD matrix (flattened):", inner_data.shape)
252  del conforms
253  conforms = numpy.load("conforms.npy")
254  os.unlink('conforms.npy')
255 
256  from pyRMSD.matrixHandler import MatrixHandler
257  mHandler = MatrixHandler()
258  mHandler.loadMatrix("Distances_Matrix.data")
259 
260  rmsd_matrix = mHandler.getMatrix()
261  distmat = rmsd_matrix.get_data()
262 
263  distmat_full = sp.spatial.distance.squareform(distmat)
264  print("Size of RMSD matrix (unpacked, N x N):", distmat_full.shape)
265 
266  # Get model lists
267  if args.rmf_A is not None:
268  sampleA_all_models = list(range(n_models[0]))
269  sampleB_all_models = list(range(n_models[0],
270  n_models[1] + n_models[0]))
271  total_num_models = n_models[1] + n_models[0]
272  else:
273  (sampleA_all_models,
274  sampleB_all_models) = clustering_rmsd.get_sample_identity(
275  idfile_A, idfile_B)
276  total_num_models = len(sampleA_all_models) + len(sampleB_all_models)
277  all_models = list(sampleA_all_models) + list(sampleB_all_models)
278  print("Size of Sample A:", len(sampleA_all_models),
279  " ; Size of Sample B: ", len(sampleB_all_models),
280  "; Total", total_num_models)
281 
282  if not args.skip_sampling_precision:
283 
284  print("Calculating sampling precision")
285 
286  # Step 2: Cluster at intervals of grid size to get the
287  # sampling precision
288  gridSize = args.gridsize
289 
290  # Get cutoffs for clustering
291  cutoffs_list = clustering_rmsd.get_cutoffs_list(distmat, gridSize)
292  print("Clustering at thresholds:", cutoffs_list)
293 
294  # Do clustering at each cutoff
295  pvals, cvs, percents = clustering_rmsd.get_clusters(
296  cutoffs_list, distmat_full, all_models, total_num_models,
297  sampleA_all_models, sampleB_all_models, args.sysname,
298  args.cores2)
299  metadata['chi_square_grid_stats'] = {
300  'cutoffs': list(cutoffs_list),
301  'p_value': pvals,
302  'cramers_v': cvs,
303  'percent_clustered': percents}
304 
305  # Now apply the rule for selecting the right precision based
306  # on population of contingency table, pvalue and cramersv
307  (sampling_precision, pval_converged, cramersv_converged,
308  percent_converged) = clustering_rmsd.get_sampling_precision(
309  cutoffs_list, pvals, cvs, percents)
310 
311  # Output test statistics
312  with open("%s.Sampling_Precision_Stats.txt"
313  % args.sysname, 'w+') as fpv:
314  print("The sampling precision is defined as the largest allowed "
315  "RMSD between the cluster centroid and a ", args.sysname,
316  "model within any cluster in the finest clustering for "
317  "which each sample contributes models proportionally to "
318  "its size (considering both significance and magnitude of "
319  "the difference) and for which a sufficient proportion of "
320  "all models occur in sufficiently large clusters. The "
321  "sampling precision for our ", args.sysname,
322  " modeling is %.3f" % (sampling_precision), " A.", file=fpv)
323 
324  print("Sampling precision, P-value, Cramer's V and percentage "
325  "of clustered models below:", file=fpv)
326  print("%.3f\t%.3f\t%.3f\t%.3f"
327  % (sampling_precision, pval_converged, cramersv_converged,
328  percent_converged), file=fpv)
329  print("", file=fpv)
330 
331  final_clustering_threshold = sampling_precision
332  metadata['precision'] = {
333  'sampling_precision': sampling_precision,
334  'p_value': pval_converged,
335  'cramers_v': cramersv_converged,
336  'percent_clustered': percent_converged}
337 
338  else:
339  final_clustering_threshold = args.cluster_threshold
340 
341  metadata['clustering_threshold'] = final_clustering_threshold
342 
343  # Perform final clustering at the required precision
344  print("Clustering at threshold %.3f" % final_clustering_threshold)
345  (cluster_centers, cluster_members) = clustering_rmsd.precision_cluster(
346  distmat_full, total_num_models, final_clustering_threshold)
347 
348  (ctable, retained_clusters) = clustering_rmsd.get_contingency_table(
349  len(cluster_centers), cluster_members, all_models,
350  sampleA_all_models, sampleB_all_models)
351  print("Contingency table:", ctable)
352  # metadata['contingency_table'] = ctable
353  # Output the number of models in each cluster and each sample
354  with open("%s.Cluster_Population.txt" % args.sysname, 'w+') as fcp:
355  for rows in range(len(ctable)):
356  print(rows, ctable[rows][0], ctable[rows][1], file=fcp)
357 
358  # Obtain the subunits for which we need to calculate densities
359  density_custom_ranges = precision_rmsd.parse_custom_ranges(args.density)
360  metadata['density_custom_ranges'] = density_custom_ranges
361 
362  # Output cluster precisions
363  fpc = open("%s.Cluster_Precision.txt" % args.sysname, 'w+')
364 
365  metadata['clusters'] = []
366  # For each cluster, output the models in the cluster
367  # Also output the densities for the cluster models
368  for i in range(len(retained_clusters)):
369  cmeta = {'name': 'cluster.%d' % i}
370  clus = retained_clusters[i]
371 
372  # The cluster centroid is the first conformation.
373  # We use this as to align and compute RMSD/precision
374  conform_0 = conforms[all_models[cluster_members[clus][0]]]
375 
376  # create a directory for the cluster
377  if not os.path.exists("./cluster.%s" % i):
378  os.mkdir("./cluster.%s" % i)
379  os.mkdir("./cluster.%s/Sample_A/" % i)
380  os.mkdir("./cluster.%s/Sample_B/" % i)
381  else:
382  shutil.rmtree("./cluster.%s" % i)
383  os.mkdir("./cluster.%s" % i)
384  os.mkdir("./cluster.%s/Sample_A/" % i)
385  os.mkdir("./cluster.%s/Sample_B/" % i)
386  # File for saving input to PrISM
387  if args.prism:
388  prism_file = 'cluster.'+str(i)+'.prism.npz'
389  superposed_coords_cluster = []
390  # Create densities for all subunits for both sample A and sample B
391  # as well as separately.
392  gmd1 = precision_rmsd.GetModelDensity(
393  custom_ranges=density_custom_ranges,
394  resolution=args.density_threshold, voxel=args.voxel,
395  bead_names=ps_names)
396  gmd2 = precision_rmsd.GetModelDensity(
397  custom_ranges=density_custom_ranges,
398  resolution=args.density_threshold, voxel=args.voxel,
399  bead_names=ps_names)
400  gmdt = precision_rmsd.GetModelDensity(
401  custom_ranges=density_custom_ranges,
402  resolution=args.density_threshold, voxel=args.voxel,
403  bead_names=ps_names)
404 
405  # Also output the identities of cluster members
406  both_file = open('cluster.'+str(i)+'.all.txt', 'w')
407  sampleA_file = open('cluster.'+str(i)+'.sample_A.txt', 'w')
408  sampleB_file = open('cluster.'+str(i)+'.sample_B.txt', 'w')
409 
410  # Create a model with just the cluster_member particles
411  model = IMP.Model()
412  ps = [] # particle list to be updated by each RMF frame
413  for pi in range(len(conform_0)):
414  p = IMP.Particle(model, "%s" % str(pi))
415  IMP.core.XYZ.setup_particle(p, (0, 0, 0))
416  IMP.core.XYZR.setup_particle(p, float(radii[pi]))
417  IMP.atom.Mass.setup_particle(p, float(masses[pi]))
418  ps.append(p)
419 
420  # Obtain cluster precision by obtaining average RMSD of each model
421  # to the cluster center
422  cluster_precision = 0.0
423 
424  cmeta['members'] = {
425  'A': [int(x) for x in cluster_members[clus]
426  if x in sampleA_all_models],
427  'B': [int(x) for x in cluster_members[clus]
428  if x in sampleB_all_models]}
429 
430  # transformation from internal pyRMSD orientation
431  trans = None
432  # for each model in the cluster
433  for mem in cluster_members[clus]:
434 
435  model_index = all_models[mem]
436 
437  # get superposition of each model to cluster center and the
438  # RMSD between the two
439  rmsd, superposed_ps, trans = \
440  precision_rmsd.get_particles_from_superposed(
441  conforms[model_index], conform_0, args.align,
442  ps, trans, symm_groups)
443 
444  model.update() # why not?
445 
446  cluster_precision += rmsd
447 
448  # Add the superposed particles to the respective density maps
449  gmdt.add_subunits_density(superposed_ps) # total density map
450  print(model_index, file=both_file)
451 
452  if model_index in sampleA_all_models:
453  # density map for sample A
454  gmd1.add_subunits_density(superposed_ps)
455  print(model_index, file=sampleA_file)
456  else:
457  # density map for sample B
458  gmd2.add_subunits_density(superposed_ps)
459  print(model_index, file=sampleB_file)
460  if args.prism:
461  superposed_coords = \
462  [IMP.core.XYZ(s_ps).get_coordinates()
463  for s_ps in superposed_ps]
464  superposed_coords_cluster.append(
465  numpy.array(superposed_coords))
466  if args.prism:
467  mass = \
468  [IMP.atom.Mass(m_p).get_mass() for m_p in superposed_ps]
469  radii = \
470  [IMP.core.XYZR(r_p).get_radius() for r_p in superposed_ps]
471  numpy.savez(
472  prism_file,
473  numpy.array(superposed_coords_cluster),
474  numpy.array(mass),
475  numpy.array(radii),
476  numpy.array(ps_names))
477  cluster_precision /= float(len(cluster_members[clus]) - 1.0)
478  cmeta['precision'] = cluster_precision
479  print("Cluster precision (average distance to cluster centroid) "
480  "of cluster ", str(i), " is %.3f" % cluster_precision, "A",
481  file=fpc)
482 
483  both_file.close()
484  sampleA_file.close()
485  sampleB_file.close()
486 
487  # Output density files for the cluster
488  cmeta['density'] = gmdt.write_mrc(path="./cluster.%s" % i,
489  file_prefix="LPD")
490  cmeta['densityA'] = gmd1.write_mrc(path="./cluster.%s/Sample_A/" % i,
491  file_prefix="LPD")
492  cmeta['densityB'] = gmd2.write_mrc(path="./cluster.%s/Sample_B/" % i,
493  file_prefix="LPD")
494 
495  # Add the cluster center model RMF to the cluster directory
496  cluster_center_index = cluster_members[clus][0]
497  if args.rmf_A is not None:
498  outfname = os.path.join("cluster.%d" % i,
499  "cluster_center_model.rmf3")
500  cluster_center_model_id = cluster_center_index
501  if cluster_center_index < n_models[0]:
502  make_cluster_centroid(
503  os.path.join(args.path, args.rmf_A),
504  cluster_center_index,
505  outfname, i, len(cluster_members[clus]),
506  cluster_precision, metadata_fname, args.path)
507  else:
508  make_cluster_centroid(
509  os.path.join(args.path, args.rmf_B),
510  cluster_center_index - n_models[0],
511  outfname, i, len(cluster_members[clus]),
512  cluster_precision, metadata_fname, args.path)
513  else:
514  # index to Identities file.
515  cluster_center_model_id = all_models[cluster_center_index]
516  outfname = os.path.join("cluster.%d" % i,
517  "cluster_center_model." + args.extension)
518  if 'rmf' in args.extension:
519  make_cluster_centroid(
520  models_name[cluster_center_model_id], 0, outfname,
521  i, len(cluster_members[clus]),
522  cluster_precision, metadata_fname, args.path)
523  else:
524  shutil.copy(models_name[cluster_center_model_id], outfname)
525  cmeta['centroid'] = {'index': cluster_center_index,
526  'file': outfname}
527  metadata['clusters'].append(cmeta)
528  fpc.close()
529  with open(metadata_fname, 'w') as jfh:
530  json.dump(metadata, jfh)
531  # generate plots for the score and structure tests
532  if args.gnuplot:
533  import subprocess
534  import glob
535 
536  gnuplotdir = IMP.sampcon.get_data_path("gnuplot_scripts")
537  for filename in sorted(glob.glob(os.path.join(gnuplotdir, "*.plt"))):
538  cmd = ['gnuplot', '-e', 'sysname="%s"' % args.sysname, filename]
539  print(" ".join(cmd))
540  subprocess.check_call(cmd)
541 
542 
543 if __name__ == '__main__':
544  main()
Add mass to a particle.
Definition: Mass.h:23
def get_data_path
Return the full path to one of this module's data files.
static XYZR setup_particle(Model *m, ParticleIndex pi)
Definition: XYZR.h:48
double get_mass(ResidueType c)
Get the mass from the residue type.
static XYZ setup_particle(Model *m, ParticleIndex pi)
Definition: XYZ.h:51
Class for storing model, its restraints, constraints, and particles.
Definition: Model.h:86
static Mass setup_particle(Model *m, ParticleIndex pi, Float mass)
Definition: Mass.h:48
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
Class to handle individual particles of a Model object.
Definition: Particle.h:43
Sampling exhaustiveness protocol.
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27