IMP logo
IMP Reference Guide  2.16.0
The Integrative Modeling Platform
exhaust.py
1 from __future__ import print_function
2 from IMP import ArgumentParser
3 import os
4 
5 __doc__ = "Perform analysis to determine sampling convergence."
6 
7 ############################################################
8 # Scripts written by Shruthi Viswanath and Ilan E. Chemmama#
9 # in Andrej Sali Lab at UCSF. #
10 # Based on Viswanath, Chemmama et al. Biophys. J. (2017) #
11 # #
12 ############################################################
13 
14 
15 def parse_args():
16  parser = ArgumentParser(
17  description="First stages of analysis for assessing sampling "
18  "convergence")
19  parser.add_argument(
20  '--sysname', '-n', dest="sysname",
21  help='name of the system', default="")
22  parser.add_argument(
23  '--path', '-p', dest="path",
24  help='path to the good-scoring models', default="./")
25  parser.add_argument(
26  '--extension', '-e', dest="extension",
27  help='extension of the file', choices=['rmf', 'pdb'], default="rmf")
28  parser.add_argument(
29  '--mode', '-m', dest="mode", help='pyRMSD calculator',
30  choices=['cuda', 'cpu_omp', 'cpu_serial'], default="cuda")
31  parser.add_argument(
32  '--cores', '-c', dest="cores", type=int,
33  help='number of cores for RMSD matrix calculations; '
34  'only for cpu_omp', default=1)
35  parser.add_argument(
36  '--resolution', '-r', dest="resolution", type=int,
37  help='resolution at which to select proteins in a multiscale system',
38  default=1)
39  parser.add_argument(
40  '--subunit', '-su', dest="subunit",
41  help='calculate RMSD/sampling and cluster precision/densities '
42  'etc over this subunit only', default=None)
43  parser.add_argument(
44  '--align', '-a', dest="align",
45  help='boolean flag to allow superposition of models',
46  default=False, action='store_true')
47  parser.add_argument(
48  '--ambiguity', '-amb', dest="symmetry_groups",
49  help='file containing symmetry groups', default=None)
50  parser.add_argument(
51  '--scoreA', '-sa', dest="scoreA",
52  help='name of the file having the good-scoring scores for sample A',
53  default="scoresA.txt")
54  parser.add_argument(
55  '--scoreB', '-sb', dest="scoreB",
56  help='name of the file having the good-scoring scores for sample B',
57  default="scoresB.txt")
58  parser.add_argument(
59  '--rmfA', '-ra', dest="rmf_A",
60  help='RMF file with conformations from Sample A', default=None)
61  parser.add_argument(
62  '--rmfB', '-rb', dest="rmf_B",
63  help='RMF file with conformations from Sample B', default=None)
64  parser.add_argument(
65  '--gridsize', '-g', dest="gridsize", type=float,
66  help='grid size for calculating sampling precision', default=10.0)
67  parser.add_argument(
68  '--skip', '-s', dest="skip_sampling_precision",
69  help="This option will bypass the calculation of sampling "
70  "precision. This option needs to be used with the clustering "
71  "threshold option. Otherwise by default, sampling precision "
72  "is calculated and the clustering threshold is the "
73  "calculated sampling precision.", default=False,
74  action='store_true')
75  parser.add_argument(
76  '--cluster_threshold', '-ct', dest="cluster_threshold", type=float,
77  help='final clustering threshold to visualize clusters. Assumes '
78  'that the user has previously calculated sampling precision '
79  'and wants clusters defined at a threshold higher than the '
80  'sampling precision for ease of analysis (lesser number of '
81  'clusters).', default=30.0)
82  parser.add_argument(
83  '--voxel', '-v', dest="voxel", type=float,
84  help='voxel size for the localization densities', default=5.0)
85  parser.add_argument(
86  '--density_threshold', '-dt', type=float,
87  dest="density_threshold",
88  help='threshold for localization densities', default=20.0)
89  parser.add_argument(
90  '--density', '-d', dest="density",
91  help='file containing dictionary of density custom ranges',
92  default=None)
93  parser.add_argument(
94  '--gnuplot', '-gp', dest="gnuplot",
95  help="plotting automatically with gnuplot", default=False,
96  action='store_true')
97  parser.add_argument(
98  '--selection', '-sn', dest="selection",
99  help='file containing dictionary'
100  'of selected subunits and residues'
101  'for RMSD and clustering calculation'
102  "each entry in the dictionary takes the form"
103  "'selection name': [(residue_start, residue_end, protein name)",
104  default=None)
105  return parser.parse_args()
106 
107 
108 def make_cluster_centroid(infname, frame, outfname, cluster_index,
109  cluster_size, precision, density, path):
110 
111  import RMF
112  # If we have new enough IMP/RMF, do our own RMF slicing with provenance
113  if hasattr(RMF.NodeHandle, 'replace_child'):
114  print(infname, outfname)
115  inr = RMF.open_rmf_file_read_only(infname)
116  outr = RMF.create_rmf_file(outfname)
117  cpf = RMF.ClusterProvenanceFactory(outr)
118  RMF.clone_file_info(inr, outr)
119  RMF.clone_hierarchy(inr, outr)
120  RMF.clone_static_frame(inr, outr)
121  inr.set_current_frame(RMF.FrameID(frame))
122  outr.add_frame("f0")
123  RMF.clone_loaded_frame(inr, outr)
124  rn = outr.get_root_node()
125  children = rn.get_children()
126  if len(children) == 0:
127  return
128  rn = children[0] # Should be the top-level IMP node
129  prov = [c for c in rn.get_children() if c.get_type() == RMF.PROVENANCE]
130  if not prov:
131  return
132  prov = prov[0]
133  # Add cluster-provenance info
134  newp = rn.replace_child(
135  prov, "cluster.%d" % cluster_index, RMF.PROVENANCE)
136  cp = cpf.get(newp)
137  cp.set_members(cluster_size)
138  cp.set_precision(precision)
139  cp.set_density(os.path.abspath(density))
140  else:
141  # Otherwise, fall back to RMF's command line tool
142  import subprocess
143  print(infname, frame, outfname)
144  subprocess.call(['rmf_slice', path + infname, '-f', str(frame),
145  outfname])
146 
147 
148 def main():
149  args = parse_args()
150 
151  import os
152  import shutil
153  import numpy
154 
155  import scipy as sp
156 
157  import IMP.sampcon
158  from IMP.sampcon import scores_convergence, clustering_rmsd
159  from IMP.sampcon import rmsd_calculation, precision_rmsd
160 
161  import IMP
162 
163  idfile_A = "Identities_A.txt"
164  idfile_B = "Identities_B.txt"
165 
166  # Step 0: Compute Score convergence
167  score_A = []
168  score_B = []
169 
170  with open(os.path.join(args.path, args.scoreA), 'r') as f:
171  for line in f:
172  score_A.append(float(line.strip("\n")))
173 
174  with open(os.path.join(args.path, args.scoreB), 'r') as f:
175  for line in f:
176  score_B.append(float(line.strip("\n")))
177 
178  scores = score_A + score_B
179 
180  # Get the convergence of the best score
181  scores_convergence.get_top_scorings_statistics(scores, 0, args.sysname)
182 
183  # Check if the two score distributions are similar
184  scores_convergence.get_scores_distributions_KS_Stats(
185  score_A, score_B, 100, args.sysname)
186 
187  # Step 1: Compute RMSD matrix
188  if args.extension == "pdb":
189  ps_names = [] # bead names are not stored in PDB files
190  symm_groups = None
191  conforms, masses, radii, models_name = \
192  rmsd_calculation.get_pdbs_coordinates(
193  args.path, idfile_A, idfile_B)
194  else:
195  args.extension = "rmf3"
196  if args.selection is not None:
197  rmsd_custom_ranges = \
198  precision_rmsd.parse_custom_ranges(args.selection)
199  else:
200  rmsd_custom_ranges = None
201  # If we have a single RMF file, read conformations from that
202  if args.rmf_A is not None:
203  (ps_names, masses, radii, conforms, symm_groups, models_name,
204  n_models) = rmsd_calculation.get_rmfs_coordinates_one_rmf(
205  args.path, args.rmf_A, args.rmf_B, args.subunit,
206  args.symmetry_groups,
207  rmsd_custom_ranges,
208  args.resolution)
209 
210  # If not, default to the Identities.txt file
211  else:
212  symm_groups = None
213  (ps_names, masses, radii, conforms,
214  models_name) = rmsd_calculation.get_rmfs_coordinates(
215  args.path, idfile_A, idfile_B, args.subunit,
216  selection=rmsd_custom_ranges,
217  resolution=args.resolution)
218 
219  print("Size of conformation matrix", conforms.shape)
220 
221  if not args.skip_sampling_precision:
222  # get_rmsds_matrix modifies conforms, so save it to a file and restore
223  # afterwards (so that we retain the original IMP orientation)
224  numpy.save("conforms", conforms)
225  inner_data = rmsd_calculation.get_rmsds_matrix(
226  conforms, args.mode, args.align, args.cores, symm_groups)
227  print("Size of RMSD matrix (flattened):", inner_data.shape)
228  del conforms
229  conforms = numpy.load("conforms.npy")
230  os.unlink('conforms.npy')
231 
232  from pyRMSD.matrixHandler import MatrixHandler
233  mHandler = MatrixHandler()
234  mHandler.loadMatrix("Distances_Matrix.data")
235 
236  rmsd_matrix = mHandler.getMatrix()
237  distmat = rmsd_matrix.get_data()
238 
239  distmat_full = sp.spatial.distance.squareform(distmat)
240  print("Size of RMSD matrix (unpacked, N x N):", distmat_full.shape)
241 
242  # Get model lists
243  if args.rmf_A is not None:
244  sampleA_all_models = list(range(n_models[0]))
245  sampleB_all_models = list(range(n_models[0],
246  n_models[1] + n_models[0]))
247  total_num_models = n_models[1] + n_models[0]
248  else:
249  (sampleA_all_models,
250  sampleB_all_models) = clustering_rmsd.get_sample_identity(
251  idfile_A, idfile_B)
252  total_num_models = len(sampleA_all_models) + len(sampleB_all_models)
253  all_models = list(sampleA_all_models) + list(sampleB_all_models)
254  print("Size of Sample A:", len(sampleA_all_models),
255  " ; Size of Sample B: ", len(sampleB_all_models),
256  "; Total", total_num_models)
257 
258  if not args.skip_sampling_precision:
259 
260  print("Calculating sampling precision")
261 
262  # Step 2: Cluster at intervals of grid size to get the
263  # sampling precision
264  gridSize = args.gridsize
265 
266  # Get cutoffs for clustering
267  cutoffs_list = clustering_rmsd.get_cutoffs_list(distmat, gridSize)
268  print("Clustering at thresholds:", cutoffs_list)
269 
270  # Do clustering at each cutoff
271  pvals, cvs, percents = clustering_rmsd.get_clusters(
272  cutoffs_list, distmat_full, all_models, total_num_models,
273  sampleA_all_models, sampleB_all_models, args.sysname)
274 
275  # Now apply the rule for selecting the right precision based
276  # on population of contingency table, pvalue and cramersv
277  (sampling_precision, pval_converged, cramersv_converged,
278  percent_converged) = clustering_rmsd.get_sampling_precision(
279  cutoffs_list, pvals, cvs, percents)
280 
281  # Output test statistics
282  with open("%s.Sampling_Precision_Stats.txt"
283  % args.sysname, 'w+') as fpv:
284  print("The sampling precision is defined as the largest allowed "
285  "RMSD between the cluster centroid and a ", args.sysname,
286  "model within any cluster in the finest clustering for "
287  "which each sample contributes models proportionally to "
288  "its size (considering both significance and magnitude of "
289  "the difference) and for which a sufficient proportion of "
290  "all models occur in sufficiently large clusters. The "
291  "sampling precision for our ", args.sysname,
292  " modeling is %.3f" % (sampling_precision), " A.", file=fpv)
293 
294  print("Sampling precision, P-value, Cramer's V and percentage "
295  "of clustered models below:", file=fpv)
296  print("%.3f\t%.3f\t%.3f\t%.3f"
297  % (sampling_precision, pval_converged, cramersv_converged,
298  percent_converged), file=fpv)
299  print("", file=fpv)
300 
301  final_clustering_threshold = sampling_precision
302 
303  else:
304  final_clustering_threshold = args.cluster_threshold
305 
306  # Perform final clustering at the required precision
307  print("Clustering at threshold %.3f" % final_clustering_threshold)
308  (cluster_centers, cluster_members) = clustering_rmsd.precision_cluster(
309  distmat_full, total_num_models, final_clustering_threshold)
310 
311  (ctable, retained_clusters) = clustering_rmsd.get_contingency_table(
312  len(cluster_centers), cluster_members, all_models,
313  sampleA_all_models, sampleB_all_models)
314  print("Contingency table:", ctable)
315  # Output the number of models in each cluster and each sample
316  with open("%s.Cluster_Population.txt" % args.sysname, 'w+') as fcp:
317  for rows in range(len(ctable)):
318  print(rows, ctable[rows][0], ctable[rows][1], file=fcp)
319 
320  # Obtain the subunits for which we need to calculate densities
321  density_custom_ranges = precision_rmsd.parse_custom_ranges(args.density)
322 
323  # Output cluster precisions
324  fpc = open("%s.Cluster_Precision.txt" % args.sysname, 'w+')
325 
326  # For each cluster, output the models in the cluster
327  # Also output the densities for the cluster models
328  for i in range(len(retained_clusters)):
329  clus = retained_clusters[i]
330 
331  # The cluster centroid is the first conformation.
332  # We use this as to align and compute RMSD/precision
333  conform_0 = conforms[all_models[cluster_members[clus][0]]]
334 
335  # create a directory for the cluster
336  if not os.path.exists("./cluster.%s" % i):
337  os.mkdir("./cluster.%s" % i)
338  os.mkdir("./cluster.%s/Sample_A/" % i)
339  os.mkdir("./cluster.%s/Sample_B/" % i)
340  else:
341  shutil.rmtree("./cluster.%s" % i)
342  os.mkdir("./cluster.%s" % i)
343  os.mkdir("./cluster.%s/Sample_A/" % i)
344  os.mkdir("./cluster.%s/Sample_B/" % i)
345 
346  # Create densities for all subunits for both sample A and sample B
347  # as well as separately.
348  gmd1 = precision_rmsd.GetModelDensity(
349  custom_ranges=density_custom_ranges,
350  resolution=args.density_threshold, voxel=args.voxel,
351  bead_names=ps_names)
352  gmd2 = precision_rmsd.GetModelDensity(
353  custom_ranges=density_custom_ranges,
354  resolution=args.density_threshold, voxel=args.voxel,
355  bead_names=ps_names)
356  gmdt = precision_rmsd.GetModelDensity(
357  custom_ranges=density_custom_ranges,
358  resolution=args.density_threshold, voxel=args.voxel,
359  bead_names=ps_names)
360 
361  # Also output the identities of cluster members
362  both_file = open('cluster.'+str(i)+'.all.txt', 'w')
363  sampleA_file = open('cluster.'+str(i)+'.sample_A.txt', 'w')
364  sampleB_file = open('cluster.'+str(i)+'.sample_B.txt', 'w')
365 
366  # Create a model with just the cluster_member particles
367  model = IMP.Model()
368  ps = [] # particle list to be updated by each RMF frame
369  for pi in range(len(conform_0)):
370  p = IMP.Particle(model, "%s" % str(pi))
371  IMP.core.XYZ.setup_particle(p, (0, 0, 0))
372  IMP.core.XYZR.setup_particle(p, float(radii[pi]))
373  IMP.atom.Mass.setup_particle(p, float(masses[pi]))
374  ps.append(p)
375 
376  # Obtain cluster precision by obtaining average RMSD of each model
377  # to the cluster center
378  cluster_precision = 0.0
379 
380  # transformation from internal pyRMSD orientation
381  trans = None
382  # for each model in the cluster
383  for mem in cluster_members[clus]:
384 
385  model_index = all_models[mem]
386 
387  # get superposition of each model to cluster center and the
388  # RMSD between the two
389  if args.symmetry_groups:
390  rmsd, superposed_ps, trans = \
391  precision_rmsd.get_particles_from_superposed_amb(
392  conforms[model_index], conform_0, args.align, ps,
393  trans, symm_groups)
394  else:
395  rmsd, superposed_ps, trans = \
396  precision_rmsd.get_particles_from_superposed(
397  conforms[model_index], conform_0, args.align,
398  ps, trans)
399 
400  model.update() # why not?
401 
402  cluster_precision += rmsd
403 
404  # Add the superposed particles to the respective density maps
405  gmdt.add_subunits_density(superposed_ps) # total density map
406  print(model_index, file=both_file)
407 
408  if model_index in sampleA_all_models:
409  # density map for sample A
410  gmd1.add_subunits_density(superposed_ps)
411  print(model_index, file=sampleA_file)
412  else:
413  # density map for sample B
414  gmd2.add_subunits_density(superposed_ps)
415  print(model_index, file=sampleB_file)
416 
417  cluster_precision /= float(len(cluster_members[clus]) - 1.0)
418 
419  print("Cluster precision (average distance to cluster centroid) "
420  "of cluster ", str(i), " is %.3f" % cluster_precision, "A",
421  file=fpc)
422 
423  both_file.close()
424  sampleA_file.close()
425  sampleB_file.close()
426 
427  # Output density files for the cluster
428  density = gmdt.write_mrc(path="./cluster.%s" % i, file_prefix="LPD")
429  gmd1.write_mrc(path="./cluster.%s/Sample_A/" % i, file_prefix="LPD")
430  gmd2.write_mrc(path="./cluster.%s/Sample_B/" % i, file_prefix="LPD")
431 
432  # Add the cluster center model RMF to the cluster directory
433  cluster_center_index = cluster_members[clus][0]
434  if args.rmf_A is not None:
435  cluster_center_model_id = cluster_center_index
436  if cluster_center_index < n_models[0]:
437  make_cluster_centroid(
438  os.path.join(args.path, args.rmf_A),
439  cluster_center_index,
440  os.path.join("cluster.%d" % i,
441  "cluster_center_model.rmf3"),
442  i, len(cluster_members[clus]),
443  cluster_precision, density, args.path)
444  else:
445  make_cluster_centroid(
446  os.path.join(args.path, args.rmf_B),
447  cluster_center_index - n_models[0],
448  os.path.join("cluster.%d" % i,
449  "cluster_center_model.rmf3"),
450  i, len(cluster_members[clus]),
451  cluster_precision, density, args.path)
452  else:
453  # index to Identities file.
454  cluster_center_model_id = all_models[cluster_center_index]
455  outfname = os.path.join("cluster.%d" % i,
456  "cluster_center_model." + args.extension)
457  if 'rmf' in args.extension:
458  make_cluster_centroid(
459  models_name[cluster_center_model_id], 0, outfname,
460  i, len(cluster_members[clus]),
461  cluster_precision, density, args.path)
462  else:
463  shutil.copy(models_name[cluster_center_model_id], outfname)
464 
465  fpc.close()
466 
467  # generate plots for the score and structure tests
468  if args.gnuplot:
469  import subprocess
470  import glob
471 
472  gnuplotdir = IMP.sampcon.get_data_path("gnuplot_scripts")
473  for filename in sorted(glob.glob(os.path.join(gnuplotdir, "*.plt"))):
474  cmd = ['gnuplot', '-e', 'sysname="%s"' % args.sysname, filename]
475  print(" ".join(cmd))
476  subprocess.check_call(cmd)
477 
478 
479 if __name__ == '__main__':
480  main()
def get_data_path
Return the full path to one of this module's data files.
static XYZR setup_particle(Model *m, ParticleIndex pi)
Definition: XYZR.h:48
static XYZ setup_particle(Model *m, ParticleIndex pi)
Definition: XYZ.h:51
Class for storing model, its restraints, constraints, and particles.
Definition: Model.h:73
static Mass setup_particle(Model *m, ParticleIndex pi, Float mass)
Definition: Mass.h:48
Class to handle individual particles of a Model object.
Definition: Particle.h:41
Sampling exhaustiveness protocol.