IMP logo
IMP Reference Guide  2.15.0
The Integrative Modeling Platform
exhaust.py
1 from __future__ import print_function
2 from IMP import ArgumentParser
3 import os
4 
5 __doc__ = "Perform analysis to determine sampling convergence."
6 
7 ############################################################
8 # Scripts written by Shruthi Viswanath and Ilan E. Chemmama#
9 # in Andrej Sali Lab at UCSF. #
10 # Based on Viswanath, Chemmama et al. Biophys. J. (2017) #
11 # #
12 ############################################################
13 
14 
15 def parse_args():
16  parser = ArgumentParser(
17  description="First stages of analysis for assessing sampling "
18  "convergence")
19  parser.add_argument(
20  '--sysname', '-n', dest="sysname",
21  help='name of the system', default="")
22  parser.add_argument(
23  '--path', '-p', dest="path",
24  help='path to the good-scoring models', default="./")
25  parser.add_argument(
26  '--extension', '-e', dest="extension",
27  help='extension of the file', choices=['rmf', 'pdb'], default="rmf")
28  parser.add_argument(
29  '--mode', '-m', dest="mode", help='pyRMSD calculator',
30  choices=['cuda', 'cpu_omp', 'cpu_serial'], default="cuda")
31  parser.add_argument(
32  '--cores', '-c', dest="cores", type=int,
33  help='number of cores for RMSD matrix calculations; '
34  'only for cpu_omp', default=1)
35  parser.add_argument(
36  '--subunit', '-su', dest="subunit",
37  help='calculate RMSD/sampling and cluster precision/densities '
38  'etc over this subunit only', default=None)
39  parser.add_argument(
40  '--align', '-a', dest="align",
41  help='boolean flag to allow superposition of models',
42  default=False, action='store_true')
43  parser.add_argument(
44  '--ambiguity', '-amb', dest="symmetry_groups",
45  help='file containing symmetry groups', default=None)
46  parser.add_argument(
47  '--scoreA', '-sa', dest="scoreA",
48  help='name of the file having the good-scoring scores for sample A',
49  default="scoresA.txt")
50  parser.add_argument(
51  '--scoreB', '-sb', dest="scoreB",
52  help='name of the file having the good-scoring scores for sample B',
53  default="scoresB.txt")
54  parser.add_argument(
55  '--rmfA', '-ra', dest="rmf_A",
56  help='RMF file with conformations from Sample A', default=None)
57  parser.add_argument(
58  '--rmfB', '-rb', dest="rmf_B",
59  help='RMF file with conformations from Sample B', default=None)
60  parser.add_argument(
61  '--gridsize', '-g', dest="gridsize", type=float,
62  help='grid size for calculating sampling precision', default=10.0)
63  parser.add_argument(
64  '--skip', '-s', dest="skip_sampling_precision",
65  help="This option will bypass the calculation of sampling "
66  "precision. This option needs to be used with the clustering "
67  "threshold option. Otherwise by default, sampling precision "
68  "is calculated and the clustering threshold is the "
69  "calculated sampling precision.", default=False,
70  action='store_true')
71  parser.add_argument(
72  '--cluster_threshold', '-ct', dest="cluster_threshold", type=float,
73  help='final clustering threshold to visualize clusters. Assumes '
74  'that the user has previously calculated sampling precision '
75  'and wants clusters defined at a threshold higher than the '
76  'sampling precision for ease of analysis (lesser number of '
77  'clusters).', default=30.0)
78  parser.add_argument(
79  '--voxel', '-v', dest="voxel", type=float,
80  help='voxel size for the localization densities', default=5.0)
81  parser.add_argument(
82  '--density_threshold', '-dt', type=float,
83  dest="density_threshold",
84  help='threshold for localization densities', default=20.0)
85  parser.add_argument(
86  '--density', '-d', dest="density",
87  help='file containing dictionary of density custom ranges',
88  default=None)
89  parser.add_argument(
90  '--gnuplot', '-gp', dest="gnuplot",
91  help="plotting automatically with gnuplot", default=False,
92  action='store_true')
93  parser.add_argument(
94  '--selection', '-sn', dest="selection",
95  help='file containing dictionary'
96  'of selected subunits and residues'
97  'for RMSD and clustering calculation'
98  "each entry in the dictionary takes the form"
99  "'selection name': [(residue_start, residue_end, protein name)",
100  default=None)
101  return parser.parse_args()
102 
103 
104 def make_cluster_centroid(infname, frame, outfname, cluster_index,
105  cluster_size, precision, density, path):
106 
107  import RMF
108  # If we have new enough IMP/RMF, do our own RMF slicing with provenance
109  if hasattr(RMF.NodeHandle, 'replace_child'):
110  print(infname, outfname)
111  inr = RMF.open_rmf_file_read_only(infname)
112  outr = RMF.create_rmf_file(outfname)
113  cpf = RMF.ClusterProvenanceFactory(outr)
114  RMF.clone_file_info(inr, outr)
115  RMF.clone_hierarchy(inr, outr)
116  RMF.clone_static_frame(inr, outr)
117  inr.set_current_frame(RMF.FrameID(frame))
118  outr.add_frame("f0")
119  RMF.clone_loaded_frame(inr, outr)
120  rn = outr.get_root_node()
121  children = rn.get_children()
122  if len(children) == 0:
123  return
124  rn = children[0] # Should be the top-level IMP node
125  prov = [c for c in rn.get_children() if c.get_type() == RMF.PROVENANCE]
126  if not prov:
127  return
128  prov = prov[0]
129  # Add cluster-provenance info
130  newp = rn.replace_child(
131  prov, "cluster.%d" % cluster_index, RMF.PROVENANCE)
132  cp = cpf.get(newp)
133  cp.set_members(cluster_size)
134  cp.set_precision(precision)
135  cp.set_density(os.path.abspath(density))
136  else:
137  # Otherwise, fall back to RMF's command line tool
138  import subprocess
139  print(infname, frame, outfname)
140  subprocess.call(['rmf_slice', path + infname, '-f', str(frame),
141  outfname])
142 
143 
144 def main():
145  args = parse_args()
146 
147  import os
148  import shutil
149  import numpy
150 
151  import scipy as sp
152 
153  import IMP.sampcon
154  from IMP.sampcon import scores_convergence, clustering_rmsd
155  from IMP.sampcon import rmsd_calculation, precision_rmsd
156 
157  import IMP
158 
159  idfile_A = "Identities_A.txt"
160  idfile_B = "Identities_B.txt"
161 
162  # Step 0: Compute Score convergence
163  score_A = []
164  score_B = []
165 
166  with open(os.path.join(args.path, args.scoreA), 'r') as f:
167  for line in f:
168  score_A.append(float(line.strip("\n")))
169 
170  with open(os.path.join(args.path, args.scoreB), 'r') as f:
171  for line in f:
172  score_B.append(float(line.strip("\n")))
173 
174  scores = score_A + score_B
175 
176  # Get the convergence of the best score
177  scores_convergence.get_top_scorings_statistics(scores, 0, args.sysname)
178 
179  # Check if the two score distributions are similar
180  scores_convergence.get_scores_distributions_KS_Stats(
181  score_A, score_B, 100, args.sysname)
182 
183  # Step 1: Compute RMSD matrix
184  if args.extension == "pdb":
185  ps_names = [] # bead names are not stored in PDB files
186  symm_groups = None
187  conforms, masses, radii, models_name = \
188  rmsd_calculation.get_pdbs_coordinates(
189  args.path, idfile_A, idfile_B)
190  else:
191  args.extension = "rmf3"
192  if args.selection is not None:
193  rmsd_custom_ranges = \
194  precision_rmsd.parse_custom_ranges(args.selection)
195  else:
196  rmsd_custom_ranges = None
197  # If we have a single RMF file, read conformations from that
198  if args.rmf_A is not None:
199  (ps_names, masses, radii, conforms, symm_groups, models_name,
200  n_models) = rmsd_calculation.get_rmfs_coordinates_one_rmf(
201  args.path, args.rmf_A, args.rmf_B, args.subunit,
202  args.symmetry_groups,
203  rmsd_custom_ranges)
204 
205  # If not, default to the Identities.txt file
206  else:
207  symm_groups = None
208  (ps_names, masses, radii, conforms,
209  models_name) = rmsd_calculation.get_rmfs_coordinates(
210  args.path, idfile_A, idfile_B, args.subunit,
211  selection=rmsd_custom_ranges)
212 
213  print("Size of conformation matrix", conforms.shape)
214 
215  if not args.skip_sampling_precision:
216  # get_rmsds_matrix modifies conforms, so save it to a file and restore
217  # afterwards (so that we retain the original IMP orientation)
218  numpy.save("conforms", conforms)
219  inner_data = rmsd_calculation.get_rmsds_matrix(
220  conforms, args.mode, args.align, args.cores, symm_groups)
221  print("Size of RMSD matrix (flattened):", inner_data.shape)
222  del conforms
223  conforms = numpy.load("conforms.npy")
224  os.unlink('conforms.npy')
225 
226  from pyRMSD.matrixHandler import MatrixHandler
227  mHandler = MatrixHandler()
228  mHandler.loadMatrix("Distances_Matrix.data")
229 
230  rmsd_matrix = mHandler.getMatrix()
231  distmat = rmsd_matrix.get_data()
232 
233  distmat_full = sp.spatial.distance.squareform(distmat)
234  print("Size of RMSD matrix (unpacked, N x N):", distmat_full.shape)
235 
236  # Get model lists
237  if args.rmf_A is not None:
238  sampleA_all_models = list(range(n_models[0]))
239  sampleB_all_models = list(range(n_models[0],
240  n_models[1] + n_models[0]))
241  total_num_models = n_models[1] + n_models[0]
242  else:
243  (sampleA_all_models,
244  sampleB_all_models) = clustering_rmsd.get_sample_identity(
245  idfile_A, idfile_B)
246  total_num_models = len(sampleA_all_models) + len(sampleB_all_models)
247  all_models = list(sampleA_all_models) + list(sampleB_all_models)
248  print("Size of Sample A:", len(sampleA_all_models),
249  " ; Size of Sample B: ", len(sampleB_all_models),
250  "; Total", total_num_models)
251 
252  if not args.skip_sampling_precision:
253 
254  print("Calculating sampling precision")
255 
256  # Step 2: Cluster at intervals of grid size to get the
257  # sampling precision
258  gridSize = args.gridsize
259 
260  # Get cutoffs for clustering
261  cutoffs_list = clustering_rmsd.get_cutoffs_list(distmat, gridSize)
262  print("Clustering at thresholds:", cutoffs_list)
263 
264  # Do clustering at each cutoff
265  pvals, cvs, percents = clustering_rmsd.get_clusters(
266  cutoffs_list, distmat_full, all_models, total_num_models,
267  sampleA_all_models, sampleB_all_models, args.sysname)
268 
269  # Now apply the rule for selecting the right precision based
270  # on population of contingency table, pvalue and cramersv
271  (sampling_precision, pval_converged, cramersv_converged,
272  percent_converged) = clustering_rmsd.get_sampling_precision(
273  cutoffs_list, pvals, cvs, percents)
274 
275  # Output test statistics
276  with open("%s.Sampling_Precision_Stats.txt"
277  % args.sysname, 'w+') as fpv:
278  print("The sampling precision is defined as the largest allowed "
279  "RMSD between the cluster centroid and a ", args.sysname,
280  "model within any cluster in the finest clustering for "
281  "which each sample contributes models proportionally to "
282  "its size (considering both significance and magnitude of "
283  "the difference) and for which a sufficient proportion of "
284  "all models occur in sufficiently large clusters. The "
285  "sampling precision for our ", args.sysname,
286  " modeling is %.3f" % (sampling_precision), " A.", file=fpv)
287 
288  print("Sampling precision, P-value, Cramer's V and percentage "
289  "of clustered models below:", file=fpv)
290  print("%.3f\t%.3f\t%.3f\t%.3f"
291  % (sampling_precision, pval_converged, cramersv_converged,
292  percent_converged), file=fpv)
293  print("", file=fpv)
294 
295  final_clustering_threshold = sampling_precision
296 
297  else:
298  final_clustering_threshold = args.cluster_threshold
299 
300  # Perform final clustering at the required precision
301  print("Clustering at threshold %.3f" % final_clustering_threshold)
302  (cluster_centers, cluster_members) = clustering_rmsd.precision_cluster(
303  distmat_full, total_num_models, final_clustering_threshold)
304 
305  (ctable, retained_clusters) = clustering_rmsd.get_contingency_table(
306  len(cluster_centers), cluster_members, all_models,
307  sampleA_all_models, sampleB_all_models)
308  print("Contingency table:", ctable)
309  # Output the number of models in each cluster and each sample
310  with open("%s.Cluster_Population.txt" % args.sysname, 'w+') as fcp:
311  for rows in range(len(ctable)):
312  print(rows, ctable[rows][0], ctable[rows][1], file=fcp)
313 
314  # Obtain the subunits for which we need to calculate densities
315  density_custom_ranges = precision_rmsd.parse_custom_ranges(args.density)
316 
317  # Output cluster precisions
318  fpc = open("%s.Cluster_Precision.txt" % args.sysname, 'w+')
319 
320  # For each cluster, output the models in the cluster
321  # Also output the densities for the cluster models
322  for i in range(len(retained_clusters)):
323  clus = retained_clusters[i]
324 
325  # The cluster centroid is the first conformation.
326  # We use this as to align and compute RMSD/precision
327  conform_0 = conforms[all_models[cluster_members[clus][0]]]
328 
329  # create a directory for the cluster
330  if not os.path.exists("./cluster.%s" % i):
331  os.mkdir("./cluster.%s" % i)
332  os.mkdir("./cluster.%s/Sample_A/" % i)
333  os.mkdir("./cluster.%s/Sample_B/" % i)
334  else:
335  shutil.rmtree("./cluster.%s" % i)
336  os.mkdir("./cluster.%s" % i)
337  os.mkdir("./cluster.%s/Sample_A/" % i)
338  os.mkdir("./cluster.%s/Sample_B/" % i)
339 
340  # Create densities for all subunits for both sample A and sample B
341  # as well as separately.
342  gmd1 = precision_rmsd.GetModelDensity(
343  custom_ranges=density_custom_ranges,
344  resolution=args.density_threshold, voxel=args.voxel,
345  bead_names=ps_names)
346  gmd2 = precision_rmsd.GetModelDensity(
347  custom_ranges=density_custom_ranges,
348  resolution=args.density_threshold, voxel=args.voxel,
349  bead_names=ps_names)
350  gmdt = precision_rmsd.GetModelDensity(
351  custom_ranges=density_custom_ranges,
352  resolution=args.density_threshold, voxel=args.voxel,
353  bead_names=ps_names)
354 
355  # Also output the identities of cluster members
356  both_file = open('cluster.'+str(i)+'.all.txt', 'w')
357  sampleA_file = open('cluster.'+str(i)+'.sample_A.txt', 'w')
358  sampleB_file = open('cluster.'+str(i)+'.sample_B.txt', 'w')
359 
360  # Create a model with just the cluster_member particles
361  model = IMP.Model()
362  ps = [] # particle list to be updated by each RMF frame
363  for pi in range(len(conform_0)):
364  p = IMP.Particle(model, "%s" % str(pi))
365  IMP.core.XYZ.setup_particle(p, (0, 0, 0))
366  IMP.core.XYZR.setup_particle(p, float(radii[pi]))
367  IMP.atom.Mass.setup_particle(p, float(masses[pi]))
368  ps.append(p)
369 
370  # Obtain cluster precision by obtaining average RMSD of each model
371  # to the cluster center
372  cluster_precision = 0.0
373 
374  # transformation from internal pyRMSD orientation
375  trans = None
376  # for each model in the cluster
377  for mem in cluster_members[clus]:
378 
379  model_index = all_models[mem]
380 
381  # get superposition of each model to cluster center and the
382  # RMSD between the two
383  if args.symmetry_groups:
384  rmsd, superposed_ps, trans = \
385  precision_rmsd.get_particles_from_superposed_amb(
386  conforms[model_index], conform_0, args.align, ps,
387  trans, symm_groups)
388  else:
389  rmsd, superposed_ps, trans = \
390  precision_rmsd.get_particles_from_superposed(
391  conforms[model_index], conform_0, args.align,
392  ps, trans)
393 
394  model.update() # why not?
395 
396  cluster_precision += rmsd
397 
398  # Add the superposed particles to the respective density maps
399  gmdt.add_subunits_density(superposed_ps) # total density map
400  print(model_index, file=both_file)
401 
402  if model_index in sampleA_all_models:
403  # density map for sample A
404  gmd1.add_subunits_density(superposed_ps)
405  print(model_index, file=sampleA_file)
406  else:
407  # density map for sample B
408  gmd2.add_subunits_density(superposed_ps)
409  print(model_index, file=sampleB_file)
410 
411  cluster_precision /= float(len(cluster_members[clus]) - 1.0)
412 
413  print("Cluster precision (average distance to cluster centroid) "
414  "of cluster ", str(i), " is %.3f" % cluster_precision, "A",
415  file=fpc)
416 
417  both_file.close()
418  sampleA_file.close()
419  sampleB_file.close()
420 
421  # Output density files for the cluster
422  density = gmdt.write_mrc(path="./cluster.%s" % i, file_prefix="LPD")
423  gmd1.write_mrc(path="./cluster.%s/Sample_A/" % i, file_prefix="LPD")
424  gmd2.write_mrc(path="./cluster.%s/Sample_B/" % i, file_prefix="LPD")
425 
426  # Add the cluster center model RMF to the cluster directory
427  cluster_center_index = cluster_members[clus][0]
428  if args.rmf_A is not None:
429  cluster_center_model_id = cluster_center_index
430  if cluster_center_index < n_models[0]:
431  make_cluster_centroid(
432  os.path.join(args.path, args.rmf_A),
433  cluster_center_index,
434  os.path.join("cluster.%d" % i,
435  "cluster_center_model.rmf3"),
436  i, len(cluster_members[clus]),
437  cluster_precision, density, args.path)
438  else:
439  make_cluster_centroid(
440  os.path.join(args.path, args.rmf_B),
441  cluster_center_index - n_models[0],
442  os.path.join("cluster.%d" % i,
443  "cluster_center_model.rmf3"),
444  i, len(cluster_members[clus]),
445  cluster_precision, density, args.path)
446  else:
447  # index to Identities file.
448  cluster_center_model_id = all_models[cluster_center_index]
449  outfname = os.path.join("cluster.%d" % i,
450  "cluster_center_model." + args.extension)
451  if 'rmf' in args.extension:
452  make_cluster_centroid(
453  models_name[cluster_center_model_id], 0, outfname,
454  i, len(cluster_members[clus]),
455  cluster_precision, density, args.path)
456  else:
457  shutil.copy(models_name[cluster_center_model_id], outfname)
458 
459  fpc.close()
460 
461  # generate plots for the score and structure tests
462  if args.gnuplot:
463  import subprocess
464  import glob
465 
466  gnuplotdir = IMP.sampcon.get_data_path("gnuplot_scripts")
467  for filename in sorted(glob.glob(os.path.join(gnuplotdir, "*.plt"))):
468  cmd = ['gnuplot', '-e', 'sysname="%s"' % args.sysname, filename]
469  print(" ".join(cmd))
470  subprocess.check_call(cmd)
471 
472 
473 if __name__ == '__main__':
474  main()
def get_data_path
Return the full path to one of this module's data files.
static XYZR setup_particle(Model *m, ParticleIndex pi)
Definition: XYZR.h:48
static XYZ setup_particle(Model *m, ParticleIndex pi)
Definition: XYZ.h:51
Class for storing model, its restraints, constraints, and particles.
Definition: Model.h:72
static Mass setup_particle(Model *m, ParticleIndex pi, Float mass)
Definition: Mass.h:48
Class to handle individual particles of a Model object.
Definition: Particle.h:41
Sampling exhaustiveness protocol.