IMP logo
IMP Reference Guide  2.14.0
The Integrative Modeling Platform
exhaust.py
1 from __future__ import print_function
2 from IMP import ArgumentParser
3 import os
4 
5 __doc__ = "Perform analysis to determine sampling convergence."
6 
7 ###########################################################
8 #Scripts written by Shruthi Viswanath and Ilan E. Chemmama#
9 # in Andrej Sali Lab at UCSF. #
10 # Based on Viswanath, Chemmama et al. Biophys. J. (2017) #
11 # #
12 ###########################################################
13 
14 def parse_args():
15  parser = ArgumentParser(
16  description="First stages of analysis for assessing sampling "
17  "convergence")
18  parser.add_argument('--sysname', '-n', dest="sysname",
19  help='name of the system', default="")
20  parser.add_argument('--path', '-p', dest="path",
21  help='path to the good-scoring models', default="./")
22  parser.add_argument('--extension', '-e', dest="extension",
23  help='extension of the file', choices=['rmf', 'pdb'],
24  default="rmf")
25  parser.add_argument('--mode', '-m', dest="mode", help='pyRMSD calculator',
26  choices=['cuda', 'cpu_omp', 'cpu_serial'], default="cuda")
27  parser.add_argument('--cores', '-c', dest="cores", type=int,
28  help='number of cores for RMSD matrix calculations; '
29  'only for cpu_omp', default=1)
30  parser.add_argument('--subunit', '-su', dest="subunit",
31  help='calculate RMSD/sampling and cluster precision/densities '
32  'etc over this subunit only', default=None)
33  parser.add_argument('--align', '-a', dest="align",
34  help='boolean flag to allow superposition of models',
35  default=False, action='store_true')
36  parser.add_argument('--ambiguity', '-amb', dest="symmetry_groups",
37  help='file containing symmetry groups', default=None)
38  parser.add_argument('--scoreA', '-sa', dest="scoreA",
39  help='name of the file having the good-scoring scores for sample A',
40  default="scoresA.txt")
41  parser.add_argument('--scoreB', '-sb', dest="scoreB",
42  help='name of the file having the good-scoring scores for sample B',
43  default="scoresB.txt")
44  parser.add_argument('--rmfA', '-ra', dest="rmf_A",
45  help='RMF file with conformations from Sample A', default=None)
46  parser.add_argument('--rmfB', '-rb', dest="rmf_B",
47  help='RMF file with conformations from Sample B', default=None)
48  parser.add_argument('--gridsize', '-g', dest="gridsize", type=float,
49  help='grid size for calculating sampling precision', default=10.0)
50  parser.add_argument('--skip', '-s', dest="skip_sampling_precision",
51  help="This option will bypass the calculation of sampling "
52  "precision. This option needs to be used with the clustering "
53  "threshold option. Otherwise by default, sampling precision "
54  "is calculated and the clustering threshold is the "
55  "calculated sampling precision.", default=False,
56  action='store_true')
57  parser.add_argument('--cluster_threshold', '-ct', dest="cluster_threshold",
58  type=float,
59  help='final clustering threshold to visualize clusters. Assumes '
60  'that the user has previously calculated sampling precision '
61  'and wants clusters defined at a threshold higher than the '
62  'sampling precision for ease of analysis (lesser number of '
63  'clusters).', default=30.0)
64  parser.add_argument('--voxel', '-v', dest="voxel", type=float,
65  help='voxel size for the localization densities', default=5.0)
66  parser.add_argument('--density_threshold', '-dt', type=float,
67  dest="density_threshold",
68  help='threshold for localization densities', default=20.0)
69  parser.add_argument('--density', '-d', dest="density",
70  help='file containing dictionary of density custom ranges',
71  default=None)
72  parser.add_argument('--gnuplot', '-gp', dest="gnuplot",
73  help="plotting automatically with gnuplot", default=False,
74  action='store_true')
75  return parser.parse_args()
76 
77 
78 def make_cluster_centroid(infname, frame, outfname, cluster_index,
79  cluster_size, precision, density, path):
80 
81  import RMF
82  # If we have new enough IMP/RMF, do our own RMF slicing with provenance
83  if hasattr(RMF.NodeHandle, 'replace_child'):
84  print(infname, outfname)
85  inr = RMF.open_rmf_file_read_only(infname)
86  outr = RMF.create_rmf_file(outfname)
87  cpf = RMF.ClusterProvenanceFactory(outr)
88  RMF.clone_file_info(inr, outr)
89  RMF.clone_hierarchy(inr, outr)
90  RMF.clone_static_frame(inr, outr)
91  inr.set_current_frame(RMF.FrameID(frame))
92  outr.add_frame("f0")
93  RMF.clone_loaded_frame(inr, outr)
94  rn = outr.get_root_node()
95  children = rn.get_children()
96  if len(children) == 0:
97  return
98  rn = children[0] # Should be the top-level IMP node
99  prov = [c for c in rn.get_children() if c.get_type() == RMF.PROVENANCE]
100  if not prov:
101  return
102  prov = prov[0]
103  # Add cluster-provenance info
104  newp = rn.replace_child(prov, "cluster.%d" % cluster_index,
105  RMF.PROVENANCE)
106  cp = cpf.get(newp)
107  cp.set_members(cluster_size)
108  cp.set_precision(precision)
109  cp.set_density(os.path.abspath(density))
110  else:
111  # Otherwise, fall back to RMF's command line tool
112  import subprocess
113  print(infname, frame, outfname)
114  subprocess.call(['rmf_slice', path + infname, '-f', str(frame),
115  outfname])
116 
117 
118 def main():
119  args = parse_args()
120 
121  import os
122  import shutil
123  import numpy
124 
125  import scipy as sp
126  from scipy import spatial
127 
128  import IMP.sampcon
129  from IMP.sampcon import scores_convergence, clustering_rmsd
130  from IMP.sampcon import rmsd_calculation, precision_rmsd
131 
132  import IMP
133 
134  idfile_A = "Identities_A.txt"
135  idfile_B = "Identities_B.txt"
136 
137  #Step 0: Compute Score convergence
138  score_A = []
139  score_B = []
140 
141  with open(os.path.join(args.path, args.scoreA), 'r') as f:
142  for line in f:
143  score_A.append(float(line.strip("\n")))
144 
145  with open(os.path.join(args.path, args.scoreB), 'r') as f:
146  for line in f:
147  score_B.append(float(line.strip("\n")))
148 
149  scores = score_A + score_B
150 
151  # Get the convergence of the best score
152  scores_convergence.get_top_scorings_statistics(scores, 0, args.sysname)
153 
154  # Check if the two score distributions are similar
155  scores_convergence.get_scores_distributions_KS_Stats(
156  score_A, score_B, 100, args.sysname)
157 
158  #Step 1: Compute RMSD matrix
159  if args.extension == "pdb":
160  ps_names = [] # bead names are not stored in PDB files
161  symm_groups = None
162  conforms, masses, radii, models_name = \
163  rmsd_calculation.get_pdbs_coordinates(
164  args.path, idfile_A, idfile_B)
165  else:
166  args.extension = "rmf3"
167  # If we have a single RMF file, read conformations from that
168  if args.rmf_A is not None:
169  (ps_names, masses, radii, conforms, symm_groups, models_name,
170  n_models) = rmsd_calculation.get_rmfs_coordinates_one_rmf(
171  args.path, args.rmf_A, args.rmf_B, args.subunit,
172  args.symmetry_groups)
173 
174  # If not, default to the Identities.txt file
175  else:
176  symm_groups = None
177  (ps_names, masses, radii, conforms,
178  models_name) = rmsd_calculation.get_rmfs_coordinates(
179  args.path, idfile_A, idfile_B, args.subunit)
180 
181  print("Size of conformation matrix", conforms.shape)
182 
183  if not args.skip_sampling_precision:
184  # get_rmsds_matrix modifies conforms, so save it to a file and restore
185  # afterwards (so that we retain the original IMP orientation)
186  numpy.save("conforms", conforms)
187  inner_data = rmsd_calculation.get_rmsds_matrix(
188  conforms, args.mode, args.align, args.cores, symm_groups)
189  print("Size of RMSD matrix (flattened):", inner_data.shape)
190  del conforms
191  conforms = numpy.load("conforms.npy")
192  os.unlink('conforms.npy')
193 
194  import pyRMSD.RMSDCalculator
195  from pyRMSD.matrixHandler import MatrixHandler
196  mHandler = MatrixHandler()
197  mHandler.loadMatrix("Distances_Matrix.data")
198 
199  rmsd_matrix = mHandler.getMatrix()
200  distmat = rmsd_matrix.get_data()
201 
202  distmat_full = sp.spatial.distance.squareform(distmat)
203  print("Size of RMSD matrix (unpacked, N x N):", distmat_full.shape)
204 
205  # Get model lists
206  if args.rmf_A is not None:
207  sampleA_all_models = list(range(n_models[0]))
208  sampleB_all_models = list(range(n_models[0], n_models[1] + n_models[0]))
209  total_num_models = n_models[1] + n_models[0]
210  else:
211  (sampleA_all_models,
212  sampleB_all_models) = clustering_rmsd.get_sample_identity(
213  idfile_A, idfile_B)
214  total_num_models = len(sampleA_all_models) + len(sampleB_all_models)
215  all_models = list(sampleA_all_models) + list(sampleB_all_models)
216  print("Size of Sample A:", len(sampleA_all_models),
217  " ; Size of Sample B: ", len(sampleB_all_models),
218  "; Total", total_num_models)
219 
220  if not args.skip_sampling_precision:
221 
222  print("Calculating sampling precision")
223 
224  # Step 2: Cluster at intervals of grid size to get the
225  # sampling precision
226  gridSize = args.gridsize
227 
228  # Get cutoffs for clustering
229  cutoffs_list = clustering_rmsd.get_cutoffs_list(distmat, gridSize)
230  print("Clustering at thresholds:", cutoffs_list)
231 
232  # Do clustering at each cutoff
233  pvals, cvs, percents = clustering_rmsd.get_clusters(
234  cutoffs_list, distmat_full, all_models, total_num_models,
235  sampleA_all_models, sampleB_all_models, args.sysname)
236 
237  # Now apply the rule for selecting the right precision based
238  # on population of contingency table, pvalue and cramersv
239  (sampling_precision, pval_converged, cramersv_converged,
240  percent_converged) = clustering_rmsd.get_sampling_precision(
241  cutoffs_list, pvals, cvs, percents)
242 
243  # Output test statistics
244  with open("%s.Sampling_Precision_Stats.txt"
245  % args.sysname, 'w+') as fpv:
246  print("The sampling precision is defined as the largest allowed "
247  "RMSD between the cluster centroid and a ", args.sysname,
248  "model within any cluster in the finest clustering for which "
249  "each sample contributes models proportionally to its size "
250  "(considering both significance and magnitude of the "
251  "difference) and for which a sufficient proportion of all "
252  "models occur in sufficiently large clusters. The sampling "
253  "precision for our ", args.sysname,
254  " modeling is %.3f" % (sampling_precision), " A.", file=fpv)
255 
256  print("Sampling precision, P-value, Cramer's V and percentage "
257  "of clustered models below:", file=fpv)
258  print("%.3f\t%.3f\t%.3f\t%.3f"
259  % (sampling_precision, pval_converged, cramersv_converged,
260  percent_converged), file=fpv)
261  print("", file=fpv)
262 
263  final_clustering_threshold = sampling_precision
264 
265  else:
266  final_clustering_threshold = args.cluster_threshold
267 
268  # Perform final clustering at the required precision
269  print("Clustering at threshold %.3f" % final_clustering_threshold)
270  (cluster_centers, cluster_members) = clustering_rmsd.precision_cluster(
271  distmat_full, total_num_models, final_clustering_threshold)
272 
273  (ctable, retained_clusters) = clustering_rmsd.get_contingency_table(
274  len(cluster_centers), cluster_members, all_models,
275  sampleA_all_models, sampleB_all_models)
276  print("Contingency table:", ctable)
277  # Output the number of models in each cluster and each sample
278  with open("%s.Cluster_Population.txt" % args.sysname, 'w+') as fcp:
279  for rows in range(len(ctable)):
280  print(rows, ctable[rows][0], ctable[rows][1], file=fcp)
281 
282  # Obtain the subunits for which we need to calculate densities
283  density_custom_ranges = precision_rmsd.parse_custom_ranges(args.density)
284 
285  # Output cluster precisions
286  fpc = open("%s.Cluster_Precision.txt" % args.sysname, 'w+')
287 
288  # For each cluster, output the models in the cluster
289  # Also output the densities for the cluster models
290  for i in range(len(retained_clusters)):
291  clus = retained_clusters[i]
292 
293  # The cluster centroid is the first conformation.
294  # We use this as to align and compute RMSD/precision
295  conform_0 = conforms[all_models[cluster_members[clus][0]]]
296 
297  # create a directory for the cluster
298  if not os.path.exists("./cluster.%s" %i):
299  os.mkdir("./cluster.%s" %i)
300  os.mkdir("./cluster.%s/Sample_A/" % i)
301  os.mkdir("./cluster.%s/Sample_B/" % i)
302  else:
303  shutil.rmtree("./cluster.%s" %i)
304  os.mkdir("./cluster.%s" %i)
305  os.mkdir("./cluster.%s/Sample_A/" % i)
306  os.mkdir("./cluster.%s/Sample_B/" % i)
307 
308  # Create densities for all subunits for both sample A and sample B
309  # as well as separately.
310  gmd1 = precision_rmsd.GetModelDensity(
311  custom_ranges=density_custom_ranges,
312  resolution=args.density_threshold, voxel=args.voxel,
313  bead_names=ps_names)
314  gmd2 = precision_rmsd.GetModelDensity(
315  custom_ranges=density_custom_ranges,
316  resolution=args.density_threshold, voxel=args.voxel,
317  bead_names=ps_names)
318  gmdt = precision_rmsd.GetModelDensity(
319  custom_ranges=density_custom_ranges,
320  resolution=args.density_threshold, voxel=args.voxel,
321  bead_names=ps_names)
322 
323  # Also output the identities of cluster members
324  both_file = open('cluster.'+str(i)+'.all.txt', 'w')
325  sampleA_file = open('cluster.'+str(i)+'.sample_A.txt', 'w')
326  sampleB_file = open('cluster.'+str(i)+'.sample_B.txt', 'w')
327 
328  # Create a model with just the cluster_member particles
329  model = IMP.Model()
330  ps = [] # particle list to be updated by each RMF frame
331  for pi in range(len(conform_0)):
332  p = IMP.Particle(model, "%s" % str(pi))
333  IMP.core.XYZ.setup_particle(p, (0,0,0))
334  IMP.core.XYZR.setup_particle(p, float(radii[pi]))
335  IMP.atom.Mass.setup_particle(p, float(masses[pi]))
336  ps.append(p)
337 
338  # Obtain cluster precision by obtaining average RMSD of each model
339  # to the cluster center
340  cluster_precision = 0.0
341 
342  # transformation from internal pyRMSD orientation
343  trans = None
344  # for each model in the cluster
345  for mem in cluster_members[clus]:
346 
347  model_index = all_models[mem]
348 
349  # get superposition of each model to cluster center and the
350  # RMSD between the two
351  if args.symmetry_groups:
352  rmsd, superposed_ps, trans = \
353  precision_rmsd.get_particles_from_superposed_amb(
354  conforms[model_index], conform_0, args.align, ps, trans,
355  symm_groups)
356  else:
357  rmsd, superposed_ps, trans = \
358  precision_rmsd.get_particles_from_superposed(
359  conforms[model_index], conform_0, args.align, ps, trans)
360 
361  model.update() # why not?
362 
363  cluster_precision += rmsd
364 
365  # Add the superposed particles to the respective density maps
366  gmdt.add_subunits_density(superposed_ps) # total density map
367  print(model_index, file=both_file)
368 
369  if model_index in sampleA_all_models:
370  # density map for sample A
371  gmd1.add_subunits_density(superposed_ps)
372  print(model_index, file=sampleA_file)
373  else:
374  # density map for sample B
375  gmd2.add_subunits_density(superposed_ps)
376  print(model_index, file=sampleB_file)
377 
378  cluster_precision /= float(len(cluster_members[clus]) - 1.0)
379 
380  print("Cluster precision (average distance to cluster centroid) "
381  "of cluster ", str(i), " is %.3f" % cluster_precision, "A",
382  file=fpc)
383 
384  both_file.close()
385  sampleA_file.close()
386  sampleB_file.close()
387 
388  # Output density files for the cluster
389  density = gmdt.write_mrc(path="./cluster.%s" % i, file_prefix = "LPD")
390  gmd1.write_mrc(path="./cluster.%s/Sample_A/" % i, file_prefix = "LPD")
391  gmd2.write_mrc(path="./cluster.%s/Sample_B/" % i, file_prefix = "LPD")
392 
393  # Add the cluster center model RMF to the cluster directory
394  cluster_center_index = cluster_members[clus][0]
395  if args.rmf_A is not None:
396  cluster_center_model_id = cluster_center_index
397  if cluster_center_index < n_models[0]:
398  make_cluster_centroid(os.path.join(args.path, args.rmf_A),
399  cluster_center_index,
400  os.path.join("cluster.%d" % i,
401  "cluster_center_model.rmf3"),
402  i, len(cluster_members[clus]),
403  cluster_precision, density, args.path)
404  else:
405  make_cluster_centroid(os.path.join(args.path, args.rmf_B),
406  cluster_center_index - n_models[0],
407  os.path.join("cluster.%d" % i,
408  "cluster_center_model.rmf3"),
409  i, len(cluster_members[clus]),
410  cluster_precision, density, args.path)
411  else:
412  # index to Identities file.
413  cluster_center_model_id = all_models[cluster_center_index]
414  outfname = os.path.join("cluster.%d" % i,
415  "cluster_center_model." + args.extension)
416  if 'rmf' in args.extension:
417  make_cluster_centroid(
418  models_name[cluster_center_model_id], 0, outfname,
419  i, len(cluster_members[clus]),
420  cluster_precision, density, args.path)
421  else:
422  shutil.copy(models_name[cluster_center_model_id], outfname)
423 
424  fpc.close()
425 
426  # generate plots for the score and structure tests
427  if args.gnuplot:
428  import subprocess
429  import glob
430 
431  thisdir = os.path.dirname(__file__)
432  gnuplotdir = IMP.sampcon.get_data_path("gnuplot_scripts")
433  for filename in sorted(glob.glob(os.path.join(gnuplotdir, "*.plt"))):
434  cmd = ['gnuplot', '-e', 'sysname="%s"' % args.sysname, filename]
435  print(" ".join(cmd))
436  subprocess.check_call(cmd)
437 
438 if __name__ == '__main__':
439  main()
def get_data_path
Return the full path to one of this module's data files.
static XYZR setup_particle(Model *m, ParticleIndex pi)
Definition: XYZR.h:48
static XYZ setup_particle(Model *m, ParticleIndex pi)
Definition: XYZ.h:51
Class for storing model, its restraints, constraints, and particles.
Definition: Model.h:72
static Mass setup_particle(Model *m, ParticleIndex pi, Float mass)
Definition: Mass.h:48
Class to handle individual particles of a Model object.
Definition: Particle.h:41
Sampling exhaustiveness protocol.