IMP logo
IMP Reference Guide  2.13.0
The Integrative Modeling Platform
exhaust.py
1 from __future__ import print_function
2 from IMP import ArgumentParser
3 
4 __doc__ = "Perform analysis to determine sampling convergence."
5 
6 ###########################################################
7 #Scripts written by Shruthi Viswanath and Ilan E. Chemmama#
8 # in Andrej Sali Lab at UCSF. #
9 # Based on Viswanath, Chemmama et al. Biophys. J. (2017) #
10 # #
11 ###########################################################
12 
13 def parse_args():
14  parser = ArgumentParser(
15  description="First stages of analysis for assessing sampling "
16  "convergence")
17  parser.add_argument('--sysname', '-n', dest="sysname",
18  help='name of the system', default="")
19  parser.add_argument('--path', '-p', dest="path",
20  help='path to the good-scoring models', default="./")
21  parser.add_argument('--extension', '-e', dest="extension",
22  help='extension of the file', choices=['rmf', 'pdb'],
23  default="rmf")
24  parser.add_argument('--mode', '-m', dest="mode", help='pyRMSD calculator',
25  choices=['cuda', 'cpu_omp', 'cpu_serial'], default="cuda")
26  parser.add_argument('--cores', '-c', dest="cores", type=int,
27  help='number of cores for RMSD matrix calculations; '
28  'only for cpu_omp', default=1)
29  parser.add_argument('--subunit', '-su', dest="subunit",
30  help='calculate RMSD/sampling and cluster precision/densities '
31  'etc over this subunit only', default=None)
32  parser.add_argument('--align', '-a', dest="align",
33  help='boolean flag to allow superposition of models',
34  default=False, action='store_true')
35  parser.add_argument('--scoreA', '-sa', dest="scoreA",
36  help='name of the file having the good-scoring scores for sample A',
37  default="scoresA.txt")
38  parser.add_argument('--scoreB', '-sb', dest="scoreB",
39  help='name of the file having the good-scoring scores for sample B',
40  default="scoresB.txt")
41  parser.add_argument('--rmfA', '-ra', dest="rmf_A",
42  help='RMF file with conformations from Sample A', default=None)
43  parser.add_argument('--rmfB', '-rb', dest="rmf_B",
44  help='RMF file with conformations from Sample B', default=None)
45  parser.add_argument('--gridsize', '-g', dest="gridsize", type=float,
46  help='grid size for calculating sampling precision', default=10.0)
47  parser.add_argument('--skip', '-s', dest="skip_sampling_precision",
48  help="This option will bypass the calculation of sampling "
49  "precision. This option needs to be used with the clustering "
50  "threshold option. Otherwise by default, sampling precision "
51  "is calculated and the clustering threshold is the "
52  "calculated sampling precision.", default=False,
53  action='store_true')
54  parser.add_argument('--cluster_threshold', '-ct', dest="cluster_threshold",
55  type=float,
56  help='final clustering threshold to visualize clusters. Assumes '
57  'that the user has previously calculated sampling precision '
58  'and wants clusters defined at a threshold higher than the '
59  'sampling precision for ease of analysis (lesser number of '
60  'clusters).', default=30.0)
61  parser.add_argument('--voxel', '-v', dest="voxel", type=float,
62  help='voxel size for the localization densities', default=5.0)
63  parser.add_argument('--density_threshold', '-dt', type=float,
64  dest="density_threshold",
65  help='threshold for localization densities', default=20.0)
66  parser.add_argument('--density', '-d', dest="density",
67  help='file containing dictionary of density custom ranges',
68  default=None)
69  parser.add_argument('--gnuplot', '-gp', dest="gnuplot",
70  help="plotting automatically with gnuplot", default=False,
71  action='store_true')
72  return parser.parse_args()
73 
74 
75 def make_cluster_centroid(infname, frame, outfname, cluster_index,
76  cluster_size):
77  import RMF
78  # If we have new enough IMP/RMF, do our own RMF slicing with provenance
79  if hasattr(RMF.NodeHandle, 'replace_child'):
80  print(infname, outfname)
81  inr = RMF.open_rmf_file_read_only(infname)
82  outr = RMF.create_rmf_file(outfname)
83  cpf = RMF.ClusterProvenanceFactory(outr)
84  RMF.clone_file_info(inr, outr)
85  RMF.clone_hierarchy(inr, outr)
86  RMF.clone_static_frame(inr, outr)
87  inr.set_current_frame(RMF.FrameID(frame))
88  outr.add_frame("f0")
89  RMF.clone_loaded_frame(inr, outr)
90  rn = outr.get_root_node()
91  children = rn.get_children()
92  if len(children) == 0:
93  return
94  rn = children[0] # Should be the top-level IMP node
95  prov = [c for c in rn.get_children() if c.get_type() == RMF.PROVENANCE]
96  if not prov:
97  return
98  prov = prov[0]
99  # Add cluster-provenance info
100  newp = rn.replace_child(prov, "cluster.%d" % cluster_index,
101  RMF.PROVENANCE)
102  cp = cpf.get(newp)
103  cp.set_members(cluster_size)
104  else:
105  # Otherwise, fall back to RMF's command line tool
106  import subprocess
107  print(infname, frame, outfname)
108  subprocess.call(['rmf_slice', infname, '-f', str(frame),
109  outfname])
110 
111 
112 def main():
113  args = parse_args()
114 
115  import os
116  import shutil
117  import numpy
118 
119  import scipy as sp
120  from scipy import spatial
121 
122  import IMP.sampcon
123  from IMP.sampcon import scores_convergence, clustering_rmsd
124  from IMP.sampcon import rmsd_calculation, precision_rmsd
125 
126  import IMP
127 
128  idfile_A = "Identities_A.txt"
129  idfile_B = "Identities_B.txt"
130 
131  #Step 0: Compute Score convergence
132  score_A = []
133  score_B = []
134 
135  with open(os.path.join(args.path, args.scoreA), 'r') as f:
136  for line in f:
137  score_A.append(float(line.strip("\n")))
138 
139  with open(os.path.join(args.path, args.scoreB), 'r') as f:
140  for line in f:
141  score_B.append(float(line.strip("\n")))
142 
143  scores = score_A + score_B
144 
145  # Get the convergence of the best score
146  scores_convergence.get_top_scorings_statistics(scores, 0, args.sysname)
147 
148  # Check if the two score distributions are similar
149  scores_convergence.get_scores_distributions_KS_Stats(
150  score_A, score_B, 100, args.sysname)
151 
152  #Step 1: Compute RMSD matrix
153  if args.extension == "pdb":
154  ps_names = [] # bead names are not stored in PDB files
155  conforms, masses, radii, models_name = \
156  rmsd_calculation.get_pdbs_coordinates(
157  args.path, idfile_A, idfile_B)
158  else:
159  args.extension = "rmf3"
160  # If we have a single RMF file, read conformations from that
161  if args.rmf_A is not None:
162  (ps_names, masses, radii, conforms, models_name, n_models) = \
163  rmsd_calculation.get_rmfs_coordinates_one_rmf(
164  args.path, args.rmf_A, args.rmf_B, args.subunit)
165  # If not, default to the Identities.txt file
166  else:
167  (ps_names, masses, radii, conforms,
168  models_name) = rmsd_calculation.get_rmfs_coordinates(
169  args.path, idfile_A, idfile_B, args.subunit)
170 
171  print("Size of conformation matrix", conforms.shape)
172 
173  if not args.skip_sampling_precision:
174  # get_rmsds_matrix modifies conforms, so save it to a file and restore
175  # afterwards (so that we retain the original IMP orientation)
176  numpy.save("conforms", conforms)
177  inner_data = rmsd_calculation.get_rmsds_matrix(
178  conforms, args.mode, args.align, args.cores)
179  print("Size of RMSD matrix (flattened):", inner_data.shape)
180  del conforms
181  conforms = numpy.load("conforms.npy")
182  os.unlink('conforms.npy')
183 
184  import pyRMSD.RMSDCalculator
185  from pyRMSD.matrixHandler import MatrixHandler
186  mHandler = MatrixHandler()
187  mHandler.loadMatrix("Distances_Matrix.data")
188 
189  rmsd_matrix = mHandler.getMatrix()
190  distmat = rmsd_matrix.get_data()
191 
192  distmat_full = sp.spatial.distance.squareform(distmat)
193  print("Size of RMSD matrix (unpacked, N x N):", distmat_full.shape)
194 
195  # Get model lists
196  if args.rmf_A is not None:
197  sampleA_all_models = range(n_models[0])
198  sampleB_all_models = range(n_models[0], n_models[1] + n_models[0])
199  total_num_models = n_models[1] + n_models[0]
200  else:
201  (sampleA_all_models,
202  sampleB_all_models) = clustering_rmsd.get_sample_identity(
203  idfile_A, idfile_B)
204  total_num_models = len(sampleA_all_models) + len(sampleB_all_models)
205  all_models = sampleA_all_models + sampleB_all_models
206  print("Size of Sample A:", len(sampleA_all_models),
207  " ; Size of Sample B: ", len(sampleB_all_models),
208  "; Total", total_num_models)
209 
210  if not args.skip_sampling_precision:
211 
212  print("Calculating sampling precision")
213 
214  # Step 2: Cluster at intervals of grid size to get the
215  # sampling precision
216  gridSize = args.gridsize
217 
218  # Get cutoffs for clustering
219  cutoffs_list = clustering_rmsd.get_cutoffs_list(distmat, gridSize)
220  print("Clustering at thresholds:", cutoffs_list)
221 
222  # Do clustering at each cutoff
223  pvals, cvs, percents = clustering_rmsd.get_clusters(
224  cutoffs_list, distmat_full, all_models, total_num_models,
225  sampleA_all_models, sampleB_all_models, args.sysname)
226 
227  # Now apply the rule for selecting the right precision based
228  # on population of contingency table, pvalue and cramersv
229  (sampling_precision, pval_converged, cramersv_converged,
230  percent_converged) = clustering_rmsd.get_sampling_precision(
231  cutoffs_list, pvals, cvs, percents)
232 
233  # Output test statistics
234  with open("%s.Sampling_Precision_Stats.txt"
235  % args.sysname, 'w+') as fpv:
236  print("The sampling precision is defined as the largest allowed "
237  "RMSD between the cluster centroid and a ", args.sysname,
238  "model within any cluster in the finest clustering for which "
239  "each sample contributes models proportionally to its size "
240  "(considering both significance and magnitude of the "
241  "difference) and for which a sufficient proportion of all "
242  "models occur in sufficiently large clusters. The sampling "
243  "precision for our ", args.sysname,
244  " modeling is %.3f" % (sampling_precision), " A.", file=fpv)
245 
246  print("Sampling precision, P-value, Cramer's V and percentage "
247  "of clustered models below:", file=fpv)
248  print("%.3f\t%.3f\t%.3f\t%.3f"
249  % (sampling_precision, pval_converged, cramersv_converged,
250  percent_converged), file=fpv)
251  print("", file=fpv)
252 
253  final_clustering_threshold = sampling_precision
254 
255  else:
256  final_clustering_threshold = args.cluster_threshold
257 
258  # Perform final clustering at the required precision
259  print("Clustering at threshold %.3f" % final_clustering_threshold)
260  (cluster_centers, cluster_members) = clustering_rmsd.precision_cluster(
261  distmat_full, total_num_models, final_clustering_threshold)
262 
263  (ctable, retained_clusters) = clustering_rmsd.get_contingency_table(
264  len(cluster_centers), cluster_members, all_models,
265  sampleA_all_models, sampleB_all_models)
266  print("Contingency table:", ctable)
267  # Output the number of models in each cluster and each sample
268  with open("%s.Cluster_Population.txt" % args.sysname, 'w+') as fcp:
269  for rows in range(len(ctable)):
270  print(rows, ctable[rows][0], ctable[rows][1], file=fcp)
271 
272  # Obtain the subunits for which we need to calculate densities
273  density_custom_ranges = precision_rmsd.parse_custom_ranges(args.density)
274 
275  # Output cluster precisions
276  fpc = open("%s.Cluster_Precision.txt" % args.sysname, 'w+')
277 
278  # For each cluster, output the models in the cluster
279  # Also output the densities for the cluster models
280  for i in range(len(retained_clusters)):
281  clus = retained_clusters[i]
282 
283  # The cluster centroid is the first conformation.
284  # We use this as to align and compute RMSD/precision
285  conform_0 = conforms[all_models[cluster_members[clus][0]]]
286 
287  # create a directory for the cluster
288  if not os.path.exists("./cluster.%s" %i):
289  os.mkdir("./cluster.%s" %i)
290  os.mkdir("./cluster.%s/Sample_A/" % i)
291  os.mkdir("./cluster.%s/Sample_B/" % i)
292  else:
293  shutil.rmtree("./cluster.%s" %i)
294  os.mkdir("./cluster.%s" %i)
295  os.mkdir("./cluster.%s/Sample_A/" % i)
296  os.mkdir("./cluster.%s/Sample_B/" % i)
297 
298  # Create densities for all subunits for both sample A and sample B
299  # as well as separately.
300  gmd1 = precision_rmsd.GetModelDensity(
301  custom_ranges=density_custom_ranges,
302  resolution=args.density_threshold, voxel=args.voxel,
303  bead_names=ps_names)
304  gmd2 = precision_rmsd.GetModelDensity(
305  custom_ranges=density_custom_ranges,
306  resolution=args.density_threshold, voxel=args.voxel,
307  bead_names=ps_names)
308  gmdt = precision_rmsd.GetModelDensity(
309  custom_ranges=density_custom_ranges,
310  resolution=args.density_threshold, voxel=args.voxel,
311  bead_names=ps_names)
312 
313  # Also output the identities of cluster members
314  both_file = open('cluster.'+str(i)+'.all.txt', 'w')
315  sampleA_file = open('cluster.'+str(i)+'.sample_A.txt', 'w')
316  sampleB_file = open('cluster.'+str(i)+'.sample_B.txt', 'w')
317 
318  # Add the cluster center model RMF to the cluster directory
319  cluster_center_index = cluster_members[clus][0]
320  if args.rmf_A is not None:
321  cluster_center_model_id = cluster_center_index
322  if cluster_center_index < n_models[0]:
323  make_cluster_centroid(args.rmf_A, cluster_center_index,
324  os.path.join("cluster.%d" % i,
325  "cluster_center_model.rmf"),
326  i, len(cluster_members[clus]))
327  else:
328  make_cluster_centroid(args.rmf_B,
329  cluster_center_index - n_models[0],
330  os.path.join("cluster.%d" % i,
331  "cluster_center_model.rmf"),
332  i, len(cluster_members[clus]))
333  else:
334  # index to Identities file.
335  cluster_center_model_id = all_models[cluster_center_index]
336  outfname = os.path.join("cluster.%d" % i,
337  "cluster_center_model." + args.extension)
338  if 'rmf' in args.extension:
339  make_cluster_centroid(
340  models_name[cluster_center_model_id], 0, outfname,
341  i, len(cluster_members[clus]))
342  else:
343  shutil.copy(models_name[cluster_center_model_id], outfname)
344 
345  # Create a model with just the cluster_member particles
346  model = IMP.Model()
347  ps = [] # particle list to be updated by each RMF frame
348  for pi in range(len(conform_0)):
349  p = IMP.Particle(model, "%s" % str(pi))
350  IMP.core.XYZ.setup_particle(p, (0,0,0))
351  IMP.core.XYZR.setup_particle(p, float(radii[pi]))
352  IMP.atom.Mass.setup_particle(p, float(masses[pi]))
353  ps.append(p)
354 
355  # Obtain cluster precision by obtaining average RMSD of each model
356  # to the cluster center
357  cluster_precision = 0.0
358 
359  # transformation from internal pyRMSD orientation
360  trans = None
361  # for each model in the cluster
362  for mem in cluster_members[clus]:
363 
364  model_index = all_models[mem]
365 
366  # get superposition of each model to cluster center and the
367  # RMSD between the two
368  rmsd, superposed_ps, trans = \
369  precision_rmsd.get_particles_from_superposed(
370  conforms[model_index], conform_0, args.align, ps, trans)
371 
372  model.update() # why not?
373 
374  cluster_precision += rmsd
375 
376  # Add the superposed particles to the respective density maps
377  gmdt.add_subunits_density(superposed_ps) # total density map
378  print(model_index, file=both_file)
379 
380  if model_index in sampleA_all_models:
381  # density map for sample A
382  gmd1.add_subunits_density(superposed_ps)
383  print(model_index, file=sampleA_file)
384  else:
385  # density map for sample B
386  gmd2.add_subunits_density(superposed_ps)
387  print(model_index, file=sampleB_file)
388 
389  cluster_precision /= float(len(cluster_members[clus]) - 1.0)
390 
391  print("Cluster precision (average distance to cluster centroid) "
392  "of cluster ", str(i), " is %.3f" % cluster_precision, "A",
393  file=fpc)
394 
395  both_file.close()
396  sampleA_file.close()
397  sampleB_file.close()
398 
399  # Finally, output density files for the cluster
400  gmdt.write_mrc(path="./cluster.%s" % i, file_prefix = "LPD")
401  gmd1.write_mrc(path="./cluster.%s/Sample_A/" % i, file_prefix = "LPD")
402  gmd2.write_mrc(path="./cluster.%s/Sample_B/" % i, file_prefix = "LPD")
403  fpc.close()
404 
405  # generate plots for the score and structure tests
406  if args.gnuplot:
407  import subprocess
408  import glob
409 
410  thisdir = os.path.dirname(__file__)
411  gnuplotdir = IMP.sampcon.get_data_path("gnuplot_scripts")
412  for filename in sorted(glob.glob(os.path.join(gnuplotdir, "*.plt"))):
413  cmd = ['gnuplot', '-e', 'sysname="%s"' % args.sysname, filename]
414  print(" ".join(cmd))
415  subprocess.check_call(cmd)
416 
417 if __name__ == '__main__':
418  main()
def get_data_path
Return the full path to one of this module's data files.
static XYZR setup_particle(Model *m, ParticleIndex pi)
Definition: XYZR.h:48
static XYZ setup_particle(Model *m, ParticleIndex pi)
Definition: XYZ.h:51
Class for storing model, its restraints, constraints, and particles.
Definition: Model.h:72
static Mass setup_particle(Model *m, ParticleIndex pi, Float mass)
Definition: Mass.h:48
Class to handle individual particles of a Model object.
Definition: Particle.h:41
Sampling exhaustiveness protocol.