1 from __future__
import print_function
2 from IMP
import ArgumentParser
6 __doc__ =
"Perform analysis to determine sampling convergence."
17 parser = ArgumentParser(
18 description=
"First stages of analysis for assessing sampling "
21 '--sysname',
'-n', dest=
"sysname",
22 help=
'name of the system', default=
"")
24 '--path',
'-p', dest=
"path",
25 help=
'path to the good-scoring models', default=
"./")
27 '--extension',
'-e', dest=
"extension",
28 help=
'extension of the file', choices=[
'rmf',
'pdb'], default=
"rmf")
30 '--mode',
'-m', dest=
"mode", help=
'pyRMSD calculator',
31 choices=[
'cuda',
'cpu_omp',
'cpu_serial'], default=
"cuda")
33 '--matrix-cores',
'-c', dest=
"cores", type=int,
34 help=
'number of cores for parallel RMSD matrix calculations; '
35 'only for cpu_omp', default=1)
37 '--cluster-cores',
'-cc', dest=
"cores2", type=int,
38 help=
'number of cores for clustering at different thresholds'
39 ' and parallel IO; only for cpu_omp', default=1)
41 '--resolution',
'-r', dest=
"resolution", type=int,
42 help=
'resolution at which to select proteins in a multiscale system',
45 '--subunit',
'-su', dest=
"subunit",
46 help=
'calculate RMSD/sampling and cluster precision/densities '
47 'etc over this subunit only', default=
None)
49 '--align',
'-a', dest=
"align",
50 help=
'boolean flag to allow superposition of models',
51 default=
False, action=
'store_true')
53 '--ambiguity',
'-amb', dest=
"symmetry_groups",
54 help=
'file containing symmetry groups', default=
None)
56 '--scoreA',
'-sa', dest=
"scoreA",
57 help=
'name of the file having the good-scoring scores for sample A',
58 default=
"scoresA.txt")
60 '--scoreB',
'-sb', dest=
"scoreB",
61 help=
'name of the file having the good-scoring scores for sample B',
62 default=
"scoresB.txt")
64 '--rmfA',
'-ra', dest=
"rmf_A",
65 help=
'RMF file with conformations from Sample A', default=
None)
67 '--rmfB',
'-rb', dest=
"rmf_B",
68 help=
'RMF file with conformations from Sample B', default=
None)
70 '--gridsize',
'-g', dest=
"gridsize", type=float,
71 help=
'grid size for calculating sampling precision', default=10.0)
73 '--skip',
'-s', dest=
"skip_sampling_precision",
74 help=
"This option will bypass the calculation of sampling "
75 "precision. This option needs to be used with the clustering "
76 "threshold option. Otherwise by default, sampling precision "
77 "is calculated and the clustering threshold is the "
78 "calculated sampling precision.", default=
False,
81 '--cluster_threshold',
'-ct', dest=
"cluster_threshold", type=float,
82 help=
'final clustering threshold to visualize clusters. Assumes '
83 'that the user has previously calculated sampling precision '
84 'and wants clusters defined at a threshold higher than the '
85 'sampling precision for ease of analysis (lesser number of '
86 'clusters).', default=30.0)
88 '--voxel',
'-v', dest=
"voxel", type=float,
89 help=
'voxel size for the localization densities', default=5.0)
91 '--density_threshold',
'-dt', type=float,
92 dest=
"density_threshold",
93 help=
'threshold for localization densities', default=20.0)
95 '--density',
'-d', dest=
"density",
96 help=
'file containing dictionary of density custom ranges',
99 '--gnuplot',
'-gp', dest=
"gnuplot",
100 help=
"plotting automatically with gnuplot", default=
False,
103 '--selection',
'-sn', dest=
"selection",
104 help=
'file containing dictionary '
105 'of selected subunits and residues '
106 'for RMSD and clustering calculation; '
107 "each entry in the dictionary takes the form "
108 "'selection name': [(residue_start, residue_end, protein name)",
111 '--prism',
'-pr', dest=
"prism",
112 help=
"Save input files for PrISM", default=
False,
114 return parser.parse_args()
117 def make_cluster_centroid(infname, frame, outfname, cluster_index,
118 cluster_size, precision, metadata_fname, path):
122 if hasattr(RMF.NodeHandle,
'replace_child'):
123 print(infname, outfname)
124 inr = RMF.open_rmf_file_read_only(infname)
125 outr = RMF.create_rmf_file(outfname)
126 cpf = RMF.ClusterProvenanceFactory(outr)
127 RMF.clone_file_info(inr, outr)
128 RMF.clone_hierarchy(inr, outr)
129 RMF.clone_static_frame(inr, outr)
130 inr.set_current_frame(RMF.FrameID(frame))
132 RMF.clone_loaded_frame(inr, outr)
133 rn = outr.get_root_node()
134 children = rn.get_children()
135 if len(children) == 0:
138 prov = [c
for c
in rn.get_children()
if c.get_type() == RMF.PROVENANCE]
143 newp = rn.replace_child(
144 prov,
"cluster.%d" % cluster_index, RMF.PROVENANCE)
146 cp.set_members(cluster_size)
147 cp.set_precision(precision)
148 cp.set_density(os.path.abspath(metadata_fname))
152 print(infname, frame, outfname)
153 subprocess.call([
'rmf_slice', path + infname,
'-f', str(frame),
157 def _read_scores(fname, sample_name):
159 with open(fname,
'r') as f:
161 scores.append(float(line.strip(
"\n")))
163 raise ValueError(
"%s (file %s) is empty" % (sample_name, fname))
177 from IMP.sampcon import scores_convergence, clustering_rmsd
178 from IMP.sampcon import rmsd_calculation, precision_rmsd
184 metadata_fname =
"%s.output.json" % args.sysname
187 metadata[
'producer'] = {
'name':
'IMP.sampcon',
188 'version': IMP.sampcon.__version__,
191 idfile_A =
"Identities_A.txt"
192 idfile_B =
"Identities_B.txt"
195 score_A = _read_scores(os.path.join(args.path, args.scoreA),
"Sample A")
196 score_B = _read_scores(os.path.join(args.path, args.scoreB),
"Sample B")
197 scores = score_A + score_B
200 scores_convergence.get_top_scorings_statistics(scores, 0, args.sysname)
203 scores_convergence.get_scores_distributions_KS_Stats(
204 score_A, score_B, 100, args.sysname)
207 if args.extension ==
"pdb":
210 conforms, masses, radii, models_name = \
211 rmsd_calculation.get_pdbs_coordinates(
212 args.path, idfile_A, idfile_B)
213 metadata[
'input_frames'] = models_name
215 args.extension =
"rmf3"
216 if args.selection
is not None:
217 rmsd_custom_ranges = \
218 precision_rmsd.parse_custom_ranges(args.selection)
220 rmsd_custom_ranges =
None
222 if args.rmf_A
is not None:
223 metadata[
'input_files'] = {
'A': args.rmf_A,
'B': args.rmf_B}
224 (ps_names, masses, radii, conforms, symm_groups, models_name,
225 n_models) = rmsd_calculation.get_rmfs_coordinates_one_rmf(
226 args.path, args.rmf_A, args.rmf_B,
228 args.symmetry_groups,
236 (ps_names, masses, radii, conforms,
237 models_name) = rmsd_calculation.get_rmfs_coordinates(
238 args.path, idfile_A, idfile_B, args.subunit,
239 selection=rmsd_custom_ranges,
240 resolution=args.resolution)
241 metadata[
'input_frames'] = models_name
243 print(
"Size of conformation matrix", conforms.shape)
245 if not args.skip_sampling_precision:
248 numpy.save(
"conforms", conforms)
249 inner_data = rmsd_calculation.get_rmsds_matrix(
250 conforms, args.mode, args.align, args.cores, symm_groups)
251 print(
"Size of RMSD matrix (flattened):", inner_data.shape)
253 conforms = numpy.load(
"conforms.npy")
254 os.unlink(
'conforms.npy')
256 from pyRMSD.matrixHandler
import MatrixHandler
257 mHandler = MatrixHandler()
258 mHandler.loadMatrix(
"Distances_Matrix.data")
260 rmsd_matrix = mHandler.getMatrix()
261 distmat = rmsd_matrix.get_data()
263 distmat_full = sp.spatial.distance.squareform(distmat)
264 print(
"Size of RMSD matrix (unpacked, N x N):", distmat_full.shape)
267 if args.rmf_A
is not None:
268 sampleA_all_models = list(range(n_models[0]))
269 sampleB_all_models = list(range(n_models[0],
270 n_models[1] + n_models[0]))
271 total_num_models = n_models[1] + n_models[0]
274 sampleB_all_models) = clustering_rmsd.get_sample_identity(
276 total_num_models = len(sampleA_all_models) + len(sampleB_all_models)
277 all_models = list(sampleA_all_models) + list(sampleB_all_models)
278 print(
"Size of Sample A:", len(sampleA_all_models),
279 " ; Size of Sample B: ", len(sampleB_all_models),
280 "; Total", total_num_models)
282 if not args.skip_sampling_precision:
284 print(
"Calculating sampling precision")
288 gridSize = args.gridsize
291 cutoffs_list = clustering_rmsd.get_cutoffs_list(distmat, gridSize)
292 print(
"Clustering at thresholds:", cutoffs_list)
295 pvals, cvs, percents = clustering_rmsd.get_clusters(
296 cutoffs_list, distmat_full, all_models, total_num_models,
297 sampleA_all_models, sampleB_all_models, args.sysname,
299 metadata[
'chi_square_grid_stats'] = {
300 'cutoffs': list(cutoffs_list),
303 'percent_clustered': percents}
307 (sampling_precision, pval_converged, cramersv_converged,
308 percent_converged) = clustering_rmsd.get_sampling_precision(
309 cutoffs_list, pvals, cvs, percents)
312 with open(
"%s.Sampling_Precision_Stats.txt"
313 % args.sysname,
'w+')
as fpv:
314 print(
"The sampling precision is defined as the largest allowed "
315 "RMSD between the cluster centroid and a ", args.sysname,
316 "model within any cluster in the finest clustering for "
317 "which each sample contributes models proportionally to "
318 "its size (considering both significance and magnitude of "
319 "the difference) and for which a sufficient proportion of "
320 "all models occur in sufficiently large clusters. The "
321 "sampling precision for our ", args.sysname,
322 " modeling is %.3f" % (sampling_precision),
" A.", file=fpv)
324 print(
"Sampling precision, P-value, Cramer's V and percentage "
325 "of clustered models below:", file=fpv)
326 print(
"%.3f\t%.3f\t%.3f\t%.3f"
327 % (sampling_precision, pval_converged, cramersv_converged,
328 percent_converged), file=fpv)
331 final_clustering_threshold = sampling_precision
332 metadata[
'precision'] = {
333 'sampling_precision': sampling_precision,
334 'p_value': pval_converged,
335 'cramers_v': cramersv_converged,
336 'percent_clustered': percent_converged}
339 final_clustering_threshold = args.cluster_threshold
341 metadata[
'clustering_threshold'] = final_clustering_threshold
344 print(
"Clustering at threshold %.3f" % final_clustering_threshold)
345 (cluster_centers, cluster_members) = clustering_rmsd.precision_cluster(
346 distmat_full, total_num_models, final_clustering_threshold)
348 (ctable, retained_clusters) = clustering_rmsd.get_contingency_table(
349 len(cluster_centers), cluster_members, all_models,
350 sampleA_all_models, sampleB_all_models)
351 print(
"Contingency table:", ctable)
354 with open(
"%s.Cluster_Population.txt" % args.sysname,
'w+')
as fcp:
355 for rows
in range(len(ctable)):
356 print(rows, ctable[rows][0], ctable[rows][1], file=fcp)
359 density_custom_ranges = precision_rmsd.parse_custom_ranges(args.density)
360 metadata[
'density_custom_ranges'] = density_custom_ranges
363 fpc = open(
"%s.Cluster_Precision.txt" % args.sysname,
'w+')
365 metadata[
'clusters'] = []
368 for i
in range(len(retained_clusters)):
369 cmeta = {
'name':
'cluster.%d' % i}
370 clus = retained_clusters[i]
374 conform_0 = conforms[all_models[cluster_members[clus][0]]]
377 if not os.path.exists(
"./cluster.%s" % i):
378 os.mkdir(
"./cluster.%s" % i)
379 os.mkdir(
"./cluster.%s/Sample_A/" % i)
380 os.mkdir(
"./cluster.%s/Sample_B/" % i)
382 shutil.rmtree(
"./cluster.%s" % i)
383 os.mkdir(
"./cluster.%s" % i)
384 os.mkdir(
"./cluster.%s/Sample_A/" % i)
385 os.mkdir(
"./cluster.%s/Sample_B/" % i)
388 prism_file =
'cluster.'+str(i)+
'.prism.npz'
389 superposed_coords_cluster = []
392 gmd1 = precision_rmsd.GetModelDensity(
393 custom_ranges=density_custom_ranges,
394 resolution=args.density_threshold, voxel=args.voxel,
396 gmd2 = precision_rmsd.GetModelDensity(
397 custom_ranges=density_custom_ranges,
398 resolution=args.density_threshold, voxel=args.voxel,
400 gmdt = precision_rmsd.GetModelDensity(
401 custom_ranges=density_custom_ranges,
402 resolution=args.density_threshold, voxel=args.voxel,
406 both_file = open(
'cluster.'+str(i)+
'.all.txt',
'w')
407 sampleA_file = open(
'cluster.'+str(i)+
'.sample_A.txt',
'w')
408 sampleB_file = open(
'cluster.'+str(i)+
'.sample_B.txt',
'w')
413 for pi
in range(len(conform_0)):
422 cluster_precision = 0.0
425 'A': [int(x)
for x
in cluster_members[clus]
426 if x
in sampleA_all_models],
427 'B': [int(x)
for x
in cluster_members[clus]
428 if x
in sampleB_all_models]}
433 for mem
in cluster_members[clus]:
435 model_index = all_models[mem]
439 rmsd, superposed_ps, trans = \
440 precision_rmsd.get_particles_from_superposed(
441 conforms[model_index], conform_0, args.align,
442 ps, trans, symm_groups)
446 cluster_precision += rmsd
449 gmdt.add_subunits_density(superposed_ps)
450 print(model_index, file=both_file)
452 if model_index
in sampleA_all_models:
454 gmd1.add_subunits_density(superposed_ps)
455 print(model_index, file=sampleA_file)
458 gmd2.add_subunits_density(superposed_ps)
459 print(model_index, file=sampleB_file)
461 superposed_coords = \
463 for s_ps
in superposed_ps]
464 superposed_coords_cluster.append(
465 numpy.array(superposed_coords))
473 numpy.array(superposed_coords_cluster),
476 numpy.array(ps_names))
477 cluster_precision /= float(len(cluster_members[clus]) - 1.0)
478 cmeta[
'precision'] = cluster_precision
479 print(
"Cluster precision (average distance to cluster centroid) "
480 "of cluster ", str(i),
" is %.3f" % cluster_precision,
"A",
488 cmeta[
'density'] = gmdt.write_mrc(path=
"./cluster.%s" % i,
490 cmeta[
'densityA'] = gmd1.write_mrc(path=
"./cluster.%s/Sample_A/" % i,
492 cmeta[
'densityB'] = gmd2.write_mrc(path=
"./cluster.%s/Sample_B/" % i,
496 cluster_center_index = cluster_members[clus][0]
497 if args.rmf_A
is not None:
498 outfname = os.path.join(
"cluster.%d" % i,
499 "cluster_center_model.rmf3")
500 cluster_center_model_id = cluster_center_index
501 if cluster_center_index < n_models[0]:
502 make_cluster_centroid(
503 os.path.join(args.path, args.rmf_A),
504 cluster_center_index,
505 outfname, i, len(cluster_members[clus]),
506 cluster_precision, metadata_fname, args.path)
508 make_cluster_centroid(
509 os.path.join(args.path, args.rmf_B),
510 cluster_center_index - n_models[0],
511 outfname, i, len(cluster_members[clus]),
512 cluster_precision, metadata_fname, args.path)
515 cluster_center_model_id = all_models[cluster_center_index]
516 outfname = os.path.join(
"cluster.%d" % i,
517 "cluster_center_model." + args.extension)
518 if 'rmf' in args.extension:
519 make_cluster_centroid(
520 models_name[cluster_center_model_id], 0, outfname,
521 i, len(cluster_members[clus]),
522 cluster_precision, metadata_fname, args.path)
524 shutil.copy(models_name[cluster_center_model_id], outfname)
525 cmeta[
'centroid'] = {
'index': cluster_center_index,
527 metadata[
'clusters'].append(cmeta)
529 with open(metadata_fname,
'w')
as jfh:
530 json.dump(metadata, jfh)
537 for filename
in sorted(glob.glob(os.path.join(gnuplotdir,
"*.plt"))):
538 cmd = [
'gnuplot',
'-e',
'sysname="%s"' % args.sysname, filename]
540 subprocess.check_call(cmd)
543 if __name__ ==
'__main__':
def get_data_path
Return the full path to one of this module's data files.
static XYZR setup_particle(Model *m, ParticleIndex pi)
double get_mass(ResidueType c)
Get the mass from the residue type.
static XYZ setup_particle(Model *m, ParticleIndex pi)
Class for storing model, its restraints, constraints, and particles.
static Mass setup_particle(Model *m, ParticleIndex pi, Float mass)
A decorator for a particle with x,y,z coordinates.
Class to handle individual particles of a Model object.
Sampling exhaustiveness protocol.
A decorator for a particle with x,y,z coordinates and a radius.