1 from __future__
import print_function
2 from IMP
import ArgumentParser
4 __doc__ =
"Perform analysis to determine sampling convergence."
14 parser = ArgumentParser(
15 description=
"First stages of analysis for assessing sampling "
17 parser.add_argument(
'--sysname',
'-n', dest=
"sysname",
18 help=
'name of the system', default=
"")
19 parser.add_argument(
'--path',
'-p', dest=
"path",
20 help=
'path to the good-scoring models', default=
"./")
21 parser.add_argument(
'--extension',
'-e', dest=
"extension",
22 help=
'extension of the file', choices=[
'rmf',
'pdb'],
24 parser.add_argument(
'--mode',
'-m', dest=
"mode", help=
'pyRMSD calculator',
25 choices=[
'cuda',
'cpu_omp',
'cpu_serial'], default=
"cuda")
26 parser.add_argument(
'--cores',
'-c', dest=
"cores", type=int,
27 help=
'number of cores for RMSD matrix calculations; '
28 'only for cpu_omp', default=1)
29 parser.add_argument(
'--subunit',
'-su', dest=
"subunit",
30 help=
'calculate RMSD/sampling and cluster precision/densities '
31 'etc over this subunit only', default=
None)
32 parser.add_argument(
'--align',
'-a', dest=
"align",
33 help=
'boolean flag to allow superposition of models',
34 default=
False, action=
'store_true')
35 parser.add_argument(
'--scoreA',
'-sa', dest=
"scoreA",
36 help=
'name of the file having the good-scoring scores for sample A',
37 default=
"scoresA.txt")
38 parser.add_argument(
'--scoreB',
'-sb', dest=
"scoreB",
39 help=
'name of the file having the good-scoring scores for sample B',
40 default=
"scoresB.txt")
41 parser.add_argument(
'--rmfA',
'-ra', dest=
"rmf_A",
42 help=
'RMF file with conformations from Sample A', default=
None)
43 parser.add_argument(
'--rmfB',
'-rb', dest=
"rmf_B",
44 help=
'RMF file with conformations from Sample B', default=
None)
45 parser.add_argument(
'--gridsize',
'-g', dest=
"gridsize", type=float,
46 help=
'grid size for calculating sampling precision', default=10.0)
47 parser.add_argument(
'--skip',
'-s', dest=
"skip_sampling_precision",
48 help=
"This option will bypass the calculation of sampling "
49 "precision. This option needs to be used with the clustering "
50 "threshold option. Otherwise by default, sampling precision "
51 "is calculated and the clustering threshold is the "
52 "calculated sampling precision.", default=
False,
54 parser.add_argument(
'--cluster_threshold',
'-ct', dest=
"cluster_threshold",
56 help=
'final clustering threshold to visualize clusters. Assumes '
57 'that the user has previously calculated sampling precision '
58 'and wants clusters defined at a threshold higher than the '
59 'sampling precision for ease of analysis (lesser number of '
60 'clusters).', default=30.0)
61 parser.add_argument(
'--voxel',
'-v', dest=
"voxel", type=float,
62 help=
'voxel size for the localization densities', default=5.0)
63 parser.add_argument(
'--density_threshold',
'-dt', type=float,
64 dest=
"density_threshold",
65 help=
'threshold for localization densities', default=20.0)
66 parser.add_argument(
'--density',
'-d', dest=
"density",
67 help=
'file containing dictionary of density custom ranges',
69 parser.add_argument(
'--gnuplot',
'-gp', dest=
"gnuplot",
70 help=
"plotting automatically with gnuplot", default=
False,
72 return parser.parse_args()
75 def make_cluster_centroid(infname, frame, outfname, cluster_index,
79 if hasattr(RMF.NodeHandle,
'replace_child'):
80 print(infname, outfname)
81 inr = RMF.open_rmf_file_read_only(infname)
82 outr = RMF.create_rmf_file(outfname)
83 cpf = RMF.ClusterProvenanceFactory(outr)
84 RMF.clone_file_info(inr, outr)
85 RMF.clone_hierarchy(inr, outr)
86 RMF.clone_static_frame(inr, outr)
87 inr.set_current_frame(RMF.FrameID(frame))
89 RMF.clone_loaded_frame(inr, outr)
90 rn = outr.get_root_node()
91 children = rn.get_children()
92 if len(children) == 0:
95 prov = [c
for c
in rn.get_children()
if c.get_type() == RMF.PROVENANCE]
100 newp = rn.replace_child(prov,
"cluster.%d" % cluster_index,
103 cp.set_members(cluster_size)
107 print(infname, frame, outfname)
108 subprocess.call([
'rmf_slice', infname,
'-f', str(frame),
120 from scipy
import spatial
123 from IMP.sampcon import scores_convergence, clustering_rmsd
124 from IMP.sampcon import rmsd_calculation, precision_rmsd
128 idfile_A =
"Identities_A.txt"
129 idfile_B =
"Identities_B.txt"
135 with open(os.path.join(args.path, args.scoreA),
'r') as f:
137 score_A.append(float(line.strip(
"\n")))
139 with open(os.path.join(args.path, args.scoreB),
'r') as f:
141 score_B.append(float(line.strip(
"\n")))
143 scores = score_A + score_B
146 scores_convergence.get_top_scorings_statistics(scores, 0, args.sysname)
149 scores_convergence.get_scores_distributions_KS_Stats(
150 score_A, score_B, 100, args.sysname)
153 if args.extension ==
"pdb":
155 conforms, masses, radii, models_name = \
156 rmsd_calculation.get_pdbs_coordinates(
157 args.path, idfile_A, idfile_B)
159 args.extension =
"rmf3"
161 if args.rmf_A
is not None:
162 (ps_names, masses, radii, conforms, models_name, n_models) = \
163 rmsd_calculation.get_rmfs_coordinates_one_rmf(
164 args.path, args.rmf_A, args.rmf_B, args.subunit)
167 (ps_names, masses, radii, conforms,
168 models_name) = rmsd_calculation.get_rmfs_coordinates(
169 args.path, idfile_A, idfile_B, args.subunit)
171 print(
"Size of conformation matrix", conforms.shape)
173 if not args.skip_sampling_precision:
176 numpy.save(
"conforms", conforms)
177 inner_data = rmsd_calculation.get_rmsds_matrix(
178 conforms, args.mode, args.align, args.cores)
179 print(
"Size of RMSD matrix (flattened):", inner_data.shape)
181 conforms = numpy.load(
"conforms.npy")
182 os.unlink(
'conforms.npy')
184 import pyRMSD.RMSDCalculator
185 from pyRMSD.matrixHandler
import MatrixHandler
186 mHandler = MatrixHandler()
187 mHandler.loadMatrix(
"Distances_Matrix.data")
189 rmsd_matrix = mHandler.getMatrix()
190 distmat = rmsd_matrix.get_data()
192 distmat_full = sp.spatial.distance.squareform(distmat)
193 print(
"Size of RMSD matrix (unpacked, N x N):", distmat_full.shape)
196 if args.rmf_A
is not None:
197 sampleA_all_models = range(n_models[0])
198 sampleB_all_models = range(n_models[0], n_models[1] + n_models[0])
199 total_num_models = n_models[1] + n_models[0]
202 sampleB_all_models) = clustering_rmsd.get_sample_identity(
204 total_num_models = len(sampleA_all_models) + len(sampleB_all_models)
205 all_models = sampleA_all_models + sampleB_all_models
206 print(
"Size of Sample A:", len(sampleA_all_models),
207 " ; Size of Sample B: ", len(sampleB_all_models),
208 "; Total", total_num_models)
210 if not args.skip_sampling_precision:
212 print(
"Calculating sampling precision")
216 gridSize = args.gridsize
219 cutoffs_list = clustering_rmsd.get_cutoffs_list(distmat, gridSize)
220 print(
"Clustering at thresholds:", cutoffs_list)
223 pvals, cvs, percents = clustering_rmsd.get_clusters(
224 cutoffs_list, distmat_full, all_models, total_num_models,
225 sampleA_all_models, sampleB_all_models, args.sysname)
229 (sampling_precision, pval_converged, cramersv_converged,
230 percent_converged) = clustering_rmsd.get_sampling_precision(
231 cutoffs_list, pvals, cvs, percents)
234 with open(
"%s.Sampling_Precision_Stats.txt"
235 % args.sysname,
'w+')
as fpv:
236 print(
"The sampling precision is defined as the largest allowed "
237 "RMSD between the cluster centroid and a ", args.sysname,
238 "model within any cluster in the finest clustering for which "
239 "each sample contributes models proportionally to its size "
240 "(considering both significance and magnitude of the "
241 "difference) and for which a sufficient proportion of all "
242 "models occur in sufficiently large clusters. The sampling "
243 "precision for our ", args.sysname,
244 " modeling is %.3f" % (sampling_precision),
" A.", file=fpv)
246 print(
"Sampling precision, P-value, Cramer's V and percentage "
247 "of clustered models below:", file=fpv)
248 print(
"%.3f\t%.3f\t%.3f\t%.3f"
249 % (sampling_precision, pval_converged, cramersv_converged,
250 percent_converged), file=fpv)
253 final_clustering_threshold = sampling_precision
256 final_clustering_threshold = args.cluster_threshold
259 print(
"Clustering at threshold %.3f" % final_clustering_threshold)
260 (cluster_centers, cluster_members) = clustering_rmsd.precision_cluster(
261 distmat_full, total_num_models, final_clustering_threshold)
263 (ctable, retained_clusters) = clustering_rmsd.get_contingency_table(
264 len(cluster_centers), cluster_members, all_models,
265 sampleA_all_models, sampleB_all_models)
266 print(
"Contingency table:", ctable)
268 with open(
"%s.Cluster_Population.txt" % args.sysname,
'w+')
as fcp:
269 for rows
in range(len(ctable)):
270 print(rows, ctable[rows][0], ctable[rows][1], file=fcp)
273 density_custom_ranges = precision_rmsd.parse_custom_ranges(args.density)
276 fpc = open(
"%s.Cluster_Precision.txt" % args.sysname,
'w+')
280 for i
in range(len(retained_clusters)):
281 clus = retained_clusters[i]
285 conform_0 = conforms[all_models[cluster_members[clus][0]]]
288 if not os.path.exists(
"./cluster.%s" %i):
289 os.mkdir(
"./cluster.%s" %i)
290 os.mkdir(
"./cluster.%s/Sample_A/" % i)
291 os.mkdir(
"./cluster.%s/Sample_B/" % i)
293 shutil.rmtree(
"./cluster.%s" %i)
294 os.mkdir(
"./cluster.%s" %i)
295 os.mkdir(
"./cluster.%s/Sample_A/" % i)
296 os.mkdir(
"./cluster.%s/Sample_B/" % i)
300 gmd1 = precision_rmsd.GetModelDensity(
301 custom_ranges=density_custom_ranges,
302 resolution=args.density_threshold, voxel=args.voxel,
304 gmd2 = precision_rmsd.GetModelDensity(
305 custom_ranges=density_custom_ranges,
306 resolution=args.density_threshold, voxel=args.voxel,
308 gmdt = precision_rmsd.GetModelDensity(
309 custom_ranges=density_custom_ranges,
310 resolution=args.density_threshold, voxel=args.voxel,
314 both_file = open(
'cluster.'+str(i)+
'.all.txt',
'w')
315 sampleA_file = open(
'cluster.'+str(i)+
'.sample_A.txt',
'w')
316 sampleB_file = open(
'cluster.'+str(i)+
'.sample_B.txt',
'w')
319 cluster_center_index = cluster_members[clus][0]
320 if args.rmf_A
is not None:
321 cluster_center_model_id = cluster_center_index
322 if cluster_center_index < n_models[0]:
323 make_cluster_centroid(args.rmf_A, cluster_center_index,
324 os.path.join(
"cluster.%d" % i,
325 "cluster_center_model.rmf"),
326 i, len(cluster_members[clus]))
328 make_cluster_centroid(args.rmf_B,
329 cluster_center_index - n_models[0],
330 os.path.join(
"cluster.%d" % i,
331 "cluster_center_model.rmf"),
332 i, len(cluster_members[clus]))
335 cluster_center_model_id = all_models[cluster_center_index]
336 outfname = os.path.join(
"cluster.%d" % i,
337 "cluster_center_model." + args.extension)
338 if 'rmf' in args.extension:
339 make_cluster_centroid(
340 models_name[cluster_center_model_id], 0, outfname,
341 i, len(cluster_members[clus]))
343 shutil.copy(models_name[cluster_center_model_id], outfname)
348 for pi
in range(len(conform_0)):
357 cluster_precision = 0.0
362 for mem
in cluster_members[clus]:
364 model_index = all_models[mem]
368 rmsd, superposed_ps, trans = \
369 precision_rmsd.get_particles_from_superposed(
370 conforms[model_index], conform_0, args.align, ps, trans)
374 cluster_precision += rmsd
377 gmdt.add_subunits_density(superposed_ps)
378 print(model_index, file=both_file)
380 if model_index
in sampleA_all_models:
382 gmd1.add_subunits_density(superposed_ps)
383 print(model_index, file=sampleA_file)
386 gmd2.add_subunits_density(superposed_ps)
387 print(model_index, file=sampleB_file)
389 cluster_precision /= float(len(cluster_members[clus]) - 1.0)
391 print(
"Cluster precision (average distance to cluster centroid) "
392 "of cluster ", str(i),
" is %.3f" % cluster_precision,
"A",
400 gmdt.write_mrc(path=
"./cluster.%s" % i, file_prefix =
"LPD")
401 gmd1.write_mrc(path=
"./cluster.%s/Sample_A/" % i, file_prefix =
"LPD")
402 gmd2.write_mrc(path=
"./cluster.%s/Sample_B/" % i, file_prefix =
"LPD")
410 thisdir = os.path.dirname(__file__)
412 for filename
in sorted(glob.glob(os.path.join(gnuplotdir,
"*.plt"))):
413 cmd = [
'gnuplot',
'-e',
'sysname="%s"' % args.sysname, filename]
415 subprocess.check_call(cmd)
417 if __name__ ==
'__main__':
def get_data_path
Return the full path to one of this module's data files.
static XYZR setup_particle(Model *m, ParticleIndex pi)
static XYZ setup_particle(Model *m, ParticleIndex pi)
Class for storing model, its restraints, constraints, and particles.
static Mass setup_particle(Model *m, ParticleIndex pi, Float mass)
Class to handle individual particles of a Model object.
Sampling exhaustiveness protocol.