1 """@namespace IMP.sampcon.clustering_rmsd
2 Utilities to cluster and calculate RMSD and precision"""
4 from __future__
import print_function
8 from multiprocessing
import Pool
11 def get_sample_identity(idfile_A, idfile_B):
16 with open(idfile_A,
'r') as f:
18 mod = line.split(
"/")[-1]
19 sampleA_models.append(int(mod.strip(
"\n").split(
" ")[1]))
22 with open(idfile_B,
'r') as f:
24 mod = line.split(
"/")[-1]
25 sampleB_models.append(int(mod.strip(
"\n").split(
" ")[1]))
26 return sampleA_models, sampleB_models
29 def get_cutoffs_list(distmat, gridSize):
31 mindist = distmat.min()
32 maxdist = distmat.max()
34 print(
"Minimum and maximum pairwise model distances:", mindist, maxdist)
35 cutoffs = numpy.arange(mindist+gridSize, maxdist, gridSize)
39 def precision_cluster(distmat, numModels, rmsd_cutoff):
42 for count
in range(numModels):
43 neighbors.append([count])
46 inds = numpy.argwhere(distmat <= rmsd_cutoff)
52 neighbors[i].append(j)
53 neighbors[j].append(i)
58 for i
in range(numModels):
60 boolUnclustered.append(
True)
65 while len(unclustered) > 0:
69 for eachu
in unclustered:
72 if len(neighbors[eachu]) > max_neighbors:
73 max_neighbors = len(neighbors[eachu])
77 cluster_centers.append(currcenter)
78 cluster_members.append([n
for n
in neighbors[currcenter]])
81 for n
in neighbors[currcenter]:
84 boolUnclustered[n] =
False
86 for n
in neighbors[currcenter]:
87 for unn
in neighbors[n]:
88 if not boolUnclustered[unn]:
90 neighbors[unn].remove(n)
92 return cluster_centers, cluster_members
95 def get_contingency_table(num_clusters, cluster_members, all_models,
96 run1_models, run2_models):
97 full_ctable = numpy.zeros((num_clusters, 2))
99 for ic, cluster
in enumerate(cluster_members):
100 for member
in cluster:
101 model_index = all_models[member]
103 if model_index
in run1_models:
105 full_ctable[ic][0] += 1.0
106 elif model_index
in run2_models:
108 full_ctable[ic][1] += 1.0
112 retained_clusters = []
114 for i
in range(num_clusters):
115 if full_ctable[i][0] <= 10.0
or full_ctable[i][1] <= 10.0:
117 reduced_ctable.append([full_ctable[i][0], full_ctable[i][1]])
118 retained_clusters.append(i)
119 return numpy.array(reduced_ctable), retained_clusters
122 def test_sampling_convergence(contingency_table, total_num_models):
123 if len(contingency_table) == 0:
126 ct = numpy.transpose(contingency_table)
127 [chisquare, pvalue, dof, expected] = scipy.stats.chi2_contingency(ct)
131 cramersv = math.sqrt(chisquare/float(total_num_models))
133 return pvalue, cramersv
136 def percent_ensemble_explained(ctable, total_num_models):
139 percent_clustered = \
140 float(numpy.sum(ctable, axis=0)[0] + numpy.sum(ctable, axis=0)[1]) \
141 * 100.0 / float(total_num_models)
142 return percent_clustered
145 def unpacking_wrapper(arg_tuple):
147 all_models, run1_all_models, run2_all_models, total_num_models = x
148 x = precision_cluster(*((distmat_full, ) + arg_tuple[0:2]))
149 cluster_centers, cluster_members = x
150 ctable, retained_clusters = get_contingency_table(
151 len(cluster_centers), cluster_members, all_models,
152 run1_all_models, run2_all_models)
153 pval, cramersv = test_sampling_convergence(ctable,
155 percent_explained = percent_ensemble_explained(ctable,
157 return pval, cramersv, percent_explained
160 def init_foo(distmat):
162 distmat_full = distmat
165 def get_clusters(cutoffs_list, distmat_full, all_models, total_num_models,
166 run1_all_models, run2_all_models, sysname, cores):
171 with open(
"%s.ChiSquare_Grid_Stats.txt" % sysname,
'w+')
as f1:
172 p = Pool(cores, init_foo, initargs=(distmat_full, ))
173 args_list = [(total_num_models, c, all_models, run1_all_models,
174 run2_all_models, total_num_models)
175 for c
in cutoffs_list]
176 results = p.map(unpacking_wrapper, args_list)
177 for i, x
in enumerate(results):
180 percents.append(x[2])
182 print(cutoffs_list[i], x[0], x[1], x[2], file=f1)
184 return pvals, cvs, percents
187 def get_sampling_precision(cutoffs_list, pvals, cvs, percents):
188 sampling_precision = max(cutoffs_list)
190 cramersv_converged = 1.0
191 percent_converged = 0.0
193 for i
in range(len(cutoffs_list)):
194 if percents[i] > 80.0:
195 if pvals[i] > 0.05
or cvs[i] < 0.10:
196 if sampling_precision > cutoffs_list[i]:
197 sampling_precision = cutoffs_list[i]
198 pval_converged = pvals[i]
199 cramersv_converged = cvs[i]
200 percent_converged = percents[i]
202 sampling_precision = max(cutoffs_list)
204 return (sampling_precision, pval_converged, cramersv_converged,