1 from __future__
import print_function
5 from multiprocessing
import Pool
8 def get_sample_identity(idfile_A, idfile_B):
13 with open(idfile_A,
'r') as f:
15 mod = line.split(
"/")[-1]
16 sampleA_models.append(int(mod.strip(
"\n").split(
" ")[1]))
19 with open(idfile_B,
'r') as f:
21 mod = line.split(
"/")[-1]
22 sampleB_models.append(int(mod.strip(
"\n").split(
" ")[1]))
23 return sampleA_models, sampleB_models
26 def get_cutoffs_list(distmat, gridSize):
28 mindist = distmat.min()
29 maxdist = distmat.max()
31 print(
"Minimum and maximum pairwise model distances:", mindist, maxdist)
32 cutoffs = numpy.arange(mindist+gridSize, maxdist, gridSize)
36 def precision_cluster(distmat, numModels, rmsd_cutoff):
39 for count
in range(numModels):
40 neighbors.append([count])
43 inds = numpy.argwhere(distmat <= rmsd_cutoff)
49 neighbors[i].append(j)
50 neighbors[j].append(i)
55 for i
in range(numModels):
57 boolUnclustered.append(
True)
62 while len(unclustered) > 0:
66 for eachu
in unclustered:
69 if len(neighbors[eachu]) > max_neighbors:
70 max_neighbors = len(neighbors[eachu])
74 cluster_centers.append(currcenter)
75 cluster_members.append([n
for n
in neighbors[currcenter]])
78 for n
in neighbors[currcenter]:
81 boolUnclustered[n] =
False
83 for n
in neighbors[currcenter]:
84 for unn
in neighbors[n]:
85 if not boolUnclustered[unn]:
87 neighbors[unn].remove(n)
89 return cluster_centers, cluster_members
92 def get_contingency_table(num_clusters, cluster_members, all_models,
93 run1_models, run2_models):
94 full_ctable = numpy.zeros((num_clusters, 2))
96 for ic, cluster
in enumerate(cluster_members):
97 for member
in cluster:
98 model_index = all_models[member]
100 if model_index
in run1_models:
102 full_ctable[ic][0] += 1.0
103 elif model_index
in run2_models:
105 full_ctable[ic][1] += 1.0
109 retained_clusters = []
111 for i
in range(num_clusters):
112 if full_ctable[i][0] <= 10.0
or full_ctable[i][1] <= 10.0:
114 reduced_ctable.append([full_ctable[i][0], full_ctable[i][1]])
115 retained_clusters.append(i)
116 return numpy.array(reduced_ctable), retained_clusters
119 def test_sampling_convergence(contingency_table, total_num_models):
120 if len(contingency_table) == 0:
123 ct = numpy.transpose(contingency_table)
124 [chisquare, pvalue, dof, expected] = scipy.stats.chi2_contingency(ct)
128 cramersv = math.sqrt(chisquare/float(total_num_models))
130 return pvalue, cramersv
133 def percent_ensemble_explained(ctable, total_num_models):
136 percent_clustered = \
137 float(numpy.sum(ctable, axis=0)[0] + numpy.sum(ctable, axis=0)[1]) \
138 * 100.0 / float(total_num_models)
139 return percent_clustered
142 def unpacking_wrapper(arg_tuple):
144 all_models, run1_all_models, run2_all_models, total_num_models = x
145 x = precision_cluster(*((distmat_full, ) + arg_tuple[0:2]))
146 cluster_centers, cluster_members = x
147 ctable, retained_clusters = get_contingency_table(
148 len(cluster_centers), cluster_members, all_models,
149 run1_all_models, run2_all_models)
150 pval, cramersv = test_sampling_convergence(ctable,
152 percent_explained = percent_ensemble_explained(ctable,
154 return pval, cramersv, percent_explained
157 def init_foo(distmat):
159 distmat_full = distmat
162 def get_clusters(cutoffs_list, distmat_full, all_models, total_num_models,
163 run1_all_models, run2_all_models, sysname, cores):
168 with open(
"%s.ChiSquare_Grid_Stats.txt" % sysname,
'w+')
as f1:
169 p = Pool(cores, init_foo, initargs=(distmat_full, ))
170 args_list = [(total_num_models, c, all_models, run1_all_models,
171 run2_all_models, total_num_models)
172 for c
in cutoffs_list]
173 results = p.map(unpacking_wrapper, args_list)
174 for i, x
in enumerate(results):
177 percents.append(x[2])
179 print(cutoffs_list[i], x[0], x[1], x[2], file=f1)
181 return pvals, cvs, percents
184 def get_sampling_precision(cutoffs_list, pvals, cvs, percents):
185 sampling_precision = max(cutoffs_list)
187 cramersv_converged = 1.0
188 percent_converged = 0.0
190 for i
in range(len(cutoffs_list)):
191 if percents[i] > 80.0:
192 if pvals[i] > 0.05
or cvs[i] < 0.10:
193 if sampling_precision > cutoffs_list[i]:
194 sampling_precision = cutoffs_list[i]
195 pval_converged = pvals[i]
196 cramersv_converged = cvs[i]
197 percent_converged = percents[i]
199 sampling_precision = max(cutoffs_list)
201 return (sampling_precision, pval_converged, cramersv_converged,