1 from __future__
import print_function
7 def get_sample_identity(idfile_A, idfile_B):
12 with open(idfile_A,
'r') as f:
14 mod = line.split(
"/")[-1]
15 sampleA_models.append(int(mod.strip(
"\n").split(
" ")[1]))
18 with open(idfile_B,
'r') as f:
20 mod = line.split(
"/")[-1]
21 sampleB_models.append(int(mod.strip(
"\n").split(
" ")[1]))
22 return sampleA_models, sampleB_models
25 def get_cutoffs_list(distmat, gridSize):
27 mindist = distmat.min()
28 maxdist = distmat.max()
30 print(
"Minimum and maximum pairwise model distances:", mindist, maxdist)
31 cutoffs = numpy.arange(mindist+gridSize, maxdist, gridSize)
35 def precision_cluster(distmat, numModels, rmsd_cutoff):
38 for count
in range(numModels):
39 neighbors.append([count])
42 inds = numpy.argwhere(distmat <= rmsd_cutoff)
48 neighbors[i].append(j)
49 neighbors[j].append(i)
54 for i
in range(numModels):
56 boolUnclustered.append(
True)
61 while len(unclustered) > 0:
65 for eachu
in unclustered:
68 if len(neighbors[eachu]) > max_neighbors:
69 max_neighbors = len(neighbors[eachu])
73 cluster_centers.append(currcenter)
74 cluster_members.append([n
for n
in neighbors[currcenter]])
77 for n
in neighbors[currcenter]:
80 boolUnclustered[n] =
False
82 for n
in neighbors[currcenter]:
83 for unn
in neighbors[n]:
84 if not boolUnclustered[unn]:
86 neighbors[unn].remove(n)
88 return cluster_centers, cluster_members
91 def get_contingency_table(num_clusters, cluster_members, all_models,
92 run1_models, run2_models):
93 full_ctable = numpy.zeros((num_clusters, 2))
95 for ic, cluster
in enumerate(cluster_members):
96 for member
in cluster:
97 model_index = all_models[member]
99 if model_index
in run1_models:
101 full_ctable[ic][0] += 1.0
102 elif model_index
in run2_models:
104 full_ctable[ic][1] += 1.0
108 retained_clusters = []
110 for i
in range(num_clusters):
111 if full_ctable[i][0] <= 10.0
or full_ctable[i][1] <= 10.0:
113 reduced_ctable.append([full_ctable[i][0], full_ctable[i][1]])
114 retained_clusters.append(i)
115 return numpy.array(reduced_ctable), retained_clusters
118 def test_sampling_convergence(contingency_table, total_num_models):
119 if len(contingency_table) == 0:
122 ct = numpy.transpose(contingency_table)
123 [chisquare, pvalue, dof, expected] = scipy.stats.chi2_contingency(ct)
127 cramersv = math.sqrt(chisquare/float(total_num_models))
129 return pvalue, cramersv
132 def percent_ensemble_explained(ctable, total_num_models):
135 percent_clustered = \
136 float(numpy.sum(ctable, axis=0)[0] + numpy.sum(ctable, axis=0)[1]) \
137 * 100.0 / float(total_num_models)
138 return percent_clustered
141 def get_clusters(cutoffs_list, distmat_full, all_models, total_num_models,
142 run1_all_models, run2_all_models, sysname):
147 with open(
"%s.ChiSquare_Grid_Stats.txt" % sysname,
'w+')
as f1:
148 for c
in cutoffs_list:
149 cluster_centers, cluster_members = precision_cluster(
150 distmat_full, total_num_models, c)
151 ctable, retained_clusters = get_contingency_table(
152 len(cluster_centers), cluster_members, all_models,
153 run1_all_models, run2_all_models)
154 pval, cramersv = test_sampling_convergence(ctable,
156 percent_explained = percent_ensemble_explained(ctable,
161 percents.append(percent_explained)
163 print(c, pval, cramersv, percent_explained, file=f1)
165 return pvals, cvs, percents
168 def get_sampling_precision(cutoffs_list, pvals, cvs, percents):
169 sampling_precision = max(cutoffs_list)
171 cramersv_converged = 1.0
172 percent_converged = 0.0
174 for i
in range(len(cutoffs_list)):
175 if percents[i] > 80.0:
176 if pvals[i] > 0.05
or cvs[i] < 0.10:
177 if sampling_precision > cutoffs_list[i]:
178 sampling_precision = cutoffs_list[i]
179 pval_converged = pvals[i]
180 cramersv_converged = cvs[i]
181 percent_converged = percents[i]
183 sampling_precision = max(cutoffs_list)
185 return (sampling_precision, pval_converged, cramersv_converged,