IMP logo
IMP Reference Guide  2.17.0
The Integrative Modeling Platform
clustering_rmsd.py
1 from __future__ import print_function
2 import numpy
3 import math
4 import scipy.stats
5 from multiprocessing import Pool
6 
7 
8 def get_sample_identity(idfile_A, idfile_B):
9  # whether a run belongs to run1 or run2
10  sampleA_models = []
11  sampleB_models = []
12 
13  with open(idfile_A, 'r') as f:
14  for line in f:
15  mod = line.split("/")[-1]
16  sampleA_models.append(int(mod.strip("\n").split(" ")[1]))
17  f.close()
18 
19  with open(idfile_B, 'r') as f:
20  for line in f:
21  mod = line.split("/")[-1]
22  sampleB_models.append(int(mod.strip("\n").split(" ")[1]))
23  return sampleA_models, sampleB_models
24 
25 
26 def get_cutoffs_list(distmat, gridSize):
27 
28  mindist = distmat.min()
29  maxdist = distmat.max()
30 
31  print("Minimum and maximum pairwise model distances:", mindist, maxdist)
32  cutoffs = numpy.arange(mindist+gridSize, maxdist, gridSize)
33  return cutoffs
34 
35 
36 def precision_cluster(distmat, numModels, rmsd_cutoff):
37  # STEP 2. Populate the neighbors ofa given model
38  neighbors = []
39  for count in range(numModels):
40  neighbors.append([count]) # model is a neighbor of itself
41 
42  # set of i,j indices that pass
43  inds = numpy.argwhere(distmat <= rmsd_cutoff)
44  for x in inds:
45  i = x[0]
46  j = x[1]
47  # Only count each contribution once
48  if i > j:
49  neighbors[i].append(j)
50  neighbors[j].append(i)
51 
52  # STEP 3. Get the weightiest cluster, and iterate
53  unclustered = []
54  boolUnclustered = []
55  for i in range(numModels):
56  unclustered.append(i)
57  boolUnclustered.append(True)
58 
59  cluster_members = [] # list of lists : one list per cluster
60  cluster_centers = []
61 
62  while len(unclustered) > 0:
63  # get cluster with maximum weight
64  max_neighbors = 0
65  currcenter = -1
66  for eachu in unclustered:
67  # if multiple clusters have same maxweight this tie is
68  # broken arbitrarily!
69  if len(neighbors[eachu]) > max_neighbors:
70  max_neighbors = len(neighbors[eachu])
71  currcenter = eachu
72 
73  # form a new cluster with u and its neighbors
74  cluster_centers.append(currcenter)
75  cluster_members.append([n for n in neighbors[currcenter]])
76 
77  # update neighbors
78  for n in neighbors[currcenter]:
79  # removes the neighbor from the pool
80  unclustered.remove(n) # first occurence of n is removed.
81  boolUnclustered[n] = False # clustered
82 
83  for n in neighbors[currcenter]:
84  for unn in neighbors[n]: # unclustered neighbor
85  if not boolUnclustered[unn]:
86  continue
87  neighbors[unn].remove(n)
88 
89  return cluster_centers, cluster_members
90 
91 
92 def get_contingency_table(num_clusters, cluster_members, all_models,
93  run1_models, run2_models):
94  full_ctable = numpy.zeros((num_clusters, 2))
95 
96  for ic, cluster in enumerate(cluster_members):
97  for member in cluster:
98  model_index = all_models[member]
99 
100  if model_index in run1_models:
101  # print("run1", model_index)
102  full_ctable[ic][0] += 1.0
103  elif model_index in run2_models:
104  # print("run2", model_index)
105  full_ctable[ic][1] += 1.0
106 
107  # now normalize by number of models in each run
108  reduced_ctable = []
109  retained_clusters = []
110 
111  for i in range(num_clusters):
112  if full_ctable[i][0] <= 10.0 or full_ctable[i][1] <= 10.0:
113  continue
114  reduced_ctable.append([full_ctable[i][0], full_ctable[i][1]])
115  retained_clusters.append(i)
116  return numpy.array(reduced_ctable), retained_clusters
117 
118 
119 def test_sampling_convergence(contingency_table, total_num_models):
120  if len(contingency_table) == 0:
121  return 0.0, 1.0
122 
123  ct = numpy.transpose(contingency_table)
124  [chisquare, pvalue, dof, expected] = scipy.stats.chi2_contingency(ct)
125  if dof == 0.0:
126  cramersv = 0.0
127  else:
128  cramersv = math.sqrt(chisquare/float(total_num_models))
129 
130  return pvalue, cramersv
131 
132 
133 def percent_ensemble_explained(ctable, total_num_models):
134  if len(ctable) == 0:
135  return 0.0
136  percent_clustered = \
137  float(numpy.sum(ctable, axis=0)[0] + numpy.sum(ctable, axis=0)[1]) \
138  * 100.0 / float(total_num_models)
139  return percent_clustered
140 
141 
142 def unpacking_wrapper(arg_tuple):
143  x = arg_tuple[2:]
144  all_models, run1_all_models, run2_all_models, total_num_models = x
145  x = precision_cluster(*((distmat_full, ) + arg_tuple[0:2]))
146  cluster_centers, cluster_members = x
147  ctable, retained_clusters = get_contingency_table(
148  len(cluster_centers), cluster_members, all_models,
149  run1_all_models, run2_all_models)
150  pval, cramersv = test_sampling_convergence(ctable,
151  total_num_models)
152  percent_explained = percent_ensemble_explained(ctable,
153  total_num_models)
154  return pval, cramersv, percent_explained
155 
156 
157 def init_foo(distmat):
158  global distmat_full
159  distmat_full = distmat
160 
161 
162 def get_clusters(cutoffs_list, distmat_full, all_models, total_num_models,
163  run1_all_models, run2_all_models, sysname, cores):
164  # Do Clustering on a Grid
165  pvals = []
166  cvs = []
167  percents = []
168  with open("%s.ChiSquare_Grid_Stats.txt" % sysname, 'w+') as f1:
169  p = Pool(cores, init_foo, initargs=(distmat_full, ))
170  args_list = [(total_num_models, c, all_models, run1_all_models,
171  run2_all_models, total_num_models)
172  for c in cutoffs_list]
173  results = p.map(unpacking_wrapper, args_list)
174  for i, x in enumerate(results):
175  pvals.append(x[0])
176  cvs.append(x[1])
177  percents.append(x[2])
178 
179  print(cutoffs_list[i], x[0], x[1], x[2], file=f1)
180 
181  return pvals, cvs, percents
182 
183 
184 def get_sampling_precision(cutoffs_list, pvals, cvs, percents):
185  sampling_precision = max(cutoffs_list)
186  pval_converged = 0.0
187  cramersv_converged = 1.0
188  percent_converged = 0.0
189 
190  for i in range(len(cutoffs_list)):
191  if percents[i] > 80.0:
192  if pvals[i] > 0.05 or cvs[i] < 0.10:
193  if sampling_precision > cutoffs_list[i]:
194  sampling_precision = cutoffs_list[i]
195  pval_converged = pvals[i]
196  cramersv_converged = cvs[i]
197  percent_converged = percents[i]
198  else:
199  sampling_precision = max(cutoffs_list)
200 
201  return (sampling_precision, pval_converged, cramersv_converged,
202  percent_converged)