IMP logo
IMP Reference Guide  2.16.0
The Integrative Modeling Platform
clustering_rmsd.py
1 from __future__ import print_function
2 import numpy
3 import math
4 import scipy.stats
5 
6 
7 def get_sample_identity(idfile_A, idfile_B):
8  # whether a run belongs to run1 or run2
9  sampleA_models = []
10  sampleB_models = []
11 
12  with open(idfile_A, 'r') as f:
13  for line in f:
14  mod = line.split("/")[-1]
15  sampleA_models.append(int(mod.strip("\n").split(" ")[1]))
16  f.close()
17 
18  with open(idfile_B, 'r') as f:
19  for line in f:
20  mod = line.split("/")[-1]
21  sampleB_models.append(int(mod.strip("\n").split(" ")[1]))
22  return sampleA_models, sampleB_models
23 
24 
25 def get_cutoffs_list(distmat, gridSize):
26 
27  mindist = distmat.min()
28  maxdist = distmat.max()
29 
30  print("Minimum and maximum pairwise model distances:", mindist, maxdist)
31  cutoffs = numpy.arange(mindist+gridSize, maxdist, gridSize)
32  return cutoffs
33 
34 
35 def precision_cluster(distmat, numModels, rmsd_cutoff):
36  # STEP 2. Populate the neighbors ofa given model
37  neighbors = []
38  for count in range(numModels):
39  neighbors.append([count]) # model is a neighbor of itself
40 
41  # set of i,j indices that pass
42  inds = numpy.argwhere(distmat <= rmsd_cutoff)
43  for x in inds:
44  i = x[0]
45  j = x[1]
46  # Only count each contribution once
47  if i > j:
48  neighbors[i].append(j)
49  neighbors[j].append(i)
50 
51  # STEP 3. Get the weightiest cluster, and iterate
52  unclustered = []
53  boolUnclustered = []
54  for i in range(numModels):
55  unclustered.append(i)
56  boolUnclustered.append(True)
57 
58  cluster_members = [] # list of lists : one list per cluster
59  cluster_centers = []
60 
61  while len(unclustered) > 0:
62  # get cluster with maximum weight
63  max_neighbors = 0
64  currcenter = -1
65  for eachu in unclustered:
66  # if multiple clusters have same maxweight this tie is
67  # broken arbitrarily!
68  if len(neighbors[eachu]) > max_neighbors:
69  max_neighbors = len(neighbors[eachu])
70  currcenter = eachu
71 
72  # form a new cluster with u and its neighbors
73  cluster_centers.append(currcenter)
74  cluster_members.append([n for n in neighbors[currcenter]])
75 
76  # update neighbors
77  for n in neighbors[currcenter]:
78  # removes the neighbor from the pool
79  unclustered.remove(n) # first occurence of n is removed.
80  boolUnclustered[n] = False # clustered
81 
82  for n in neighbors[currcenter]:
83  for unn in neighbors[n]: # unclustered neighbor
84  if not boolUnclustered[unn]:
85  continue
86  neighbors[unn].remove(n)
87 
88  return cluster_centers, cluster_members
89 
90 
91 def get_contingency_table(num_clusters, cluster_members, all_models,
92  run1_models, run2_models):
93  full_ctable = numpy.zeros((num_clusters, 2))
94 
95  for ic, cluster in enumerate(cluster_members):
96  for member in cluster:
97  model_index = all_models[member]
98 
99  if model_index in run1_models:
100  # print("run1", model_index)
101  full_ctable[ic][0] += 1.0
102  elif model_index in run2_models:
103  # print("run2", model_index)
104  full_ctable[ic][1] += 1.0
105 
106  # now normalize by number of models in each run
107  reduced_ctable = []
108  retained_clusters = []
109 
110  for i in range(num_clusters):
111  if full_ctable[i][0] <= 10.0 or full_ctable[i][1] <= 10.0:
112  continue
113  reduced_ctable.append([full_ctable[i][0], full_ctable[i][1]])
114  retained_clusters.append(i)
115  return numpy.array(reduced_ctable), retained_clusters
116 
117 
118 def test_sampling_convergence(contingency_table, total_num_models):
119  if len(contingency_table) == 0:
120  return 0.0, 1.0
121 
122  ct = numpy.transpose(contingency_table)
123  [chisquare, pvalue, dof, expected] = scipy.stats.chi2_contingency(ct)
124  if dof == 0.0:
125  cramersv = 0.0
126  else:
127  cramersv = math.sqrt(chisquare/float(total_num_models))
128 
129  return pvalue, cramersv
130 
131 
132 def percent_ensemble_explained(ctable, total_num_models):
133  if len(ctable) == 0:
134  return 0.0
135  percent_clustered = \
136  float(numpy.sum(ctable, axis=0)[0] + numpy.sum(ctable, axis=0)[1]) \
137  * 100.0 / float(total_num_models)
138  return percent_clustered
139 
140 
141 def get_clusters(cutoffs_list, distmat_full, all_models, total_num_models,
142  run1_all_models, run2_all_models, sysname):
143  # Do Clustering on a Grid
144  pvals = []
145  cvs = []
146  percents = []
147  with open("%s.ChiSquare_Grid_Stats.txt" % sysname, 'w+') as f1:
148  for c in cutoffs_list:
149  cluster_centers, cluster_members = precision_cluster(
150  distmat_full, total_num_models, c)
151  ctable, retained_clusters = get_contingency_table(
152  len(cluster_centers), cluster_members, all_models,
153  run1_all_models, run2_all_models)
154  pval, cramersv = test_sampling_convergence(ctable,
155  total_num_models)
156  percent_explained = percent_ensemble_explained(ctable,
157  total_num_models)
158 
159  pvals.append(pval)
160  cvs.append(cramersv)
161  percents.append(percent_explained)
162 
163  print(c, pval, cramersv, percent_explained, file=f1)
164 
165  return pvals, cvs, percents
166 
167 
168 def get_sampling_precision(cutoffs_list, pvals, cvs, percents):
169  sampling_precision = max(cutoffs_list)
170  pval_converged = 0.0
171  cramersv_converged = 1.0
172  percent_converged = 0.0
173 
174  for i in range(len(cutoffs_list)):
175  if percents[i] > 80.0:
176  if pvals[i] > 0.05 or cvs[i] < 0.10:
177  if sampling_precision > cutoffs_list[i]:
178  sampling_precision = cutoffs_list[i]
179  pval_converged = pvals[i]
180  cramersv_converged = cvs[i]
181  percent_converged = percents[i]
182  else:
183  sampling_precision = max(cutoffs_list)
184 
185  return (sampling_precision, pval_converged, cramersv_converged,
186  percent_converged)