IMP logo
IMP Reference Guide  develop.63b38c487d,2024/12/22
The Integrative Modeling Platform
clustering_rmsd.py
1 """@namespace IMP.sampcon.clustering_rmsd
2  Utilities to cluster and calculate RMSD and precision"""
3 
4 from __future__ import print_function
5 import numpy
6 import math
7 import scipy.stats
8 from multiprocessing import Pool
9 
10 
11 def get_sample_identity(idfile_A, idfile_B):
12  # whether a run belongs to run1 or run2
13  sampleA_models = []
14  sampleB_models = []
15 
16  with open(idfile_A, 'r') as f:
17  for line in f:
18  mod = line.split("/")[-1]
19  sampleA_models.append(int(mod.strip("\n").split(" ")[1]))
20  f.close()
21 
22  with open(idfile_B, 'r') as f:
23  for line in f:
24  mod = line.split("/")[-1]
25  sampleB_models.append(int(mod.strip("\n").split(" ")[1]))
26  return sampleA_models, sampleB_models
27 
28 
29 def get_cutoffs_list(distmat, gridSize):
30 
31  mindist = distmat.min()
32  maxdist = distmat.max()
33 
34  print("Minimum and maximum pairwise model distances:", mindist, maxdist)
35  cutoffs = numpy.arange(mindist+gridSize, maxdist, gridSize)
36  return cutoffs
37 
38 
39 def precision_cluster(distmat, numModels, rmsd_cutoff):
40  # STEP 2. Populate the neighbors ofa given model
41  neighbors = []
42  for count in range(numModels):
43  neighbors.append([count]) # model is a neighbor of itself
44 
45  # set of i,j indices that pass
46  inds = numpy.argwhere(distmat <= rmsd_cutoff)
47  for x in inds:
48  i = x[0]
49  j = x[1]
50  # Only count each contribution once
51  if i > j:
52  neighbors[i].append(j)
53  neighbors[j].append(i)
54 
55  # STEP 3. Get the weightiest cluster, and iterate
56  unclustered = []
57  boolUnclustered = []
58  for i in range(numModels):
59  unclustered.append(i)
60  boolUnclustered.append(True)
61 
62  cluster_members = [] # list of lists : one list per cluster
63  cluster_centers = []
64 
65  while len(unclustered) > 0:
66  # get cluster with maximum weight
67  max_neighbors = 0
68  currcenter = -1
69  for eachu in unclustered:
70  # if multiple clusters have same maxweight this tie is
71  # broken arbitrarily!
72  if len(neighbors[eachu]) > max_neighbors:
73  max_neighbors = len(neighbors[eachu])
74  currcenter = eachu
75 
76  # form a new cluster with u and its neighbors
77  cluster_centers.append(currcenter)
78  cluster_members.append([n for n in neighbors[currcenter]])
79 
80  # update neighbors
81  for n in neighbors[currcenter]:
82  # removes the neighbor from the pool
83  unclustered.remove(n) # first occurrence of n is removed.
84  boolUnclustered[n] = False # clustered
85 
86  for n in neighbors[currcenter]:
87  for unn in neighbors[n]: # unclustered neighbor
88  if not boolUnclustered[unn]:
89  continue
90  neighbors[unn].remove(n)
91 
92  return cluster_centers, cluster_members
93 
94 
95 def get_contingency_table(num_clusters, cluster_members, all_models,
96  run1_models, run2_models):
97  full_ctable = numpy.zeros((num_clusters, 2))
98 
99  for ic, cluster in enumerate(cluster_members):
100  for member in cluster:
101  model_index = all_models[member]
102 
103  if model_index in run1_models:
104  # print("run1", model_index)
105  full_ctable[ic][0] += 1.0
106  elif model_index in run2_models:
107  # print("run2", model_index)
108  full_ctable[ic][1] += 1.0
109 
110  # now normalize by number of models in each run
111  reduced_ctable = []
112  retained_clusters = []
113 
114  for i in range(num_clusters):
115  if full_ctable[i][0] <= 10.0 or full_ctable[i][1] <= 10.0:
116  continue
117  reduced_ctable.append([full_ctable[i][0], full_ctable[i][1]])
118  retained_clusters.append(i)
119  return numpy.array(reduced_ctable), retained_clusters
120 
121 
122 def test_sampling_convergence(contingency_table, total_num_models):
123  if len(contingency_table) == 0:
124  return 0.0, 1.0
125 
126  ct = numpy.transpose(contingency_table)
127  [chisquare, pvalue, dof, expected] = scipy.stats.chi2_contingency(ct)
128  if dof == 0.0:
129  cramersv = 0.0
130  else:
131  cramersv = math.sqrt(chisquare/float(total_num_models))
132 
133  return pvalue, cramersv
134 
135 
136 def percent_ensemble_explained(ctable, total_num_models):
137  if len(ctable) == 0:
138  return 0.0
139  percent_clustered = \
140  float(numpy.sum(ctable, axis=0)[0] + numpy.sum(ctable, axis=0)[1]) \
141  * 100.0 / float(total_num_models)
142  return percent_clustered
143 
144 
145 def unpacking_wrapper(arg_tuple):
146  x = arg_tuple[2:]
147  all_models, run1_all_models, run2_all_models, total_num_models = x
148  x = precision_cluster(*((distmat_full, ) + arg_tuple[0:2]))
149  cluster_centers, cluster_members = x
150  ctable, retained_clusters = get_contingency_table(
151  len(cluster_centers), cluster_members, all_models,
152  run1_all_models, run2_all_models)
153  pval, cramersv = test_sampling_convergence(ctable,
154  total_num_models)
155  percent_explained = percent_ensemble_explained(ctable,
156  total_num_models)
157  return pval, cramersv, percent_explained
158 
159 
160 def init_foo(distmat):
161  global distmat_full
162  distmat_full = distmat
163 
164 
165 def get_clusters(cutoffs_list, distmat_full, all_models, total_num_models,
166  run1_all_models, run2_all_models, sysname, cores):
167  # Do Clustering on a Grid
168  pvals = []
169  cvs = []
170  percents = []
171  with open("%s.ChiSquare_Grid_Stats.txt" % sysname, 'w+') as f1:
172  p = Pool(cores, init_foo, initargs=(distmat_full, ))
173  args_list = [(total_num_models, c, all_models, run1_all_models,
174  run2_all_models, total_num_models)
175  for c in cutoffs_list]
176  results = p.map(unpacking_wrapper, args_list)
177  for i, x in enumerate(results):
178  pvals.append(x[0])
179  cvs.append(x[1])
180  percents.append(x[2])
181 
182  print(cutoffs_list[i], x[0], x[1], x[2], file=f1)
183 
184  return pvals, cvs, percents
185 
186 
187 def get_sampling_precision(cutoffs_list, pvals, cvs, percents):
188  sampling_precision = max(cutoffs_list)
189  pval_converged = 0.0
190  cramersv_converged = 1.0
191  percent_converged = 0.0
192 
193  for i in range(len(cutoffs_list)):
194  if percents[i] > 80.0:
195  if pvals[i] > 0.05 or cvs[i] < 0.10:
196  if sampling_precision > cutoffs_list[i]:
197  sampling_precision = cutoffs_list[i]
198  pval_converged = pvals[i]
199  cramersv_converged = cvs[i]
200  percent_converged = percents[i]
201  else:
202  sampling_precision = max(cutoffs_list)
203 
204  return (sampling_precision, pval_converged, cramersv_converged,
205  percent_converged)