IMP logo
IMP Reference Guide  2.13.0
The Integrative Modeling Platform
clustering_rmsd.py
1 from __future__ import print_function
2 import os,sys,random,numpy,math
3 import pickle
4 
5 import scipy as sp
6 from scipy import spatial
7 import scipy.stats
8 import time
9 import pyRMSD.RMSDCalculator
10 from pyRMSD.matrixHandler import MatrixHandler
11 from pyRMSD.condensedMatrix import CondensedMatrix
12 
13 
14 def get_sample_identity(idfile_A, idfile_B):
15  # whether a run belongs to run1 or run2
16  sampleA_models=[]
17  sampleB_models=[]
18 
19  with open(idfile_A, 'r') as f:
20  for line in f:
21  mod = line.split("/")[-1]
22  sampleA_models.append(int(mod.strip("\n").split(" ")[1]))
23  f.close()
24 
25  with open(idfile_B, 'r') as f:
26  for line in f:
27  mod = line.split("/")[-1]
28  sampleB_models.append(int(mod.strip("\n").split(" ")[1]))
29  return sampleA_models, sampleB_models
30 
31 
32 def get_cutoffs_list(distmat, gridSize):
33 
34  mindist = distmat.min()
35  maxdist = distmat.max()
36 
37  print("Minimum and maximum pairwise model distances:",mindist, maxdist)
38  cutoffs=numpy.arange(mindist+gridSize,maxdist,gridSize)
39  return cutoffs
40 
41 
42 def precision_cluster(distmat,numModels,rmsd_cutoff):
43  #STEP 2. Populate the neighbors ofa given model
44  neighbors=[]
45  for count in range(numModels):
46  neighbors.append([count]) # model is a neighbor of itself
47 
48  inds = numpy.argwhere(distmat <= rmsd_cutoff) # set of i,j indices that pass
49  for x in inds:
50  i = x[0]
51  j = x[1]
52  # Only count each contribution once
53  if i>j:
54  neighbors[i].append(j)
55  neighbors[j].append(i)
56 
57  #STEP 3. Get the weightiest cluster, and iterate
58  unclustered=[]
59  boolUnclustered=[]
60  for i in range(numModels):
61  unclustered.append(i)
62  boolUnclustered.append(True)
63 
64  cluster_members=[] # list of lists : one list per cluster
65  cluster_centers=[]
66 
67  while len(unclustered)>0:
68  # get cluster with maximum weight
69  max_neighbors=0
70  currcenter=-1
71  for eachu in unclustered: # if multiple clusters have same maxweight this tie is broken arbitrarily!
72  if len(neighbors[eachu])>max_neighbors:
73  max_neighbors=len(neighbors[eachu])
74  currcenter=eachu
75 
76  #form a new cluster with u and its neighbors
77  cluster_centers.append(currcenter)
78  cluster_members.append([n for n in neighbors[currcenter]])
79 
80  #update neighbors
81  for n in neighbors[currcenter]:
82  #removes the neighbor from the pool
83  unclustered.remove(n) #first occurence of n is removed.
84  boolUnclustered[n]=False # clustered
85 
86  for n in neighbors[currcenter]:
87  for unn in neighbors[n]: #unclustered neighbor
88  if not boolUnclustered[unn]:
89  continue
90  neighbors[unn].remove(n)
91 
92  return cluster_centers, cluster_members
93 
94 def get_contingency_table(num_clusters,cluster_members,all_models,run1_models,run2_models):
95  full_ctable=numpy.zeros((num_clusters,2))
96 
97  for ic,cluster in enumerate(cluster_members):
98  for member in cluster:
99  model_index=all_models[member]
100 
101  if model_index in run1_models:
102  #print("run1", model_index)
103  full_ctable[ic][0]+=1.0
104  elif model_index in run2_models:
105  #print("run2", model_index)
106  full_ctable[ic][1]+=1.0
107 
108  ## now normalize by number of models in each run
109  numModelsRun1 = float(numpy.sum(full_ctable,axis=0)[0])
110  numModelsRun2 = float(numpy.sum(full_ctable,axis=0)[1])
111 
112  reduced_ctable=[]
113  retained_clusters=[]
114 
115  for i in range(num_clusters):
116  if full_ctable[i][0]<=10.0 or full_ctable[i][1]<=10.0:
117  #if full_ctable[i][0]<=0.10*numModelsRun1 and full_ctable[i][1] <= 0.10*numModelsRun2:
118  continue
119  reduced_ctable.append([full_ctable[i][0],full_ctable[i][1]])
120  retained_clusters.append(i)
121  return numpy.array(reduced_ctable),retained_clusters
122 
123 def test_sampling_convergence(contingency_table,total_num_models):
124  if len(contingency_table)==0:
125  return 0.0,1.0
126 
127  ct = numpy.transpose(contingency_table)
128  [chisquare,pvalue,dof,expected]=scipy.stats.chi2_contingency(ct)
129  if dof==0.0:
130  cramersv=0.0
131  else:
132  cramersv=math.sqrt(chisquare/float(total_num_models))
133 
134  return(pvalue,cramersv)
135 
136 def percent_ensemble_explained(ctable,total_num_models):
137  if len(ctable)==0:
138  return 0.0
139  percent_clustered=float(numpy.sum(ctable,axis=0)[0]+numpy.sum(ctable,axis=0)[1])*100.0/float(total_num_models)
140  return percent_clustered
141 
142 def get_clusters(cutoffs_list, distmat_full, all_models, total_num_models, run1_all_models, run2_all_models, sysname):
143  #Do Clustering on a Grid
144  pvals=[]
145  cvs=[]
146  percents=[]
147  with open("%s.ChiSquare_Grid_Stats.txt" % sysname, 'w+') as f1:
148  for c in cutoffs_list:
149  cluster_centers, cluster_members = precision_cluster(
150  distmat_full, total_num_models, c)
151  ctable, retained_clusters = get_contingency_table(
152  len(cluster_centers), cluster_members, all_models,
153  run1_all_models, run2_all_models)
154  pval, cramersv = test_sampling_convergence(ctable, total_num_models)
155  percent_explained = percent_ensemble_explained(
156  ctable, total_num_models)
157 
158  pvals.append(pval)
159  cvs.append(cramersv)
160  percents.append(percent_explained)
161 
162  print(c, pval, cramersv, percent_explained, file=f1)
163 
164  return pvals, cvs, percents
165 
166 def get_sampling_precision(cutoffs_list, pvals, cvs, percents):
167  sampling_precision=max(cutoffs_list)
168  pval_converged=0.0
169  cramersv_converged=1.0
170  percent_converged=0.0
171 
172  for i in range(len(cutoffs_list)):
173  if percents[i]>80.0:
174  if pvals[i]>0.05 or cvs[i]<0.10:
175  if sampling_precision>cutoffs_list[i]:
176  sampling_precision=cutoffs_list[i]
177  pval_converged=pvals[i]
178  cramersv_converged=cvs[i]
179  percent_converged=percents[i]
180  else:
181  sampling_precision=max(cutoffs_list)
182 
183  return sampling_precision,pval_converged,cramersv_converged,percent_converged