IMP  2.0.0
The Integrative Modeling Platform
cluster.py
1 #!/usr/bin/env python
2 
3 __doc__ = "Cluster assembly solutions."
4 
5 from optparse import OptionParser
6 import itertools
7 import math
8 import IMP.multifit
9 import IMP.container
10 
11 def get_uniques(seq):
12  # Not order preserving
13  keys = {}
14  counter=[]
15  for e in seq:
16  keys[e] = 1
17  num_keys=len(keys.keys())
18  for i in range(num_keys+1):
19  counter.append([0,i])
20  for e in seq:
21  counter[e][0]=counter[e][0]+1
22  counter=sorted(counter,reverse=True)
23  indexes=[]
24  for c,i in counter[:-1]:
25  indexes.append(i)
26  return indexes
27 
28 
29 class ClusterData:
30  def __init__(self,cluster_ind,cluster_size,rmsd_calculated):
31  self.cluster_ind=cluster_ind
32  self.cluster_size=cluster_size
33  self.rmsd_calculated=rmsd_calculated
34  def set_distance_stats(self,avg,std):
35  self.distance_avg=avg
36  self.distance_std=std
37  def set_angle_stats(self,avg,std):
38  self.angle_avg=avg
39  self.angle_std=std
40  def set_rmsd_stats(self,avg,std):
41  self.rmsd_avg=avg
42  self.rmsd_std=std
43  def set_best_sampled_data(self,ind,rmsd,cc,distance,angle):
44  self.best_sampled_ind=ind
45  self.best_sampled_rmsd=rmsd
46  self.best_sampled_cc=cc
47  self.best_sampled_distance=distance
48  self.best_sampled_angle=angle
49  def set_best_scored_data(self,ind,rmsd,cc,distance,angle):
50  self.best_scored_ind=ind
51  self.best_scored_rmsd=rmsd
52  self.best_scored_cc=cc
53  self.best_scored_distance=distance
54  self.best_scored_angle=angle
55 
57  """
58  Clusters assembly models
59  - The solutions are chosen by sorting the database according to the
60  parameter orderby
61  - The models are aligned and clustered by RMSD
62  """
63  def __init__(self, asmb_fn, prot_fn, map_fn, align_fn, combs_fn):
64  self.asmb_fn = asmb_fn
65  self.prot_fn = prot_fn
66  self.map_fn = map_fn
67  self.align_fn = align_fn
68  self.combs_fn = combs_fn
69 
70  self.asmb=IMP.multifit.read_settings(self.asmb_fn)
71  self.asmb.set_was_used(True)
72  self.prot_data=IMP.multifit.read_proteomics_data(self.prot_fn)
73  self.alignment_params = IMP.multifit.AlignmentParams(self.align_fn)
74  self.mapping_data=IMP.multifit.read_protein_anchors_mapping(self.prot_data,self.map_fn)
75 
76  self.align=IMP.multifit.ProteomicsEMAlignmentAtomic(self.mapping_data,self.asmb,
77  self.alignment_params)
78  self.align.set_was_used(True)
79 
80  self.combs=IMP.multifit.read_paths(self.combs_fn)
81  self.ensmb=IMP.multifit.Ensemble(self.asmb,self.mapping_data)
82  self.ensmb.set_was_used(True)
83  self.mhs=self.align.get_molecules()
84  for i,mh in enumerate(self.mhs):
85  self.ensmb.add_component_and_fits(mh,
86  IMP.multifit.read_fitting_solutions(self.asmb.get_component_header(i).get_transformations_fn()))
87  #load the density map
88  self.dmap=IMP.em.read_map(self.asmb.get_assembly_header().get_dens_fn())
89  self.dmap.set_was_used(True)
90  self.dmap.get_header().set_resolution(
91  self.asmb.get_assembly_header().get_resolution())
92  threshold=self.asmb.get_assembly_header().get_threshold()
93  self.dmap.update_voxel_size(self.asmb.get_assembly_header().get_spacing())
94  self.dmap.set_origin(self.asmb.get_assembly_header().get_origin())
95  self.dmap.calcRMS()
96  def do_clustering(self,max_comb_ind,max_rmsd):
97  """
98  Cluster configurations for a model based on RMSD.
99  An IMP.ConfigurationSet is built using the reference frames for
100  all of the components of the assembly for each solution
101  @param max_comb_ind Maximum number of components to consider
102  @param max_rmsd Maximum RMSD tolerated when clustering
103  """
104  import fastcluster
105  import scipy.cluster.hierarchy
106 
107  self.mdl = self.align.get_model()
108  self.all_ca=[]
109  for mh in self.mhs:
110  mh_res=IMP.atom.get_by_type(mh,IMP.atom.RESIDUE_TYPE)
111  s1=IMP.atom.Selection(mh_res);
112  s1.set_atom_types([IMP.atom.AtomType("CA")])
113  self.all_ca.append(s1.get_selected_particles())
114  #load configurations
115  self.coords=[]
116  print "load configurations"
117  for combi,comb in enumerate(self.combs[:max_comb_ind]):
118  self.ensmb.load_combination(comb)
119  c1=[]
120  for mol_ca in self.all_ca:
121  mol_xyz=[]
122  for ca in mol_ca:
123  mol_xyz.append(IMP.core.XYZ(ca).get_coordinates())
124  c1.append(mol_xyz)
125  self.coords.append(c1)
126  self.ensmb.unload_combination(comb)
127  self.distances=[]
128  print "calculate distances"
129  for i in range(len(self.coords)):
130  for j in range(i+1,len(self.coords)):
131  self.distances.append(IMP.atom.get_rmsd(list(itertools.chain.from_iterable(self.coords[i])),
132  list(itertools.chain.from_iterable(self.coords[j]))))
133  print "cluster"
134  Z=fastcluster.linkage(self.distances)
135  self.cluster_inds=scipy.cluster.hierarchy.fcluster(Z,max_rmsd,criterion='distance')
136  self.uniques=get_uniques(self.cluster_inds)
137  print "number of clusters",len(self.uniques)
138 
139  #return clusters by their size
140  return self.uniques
141 
142 
143  def get_placement_score_from_coordinates(self,model_coords, native_coords):
144  """
145  Computes the position error (placement distance) and the orientation error (placement angle) of the coordinates in model_coords respect to the coordinates in native_coords.
146  placement distance - translation between the centroids of the
147  coordinates placement angle - Angle in the axis-angle formulation of the rotation
148  aligning the two rigid bodies.
149  """
150  native_centroid = IMP.algebra.get_centroid(native_coords)
151  model_centroid = IMP.algebra.get_centroid(model_coords)
152  translation_vector = native_centroid - model_centroid
153  distance = translation_vector.get_magnitude()
154  if(len(model_coords) != len(native_coords) ):
155  raise ValueError(
156  "Mismatch in the number of members %d %d " % (
157  len(model_coords),
158  len(native_coords)) )
160  native_coords)
161  P = IMP.algebra.get_axis_and_angle( TT.get_rotation() )
162  angle = P.second*180./math.pi
163  return distance, angle
164 
165 
166  def get_cc(self,ps):
167  '''
168  bb_native = self.dmap.get_bounding_box()
169  bb_solution = IMP.core.get_bounding_box(IMP.core.XYZs(ps))
170  # bounding box enclosing both the particles of the native assembly
171  # and the particles of the model
172  bb_union = IMP.algebra.get_union(bb_native, bb_solution)
173  # add border of 4 voxels
174  border = 4*voxel_size
175  bottom = bb_union.get_corner(0)
176  bottom += IMP.algebra.Vector3D(-border, -border, -border)
177  top = bb_union.get_corner(1)
178  top += IMP.algebra.Vector3D(border, border, border)
179  bb_union = IMP.algebra.BoundingBox3D(bottom, top)
180  '''
181 
182  resolution = self.dmap.get_header().get_resolution()
183  voxel_size = self.dmap.get_spacing()
184 
185  map_solution = IMP.em.SampledDensityMap(self.dmap.get_header())
186  map_solution.set_particles(ps)
187  map_solution.resample()
188  map_solution.set_was_used(True)
189 
190  map_solution.calcRMS()
191  coarse_cc = IMP.em.CoarseCC()
192  coarse_cc.set_was_used(True)
193  # base the calculation of the cross_correlation coefficient on the threshold
194  # for the native map, because the threshold for the map of the model changes
195  # with each model
196  #map_solution.get_header().show()
197  threshold = 0.01 # threshold AFTER normalization using calcRMS()
198  ccc = coarse_cc.cross_correlation_coefficient(map_solution,
199  self.dmap, threshold)
200  return ccc
201 
202 
203  def get_cluster_representative_combination(self,query_cluster_ind):
204  return self.combs[self.clusters_data[query_cluster_ind].cluster_ind]
205 
206  def get_cluster_stats(self,query_cluster_ind):
207  return self.clusters_data[query_cluster_ind]
208 
209  def do_analysis(self,max_comb_ind):
210  self.clusters_data={}
211  for cluster_ind in self.uniques:
212  self.clusters_data[cluster_ind]=self.analyze_cluster(cluster_ind,max_comb_ind)
213 
214  def analyze_cluster(self,query_cluster_ind,max_comb_ind):
215  #load reference
216  mhs_native=[]
217  mhs_native_ca=[]
218  mhs_native_ca_ps=[]
219  calc_rmsd=True
220  for i in range(len(self.mhs)):
221  if self.asmb.get_component_header(i).get_reference_fn() == "":
222  calc_rmsd=False
223  continue
224  mhs_native.append(IMP.atom.read_pdb(self.asmb.get_component_header(i).get_reference_fn(),self.mdl))
225  s1=IMP.atom.Selection(mhs_native[-1]);
226  s1.set_atom_types([IMP.atom.AtomType("CA")])
227  mhs_native_ca.append([])
228  mhs_native_ca_ps.append([])
229  for p in s1.get_selected_particles():
230  mhs_native_ca[-1].append(IMP.core.XYZ(p).get_coordinates())
231  mhs_native_ca_ps[-1].append(IMP.core.XYZ(p))
232 
233  rmsds=[]
234  distances=[]
235  angles=[]
236  for i in range(len(self.mhs)):
237  distances.append([])
238  angles.append([])
239  best_sampled_ind=-1
240  best_scored_ind=-1
241  voxel_size=3 #check with javi
242  resolution=20 #check with javi
243  counter=-1
244  for elem_ind1,cluster_ind1 in enumerate(self.cluster_inds):
245  if cluster_ind1 != query_cluster_ind:
246  continue
247  counter=counter+1
248  if calc_rmsd:
249  rmsds.append(IMP.atom.get_rmsd(list(itertools.chain.from_iterable(mhs_native_ca)),
250  list(itertools.chain.from_iterable(self.coords[elem_ind1]))))
251  if best_scored_ind==-1:
252  self.ensmb.load_combination(self.combs[elem_ind1])
253  best_scored_ind=counter
254  best_scored_cc=self.get_cc(
255  list(itertools.chain.from_iterable(self.all_ca)))
256  if calc_rmsd:
257  best_scored_rmsd = rmsds[-1]
258  best_sampled_ind=counter
259  best_sampled_cc=best_scored_cc
260  best_sampled_rmsd=rmsds[-1]
261  self.ensmb.unload_combination(self.combs[elem_ind1])
262  #print rmsds[-1],best_scored_rmsd
263  if calc_rmsd:
264  if rmsds[-1]<best_scored_rmsd:
265  self.ensmb.load_combination(self.combs[elem_ind1])
266  best_sampled_ind=counter
267  best_sampled_cc=self.get_cc(
268  list(itertools.chain.from_iterable(self.all_ca)))
269  best_sampled_rmsd=rmsds[-1]
270  self.ensmb.unload_combination(self.combs[elem_ind1])
271  sum_d=0;
272  sum_a=0
273  for i in range(len(self.mhs)):
274  [d,a]=self.get_placement_score_from_coordinates(self.coords[elem_ind1][i], \
275  mhs_native_ca[i])
276  sum_d=sum_d+d
277  sum_a=sum_a+a
278  distances[i].append(sum_d/len(self.mhs))
279  angles[i].append(sum_a/len(self.mhs))
280  cd=ClusterData(query_cluster_ind,counter+1,calc_rmsd)
281  if calc_rmsd:
282  import numpy
283  d = numpy.array(list(itertools.chain.from_iterable(distances)))
284  a = numpy.array(list(itertools.chain.from_iterable(angles)))
285  r = numpy.array(rmsds)
286  cd.set_distance_stats(d.mean(),d.std())
287  cd.set_angle_stats(a.mean(),a.std())
288  cd.set_best_scored_data(best_scored_ind, best_scored_rmsd,best_scored_cc,d[0],a[0])
289  cd.set_rmsd_stats(r.mean(),r.std())
290  cd.set_best_sampled_data(best_sampled_ind,best_sampled_rmsd,best_sampled_cc,d[best_sampled_ind],a[best_sampled_ind])
291  else:
292  cd.set_best_scored_data(best_scored_ind,-1,best_scored_cc,-1,-1)
293  return cd
294 
295 def usage():
296  usage = """%prog [options] <asmb> <asmb.proteomics> <asmb.mapping>
297  <alignment.params> <combinations> <output: clustered combinations>
298 
299 Clustering of assembly solutions.
300 
301 This program uses the Python 'fastcluster' module, which can be obtained from
302 http://math.stanford.edu/~muellner/fastcluster.html
303 """
304  parser = OptionParser(usage)
305  parser.add_option("-m", "--max", type="int", dest="max", default=999999999,
306  help="maximum solutions to consider")
307  parser.add_option("-r", "--rmsd", type="float", dest="rmsd", default=5,
308  help="maximum rmsd within a cluster")
309  options, args = parser.parse_args()
310  if len(args) !=6:
311  parser.error("incorrect number of arguments")
312  return options,args
313 
314 def main():
315  IMP.base.set_log_level(IMP.WARNING)
316  options,args = usage()
317  asmb_fn = args[0]
318  prot_fn = args[1]
319  map_fn = args[2]
320  align_fn = args[3]
321  combs_fn = args[4]
322  output_fn = args[5]
323 
324  clust_engine = AlignmentClustering(asmb_fn,prot_fn,map_fn,align_fn,combs_fn)
325  clusters=clust_engine.do_clustering(options.max,options.rmsd)
326  cluster_representatives=[]
327  print "clustering completed"
328  print "start analysis"
329  clust_engine.do_analysis(options.max)
330  repr_combs=[]
331  for cluster_ind in clust_engine.uniques:
332  repr_combs.append(clust_engine.get_cluster_representative_combination(cluster_ind))
333  IMP.multifit.write_paths(repr_combs,output_fn)
334  #print the clusters data
335  for cluster_ind in clust_engine.uniques:
336  info=clust_engine.get_cluster_stats(cluster_ind)
337  repr_combs.append(clust_engine.get_cluster_representative_combination(cluster_ind))
338  print "==========Cluster index:",info.cluster_ind,"size:",info.cluster_size
339  if info.rmsd_calculated:
340  print "best sampled in cluster (index,cc,distance,angle,rmsd):",info.best_sampled_ind,info.best_sampled_cc,info.best_sampled_distance,info.best_sampled_angle,info.best_sampled_rmsd
341  if info.rmsd_calculated:
342  print "cluster representative (index,cc,distance,angle,rmsd):",info.best_scored_ind,info.best_scored_cc,info.best_scored_distance,info.best_scored_angle,info.best_scored_rmsd
343  else:
344  print "cluster representative (index,cc):",info.best_scored_ind,info.best_scored_cc
345 
346 
347 if __name__ == "__main__":
348  main()