IMP logo
IMP Reference Guide  2.20.0
The Integrative Modeling Platform
cluster.py
1 #!/usr/bin/env python
2 
3 from __future__ import print_function
4 from IMP import ArgumentParser
5 import itertools
6 import math
7 import IMP.multifit
8 import IMP.container
9 
10 __doc__ = "Cluster assembly solutions."
11 
12 def get_uniques(seq):
13  # Not order preserving
14  keys = {}
15  counter = []
16  for e in seq:
17  keys[e] = 1
18  num_keys = len(list(keys.keys()))
19  for i in range(num_keys + 1):
20  counter.append([0, i])
21  for e in seq:
22  counter[e][0] = counter[e][0] + 1
23  counter = sorted(counter, reverse=True)
24  indexes = []
25  for c, i in counter[:-1]:
26  indexes.append(i)
27  return indexes
28 
29 
30 class ClusterData:
31 
32  def __init__(self, cluster_ind, cluster_size, rmsd_calculated):
33  self.cluster_ind = cluster_ind
34  self.cluster_size = cluster_size
35  self.rmsd_calculated = rmsd_calculated
36 
37  def set_distance_stats(self, avg, std):
38  self.distance_avg = avg
39  self.distance_std = std
40 
41  def set_angle_stats(self, avg, std):
42  self.angle_avg = avg
43  self.angle_std = std
44 
45  def set_rmsd_stats(self, avg, std):
46  self.rmsd_avg = avg
47  self.rmsd_std = std
48 
49  def set_best_sampled_data(self, ind, rmsd, cc, distance, angle):
50  self.best_sampled_ind = ind
51  self.best_sampled_rmsd = rmsd
52  self.best_sampled_cc = cc
53  self.best_sampled_distance = distance
54  self.best_sampled_angle = angle
55 
56  def set_best_scored_data(self, ind, rmsd, cc, distance, angle):
57  self.best_scored_ind = ind
58  self.best_scored_rmsd = rmsd
59  self.best_scored_cc = cc
60  self.best_scored_distance = distance
61  self.best_scored_angle = angle
62 
63 
65 
66  """
67  Clusters assembly models
68  - The solutions are chosen by sorting the database according to the
69  parameter orderby
70  - The models are aligned and clustered by RMSD
71  """
72 
73  def __init__(self, asmb_fn, prot_fn, map_fn, align_fn, combs_fn):
74  self.asmb_fn = asmb_fn
75  self.prot_fn = prot_fn
76  self.map_fn = map_fn
77  self.align_fn = align_fn
78  self.combs_fn = combs_fn
79 
80  self.asmb = IMP.multifit.read_settings(self.asmb_fn)
81  self.asmb.set_was_used(True)
82  self.prot_data = IMP.multifit.read_proteomics_data(self.prot_fn)
83  self.alignment_params = IMP.multifit.AlignmentParams(self.align_fn)
85  self.prot_data, self.map_fn)
86 
88  self.mapping_data, self.asmb,
89  self.alignment_params)
90  self.align.set_was_used(True)
91 
92  self.combs = IMP.multifit.read_paths(self.combs_fn)
93  self.ensmb = IMP.multifit.Ensemble(self.asmb, self.mapping_data)
94  self.ensmb.set_was_used(True)
95  self.mhs = self.align.get_molecules()
96  for i, mh in enumerate(self.mhs):
97  self.ensmb.add_component_and_fits(mh,
98  IMP.multifit.read_fitting_solutions(self.asmb.get_component_header(i).get_transformations_fn()))
99  # load the density map
100  self.dmap = IMP.em.read_map(
101  self.asmb.get_assembly_header().get_dens_fn())
102  self.dmap.set_was_used(True)
103  self.dmap.get_header().set_resolution(
104  self.asmb.get_assembly_header().get_resolution())
105  threshold = self.asmb.get_assembly_header().get_threshold()
106  self.dmap.update_voxel_size(
107  self.asmb.get_assembly_header().get_spacing())
108  self.dmap.set_origin(self.asmb.get_assembly_header().get_origin())
109  self.dmap.calcRMS()
110 
111  def do_clustering(self, max_comb_ind, max_rmsd):
112  """
113  Cluster configurations for a model based on RMSD.
114  An IMP.ConfigurationSet is built using the reference frames for
115  all of the components of the assembly for each solution
116  @param max_comb_ind Maximum number of components to consider
117  @param max_rmsd Maximum RMSD tolerated when clustering
118  """
119  import fastcluster
120  import scipy.cluster.hierarchy
121 
122  self.mdl = self.align.get_model()
123  self.all_ca = []
124  for mh in self.mhs:
125  mh_res = IMP.atom.get_by_type(mh, IMP.atom.RESIDUE_TYPE)
126  s1 = IMP.atom.Selection(mh_res)
127  s1.set_atom_types([IMP.atom.AtomType("CA")])
128  self.all_ca.append(s1.get_selected_particles())
129  # load configurations
130  self.coords = []
131  print("load configurations")
132  for combi, comb in enumerate(self.combs[:max_comb_ind]):
133  self.ensmb.load_combination(comb)
134  c1 = []
135  for mol_ca in self.all_ca:
136  mol_xyz = []
137  for ca in mol_ca:
138  mol_xyz.append(IMP.core.XYZ(ca).get_coordinates())
139  c1.append(mol_xyz)
140  self.coords.append(c1)
141  self.ensmb.unload_combination(comb)
142  self.distances = []
143  print("calculate distances")
144  for i in range(len(self.coords)):
145  for j in range(i + 1, len(self.coords)):
146  self.distances.append(
148  list(itertools.chain.from_iterable(self.coords[i])),
149  list(itertools.chain.from_iterable(self.coords[j]))))
150  print("cluster")
151  Z = fastcluster.linkage(self.distances)
152  self.cluster_inds = scipy.cluster.hierarchy.fcluster(
153  Z, max_rmsd, criterion='distance')
154  self.uniques = get_uniques(self.cluster_inds)
155  print("number of clusters", len(self.uniques))
156 
157  # return clusters by their size
158  return self.uniques
159 
161  self, model_coords, native_coords):
162  """
163  Computes the position error (placement distance) and the orientation error (placement angle) of the coordinates in model_coords respect to the coordinates in native_coords.
164  placement distance - translation between the centroids of the
165  coordinates placement angle - Angle in the axis-angle formulation of the rotation
166  aligning the two rigid bodies.
167  """
168  native_centroid = IMP.algebra.get_centroid(native_coords)
169  model_centroid = IMP.algebra.get_centroid(model_coords)
170  translation_vector = native_centroid - model_centroid
171  distance = translation_vector.get_magnitude()
172  if(len(model_coords) != len(native_coords)):
173  raise ValueError(
174  "Mismatch in the number of members %d %d " % (
175  len(model_coords),
176  len(native_coords)))
178  model_coords,
179  native_coords)
180  P = IMP.algebra.get_axis_and_angle(TT.get_rotation())
181  angle = P.second * 180. / math.pi
182  return distance, angle
183 
184  def get_cc(self, ps):
185  '''
186  bb_native = self.dmap.get_bounding_box()
187  bb_solution = IMP.core.get_bounding_box(IMP.core.XYZs(ps))
188  # bounding box enclosing both the particles of the native assembly
189  # and the particles of the model
190  bb_union = IMP.algebra.get_union(bb_native, bb_solution)
191  # add border of 4 voxels
192  border = 4*voxel_size
193  bottom = bb_union.get_corner(0)
194  bottom += IMP.algebra.Vector3D(-border, -border, -border)
195  top = bb_union.get_corner(1)
196  top += IMP.algebra.Vector3D(border, border, border)
197  bb_union = IMP.algebra.BoundingBox3D(bottom, top)
198  '''
199 
200  resolution = self.dmap.get_header().get_resolution()
201  voxel_size = self.dmap.get_spacing()
202 
203  map_solution = IMP.em.SampledDensityMap(self.dmap.get_header())
204  map_solution.set_particles(ps)
205  map_solution.resample()
206  map_solution.set_was_used(True)
207 
208  map_solution.calcRMS()
209  # base the calculation of the cross_correlation coefficient on the threshold
210  # for the native map, because the threshold for the map of the model changes
211  # with each model
212  # map_solution.get_header().show()
213  threshold = 0.01 # threshold AFTER normalization using calcRMS()
214  ccc = IMP.em.get_coarse_cc_coefficient(map_solution,
215  self.dmap, threshold)
216  return ccc
217 
218  def get_cluster_representative_combination(self, query_cluster_ind):
219  return self.combs[self.clusters_data[query_cluster_ind].cluster_ind]
220 
221  def get_cluster_stats(self, query_cluster_ind):
222  return self.clusters_data[query_cluster_ind]
223 
224  def do_analysis(self, max_comb_ind):
225  self.clusters_data = {}
226  for cluster_ind in self.uniques:
227  self.clusters_data[cluster_ind] = self.analyze_cluster(
228  cluster_ind, max_comb_ind)
229 
230  def analyze_cluster(self, query_cluster_ind, max_comb_ind):
231  # load reference
232  mhs_native = []
233  mhs_native_ca = []
234  mhs_native_ca_ps = []
235  calc_rmsd = True
236  for i in range(len(self.mhs)):
237  if self.asmb.get_component_header(i).get_reference_fn() == "":
238  calc_rmsd = False
239  continue
240  mhs_native.append(
242  self.asmb.get_component_header(
243  i).get_reference_fn(
244  ),
245  self.mdl))
246  s1 = IMP.atom.Selection(mhs_native[-1])
247  s1.set_atom_types([IMP.atom.AtomType("CA")])
248  mhs_native_ca.append([])
249  mhs_native_ca_ps.append([])
250  for p in s1.get_selected_particles():
251  mhs_native_ca[-1].append(IMP.core.XYZ(p).get_coordinates())
252  mhs_native_ca_ps[-1].append(IMP.core.XYZ(p))
253 
254  rmsds = []
255  distances = []
256  angles = []
257  for i in range(len(self.mhs)):
258  distances.append([])
259  angles.append([])
260  best_sampled_ind = -1
261  best_scored_ind = -1
262  voxel_size = 3 # check with javi
263  resolution = 20 # check with javi
264  counter = -1
265  for elem_ind1, cluster_ind1 in enumerate(self.cluster_inds):
266  if cluster_ind1 != query_cluster_ind:
267  continue
268  counter = counter + 1
269  if calc_rmsd:
270  rmsds.append(
272  list(itertools.chain.from_iterable(mhs_native_ca)),
273  list(itertools.chain.from_iterable(self.coords[elem_ind1]))))
274  if best_scored_ind == -1:
275  self.ensmb.load_combination(self.combs[elem_ind1])
276  best_scored_ind = counter
277  best_scored_cc = self.get_cc(
278  list(itertools.chain.from_iterable(self.all_ca)))
279  if calc_rmsd:
280  best_scored_rmsd = rmsds[-1]
281  best_sampled_ind = counter
282  best_sampled_cc = best_scored_cc
283  best_sampled_rmsd = rmsds[-1]
284  self.ensmb.unload_combination(self.combs[elem_ind1])
285  # print rmsds[-1],best_scored_rmsd
286  if calc_rmsd:
287  if rmsds[-1] < best_scored_rmsd:
288  self.ensmb.load_combination(self.combs[elem_ind1])
289  best_sampled_ind = counter
290  best_sampled_cc = self.get_cc(
291  list(itertools.chain.from_iterable(self.all_ca)))
292  best_sampled_rmsd = rmsds[-1]
293  self.ensmb.unload_combination(self.combs[elem_ind1])
294  sum_d = 0
295  sum_a = 0
296  for i in range(len(self.mhs)):
298  self.coords[elem_ind1][i],
299  mhs_native_ca[i])
300  sum_d = sum_d + d
301  sum_a = sum_a + a
302  distances[i].append(sum_d / len(self.mhs))
303  angles[i].append(sum_a / len(self.mhs))
304  cd = ClusterData(query_cluster_ind, counter + 1, calc_rmsd)
305  if calc_rmsd:
306  import numpy
307  d = numpy.array(list(itertools.chain.from_iterable(distances)))
308  a = numpy.array(list(itertools.chain.from_iterable(angles)))
309  r = numpy.array(rmsds)
310  cd.set_distance_stats(d.mean(), d.std())
311  cd.set_angle_stats(a.mean(), a.std())
312  cd.set_best_scored_data(
313  best_scored_ind,
314  best_scored_rmsd,
315  best_scored_cc,
316  d[0],
317  a[0])
318  cd.set_rmsd_stats(r.mean(), r.std())
319  cd.set_best_sampled_data(
320  best_sampled_ind,
321  best_sampled_rmsd,
322  best_sampled_cc,
323  d[best_sampled_ind],
324  a[best_sampled_ind])
325  else:
326  cd.set_best_scored_data(
327  best_scored_ind,
328  -1,
329  best_scored_cc,
330  -1,
331  -1)
332  return cd
333 
334 
335 def usage():
336  desc = """
337 Clustering of assembly solutions.
338 
339 This program uses the Python 'fastcluster' module, which can be obtained from
340 http://math.stanford.edu/~muellner/fastcluster.html
341 """
342  p = ArgumentParser(description=desc)
343  p.add_argument("-m", "--max", type=int, dest="max", default=999999999,
344  help="maximum solutions to consider")
345  p.add_argument("-r", "--rmsd", type=float, dest="rmsd", default=5,
346  help="maximum rmsd within a cluster")
347  p.add_argument("assembly_file", help="assembly file name")
348  p.add_argument("proteomics_file", help="proteomics file name")
349  p.add_argument("mapping_file", help="mapping file name")
350  p.add_argument("param_file", help="parameter file name")
351  p.add_argument("combinations_file", help="combinations file name")
352  p.add_argument("cluster_file", help="output clusters file name")
353 
354  return p.parse_args()
355 
356 
357 def main():
358  IMP.set_log_level(IMP.WARNING)
359  args = usage()
360 
361  clust_engine = AlignmentClustering(
362  args.assembly_file,
363  args.proteomics_file,
364  args.mapping_file,
365  args.param_file,
366  args.combinations_file)
367  clusters = clust_engine.do_clustering(args.max, args.rmsd)
368  cluster_representatives = []
369  print("clustering completed")
370  print("start analysis")
371  clust_engine.do_analysis(args.max)
372  repr_combs = []
373  for cluster_ind in clust_engine.uniques:
374  repr_combs.append(
375  clust_engine.get_cluster_representative_combination(cluster_ind))
376  IMP.multifit.write_paths(repr_combs, args.cluster_file)
377  # print the clusters data
378  for cluster_ind in clust_engine.uniques:
379  info = clust_engine.get_cluster_stats(cluster_ind)
380  repr_combs.append(
381  clust_engine.get_cluster_representative_combination(cluster_ind))
382  print("==========Cluster index:", info.cluster_ind, "size:", info.cluster_size)
383  if info.rmsd_calculated:
384  print("best sampled in cluster (index,cc,distance,angle,rmsd):", info.best_sampled_ind, info.best_sampled_cc, info.best_sampled_distance, info.best_sampled_angle, info.best_sampled_rmsd)
385  if info.rmsd_calculated:
386  print("cluster representative (index,cc,distance,angle,rmsd):", info.best_scored_ind, info.best_scored_cc, info.best_scored_distance, info.best_scored_angle, info.best_scored_rmsd)
387  else:
388  print("cluster representative (index,cc):", info.best_scored_ind, info.best_scored_cc)
389 
390 
391 if __name__ == "__main__":
392  main()
An ensemble of fitting solutions.
double get_coarse_cc_coefficient(const DensityMap *grid1, const DensityMap *grid2, double grid2_voxel_data_threshold, bool allow_padding=false, FloatPair norm_factors=FloatPair(0., 0.))
Calculates the cross correlation coefficient between two maps.
Various classes to hold sets of particles.
Clusters assembly models.
Definition: cluster.py:64
void write_paths(const IntsList &paths, const std::string &txt_filename)
The type of an atom.
SettingsData * read_settings(const char *filename)
void read_pdb(TextInput input, int model, Hierarchy h)
ProteinsAnchorsSamplingSpace read_protein_anchors_mapping(multifit::ProteomicsData *prots, const std::string &anchors_prot_map_fn, int max_paths=INT_MAX)
Align proteomics graph to EM density map.
Class for sampling a density map from particles.
double get_rmsd(const Selection &s0, const Selection &s1)
Fitting atomic structures into a cryo-electron microscopy density map.
std::pair< Vector3D, double > get_axis_and_angle(const Rotation3D &rot)
Decompose a Rotation3D object into a rotation around an axis.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
ProteomicsData * read_proteomics_data(const char *proteomics_fn)
Proteomics reader.
def get_placement_score_from_coordinates
Computes the position error (placement distance) and the orientation error (placement angle) of the c...
Definition: cluster.py:160
def do_clustering
Cluster configurations for a model based on RMSD.
Definition: cluster.py:111
IMP-specific subclass of argparse.ArgumentParser.
Definition: __init__.py:9596
Vector3D get_centroid(const Vector3Ds &ps)
Return the centroid of a set of vectors.
Definition: Vector3D.h:68
void set_log_level(LogLevel l)
Set the current global log level.
IntsList read_paths(const char *txt_filename, int max_paths=INT_MAX)
Read paths.
FittingSolutionRecords read_fitting_solutions(const char *fitting_fn)
Fitting solutions reader.
def get_cc
bb_native = self.dmap.get_bounding_box() bb_solution = IMP.core.get_bounding_box(IMP.core.XYZs(ps)) bounding box enclosing both the particles of the native assemblyand the particles of the modelbb_union = IMP.algebra.get_union(bb_native, bb_solution) add border of 4 voxelsborder = 4*voxel_size bottom = bb_union.get_corner(0) bottom += IMP.algebra.Vector3D(-border, -border, -border) top = bb_union.get_corner(1) top += IMP.algebra.Vector3D(border, border, border) bb_union = IMP.algebra.BoundingBox3D(bottom, top)
Definition: cluster.py:184
double get_resolution(Model *m, ParticleIndex pi)
Estimate the resolution of the hierarchy as used by Representation.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Select hierarchy particles identified by the biological name.
Definition: Selection.h:70