IMP  2.0.0
The Integrative Modeling Platform
emagefit_cluster.py
1 #!/usr/bin/env python
2 
3 import IMP
4 import IMP.em2d as em2d
5 import IMP.em2d.utility as utility
6 import IMP.em2d.imp_general.io as io
7 import IMP.em2d.imp_general.representation as representation
8 import IMP.em2d.solutions_io as solutions_io
9 import IMP.em2d.Database as Database
10 
11 import IMP.statistics as stats
12 import IMP.container as container
13 import IMP.atom as atom
14 import IMP.core as core
15 import IMP.algebra as alg
16 
17 import sys
18 import os
19 import time
20 import logging
21 log = logging.getLogger("cluster_solutions")
22 
23 
24 class AlignmentClustering:
25  """
26  Clusters solutions present in a database.
27  - The solutions are chosen by sorting the database according to the
28  parameter orderby
29  - The models are aligned and clustered by RMSD
30  """
31  def __init__(self, exp):
32  """
33  @param exp an Experiment class containing the names of the pdb files
34  """
35  self.exp = exp
36 
37  def cluster(self, fn_database, n_solutions, orderby, max_rmsd):
38  """
39  @param fn_database Database of results
40  @param n_solutions Number of solutions to use for clustering
41  @param orderby Measure used to order solutions
42  @param max_rmsd See do_clustering()
43  """
44  log.debug("Call to cluster()")
45 # sys.exit()
46  db = solutions_io.ResultsDB()
47  db.connect(fn_database)
48  fields = ["reference_frames","solution_id" ]
49  data = db.get_solutions( fields, n_solutions,orderby)
50  db.close()
51  self.solution_ids = [row[1] for row in data]
52  # reference frames for each configuration
53  confs_RFs = []
54  for row in data:
55  rs = row[0].split("/")
56  RFs = [io.TextToReferenceFrame(r).get_reference_frame() for r in rs]
57  confs_RFs.append(RFs)
58  self.do_clustering(confs_RFs, max_rmsd)
59 
60  def do_clustering(self, confs_RFs, max_rmsd):
61  """
62  Cluster configurations for a model based on RMSD.
63  An IMP.ConfigurationSet is built using the reference frames for
64  of the components of the assembly for each solution
65  @param confs_RFs A lsit containing a tuples of reference frames.
66  Each tuple contains the reference frame for the rigid body
67  of one component of the assembly
68  @param max_rmsd Maximum RMSD tolerated when clustering
69  """
70  model = IMP.Model()
71  assembly = representation.create_assembly(model, self.exp.fn_pdbs)
72  rbs = representation.create_rigid_bodies(assembly)
73  configuration_set = IMP.ConfigurationSet(model)
74  for RFs in confs_RFs:
75  representation.set_reference_frames(rbs, RFs)
76  configuration_set.save_configuration()
77  particles_container = container.ListSingletonContainer(model)
78  particles_container.add_particles(atom.get_leaves(assembly))
79  metric = stats.ConfigurationSetRMSDMetric(
80  configuration_set,particles_container, True)
81  log.info("Clustering ... ")
82  maximum_centrality = 10
83  self.pclus = stats.create_centrality_clustering( metric, max_rmsd,
84  maximum_centrality)
85  n = self.pclus.get_number_of_clusters()
86  log.info("Number of clusters found: %s", n)
87 
88 
89  def store_clusters(self, fn_database, tbl="clusters"):
90  """
91  Store the clusters in the database.
92  The database does not necessarily has to be the same database
93  used to read the solutions
94  @param fn_database Database where the clusters are written
95  @param tbl Table of the database where the clusters are written
96  """
97  if not hasattr(self, "pclus"):
98  raise ValueError("Clustering not performed")
99  db = solutions_io.ResultsDB()
100  if not os.path.exists(fn_database):
101  db.create(fn_database)
102  db.connect(fn_database)
103  db.add_clusters_table(tbl)
104  n_clusters = self.pclus.get_number_of_clusters()
105  clusters_data = []
106  for i in range(n_clusters):
107  # IDs of the solutions *according* to the clustering algorithm
108  elements = self.pclus.get_cluster(i)
109  r = self.pclus.get_cluster_representative(i)
110  n_elements = len(elements)
111  # IDs of the solutions as stored in the database
112  solution_ids = [self.solution_ids[k] for k in elements]
113  elements = "|".join( map(str, elements) )
114  solution_ids = "|".join( map(str, solution_ids) )
115  db.add_cluster_record( i, n_elements, r, elements, solution_ids)
116  db.store_cluster_data()
117  db.close()
118 
119 if __name__ == "__main__":
120 
121  parser = IMP.OptionParser(imp_module=em2d,
122  description= \
123  "Clusters the best solutions contained in the database, and writes a "
124  " new table in the database containing the clusters ids and members")
125  parser.add_option("--exp",
126  dest="experiment",
127  default=None,
128  help="File describing an experiment ")
129  parser.add_option("--db",
130  dest="fn_database",
131  help="Database of results")
132  parser.add_option("--o",
133  dest="fn_output_db",
134  default="clusters.db",
135  help="Database file to store the clusters obtained. It "\
136  "can be the same one containing the solutions")
137  parser.add_option("--n",
138  dest="n_solutions",
139  type=int,
140  default = 10,
141  help="Number of solutions to cluster")
142  parser.add_option("--orderby",
143  dest="orderby",
144  default = None,
145  help="Sor the solutions according to this measure before "\
146  "clustering")
147  parser.add_option("--log",
148  dest="log",
149  default=None,
150  help="File for logging")
151  parser.add_option("--rmsd",
152  type=float,
153  dest="max_rmsd",
154  default=10,
155  help="Maximum rmsd centroids to define clusters")
156 
157  args = parser.parse_args()
158  args = args[0]
159  if(len(sys.argv) == 1):
160  parser.print_help()
161  sys.exit()
162  if(args.log):
163  logging.basicConfig(filename=args.log, filemode="w")
164  else:
165  logging.basicConfig(stream=sys.stdout)
166  logging.root.setLevel(logging.DEBUG)
167 
168  if(args.fn_database):
169  if(not args.n_solutions or not args.orderby):
170  raise ValueError("parameters --n and --orderby required")
171  exp = utility.get_experiment_params(args.experiment)
172  tc = AlignmentClustering(exp)
173  tc.cluster(args.fn_database, args.n_solutions, args.orderby,
174  args.max_rmsd)
175  tc.store_clusters(args.fn_output_db, "clusters")