IMP logo
IMP Reference Guide  2.6.2
The Integrative Modeling Platform
cluster_coarse.py
1 #!/usr/bin/env python
2 
3 from __future__ import print_function
4 import operator
5 import IMP.multifit
6 from IMP import OptionParser
7 
8 # analyse the ensemble, first we will do the rmsd stuff
9 
10 def parse_args():
11  usage = "usage %prog [options] <asmb.input> <proteomics.input> <mapping.input> <alignment params> <combinatins> <diameter> <output combinations>\n"
12  usage += "A script for clustering an ensemble of solutions"
13  parser = OptionParser(usage)
14  parser.add_option("-m", "--max", type="int", dest="max", default=999999999,
15  help="maximum number of combinations to consider")
16  (options, args) = parser.parse_args()
17  if len(args) != 7:
18  parser.error("incorrect number of arguments")
19  return [options, args]
20 
21 
22 def run(asmb_fn, proteomics_fn, mapping_fn, align_param_fn,
23  comb_fn, diameter, output_comb_fn, max_combs):
24  asmb_data = IMP.multifit.read_settings(asmb_fn)
25  prot_data = IMP.multifit.read_proteomics_data(proteomics_fn)
27  prot_data, mapping_fn)
28  alignment_params = IMP.multifit.AlignmentParams(align_param_fn)
29 
30  # load all proteomics restraints
32  mapping_data, asmb_data, alignment_params)
33  mdl = align.get_model()
34  mhs = align.get_molecules()
35  ensb = IMP.multifit.Ensemble(asmb_data, mapping_data)
36  for i, mh in enumerate(mhs):
37  ensb.add_component_and_fits(mh,
38  IMP.multifit.read_fitting_solutions(asmb_data.get_component_header(i).get_transformations_fn()))
39 
40  mol_path_centers = [] # save the molecule centers for each path
41  # iterate over the molecules
42  print("NUMBER OF COMPS:", asmb_data.get_number_of_component_headers())
43  for i in range(asmb_data.get_number_of_component_headers()):
44  mol_centers = [] # all the centers of a specific molecule
45  mh_leaves = IMP.core.get_leaves(mhs[i])
46  # iterate over the paths and add the center of the path
47  mh_paths = mapping_data.get_paths_for_protein(
48  prot_data.get_protein_name(i))
49  dummy_comb = []
50  for j in range(asmb_data.get_number_of_component_headers()):
51  dummy_comb.append(0)
52  for j in range(len(mh_paths)):
53  dummy_comb[i] = j
54  ensb.load_combination(dummy_comb)
55  # print IMP.core.XYZs(mh_leaves)
56  mol_centers.append(IMP.core.get_centroid(IMP.core.XYZs(mh_leaves)))
57  ensb.unload_combination(dummy_comb)
58  mol_path_centers.append(mol_centers)
59  for i, p in enumerate(mol_path_centers):
60  print("number of paths for mol:", i, "is", len(p))
61  # load combinations
62  combs = IMP.multifit.read_paths(comb_fn)
63  comb_centroids = []
64  for comb in combs[:max_combs]:
65  mh_c = []
66  for i in range(len(mhs)):
67  mh_c += mol_path_centers[i][comb[i]]
68  comb_centroids.append(IMP.algebra.VectorKD(mh_c))
69  embed = IMP.statistics.VectorDEmbedding(comb_centroids)
70  # TODO - use your RMSD clustering
71  bin_cluster = IMP.statistics.create_bin_based_clustering(embed, diameter)
72  print("number of clusters:", bin_cluster.get_number_of_clusters())
73  cluster_stat = []
74  for k in range(bin_cluster.get_number_of_clusters()):
75  bc = bin_cluster.get_cluster(k)
76  cluster_stat.append([len(bc), k, bc])
77  cluster_stat = sorted(
78  cluster_stat,
79  key=operator.itemgetter(0),
80  reverse=True)
81  cluster_reps = []
82  for ind, [cluster_size, cluster_ind, cluster_elems] in enumerate(cluster_stat):
83  print("cluster index:", ind, "with", cluster_size, "combinations")
84  cluster_reps.append(combs[cluster_elems[0]])
85  print("============clustering============")
86  print("Number of clusters found " + str(len(cluster_reps)))
87  print("==================================")
88  IMP.multifit.write_paths(cluster_reps, output_comb_fn)
89 
90 if __name__ == "__main__":
91  options, args = parse_args()
92  print(options)
93  run(args[0], args[1], args[2], args[3],
94  args[4], float(args[5]), args[6], options.max)
An ensemble of fitting solutions.
void write_paths(const IntsList &paths, const std::string &txt_filename)
algebra::Vector3D get_centroid(const XYZs &ps)
Get the centroid.
SettingsData * read_settings(const char *filename)
GenericHierarchies get_leaves(Hierarchy mhd)
Get all the leaves of the bit of hierarchy.
ProteinsAnchorsSamplingSpace read_protein_anchors_mapping(multifit::ProteomicsData *prots, const std::string &anchors_prot_map_fn, int max_paths=INT_MAX)
Align proteomics graph to EM density map.
PartitionalClusteringWithCenter * create_bin_based_clustering(Embedding *embed, double side)
Fitting atomic structures into a cryo-electron microscopy density map.
ProteomicsData * read_proteomics_data(const char *proteomics_fn)
Proteomics reader.
IntsList read_paths(const char *txt_filename, int max_paths=INT_MAX)
Read paths.
FittingSolutionRecords read_fitting_solutions(const char *fitting_fn)
Fitting solutions reader.
VectorD<-1 > VectorKD
Definition: VectorD.h:411
Simply return the coordinates of a VectorD.
Definition: embeddings.h:79