IMP  2.3.0
The Integrative Modeling Platform
cluster_coarse.py
1 #!/usr/bin/env python
2 
3 # analyse the ensemble, first we will do the rmsd stuff
4 import operator
5 import IMP.multifit
6 from IMP import OptionParser
7 
8 
9 def parse_args():
10  usage = "usage %prog [options] <asmb.input> <proteomics.input> <mapping.input> <alignment params> <combinatins> <diameter> <output combinations>\n"
11  usage += "A script for clustering an ensemble of solutions"
12  parser = OptionParser(usage)
13  parser.add_option("-m", "--max", type="int", dest="max", default=999999999,
14  help="maximum number of combinations to consider")
15  (options, args) = parser.parse_args()
16  if len(args) != 7:
17  parser.error("incorrect number of arguments")
18  return [options, args]
19 
20 
21 def run(asmb_fn, proteomics_fn, mapping_fn, align_param_fn,
22  comb_fn, diameter, output_comb_fn, max_combs):
23  asmb_data = IMP.multifit.read_settings(asmb_fn)
24  prot_data = IMP.multifit.read_proteomics_data(proteomics_fn)
26  prot_data, mapping_fn)
27  alignment_params = IMP.multifit.AlignmentParams(align_param_fn)
28 
29  # load all proteomics restraints
31  mapping_data, asmb_data, alignment_params)
32  mdl = align.get_model()
33  mhs = align.get_molecules()
34  ensb = IMP.multifit.Ensemble(asmb_data, mapping_data)
35  for i, mh in enumerate(mhs):
36  ensb.add_component_and_fits(mh,
37  IMP.multifit.read_fitting_solutions(asmb_data.get_component_header(i).get_transformations_fn()))
38 
39  mol_path_centers = [] # save the molecule centers for each path
40  # iterate over the molecules
41  print "NUMBER OF COMPS:", asmb_data.get_number_of_component_headers()
42  for i in range(asmb_data.get_number_of_component_headers()):
43  mol_centers = [] # all the centers of a specific molecule
44  mh_leaves = IMP.core.get_leaves(mhs[i])
45  # iterate over the paths and add the center of the path
46  mh_paths = mapping_data.get_paths_for_protein(
47  prot_data.get_protein_name(i))
48  dummy_comb = []
49  for j in range(asmb_data.get_number_of_component_headers()):
50  dummy_comb.append(0)
51  for j in range(len(mh_paths)):
52  dummy_comb[i] = j
53  ensb.load_combination(dummy_comb)
54  # print IMP.core.XYZs(mh_leaves)
55  mol_centers.append(IMP.core.get_centroid(IMP.core.XYZs(mh_leaves)))
56  ensb.unload_combination(dummy_comb)
57  mol_path_centers.append(mol_centers)
58  for i, p in enumerate(mol_path_centers):
59  print "number of paths for mol:", i, "is", len(p)
60  # load combinations
61  combs = IMP.multifit.read_paths(comb_fn)
62  comb_centroids = []
63  for comb in combs[:max_combs]:
64  mh_c = []
65  for i in range(len(mhs)):
66  mh_c += mol_path_centers[i][comb[i]]
67  comb_centroids.append(IMP.algebra.VectorKD(mh_c))
68  embed = IMP.statistics.VectorDEmbedding(comb_centroids)
69  # TODO - use your RMSD clustering
70  bin_cluster = IMP.statistics.create_bin_based_clustering(embed, diameter)
71  print "number of clusters:", bin_cluster.get_number_of_clusters()
72  cluster_stat = []
73  for k in range(bin_cluster.get_number_of_clusters()):
74  bc = bin_cluster.get_cluster(k)
75  cluster_stat.append([len(bc), k, bc])
76  cluster_stat = sorted(
77  cluster_stat,
78  key=operator.itemgetter(0),
79  reverse=True)
80  cluster_reps = []
81  for ind, [cluster_size, cluster_ind, cluster_elems] in enumerate(cluster_stat):
82  print "cluster index:", ind, "with", cluster_size, "combinations"
83  cluster_reps.append(combs[cluster_elems[0]])
84  print "============clustering============"
85  print "Number of clusters found " + str(len(cluster_reps))
86  print "=================================="
87  IMP.multifit.write_paths(cluster_reps, output_comb_fn)
88 
89 if __name__ == "__main__":
90  options, args = parse_args()
91  print options
92  run(args[0], args[1], args[2], args[3],
93  args[4], float(args[5]), args[6], options.max)
An ensemble of fitting solutions.
void write_paths(const IntsList &paths, const std::string &txt_filename)
algebra::Vector3D get_centroid(const XYZs &ps)
Get the centroid.
SettingsData * read_settings(const char *filename)
GenericHierarchies get_leaves(Hierarchy mhd)
Get all the leaves of the bit of hierarchy.
ProteinsAnchorsSamplingSpace read_protein_anchors_mapping(multifit::ProteomicsData *prots, const std::string &anchors_prot_map_fn, int max_paths=INT_MAX)
Align proteomics graph to EM density map.
PartitionalClusteringWithCenter * create_bin_based_clustering(Embedding *embed, double side)
Fitting atomic structures into a cryo-electron microscopy density map.
ProteomicsData * read_proteomics_data(const char *proteomics_fn)
Proteomics reader.
IMP::kernel::OptionParser OptionParser
IntsList read_paths(const char *txt_filename, int max_paths=INT_MAX)
Read paths.
FittingSolutionRecords read_fitting_solutions(const char *fitting_fn)
Fitting solutions reader.
VectorD<-1 > VectorKD
Definition: VectorD.h:411
Simply return the coordinates of a VectorD.
Definition: embeddings.h:79