IMP logo
IMP Reference Guide  2.20.0
The Integrative Modeling Platform
cluster_coarse.py
1 #!/usr/bin/env python
2 
3 from __future__ import print_function
4 import operator
5 import IMP.multifit
6 from IMP import ArgumentParser
7 
8 # analyse the ensemble, first we will do the rmsd stuff
9 
10 def parse_args():
11  desc = "A script for clustering an ensemble of solutions"
12  p = ArgumentParser(description=desc)
13  p.add_argument("-m", "--max", type=int, dest="max", default=999999999,
14  help="maximum number of combinations to consider")
15  p.add_argument("assembly_file", help="assembly file name")
16  p.add_argument("proteomics_file", help="proteomics file name")
17  p.add_argument("mapping_file", help="mapping file name")
18  p.add_argument("param_file", help="parameter file name")
19  p.add_argument("combinations_file", help="combinations file name")
20  p.add_argument("diameter", type=float, help="cluster diameter")
21  p.add_argument("cluster_file", help="output clusters file name")
22  return p.parse_args()
23 
24 
25 def run(asmb_fn, proteomics_fn, mapping_fn, align_param_fn,
26  comb_fn, diameter, output_comb_fn, max_combs):
27  asmb_data = IMP.multifit.read_settings(asmb_fn)
28  prot_data = IMP.multifit.read_proteomics_data(proteomics_fn)
30  prot_data, mapping_fn)
31  alignment_params = IMP.multifit.AlignmentParams(align_param_fn)
32 
33  # load all proteomics restraints
35  mapping_data, asmb_data, alignment_params)
36  mdl = align.get_model()
37  mhs = align.get_molecules()
38  ensb = IMP.multifit.Ensemble(asmb_data, mapping_data)
39  for i, mh in enumerate(mhs):
40  ensb.add_component_and_fits(mh,
41  IMP.multifit.read_fitting_solutions(asmb_data.get_component_header(i).get_transformations_fn()))
42 
43  mol_path_centers = [] # save the molecule centers for each path
44  # iterate over the molecules
45  print("NUMBER OF COMPS:", asmb_data.get_number_of_component_headers())
46  for i in range(asmb_data.get_number_of_component_headers()):
47  mol_centers = [] # all the centers of a specific molecule
48  mh_leaves = IMP.core.get_leaves(mhs[i])
49  # iterate over the paths and add the center of the path
50  mh_paths = mapping_data.get_paths_for_protein(
51  prot_data.get_protein_name(i))
52  dummy_comb = []
53  for j in range(asmb_data.get_number_of_component_headers()):
54  dummy_comb.append(0)
55  for j in range(len(mh_paths)):
56  dummy_comb[i] = j
57  ensb.load_combination(dummy_comb)
58  # print IMP.core.XYZs(mh_leaves)
59  mol_centers.append(IMP.core.get_centroid(IMP.core.XYZs(mh_leaves)))
60  ensb.unload_combination(dummy_comb)
61  mol_path_centers.append(mol_centers)
62  for i, p in enumerate(mol_path_centers):
63  print("number of paths for mol:", i, "is", len(p))
64  # load combinations
65  combs = IMP.multifit.read_paths(comb_fn)
66  comb_centroids = []
67  for comb in combs[:max_combs]:
68  mh_c = []
69  for i in range(len(mhs)):
70  mh_c += mol_path_centers[i][comb[i]]
71  comb_centroids.append(IMP.algebra.VectorKD(mh_c))
72  embed = IMP.statistics.VectorDEmbedding(comb_centroids)
73  # TODO - use your RMSD clustering
74  bin_cluster = IMP.statistics.create_bin_based_clustering(embed, diameter)
75  print("number of clusters:", bin_cluster.get_number_of_clusters())
76  cluster_stat = []
77  for k in range(bin_cluster.get_number_of_clusters()):
78  bc = bin_cluster.get_cluster(k)
79  cluster_stat.append([len(bc), k, bc])
80  cluster_stat = sorted(
81  cluster_stat,
82  key=operator.itemgetter(0),
83  reverse=True)
84  cluster_reps = []
85  for ind, [cluster_size, cluster_ind, cluster_elems] in enumerate(cluster_stat):
86  print("cluster index:", ind, "with", cluster_size, "combinations")
87  cluster_reps.append(combs[cluster_elems[0]])
88  print("============clustering============")
89  print("Number of clusters found " + str(len(cluster_reps)))
90  print("==================================")
91  IMP.multifit.write_paths(cluster_reps, output_comb_fn)
92 
93 if __name__ == "__main__":
94  args = parse_args()
95  run(args.assembly_file, args.proteomics_file, args.mapping_file,
96  args.param_file, args.combinations_file, args.diameter,
97  args.cluster_file, args.max)
An ensemble of fitting solutions.
void write_paths(const IntsList &paths, const std::string &txt_filename)
algebra::Vector3D get_centroid(const XYZs &ps)
Get the centroid.
SettingsData * read_settings(const char *filename)
GenericHierarchies get_leaves(Hierarchy mhd)
Get all the leaves of the bit of hierarchy.
ProteinsAnchorsSamplingSpace read_protein_anchors_mapping(multifit::ProteomicsData *prots, const std::string &anchors_prot_map_fn, int max_paths=INT_MAX)
Align proteomics graph to EM density map.
PartitionalClusteringWithCenter * create_bin_based_clustering(Embedding *embed, double side)
Fitting atomic structures into a cryo-electron microscopy density map.
ProteomicsData * read_proteomics_data(const char *proteomics_fn)
Proteomics reader.
IntsList read_paths(const char *txt_filename, int max_paths=INT_MAX)
Read paths.
FittingSolutionRecords read_fitting_solutions(const char *fitting_fn)
Fitting solutions reader.
VectorD<-1 > VectorKD
Definition: VectorD.h:441
Simply return the coordinates of a VectorD.
Definition: embeddings.h:79