IMP logo
IMP Reference Guide  develop.78018a392b,2024/05/07
The Integrative Modeling Platform
cluster_coarse.py
1 #!/usr/bin/env python
2 
3 from __future__ import print_function
4 import operator
5 import IMP.multifit
6 from IMP import ArgumentParser
7 
8 # analyse the ensemble, first we will do the rmsd stuff
9 
10 
11 def parse_args():
12  desc = "A script for clustering an ensemble of solutions"
13  p = ArgumentParser(description=desc)
14  p.add_argument("-m", "--max", type=int, dest="max", default=999999999,
15  help="maximum number of combinations to consider")
16  p.add_argument("assembly_file", help="assembly file name")
17  p.add_argument("proteomics_file", help="proteomics file name")
18  p.add_argument("mapping_file", help="mapping file name")
19  p.add_argument("param_file", help="parameter file name")
20  p.add_argument("combinations_file", help="combinations file name")
21  p.add_argument("diameter", type=float, help="cluster diameter")
22  p.add_argument("cluster_file", help="output clusters file name")
23  return p.parse_args()
24 
25 
26 def run(asmb_fn, proteomics_fn, mapping_fn, align_param_fn,
27  comb_fn, diameter, output_comb_fn, max_combs):
28  asmb_data = IMP.multifit.read_settings(asmb_fn)
29  prot_data = IMP.multifit.read_proteomics_data(proteomics_fn)
31  prot_data, mapping_fn)
32  alignment_params = IMP.multifit.AlignmentParams(align_param_fn)
33 
34  # load all proteomics restraints
36  mapping_data, asmb_data, alignment_params)
37  _ = align.get_model()
38  mhs = align.get_molecules()
39  ensb = IMP.multifit.Ensemble(asmb_data, mapping_data)
40  for i, mh in enumerate(mhs):
41  ensb.add_component_and_fits(
42  mh,
44  asmb_data.get_component_header(i).get_transformations_fn()))
45 
46  mol_path_centers = [] # save the molecule centers for each path
47  # iterate over the molecules
48  print("NUMBER OF COMPS:", asmb_data.get_number_of_component_headers())
49  for i in range(asmb_data.get_number_of_component_headers()):
50  mol_centers = [] # all the centers of a specific molecule
51  mh_leaves = IMP.core.get_leaves(mhs[i])
52  # iterate over the paths and add the center of the path
53  mh_paths = mapping_data.get_paths_for_protein(
54  prot_data.get_protein_name(i))
55  dummy_comb = []
56  for j in range(asmb_data.get_number_of_component_headers()):
57  dummy_comb.append(0)
58  for j in range(len(mh_paths)):
59  dummy_comb[i] = j
60  ensb.load_combination(dummy_comb)
61  # print IMP.core.XYZs(mh_leaves)
62  mol_centers.append(IMP.core.get_centroid(IMP.core.XYZs(mh_leaves)))
63  ensb.unload_combination(dummy_comb)
64  mol_path_centers.append(mol_centers)
65  for i, p in enumerate(mol_path_centers):
66  print("number of paths for mol:", i, "is", len(p))
67  # load combinations
68  combs = IMP.multifit.read_paths(comb_fn)
69  comb_centroids = []
70  for comb in combs[:max_combs]:
71  mh_c = []
72  for i in range(len(mhs)):
73  mh_c += mol_path_centers[i][comb[i]]
74  comb_centroids.append(IMP.algebra.VectorKD(mh_c))
75  embed = IMP.statistics.VectorDEmbedding(comb_centroids)
76  # TODO - use your RMSD clustering
77  bin_cluster = IMP.statistics.create_bin_based_clustering(embed, diameter)
78  print("number of clusters:", bin_cluster.get_number_of_clusters())
79  cluster_stat = []
80  for k in range(bin_cluster.get_number_of_clusters()):
81  bc = bin_cluster.get_cluster(k)
82  cluster_stat.append([len(bc), k, bc])
83  cluster_stat = sorted(
84  cluster_stat,
85  key=operator.itemgetter(0),
86  reverse=True)
87  cluster_reps = []
88  for ind, [cluster_size, cluster_ind,
89  cluster_elems] in enumerate(cluster_stat):
90  print("cluster index:", ind, "with", cluster_size, "combinations")
91  cluster_reps.append(combs[cluster_elems[0]])
92  print("============clustering============")
93  print("Number of clusters found " + str(len(cluster_reps)))
94  print("==================================")
95  IMP.multifit.write_paths(cluster_reps, output_comb_fn)
96 
97 
98 if __name__ == "__main__":
99  args = parse_args()
100  run(args.assembly_file, args.proteomics_file, args.mapping_file,
101  args.param_file, args.combinations_file, args.diameter,
102  args.cluster_file, args.max)
An ensemble of fitting solutions.
void write_paths(const IntsList &paths, const std::string &txt_filename)
algebra::Vector3D get_centroid(const XYZs &ps)
Get the centroid.
SettingsData * read_settings(const char *filename)
GenericHierarchies get_leaves(Hierarchy mhd)
Get all the leaves of the bit of hierarchy.
ProteinsAnchorsSamplingSpace read_protein_anchors_mapping(multifit::ProteomicsData *prots, const std::string &anchors_prot_map_fn, int max_paths=INT_MAX)
Align proteomics graph to EM density map.
PartitionalClusteringWithCenter * create_bin_based_clustering(Embedding *embed, double side)
Fitting atomic structures into a cryo-electron microscopy density map.
ProteomicsData * read_proteomics_data(const char *proteomics_fn)
Proteomics reader.
IntsList read_paths(const char *txt_filename, int max_paths=INT_MAX)
Read paths.
FittingSolutionRecords read_fitting_solutions(const char *fitting_fn)
Fitting solutions reader.
VectorD<-1 > VectorKD
Definition: VectorD.h:424
Simply return the coordinates of a VectorD.
Definition: embeddings.h:79