IMP logo
IMP Reference Guide  develop.63b38c487d,2024/12/22
The Integrative Modeling Platform
cluster_coarse.py
1 #!/usr/bin/env python
2 
3 import operator
4 import IMP.multifit
5 from IMP import ArgumentParser
6 
7 # analyse the ensemble, first we will do the rmsd stuff
8 
9 
10 def parse_args():
11  desc = "A script for clustering an ensemble of solutions"
12  p = ArgumentParser(description=desc)
13  p.add_argument("-m", "--max", type=int, dest="max", default=999999999,
14  help="maximum number of combinations to consider")
15  p.add_argument("assembly_file", help="assembly file name")
16  p.add_argument("proteomics_file", help="proteomics file name")
17  p.add_argument("mapping_file", help="mapping file name")
18  p.add_argument("param_file", help="parameter file name")
19  p.add_argument("combinations_file", help="combinations file name")
20  p.add_argument("diameter", type=float, help="cluster diameter")
21  p.add_argument("cluster_file", help="output clusters file name")
22  return p.parse_args()
23 
24 
25 def run(asmb_fn, proteomics_fn, mapping_fn, align_param_fn,
26  comb_fn, diameter, output_comb_fn, max_combs):
27  asmb_data = IMP.multifit.read_settings(asmb_fn)
28  prot_data = IMP.multifit.read_proteomics_data(proteomics_fn)
30  prot_data, mapping_fn)
31  alignment_params = IMP.multifit.AlignmentParams(align_param_fn)
32 
33  # load all proteomics restraints
35  mapping_data, asmb_data, alignment_params)
36  _ = align.get_model()
37  mhs = align.get_molecules()
38  ensb = IMP.multifit.Ensemble(asmb_data, mapping_data)
39  for i, mh in enumerate(mhs):
40  ensb.add_component_and_fits(
41  mh,
43  asmb_data.get_component_header(i).get_transformations_fn()))
44 
45  mol_path_centers = [] # save the molecule centers for each path
46  # iterate over the molecules
47  print("NUMBER OF COMPS:", asmb_data.get_number_of_component_headers())
48  for i in range(asmb_data.get_number_of_component_headers()):
49  mol_centers = [] # all the centers of a specific molecule
50  mh_leaves = IMP.core.get_leaves(mhs[i])
51  # iterate over the paths and add the center of the path
52  mh_paths = mapping_data.get_paths_for_protein(
53  prot_data.get_protein_name(i))
54  dummy_comb = []
55  for j in range(asmb_data.get_number_of_component_headers()):
56  dummy_comb.append(0)
57  for j in range(len(mh_paths)):
58  dummy_comb[i] = j
59  ensb.load_combination(dummy_comb)
60  # print IMP.core.XYZs(mh_leaves)
61  mol_centers.append(IMP.core.get_centroid(IMP.core.XYZs(mh_leaves)))
62  ensb.unload_combination(dummy_comb)
63  mol_path_centers.append(mol_centers)
64  for i, p in enumerate(mol_path_centers):
65  print("number of paths for mol:", i, "is", len(p))
66  # load combinations
67  combs = IMP.multifit.read_paths(comb_fn)
68  comb_centroids = []
69  for comb in combs[:max_combs]:
70  mh_c = []
71  for i in range(len(mhs)):
72  mh_c += mol_path_centers[i][comb[i]]
73  comb_centroids.append(IMP.algebra.VectorKD(mh_c))
74  embed = IMP.statistics.VectorDEmbedding(comb_centroids)
75  # TODO - use your RMSD clustering
76  bin_cluster = IMP.statistics.create_bin_based_clustering(embed, diameter)
77  print("number of clusters:", bin_cluster.get_number_of_clusters())
78  cluster_stat = []
79  for k in range(bin_cluster.get_number_of_clusters()):
80  bc = bin_cluster.get_cluster(k)
81  cluster_stat.append([len(bc), k, bc])
82  cluster_stat = sorted(
83  cluster_stat,
84  key=operator.itemgetter(0),
85  reverse=True)
86  cluster_reps = []
87  for ind, [cluster_size, cluster_ind,
88  cluster_elems] in enumerate(cluster_stat):
89  print("cluster index:", ind, "with", cluster_size, "combinations")
90  cluster_reps.append(combs[cluster_elems[0]])
91  print("============clustering============")
92  print("Number of clusters found " + str(len(cluster_reps)))
93  print("==================================")
94  IMP.multifit.write_paths(cluster_reps, output_comb_fn)
95 
96 
97 if __name__ == "__main__":
98  args = parse_args()
99  run(args.assembly_file, args.proteomics_file, args.mapping_file,
100  args.param_file, args.combinations_file, args.diameter,
101  args.cluster_file, args.max)
An ensemble of fitting solutions.
void write_paths(const IntsList &paths, const std::string &txt_filename)
algebra::Vector3D get_centroid(const XYZs &ps)
Get the centroid.
SettingsData * read_settings(const char *filename)
GenericHierarchies get_leaves(Hierarchy mhd)
Get all the leaves of the bit of hierarchy.
ProteinsAnchorsSamplingSpace read_protein_anchors_mapping(multifit::ProteomicsData *prots, const std::string &anchors_prot_map_fn, int max_paths=INT_MAX)
Align proteomics graph to EM density map.
PartitionalClusteringWithCenter * create_bin_based_clustering(Embedding *embed, double side)
Fitting atomic structures into a cryo-electron microscopy density map.
ProteomicsData * read_proteomics_data(const char *proteomics_fn)
Proteomics reader.
IntsList read_paths(const char *txt_filename, int max_paths=INT_MAX)
Read paths.
FittingSolutionRecords read_fitting_solutions(const char *fitting_fn)
Fitting solutions reader.
VectorD<-1 > VectorKD
Definition: VectorD.h:424
Simply return the coordinates of a VectorD.
Definition: embeddings.h:79