IMP  2.1.1
The Integrative Modeling Platform
cluster_coarse.py
1 #!/usr/bin/env python
2 
3 #analyse the ensemble, first we will do the rmsd stuff
4 import operator
5 import IMP.multifit
6 from IMP import OptionParser
7 
8 def parse_args():
9  usage = "usage %prog [options] <asmb.input> <proteomics.input> <mapping.input> <alignment params> <combinatins> <diameter> <output combinations>\n"
10  usage+="A script for clustering an ensemble of solutions"
11  parser = OptionParser(usage)
12  parser.add_option("-m", "--max", type="int", dest="max", default=999999999,
13  help="maximum number of combinations to consider")
14  (options, args) = parser.parse_args()
15  if len(args) != 7:
16  parser.error("incorrect number of arguments")
17  return [options,args]
18 
19 def run(asmb_fn,proteomics_fn,mapping_fn,align_param_fn,
20  comb_fn,diameter,output_comb_fn,max_combs):
21  asmb_data=IMP.multifit.read_settings(asmb_fn)
22  prot_data=IMP.multifit.read_proteomics_data(proteomics_fn)
23  mapping_data=IMP.multifit.read_protein_anchors_mapping(prot_data,mapping_fn)
24  alignment_params = IMP.multifit.AlignmentParams(align_param_fn)
25 
26  #load all proteomics restraints
27  align=IMP.multifit.ProteomicsEMAlignmentAtomic(mapping_data,asmb_data,alignment_params)
28  mdl=align.get_model()
29  mhs=align.get_molecules()
30  ensb=IMP.multifit.Ensemble(asmb_data,mapping_data)
31  for i,mh in enumerate(mhs):
32  ensb.add_component_and_fits(mh,
33  IMP.multifit.read_fitting_solutions(asmb_data.get_component_header(i).get_transformations_fn()))
34 
35  mol_path_centers=[] #save the molecule centers for each path
36  #iterate over the molecules
37  print "NUMBER OF COMPS:",asmb_data.get_number_of_component_headers()
38  for i in range(asmb_data.get_number_of_component_headers()):
39  mol_centers=[] #all the centers of a specific molecule
40  mh_leaves=IMP.core.get_leaves(mhs[i])
41  #iterate over the paths and add the center of the path
42  mh_paths=mapping_data.get_paths_for_protein(prot_data.get_protein_name(i))
43  dummy_comb=[]
44  for j in range(asmb_data.get_number_of_component_headers()):
45  dummy_comb.append(0)
46  for j in range(len(mh_paths)):
47  dummy_comb[i]=j
48  ensb.load_combination(dummy_comb)
49  #print IMP.core.XYZs(mh_leaves)
50  mol_centers.append(IMP.core.get_centroid(IMP.core.XYZs(mh_leaves)))
51  ensb.unload_combination(dummy_comb)
52  mol_path_centers.append(mol_centers)
53  for i,p in enumerate(mol_path_centers):
54  print "number of paths for mol:",i,"is",len(p)
55  #load combinations
56  combs=IMP.multifit.read_paths(comb_fn)
57  comb_centroids=[]
58  for comb in combs[:max_combs]:
59  mh_c=[]
60  for i in range(len(mhs)):
61  mh_c+=mol_path_centers[i][comb[i]]
62  comb_centroids.append(IMP.algebra.VectorKD(mh_c))
63  embed=IMP.statistics.VectorDEmbedding(comb_centroids)
64  #TODO - use your RMSD clustering
65  bin_cluster= IMP.statistics.create_bin_based_clustering(embed,diameter)
66  print "number of clusters:",bin_cluster.get_number_of_clusters()
67  cluster_stat=[]
68  for k in range(bin_cluster.get_number_of_clusters()):
69  bc= bin_cluster.get_cluster(k)
70  cluster_stat.append([len(bc),k,bc])
71  cluster_stat=sorted(cluster_stat,key=operator.itemgetter(0),reverse=True)
72  cluster_reps=[]
73  for ind,[cluster_size,cluster_ind,cluster_elems] in enumerate(cluster_stat):
74  print "cluster index:",ind,"with",cluster_size,"combinations"
75  cluster_reps.append(combs[cluster_elems[0]])
76  print "============clustering============"
77  print "Number of clusters found "+str(len(cluster_reps))
78  print "=================================="
79  IMP.multifit.write_paths(cluster_reps,output_comb_fn)
80 
81 if __name__=="__main__":
82  options,args = parse_args()
83  print options
84  run(args[0],args[1],args[2],args[3],args[4],float(args[5]),args[6],options.max)
An ensemble of fitting solutions.
void write_paths(const IntsList &paths, const std::string &txt_filename)
algebra::Vector3D get_centroid(const XYZs &ps)
Get the centroid.
SettingsData * read_settings(const char *filename)
GenericHierarchies get_leaves(Hierarchy mhd)
Get all the leaves of the bit of hierarchy.
ProteinsAnchorsSamplingSpace read_protein_anchors_mapping(multifit::ProteomicsData *prots, const std::string &anchors_prot_map_fn, int max_paths=INT_MAX)
Align proteomics graph to EM density map.
A Cartesian vector in D-dimensions.
Definition: VectorD.h:48
PartitionalClusteringWithCenter * create_bin_based_clustering(Embedding *embed, double side)
See IMP.multifit for more information.
ProteomicsData * read_proteomics_data(const char *proteomics_fn)
Proteomics reader.
IMP::kernel::OptionParser OptionParser
IntsList read_paths(const char *txt_filename, int max_paths=INT_MAX)
Read paths.
FittingSolutionRecords read_fitting_solutions(const char *fitting_fn)
Fitting solutions reader.
Simply return the coordinates of a VectorD.
Definition: embeddings.h:76