3 from __future__
import print_function
4 from IMP
import ArgumentParser
10 __doc__ =
"Cluster assembly solutions."
18 num_keys = len(list(keys.keys()))
19 for i
in range(num_keys + 1):
20 counter.append([0, i])
22 counter[e][0] = counter[e][0] + 1
23 counter = sorted(counter, reverse=
True)
25 for c, i
in counter[:-1]:
32 def __init__(self, cluster_ind, cluster_size, rmsd_calculated):
33 self.cluster_ind = cluster_ind
34 self.cluster_size = cluster_size
35 self.rmsd_calculated = rmsd_calculated
37 def set_distance_stats(self, avg, std):
38 self.distance_avg = avg
39 self.distance_std = std
41 def set_angle_stats(self, avg, std):
45 def set_rmsd_stats(self, avg, std):
49 def set_best_sampled_data(self, ind, rmsd, cc, distance, angle):
50 self.best_sampled_ind = ind
51 self.best_sampled_rmsd = rmsd
52 self.best_sampled_cc = cc
53 self.best_sampled_distance = distance
54 self.best_sampled_angle = angle
56 def set_best_scored_data(self, ind, rmsd, cc, distance, angle):
57 self.best_scored_ind = ind
58 self.best_scored_rmsd = rmsd
59 self.best_scored_cc = cc
60 self.best_scored_distance = distance
61 self.best_scored_angle = angle
67 Clusters assembly models
68 - The solutions are chosen by sorting the database according to the
70 - The models are aligned and clustered by RMSD
73 def __init__(self, asmb_fn, prot_fn, map_fn, align_fn, combs_fn):
74 self.asmb_fn = asmb_fn
75 self.prot_fn = prot_fn
77 self.align_fn = align_fn
78 self.combs_fn = combs_fn
81 self.asmb.set_was_used(
True)
83 self.alignment_params = IMP.multifit.AlignmentParams(self.align_fn)
85 self.prot_data, self.map_fn)
88 self.mapping_data, self.asmb,
89 self.alignment_params)
90 self.align.set_was_used(
True)
94 self.ensmb.set_was_used(
True)
95 self.mhs = self.align.get_molecules()
96 for i, mh
in enumerate(self.mhs):
97 self.ensmb.add_component_and_fits(mh,
100 self.dmap = IMP.em.read_map(
101 self.asmb.get_assembly_header().get_dens_fn())
102 self.dmap.set_was_used(
True)
103 self.dmap.get_header().set_resolution(
105 threshold = self.asmb.get_assembly_header().get_threshold()
106 self.dmap.update_voxel_size(
107 self.asmb.get_assembly_header().get_spacing())
108 self.dmap.set_origin(self.asmb.get_assembly_header().get_origin())
113 Cluster configurations for a model based on RMSD.
114 An IMP.ConfigurationSet is built using the reference frames for
115 all of the components of the assembly for each solution
116 @param max_comb_ind Maximum number of components to consider
117 @param max_rmsd Maximum RMSD tolerated when clustering
120 import scipy.cluster.hierarchy
122 self.mdl = self.align.get_model()
125 mh_res = IMP.atom.get_by_type(mh, IMP.atom.RESIDUE_TYPE)
128 self.all_ca.append(s1.get_selected_particles())
131 print(
"load configurations")
132 for combi, comb
in enumerate(self.combs[:max_comb_ind]):
133 self.ensmb.load_combination(comb)
135 for mol_ca
in self.all_ca:
140 self.coords.append(c1)
141 self.ensmb.unload_combination(comb)
143 print(
"calculate distances")
144 for i
in range(len(self.coords)):
145 for j
in range(i + 1, len(self.coords)):
146 self.distances.append(
148 list(itertools.chain.from_iterable(self.coords[i])),
149 list(itertools.chain.from_iterable(self.coords[j]))))
151 Z = fastcluster.linkage(self.distances)
152 self.cluster_inds = scipy.cluster.hierarchy.fcluster(
153 Z, max_rmsd, criterion=
'distance')
154 self.uniques = get_uniques(self.cluster_inds)
155 print(
"number of clusters", len(self.uniques))
161 self, model_coords, native_coords):
163 Computes the position error (placement distance) and the orientation error (placement angle) of the coordinates in model_coords respect to the coordinates in native_coords.
164 placement distance - translation between the centroids of the
165 coordinates placement angle - Angle in the axis-angle formulation of the rotation
166 aligning the two rigid bodies.
170 translation_vector = native_centroid - model_centroid
171 distance = translation_vector.get_magnitude()
172 if(len(model_coords) != len(native_coords)):
174 "Mismatch in the number of members %d %d " % (
181 angle = P.second * 180. / math.pi
182 return distance, angle
186 bb_native = self.dmap.get_bounding_box()
187 bb_solution = IMP.core.get_bounding_box(IMP.core.XYZs(ps))
188 # bounding box enclosing both the particles of the native assembly
189 # and the particles of the model
190 bb_union = IMP.algebra.get_union(bb_native, bb_solution)
191 # add border of 4 voxels
192 border = 4*voxel_size
193 bottom = bb_union.get_corner(0)
194 bottom += IMP.algebra.Vector3D(-border, -border, -border)
195 top = bb_union.get_corner(1)
196 top += IMP.algebra.Vector3D(border, border, border)
197 bb_union = IMP.algebra.BoundingBox3D(bottom, top)
201 voxel_size = self.dmap.get_spacing()
204 map_solution.set_particles(ps)
205 map_solution.resample()
206 map_solution.set_was_used(
True)
208 map_solution.calcRMS()
210 coarse_cc.set_was_used(
True)
216 ccc = coarse_cc.cross_correlation_coefficient(map_solution,
217 self.dmap, threshold)
220 def get_cluster_representative_combination(self, query_cluster_ind):
221 return self.combs[self.clusters_data[query_cluster_ind].cluster_ind]
223 def get_cluster_stats(self, query_cluster_ind):
224 return self.clusters_data[query_cluster_ind]
226 def do_analysis(self, max_comb_ind):
227 self.clusters_data = {}
228 for cluster_ind
in self.uniques:
229 self.clusters_data[cluster_ind] = self.analyze_cluster(
230 cluster_ind, max_comb_ind)
232 def analyze_cluster(self, query_cluster_ind, max_comb_ind):
236 mhs_native_ca_ps = []
238 for i
in range(len(self.mhs)):
239 if self.asmb.get_component_header(i).get_reference_fn() ==
"":
244 self.asmb.get_component_header(
250 mhs_native_ca.append([])
251 mhs_native_ca_ps.append([])
252 for p
in s1.get_selected_particles():
253 mhs_native_ca[-1].append(
IMP.core.XYZ(p).get_coordinates())
259 for i
in range(len(self.mhs)):
262 best_sampled_ind = -1
267 for elem_ind1, cluster_ind1
in enumerate(self.cluster_inds):
268 if cluster_ind1 != query_cluster_ind:
270 counter = counter + 1
274 list(itertools.chain.from_iterable(mhs_native_ca)),
275 list(itertools.chain.from_iterable(self.coords[elem_ind1]))))
276 if best_scored_ind == -1:
277 self.ensmb.load_combination(self.combs[elem_ind1])
278 best_scored_ind = counter
279 best_scored_cc = self.
get_cc(
280 list(itertools.chain.from_iterable(self.all_ca)))
282 best_scored_rmsd = rmsds[-1]
283 best_sampled_ind = counter
284 best_sampled_cc = best_scored_cc
285 best_sampled_rmsd = rmsds[-1]
286 self.ensmb.unload_combination(self.combs[elem_ind1])
289 if rmsds[-1] < best_scored_rmsd:
290 self.ensmb.load_combination(self.combs[elem_ind1])
291 best_sampled_ind = counter
292 best_sampled_cc = self.
get_cc(
293 list(itertools.chain.from_iterable(self.all_ca)))
294 best_sampled_rmsd = rmsds[-1]
295 self.ensmb.unload_combination(self.combs[elem_ind1])
298 for i
in range(len(self.mhs)):
300 self.coords[elem_ind1][i],
304 distances[i].append(sum_d / len(self.mhs))
305 angles[i].append(sum_a / len(self.mhs))
306 cd = ClusterData(query_cluster_ind, counter + 1, calc_rmsd)
309 d = numpy.array(list(itertools.chain.from_iterable(distances)))
310 a = numpy.array(list(itertools.chain.from_iterable(angles)))
311 r = numpy.array(rmsds)
312 cd.set_distance_stats(d.mean(), d.std())
313 cd.set_angle_stats(a.mean(), a.std())
314 cd.set_best_scored_data(
320 cd.set_rmsd_stats(r.mean(), r.std())
321 cd.set_best_sampled_data(
328 cd.set_best_scored_data(
339 Clustering of assembly solutions.
341 This program uses the Python 'fastcluster' module, which can be obtained from
342 http://math.stanford.edu/~muellner/fastcluster.html
344 p = ArgumentParser(description=desc)
345 p.add_argument(
"-m",
"--max", type=int, dest=
"max", default=999999999,
346 help=
"maximum solutions to consider")
347 p.add_argument(
"-r",
"--rmsd", type=float, dest=
"rmsd", default=5,
348 help=
"maximum rmsd within a cluster")
349 p.add_argument(
"assembly_file", help=
"assembly file name")
350 p.add_argument(
"proteomics_file", help=
"proteomics file name")
351 p.add_argument(
"mapping_file", help=
"mapping file name")
352 p.add_argument(
"param_file", help=
"parameter file name")
353 p.add_argument(
"combinations_file", help=
"combinations file name")
354 p.add_argument(
"cluster_file", help=
"output clusters file name")
356 return p.parse_args()
365 args.proteomics_file,
368 args.combinations_file)
369 clusters = clust_engine.do_clustering(args.max, args.rmsd)
370 cluster_representatives = []
371 print(
"clustering completed")
372 print(
"start analysis")
373 clust_engine.do_analysis(args.max)
375 for cluster_ind
in clust_engine.uniques:
377 clust_engine.get_cluster_representative_combination(cluster_ind))
380 for cluster_ind
in clust_engine.uniques:
381 info = clust_engine.get_cluster_stats(cluster_ind)
383 clust_engine.get_cluster_representative_combination(cluster_ind))
384 print(
"==========Cluster index:", info.cluster_ind,
"size:", info.cluster_size)
385 if info.rmsd_calculated:
386 print(
"best sampled in cluster (index,cc,distance,angle,rmsd):", info.best_sampled_ind, info.best_sampled_cc, info.best_sampled_distance, info.best_sampled_angle, info.best_sampled_rmsd)
387 if info.rmsd_calculated:
388 print(
"cluster representative (index,cc,distance,angle,rmsd):", info.best_scored_ind, info.best_scored_cc, info.best_scored_distance, info.best_scored_angle, info.best_scored_rmsd)
390 print(
"cluster representative (index,cc):", info.best_scored_ind, info.best_scored_cc)
393 if __name__ ==
"__main__":
An ensemble of fitting solutions.
Various classes to hold sets of particles.
Clusters assembly models.
void write_paths(const IntsList &paths, const std::string &txt_filename)
SettingsData * read_settings(const char *filename)
void read_pdb(TextInput input, int model, Hierarchy h)
ProteinsAnchorsSamplingSpace read_protein_anchors_mapping(multifit::ProteomicsData *prots, const std::string &anchors_prot_map_fn, int max_paths=INT_MAX)
Responsible for performing coarse fitting between two density objects.
Align proteomics graph to EM density map.
Class for sampling a density map from particles.
double get_rmsd(const Selection &s0, const Selection &s1)
Fitting atomic structures into a cryo-electron microscopy density map.
std::pair< Vector3D, double > get_axis_and_angle(const Rotation3D &rot)
Decompose a Rotation3D object into a rotation around an axis.
A decorator for a particle with x,y,z coordinates.
ProteomicsData * read_proteomics_data(const char *proteomics_fn)
Proteomics reader.
def get_placement_score_from_coordinates
Computes the position error (placement distance) and the orientation error (placement angle) of the c...
def do_clustering
Cluster configurations for a model based on RMSD.
Vector3D get_centroid(const Vector3Ds &ps)
Return the centroid of a set of vectors.
void set_log_level(LogLevel l)
Set the current global log level.
IntsList read_paths(const char *txt_filename, int max_paths=INT_MAX)
Read paths.
FittingSolutionRecords read_fitting_solutions(const char *fitting_fn)
Fitting solutions reader.
def get_cc
bb_native = self.dmap.get_bounding_box() bb_solution = IMP.core.get_bounding_box(IMP.core.XYZs(ps)) bounding box enclosing both the particles of the native assemblyand the particles of the modelbb_union = IMP.algebra.get_union(bb_native, bb_solution) add border of 4 voxelsborder = 4*voxel_size bottom = bb_union.get_corner(0) bottom += IMP.algebra.Vector3D(-border, -border, -border) top = bb_union.get_corner(1) top += IMP.algebra.Vector3D(border, border, border) bb_union = IMP.algebra.BoundingBox3D(bottom, top)
double get_resolution(Model *m, ParticleIndex pi)
Estimate the resolution of the hierarchy as used by Representation.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Select hierarchy particles identified by the biological name.