3 from __future__
import print_function
4 from IMP
import ArgumentParser
10 __doc__ =
"Cluster assembly solutions."
19 num_keys = len(list(keys.keys()))
20 for i
in range(num_keys + 1):
21 counter.append([0, i])
23 counter[e][0] = counter[e][0] + 1
24 counter = sorted(counter, reverse=
True)
26 for c, i
in counter[:-1]:
33 def __init__(self, cluster_ind, cluster_size, rmsd_calculated):
34 self.cluster_ind = cluster_ind
35 self.cluster_size = cluster_size
36 self.rmsd_calculated = rmsd_calculated
38 def set_distance_stats(self, avg, std):
39 self.distance_avg = avg
40 self.distance_std = std
42 def set_angle_stats(self, avg, std):
46 def set_rmsd_stats(self, avg, std):
50 def set_best_sampled_data(self, ind, rmsd, cc, distance, angle):
51 self.best_sampled_ind = ind
52 self.best_sampled_rmsd = rmsd
53 self.best_sampled_cc = cc
54 self.best_sampled_distance = distance
55 self.best_sampled_angle = angle
57 def set_best_scored_data(self, ind, rmsd, cc, distance, angle):
58 self.best_scored_ind = ind
59 self.best_scored_rmsd = rmsd
60 self.best_scored_cc = cc
61 self.best_scored_distance = distance
62 self.best_scored_angle = angle
68 Clusters assembly models
69 - The solutions are chosen by sorting the database according to the
71 - The models are aligned and clustered by RMSD
74 def __init__(self, asmb_fn, prot_fn, map_fn, align_fn, combs_fn):
75 self.asmb_fn = asmb_fn
76 self.prot_fn = prot_fn
78 self.align_fn = align_fn
79 self.combs_fn = combs_fn
82 self.asmb.set_was_used(
True)
84 self.alignment_params = IMP.multifit.AlignmentParams(self.align_fn)
86 self.prot_data, self.map_fn)
89 self.mapping_data, self.asmb,
90 self.alignment_params)
91 self.align.set_was_used(
True)
95 self.ensmb.set_was_used(
True)
96 self.mhs = self.align.get_molecules()
97 for i, mh
in enumerate(self.mhs):
99 self.asmb.get_component_header(i).get_transformations_fn()
100 self.ensmb.add_component_and_fits(
103 self.dmap = IMP.em.read_map(
104 self.asmb.get_assembly_header().get_dens_fn())
105 self.dmap.set_was_used(
True)
106 self.dmap.get_header().set_resolution(
108 _ = self.asmb.get_assembly_header().get_threshold()
109 self.dmap.update_voxel_size(
110 self.asmb.get_assembly_header().get_spacing())
111 self.dmap.set_origin(self.asmb.get_assembly_header().get_origin())
116 Cluster configurations for a model based on RMSD.
117 An IMP.ConfigurationSet is built using the reference frames for
118 all of the components of the assembly for each solution
119 @param max_comb_ind Maximum number of components to consider
120 @param max_rmsd Maximum RMSD tolerated when clustering
123 import scipy.cluster.hierarchy
125 self.mdl = self.align.get_model()
128 mh_res = IMP.atom.get_by_type(mh, IMP.atom.RESIDUE_TYPE)
131 self.all_ca.append(s1.get_selected_particles())
134 print(
"load configurations")
135 for combi, comb
in enumerate(self.combs[:max_comb_ind]):
136 self.ensmb.load_combination(comb)
138 for mol_ca
in self.all_ca:
143 self.coords.append(c1)
144 self.ensmb.unload_combination(comb)
146 print(
"calculate distances")
147 for i
in range(len(self.coords)):
148 for j
in range(i + 1, len(self.coords)):
149 self.distances.append(
151 list(itertools.chain.from_iterable(self.coords[i])),
152 list(itertools.chain.from_iterable(self.coords[j]))))
154 Z = fastcluster.linkage(self.distances)
155 self.cluster_inds = scipy.cluster.hierarchy.fcluster(
156 Z, max_rmsd, criterion=
'distance')
157 self.uniques = get_uniques(self.cluster_inds)
158 print(
"number of clusters", len(self.uniques))
164 self, model_coords, native_coords):
166 Computes the position error (placement distance) and the orientation
167 error (placement angle) of the coordinates in model_coords with
168 respect to the coordinates in native_coords.
169 placement distance - translation between the centroids of the
171 placement angle - Angle in the axis-angle formulation of the rotation
172 aligning the two rigid bodies.
176 translation_vector = native_centroid - model_centroid
177 distance = translation_vector.get_magnitude()
178 if (len(model_coords) != len(native_coords)):
180 "Mismatch in the number of members %d %d " % (
187 angle = P.second * 180. / math.pi
188 return distance, angle
192 bb_native = self.dmap.get_bounding_box()
193 bb_solution = IMP.core.get_bounding_box(IMP.core.XYZs(ps))
194 # bounding box enclosing both the particles of the native assembly
195 # and the particles of the model
196 bb_union = IMP.algebra.get_union(bb_native, bb_solution)
197 # add border of 4 voxels
198 border = 4*voxel_size
199 bottom = bb_union.get_corner(0)
200 bottom += IMP.algebra.Vector3D(-border, -border, -border)
201 top = bb_union.get_corner(1)
202 top += IMP.algebra.Vector3D(border, border, border)
203 bb_union = IMP.algebra.BoundingBox3D(bottom, top)
207 map_solution.set_particles(ps)
208 map_solution.resample()
209 map_solution.set_was_used(
True)
211 map_solution.calcRMS()
218 self.dmap, threshold)
221 def get_cluster_representative_combination(self, query_cluster_ind):
222 return self.combs[self.clusters_data[query_cluster_ind].cluster_ind]
224 def get_cluster_stats(self, query_cluster_ind):
225 return self.clusters_data[query_cluster_ind]
227 def do_analysis(self, max_comb_ind):
228 self.clusters_data = {}
229 for cluster_ind
in self.uniques:
230 self.clusters_data[cluster_ind] = self.analyze_cluster(
231 cluster_ind, max_comb_ind)
233 def analyze_cluster(self, query_cluster_ind, max_comb_ind):
237 mhs_native_ca_ps = []
239 for i
in range(len(self.mhs)):
240 if self.asmb.get_component_header(i).get_reference_fn() ==
"":
245 self.asmb.get_component_header(
251 mhs_native_ca.append([])
252 mhs_native_ca_ps.append([])
253 for p
in s1.get_selected_particles():
254 mhs_native_ca[-1].append(
IMP.core.XYZ(p).get_coordinates())
260 for i
in range(len(self.mhs)):
263 best_sampled_ind = -1
266 for elem_ind1, cluster_ind1
in enumerate(self.cluster_inds):
267 if cluster_ind1 != query_cluster_ind:
269 counter = counter + 1
273 list(itertools.chain.from_iterable(mhs_native_ca)),
274 list(itertools.chain.from_iterable(
275 self.coords[elem_ind1]))))
276 if best_scored_ind == -1:
277 self.ensmb.load_combination(self.combs[elem_ind1])
278 best_scored_ind = counter
279 best_scored_cc = self.
get_cc(
280 list(itertools.chain.from_iterable(self.all_ca)))
282 best_scored_rmsd = rmsds[-1]
283 best_sampled_ind = counter
284 best_sampled_cc = best_scored_cc
285 best_sampled_rmsd = rmsds[-1]
286 self.ensmb.unload_combination(self.combs[elem_ind1])
289 if rmsds[-1] < best_scored_rmsd:
290 self.ensmb.load_combination(self.combs[elem_ind1])
291 best_sampled_ind = counter
292 best_sampled_cc = self.
get_cc(
293 list(itertools.chain.from_iterable(self.all_ca)))
294 best_sampled_rmsd = rmsds[-1]
295 self.ensmb.unload_combination(self.combs[elem_ind1])
298 for i
in range(len(self.mhs)):
300 self.coords[elem_ind1][i],
304 distances[i].append(sum_d / len(self.mhs))
305 angles[i].append(sum_a / len(self.mhs))
306 cd = ClusterData(query_cluster_ind, counter + 1, calc_rmsd)
309 d = numpy.array(list(itertools.chain.from_iterable(distances)))
310 a = numpy.array(list(itertools.chain.from_iterable(angles)))
311 r = numpy.array(rmsds)
312 cd.set_distance_stats(d.mean(), d.std())
313 cd.set_angle_stats(a.mean(), a.std())
314 cd.set_best_scored_data(
320 cd.set_rmsd_stats(r.mean(), r.std())
321 cd.set_best_sampled_data(
328 cd.set_best_scored_data(
339 Clustering of assembly solutions.
341 This program uses the Python 'fastcluster' module, which can be obtained from
342 http://math.stanford.edu/~muellner/fastcluster.html
345 p.add_argument(
"-m",
"--max", type=int, dest=
"max", default=999999999,
346 help=
"maximum solutions to consider")
347 p.add_argument(
"-r",
"--rmsd", type=float, dest=
"rmsd", default=5,
348 help=
"maximum rmsd within a cluster")
349 p.add_argument(
"assembly_file", help=
"assembly file name")
350 p.add_argument(
"proteomics_file", help=
"proteomics file name")
351 p.add_argument(
"mapping_file", help=
"mapping file name")
352 p.add_argument(
"param_file", help=
"parameter file name")
353 p.add_argument(
"combinations_file", help=
"combinations file name")
354 p.add_argument(
"cluster_file", help=
"output clusters file name")
356 return p.parse_args()
365 args.proteomics_file,
368 args.combinations_file)
369 _ = clust_engine.do_clustering(args.max, args.rmsd)
370 print(
"clustering completed")
371 print(
"start analysis")
372 clust_engine.do_analysis(args.max)
374 for cluster_ind
in clust_engine.uniques:
376 clust_engine.get_cluster_representative_combination(cluster_ind))
379 for cluster_ind
in clust_engine.uniques:
380 info = clust_engine.get_cluster_stats(cluster_ind)
382 clust_engine.get_cluster_representative_combination(cluster_ind))
383 print(
"==========Cluster index:", info.cluster_ind,
"size:",
385 if info.rmsd_calculated:
386 print(
"best sampled in cluster (index,cc,distance,angle,rmsd):",
387 info.best_sampled_ind, info.best_sampled_cc,
388 info.best_sampled_distance, info.best_sampled_angle,
389 info.best_sampled_rmsd)
390 if info.rmsd_calculated:
391 print(
"cluster representative (index,cc,distance,angle,rmsd):",
392 info.best_scored_ind, info.best_scored_cc,
393 info.best_scored_distance, info.best_scored_angle,
394 info.best_scored_rmsd)
396 print(
"cluster representative (index,cc):", info.best_scored_ind,
400 if __name__ ==
"__main__":
An ensemble of fitting solutions.
double get_coarse_cc_coefficient(const DensityMap *grid1, const DensityMap *grid2, double grid2_voxel_data_threshold, bool allow_padding=false, FloatPair norm_factors=FloatPair(0., 0.))
Calculates the cross correlation coefficient between two maps.
Various classes to hold sets of particles.
Clusters assembly models.
void write_paths(const IntsList &paths, const std::string &txt_filename)
SettingsData * read_settings(const char *filename)
void read_pdb(TextInput input, int model, Hierarchy h)
ProteinsAnchorsSamplingSpace read_protein_anchors_mapping(multifit::ProteomicsData *prots, const std::string &anchors_prot_map_fn, int max_paths=INT_MAX)
Align proteomics graph to EM density map.
Class for sampling a density map from particles.
double get_rmsd(const Selection &s0, const Selection &s1)
Fitting atomic structures into a cryo-electron microscopy density map.
std::pair< Vector3D, double > get_axis_and_angle(const Rotation3D &rot)
Decompose a Rotation3D object into a rotation around an axis.
A decorator for a particle with x,y,z coordinates.
ProteomicsData * read_proteomics_data(const char *proteomics_fn)
Proteomics reader.
def get_placement_score_from_coordinates
Computes the position error (placement distance) and the orientation error (placement angle) of the c...
def do_clustering
Cluster configurations for a model based on RMSD.
IMP-specific subclass of argparse.ArgumentParser.
Vector3D get_centroid(const Vector3Ds &ps)
Return the centroid of a set of vectors.
void set_log_level(LogLevel l)
Set the current global log level.
IntsList read_paths(const char *txt_filename, int max_paths=INT_MAX)
Read paths.
FittingSolutionRecords read_fitting_solutions(const char *fitting_fn)
Fitting solutions reader.
def get_cc
bb_native = self.dmap.get_bounding_box() bb_solution = IMP.core.get_bounding_box(IMP.core.XYZs(ps)) bounding box enclosing both the particles of the native assemblyand the particles of the modelbb_union = IMP.algebra.get_union(bb_native, bb_solution) add border of 4 voxelsborder = 4*voxel_size bottom = bb_union.get_corner(0) bottom += IMP.algebra.Vector3D(-border, -border, -border) top = bb_union.get_corner(1) top += IMP.algebra.Vector3D(border, border, border) bb_union = IMP.algebra.BoundingBox3D(bottom, top)
double get_resolution(Model *m, ParticleIndex pi)
Estimate the resolution of the hierarchy as used by Representation.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Select hierarchy particles identified by the biological name.