3 from IMP
import ArgumentParser
9 __doc__ =
"Cluster assembly solutions."
18 num_keys = len(list(keys.keys()))
19 for i
in range(num_keys + 1):
20 counter.append([0, i])
22 counter[e][0] = counter[e][0] + 1
23 counter = sorted(counter, reverse=
True)
25 for c, i
in counter[:-1]:
32 def __init__(self, cluster_ind, cluster_size, rmsd_calculated):
33 self.cluster_ind = cluster_ind
34 self.cluster_size = cluster_size
35 self.rmsd_calculated = rmsd_calculated
37 def set_distance_stats(self, avg, std):
38 self.distance_avg = avg
39 self.distance_std = std
41 def set_angle_stats(self, avg, std):
45 def set_rmsd_stats(self, avg, std):
49 def set_best_sampled_data(self, ind, rmsd, cc, distance, angle):
50 self.best_sampled_ind = ind
51 self.best_sampled_rmsd = rmsd
52 self.best_sampled_cc = cc
53 self.best_sampled_distance = distance
54 self.best_sampled_angle = angle
56 def set_best_scored_data(self, ind, rmsd, cc, distance, angle):
57 self.best_scored_ind = ind
58 self.best_scored_rmsd = rmsd
59 self.best_scored_cc = cc
60 self.best_scored_distance = distance
61 self.best_scored_angle = angle
67 Clusters assembly models
68 - The solutions are chosen by sorting the database according to the
70 - The models are aligned and clustered by RMSD
73 def __init__(self, asmb_fn, prot_fn, map_fn, align_fn, combs_fn):
74 self.asmb_fn = asmb_fn
75 self.prot_fn = prot_fn
77 self.align_fn = align_fn
78 self.combs_fn = combs_fn
81 self.asmb.set_was_used(
True)
83 self.alignment_params = IMP.multifit.AlignmentParams(self.align_fn)
85 self.prot_data, self.map_fn)
88 self.mapping_data, self.asmb,
89 self.alignment_params)
90 self.align.set_was_used(
True)
94 self.ensmb.set_was_used(
True)
95 self.mhs = self.align.get_molecules()
96 for i, mh
in enumerate(self.mhs):
98 self.asmb.get_component_header(i).get_transformations_fn()
99 self.ensmb.add_component_and_fits(
102 self.dmap = IMP.em.read_map(
103 self.asmb.get_assembly_header().get_dens_fn())
104 self.dmap.set_was_used(
True)
105 self.dmap.get_header().set_resolution(
107 _ = self.asmb.get_assembly_header().get_threshold()
108 self.dmap.update_voxel_size(
109 self.asmb.get_assembly_header().get_spacing())
110 self.dmap.set_origin(self.asmb.get_assembly_header().get_origin())
115 Cluster configurations for a model based on RMSD.
116 An IMP.ConfigurationSet is built using the reference frames for
117 all of the components of the assembly for each solution
118 @param max_comb_ind Maximum number of components to consider
119 @param max_rmsd Maximum RMSD tolerated when clustering
122 import scipy.cluster.hierarchy
124 self.mdl = self.align.get_model()
127 mh_res = IMP.atom.get_by_type(mh, IMP.atom.RESIDUE_TYPE)
130 self.all_ca.append(s1.get_selected_particles())
133 print(
"load configurations")
134 for combi, comb
in enumerate(self.combs[:max_comb_ind]):
135 self.ensmb.load_combination(comb)
137 for mol_ca
in self.all_ca:
142 self.coords.append(c1)
143 self.ensmb.unload_combination(comb)
145 print(
"calculate distances")
146 for i
in range(len(self.coords)):
147 for j
in range(i + 1, len(self.coords)):
148 self.distances.append(
150 list(itertools.chain.from_iterable(self.coords[i])),
151 list(itertools.chain.from_iterable(self.coords[j]))))
153 Z = fastcluster.linkage(self.distances)
154 self.cluster_inds = scipy.cluster.hierarchy.fcluster(
155 Z, max_rmsd, criterion=
'distance')
156 self.uniques = get_uniques(self.cluster_inds)
157 print(
"number of clusters", len(self.uniques))
163 self, model_coords, native_coords):
165 Computes the position error (placement distance) and the orientation
166 error (placement angle) of the coordinates in model_coords with
167 respect to the coordinates in native_coords.
168 placement distance - translation between the centroids of the
170 placement angle - Angle in the axis-angle formulation of the rotation
171 aligning the two rigid bodies.
175 translation_vector = native_centroid - model_centroid
176 distance = translation_vector.get_magnitude()
177 if (len(model_coords) != len(native_coords)):
179 "Mismatch in the number of members %d %d " % (
186 angle = P.second * 180. / math.pi
187 return distance, angle
191 bb_native = self.dmap.get_bounding_box()
192 bb_solution = IMP.core.get_bounding_box(IMP.core.XYZs(ps))
193 # bounding box enclosing both the particles of the native assembly
194 # and the particles of the model
195 bb_union = IMP.algebra.get_union(bb_native, bb_solution)
196 # add border of 4 voxels
197 border = 4*voxel_size
198 bottom = bb_union.get_corner(0)
199 bottom += IMP.algebra.Vector3D(-border, -border, -border)
200 top = bb_union.get_corner(1)
201 top += IMP.algebra.Vector3D(border, border, border)
202 bb_union = IMP.algebra.BoundingBox3D(bottom, top)
206 map_solution.set_particles(ps)
207 map_solution.resample()
208 map_solution.set_was_used(
True)
210 map_solution.calcRMS()
217 self.dmap, threshold)
220 def get_cluster_representative_combination(self, query_cluster_ind):
221 return self.combs[self.clusters_data[query_cluster_ind].cluster_ind]
223 def get_cluster_stats(self, query_cluster_ind):
224 return self.clusters_data[query_cluster_ind]
226 def do_analysis(self, max_comb_ind):
227 self.clusters_data = {}
228 for cluster_ind
in self.uniques:
229 self.clusters_data[cluster_ind] = self.analyze_cluster(
230 cluster_ind, max_comb_ind)
232 def analyze_cluster(self, query_cluster_ind, max_comb_ind):
236 mhs_native_ca_ps = []
238 for i
in range(len(self.mhs)):
239 if self.asmb.get_component_header(i).get_reference_fn() ==
"":
244 self.asmb.get_component_header(
250 mhs_native_ca.append([])
251 mhs_native_ca_ps.append([])
252 for p
in s1.get_selected_particles():
253 mhs_native_ca[-1].append(
IMP.core.XYZ(p).get_coordinates())
259 for i
in range(len(self.mhs)):
262 best_sampled_ind = -1
265 for elem_ind1, cluster_ind1
in enumerate(self.cluster_inds):
266 if cluster_ind1 != query_cluster_ind:
268 counter = counter + 1
272 list(itertools.chain.from_iterable(mhs_native_ca)),
273 list(itertools.chain.from_iterable(
274 self.coords[elem_ind1]))))
275 if best_scored_ind == -1:
276 self.ensmb.load_combination(self.combs[elem_ind1])
277 best_scored_ind = counter
278 best_scored_cc = self.
get_cc(
279 list(itertools.chain.from_iterable(self.all_ca)))
281 best_scored_rmsd = rmsds[-1]
282 best_sampled_ind = counter
283 best_sampled_cc = best_scored_cc
284 best_sampled_rmsd = rmsds[-1]
285 self.ensmb.unload_combination(self.combs[elem_ind1])
288 if rmsds[-1] < best_scored_rmsd:
289 self.ensmb.load_combination(self.combs[elem_ind1])
290 best_sampled_ind = counter
291 best_sampled_cc = self.
get_cc(
292 list(itertools.chain.from_iterable(self.all_ca)))
293 best_sampled_rmsd = rmsds[-1]
294 self.ensmb.unload_combination(self.combs[elem_ind1])
297 for i
in range(len(self.mhs)):
299 self.coords[elem_ind1][i],
303 distances[i].append(sum_d / len(self.mhs))
304 angles[i].append(sum_a / len(self.mhs))
305 cd = ClusterData(query_cluster_ind, counter + 1, calc_rmsd)
308 d = numpy.array(list(itertools.chain.from_iterable(distances)))
309 a = numpy.array(list(itertools.chain.from_iterable(angles)))
310 r = numpy.array(rmsds)
311 cd.set_distance_stats(d.mean(), d.std())
312 cd.set_angle_stats(a.mean(), a.std())
313 cd.set_best_scored_data(
319 cd.set_rmsd_stats(r.mean(), r.std())
320 cd.set_best_sampled_data(
327 cd.set_best_scored_data(
338 Clustering of assembly solutions.
340 This program uses the Python 'fastcluster' module, which can be obtained from
341 http://math.stanford.edu/~muellner/fastcluster.html
344 p.add_argument(
"-m",
"--max", type=int, dest=
"max", default=999999999,
345 help=
"maximum solutions to consider")
346 p.add_argument(
"-r",
"--rmsd", type=float, dest=
"rmsd", default=5,
347 help=
"maximum rmsd within a cluster")
348 p.add_argument(
"assembly_file", help=
"assembly file name")
349 p.add_argument(
"proteomics_file", help=
"proteomics file name")
350 p.add_argument(
"mapping_file", help=
"mapping file name")
351 p.add_argument(
"param_file", help=
"parameter file name")
352 p.add_argument(
"combinations_file", help=
"combinations file name")
353 p.add_argument(
"cluster_file", help=
"output clusters file name")
355 return p.parse_args()
364 args.proteomics_file,
367 args.combinations_file)
368 _ = clust_engine.do_clustering(args.max, args.rmsd)
369 print(
"clustering completed")
370 print(
"start analysis")
371 clust_engine.do_analysis(args.max)
373 for cluster_ind
in clust_engine.uniques:
375 clust_engine.get_cluster_representative_combination(cluster_ind))
378 for cluster_ind
in clust_engine.uniques:
379 info = clust_engine.get_cluster_stats(cluster_ind)
381 clust_engine.get_cluster_representative_combination(cluster_ind))
382 print(
"==========Cluster index:", info.cluster_ind,
"size:",
384 if info.rmsd_calculated:
385 print(
"best sampled in cluster (index,cc,distance,angle,rmsd):",
386 info.best_sampled_ind, info.best_sampled_cc,
387 info.best_sampled_distance, info.best_sampled_angle,
388 info.best_sampled_rmsd)
389 if info.rmsd_calculated:
390 print(
"cluster representative (index,cc,distance,angle,rmsd):",
391 info.best_scored_ind, info.best_scored_cc,
392 info.best_scored_distance, info.best_scored_angle,
393 info.best_scored_rmsd)
395 print(
"cluster representative (index,cc):", info.best_scored_ind,
399 if __name__ ==
"__main__":
An ensemble of fitting solutions.
double get_coarse_cc_coefficient(const DensityMap *grid1, const DensityMap *grid2, double grid2_voxel_data_threshold, bool allow_padding=false, FloatPair norm_factors=FloatPair(0., 0.))
Calculates the cross correlation coefficient between two maps.
Various classes to hold sets of particles.
Clusters assembly models.
void write_paths(const IntsList &paths, const std::string &txt_filename)
SettingsData * read_settings(const char *filename)
void read_pdb(TextInput input, int model, Hierarchy h)
ProteinsAnchorsSamplingSpace read_protein_anchors_mapping(multifit::ProteomicsData *prots, const std::string &anchors_prot_map_fn, int max_paths=INT_MAX)
Align proteomics graph to EM density map.
Class for sampling a density map from particles.
double get_rmsd(const Selection &s0, const Selection &s1)
Fitting atomic structures into a cryo-electron microscopy density map.
std::pair< Vector3D, double > get_axis_and_angle(const Rotation3D &rot)
Decompose a Rotation3D object into a rotation around an axis.
A decorator for a particle with x,y,z coordinates.
ProteomicsData * read_proteomics_data(const char *proteomics_fn)
Proteomics reader.
def get_placement_score_from_coordinates
Computes the position error (placement distance) and the orientation error (placement angle) of the c...
def do_clustering
Cluster configurations for a model based on RMSD.
IMP-specific subclass of argparse.ArgumentParser.
Vector3D get_centroid(const Vector3Ds &ps)
Return the centroid of a set of vectors.
void set_log_level(LogLevel l)
Set the current global log level.
IntsList read_paths(const char *txt_filename, int max_paths=INT_MAX)
Read paths.
FittingSolutionRecords read_fitting_solutions(const char *fitting_fn)
Fitting solutions reader.
def get_cc
bb_native = self.dmap.get_bounding_box() bb_solution = IMP.core.get_bounding_box(IMP.core.XYZs(ps)) bounding box enclosing both the particles of the native assemblyand the particles of the modelbb_union = IMP.algebra.get_union(bb_native, bb_solution) add border of 4 voxelsborder = 4*voxel_size bottom = bb_union.get_corner(0) bottom += IMP.algebra.Vector3D(-border, -border, -border) top = bb_union.get_corner(1) top += IMP.algebra.Vector3D(border, border, border) bb_union = IMP.algebra.BoundingBox3D(bottom, top)
double get_resolution(Model *m, ParticleIndex pi)
Estimate the resolution of the hierarchy as used by Representation.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Select hierarchy particles identified by the biological name.