doc/html/emagefit__cluster_8py_source.html

#!/usr/bin/env python


import IMP

import IMP.em2d as em2d

import IMP.em2d.utility as utility

import IMP.em2d.imp_general.io as io

import IMP.em2d.imp_general.representation as representation

import IMP.em2d.solutions_io as solutions_io

import IMP.em2d.Database as Database


import IMP.statistics as stats

import IMP.container as container

import IMP.atom as atom

import IMP.core as core

import IMP.algebra as alg


import sys

import os

import time

import logging

log = logging.getLogger("cluster_solutions")


class AlignmentClustering:

    """

        Clusters solutions present in a database.

        - The solutions are chosen by sorting the database according to the

          parameter orderby

        - The models are aligned and clustered by RMSD

    """

    def __init__(self, exp):

        """

            @param exp an Experiment class containing the names of the pdb files

        """

        self.exp = exp


    def cluster(self, fn_database, n_solutions, orderby, max_rmsd):

        """

            @param fn_database Database of results

            @param n_solutions Number of solutions to use for clustering

            @param orderby Measure used to order solutions

            @param max_rmsd See do_clustering()

        """

        log.debug("Call to cluster()")

#        sys.exit()

        db = solutions_io.ResultsDB()

        db.connect(fn_database)

        fields = ["reference_frames","solution_id" ]

        data =  db.get_solutions( fields, n_solutions,orderby)

        db.close()

        self.solution_ids = [row[1] for row in data]

        # reference frames for each configuration

        confs_RFs = []

        for row in data:

            rs = row[0].split("/")

            RFs = [io.TextToReferenceFrame(r).get_reference_frame() for r in rs]

            confs_RFs.append(RFs)

        self.do_clustering(confs_RFs, max_rmsd)


    def do_clustering(self, confs_RFs, max_rmsd):

        """

            Cluster configurations for a model based on RMSD.

            An IMP.ConfigurationSet is built using the reference frames for

            of the components of the assembly for each solution

            @param confs_RFs A lsit containing a tuples of reference frames.

                Each tuple contains the reference frame for the rigid body

                of one component of the assembly

            @param max_rmsd Maximum RMSD tolerated when clustering

        """

        model = IMP.Model()

        assembly = representation.create_assembly(model, self.exp.fn_pdbs)

        rbs = representation.create_rigid_bodies(assembly)

        configuration_set = IMP.ConfigurationSet(model)

        for RFs in confs_RFs:

            representation.set_reference_frames(rbs, RFs)

            configuration_set.save_configuration()

        particles_container = container.ListSingletonContainer(model)

        particles_container.add_particles(atom.get_leaves(assembly))

        metric = stats.ConfigurationSetRMSDMetric(

                                configuration_set,particles_container, True)

        log.info("Clustering ... ")

        maximum_centrality = 10

        self.pclus = stats.create_centrality_clustering( metric, max_rmsd,

                                                maximum_centrality)

        n = self.pclus.get_number_of_clusters()

        log.info("Number of clusters found: %s", n)


    def store_clusters(self, fn_database, tbl="clusters"):

        """

            Store the clusters in the database.

            The database does not necessarily has to be the same database

            used to read the solutions

            @param fn_database Database where the clusters are written

            @param tbl Table of the database where the clusters are written

        """

        if not hasattr(self, "pclus"):

            raise ValueError("Clustering not performed")

        db = solutions_io.ResultsDB()

        if not os.path.exists(fn_database):

            db.create(fn_database)

        db.connect(fn_database)

        db.add_clusters_table(tbl)

        n_clusters = self.pclus.get_number_of_clusters()

        clusters_data = []

        for i in range(n_clusters):

            # IDs of the solutions *according* to the clustering algorithm

            elements = self.pclus.get_cluster(i)

            r = self.pclus.get_cluster_representative(i)

            n_elements = len(elements)

            # IDs of the solutions as stored in the database

            solution_ids = [self.solution_ids[k] for k in elements]

            elements  = "|".join( map(str, elements) )

            solution_ids  = "|".join( map(str, solution_ids) )

            db.add_cluster_record( i, n_elements, r, elements, solution_ids)

        db.store_cluster_data()

        db.close()


if __name__ == "__main__":


    parser = IMP.OptionParser(imp_module=em2d,

                              description= \

        "Clusters the best solutions contained in the database, and writes a "

        " new table in the database containing the clusters ids and members")

    parser.add_option("--exp",

                     dest="experiment",

                     default=None,

                     help="File describing an experiment ")

    parser.add_option("--db",

                    dest="fn_database",

                    help="Database of results")

    parser.add_option("--o",

                      dest="fn_output_db",

                      default="clusters.db",

                      help="Database file to store the clusters obtained. It "\

                           "can be the same one containing the solutions")

    parser.add_option("--n",

                    dest="n_solutions",

                    type=int,

                    default = 10,

                    help="Number of solutions to cluster")

    parser.add_option("--orderby",

                     dest="orderby",

                     default = None,

                     help="Sor the solutions according to this measure before "\

                            "clustering")

    parser.add_option("--log",

                     dest="log",

                     default=None,

                     help="File for logging")

    parser.add_option("--rmsd",

                     type=float,

                     dest="max_rmsd",

                     default=10,

                     help="Maximum rmsd centroids to define clusters")


    args = parser.parse_args()

    args = args[0]

    if(len(sys.argv) == 1):

        parser.print_help()

        sys.exit()

    if(args.log):

        logging.basicConfig(filename=args.log, filemode="w")

    else:

        logging.basicConfig(stream=sys.stdout)

    logging.root.setLevel(logging.DEBUG)


    if(args.fn_database):

        if(not args.n_solutions or not args.orderby):

            raise ValueError("parameters --n and --orderby required")

        exp = utility.get_experiment_params(args.experiment)

        tc = AlignmentClustering(exp)

        tc.cluster(args.fn_database, args.n_solutions, args.orderby,

                                                                args.max_rmsd)

        tc.store_clusters(args.fn_output_db, "clusters")