doc/html/macros_8py_source.html

 """@namespace IMP.pmi.macros

 Protocols for sampling structures and analyzing them.

 """


 import IMP.pmi.representation

 import IMP.pmi.tools

 import IMP.pmi.samplers

 import IMP.pmi.output

 import IMP.pmi.analysis

 import IMP.pmi.io.input

 import IMP.rmf

 import RMF

 import os

 import glob

 from operator import itemgetter

 from collections import defaultdict

 import numpy as np


 class ReplicaExchange0(object):

     """ A macro to help setup and run replica exchange,

     supporting monte carlo and molecular dynamics.

     Produces trajectory RMF files, best PDB structures,

     and output stat files

     """


     def __init__(self, model,

                  representation=None,

                  root_hier=None,

                  sample_objects=None, # DEPRECATED

                  monte_carlo_sample_objects=None,

                  molecular_dynamics_sample_objects=None,

                  output_objects=None,

                  crosslink_restraints=None,

                  monte_carlo_temperature=1.0,

                  replica_exchange_minimum_temperature=1.0,

                  replica_exchange_maximum_temperature=2.5,

                  num_sample_rounds=1,

                  number_of_best_scoring_models=500,

                  monte_carlo_steps=10,

                  molecular_dynamics_steps=10,

                  number_of_frames=1000,

                  nframes_write_coordinates=1,

                  write_initial_rmf=True,

                  initial_rmf_name_suffix="initial",

                  stat_file_name_suffix="stat",

                  best_pdb_name_suffix="model",

                  do_clean_first=True,

                  do_create_directories=True,

                  global_output_directory="./",

                  rmf_dir="rmfs/",

                  best_pdb_dir="pdbs/",

                  replica_stat_file_suffix="stat_replica",

                  em_object_for_rmf=None,

                  atomistic=False,

                  replica_exchange_object=None):

         """

         Setup replica exchange.

         @param model                    The IMP model

         @param representation           PMI.Representation() (or list of them, for multi-state modeling)

         @param root_hier                Instead of passing Representation, just pass a hierarchy

         @param mote_carlo_sample_objcts Objects for MC sampling

         @param molecular_dynamics_sample_objects Objects for MD sampling

         @param output_objects           Objects with get_output() for packing into stat files

         @param crosslink_restraints     Harmonic restraints to go in output RMF files

         @param monte_carlo_temperature  MC temp

         @param replica_exchange_minimum_temperature Low temp for REX

         @param replica_exchange_maximum_temperature High temp for REX

         @param num_sample_rounds        Number of rounds of MC/MD per cycle

         @param number_of_best_scoring_models Number of top-scoring PDB models to keep around

         @param monte_carlo_steps        Number of MC steps per round

         @param molecular_dynamics_steps  Number of MD steps per round

         @param number_of_frames         Number of REX frames to run

         @param nframes_write_coordinates How often to write the coordinates of a frame

         @param write_initial_rmf        Write the initial configuration

         """


         self.model = model

         self.vars = {}


         ### add check hierarchy is multistate

         if representation:

             if type(representation) == list:

                 self.is_multi_state = True

                 self.root_hiers = [r.prot for r in representation]

                 self.vars["number_of_states"] = len(representation)

             else:

                 self.is_multi_state = False

                 self.root_hier = representation.prot

                 self.vars["number_of_states"] = 1

         elif root_hier:

             states = IMP.atom.get_by_type(root_hier,IMP.atom.STATE_TYPE)

             self.vars["number_of_states"] = len(states)

             if len(states)>1:

                 self.root_hiers = states

                 self.is_multi_state = True

             else:

                 self.root_hier = root_hier

                 self.is_multi_state = False

         else:

             print "ERROR: Must provide representation or root_hier"

             return


         self.crosslink_restraints = crosslink_restraints

         self.em_object_for_rmf = em_object_for_rmf

         self.monte_carlo_sample_objects = monte_carlo_sample_objects

         if sample_objects is not None:

             self.monte_carlo_sample_objects+=sample_objects

         self.molecular_dynamics_sample_objects=molecular_dynamics_sample_objects

         self.output_objects = output_objects

         self.replica_exchange_object = replica_exchange_object

         self.vars["monte_carlo_temperature"] = monte_carlo_temperature

         self.vars[

             "replica_exchange_minimum_temperature"] = replica_exchange_minimum_temperature

         self.vars[

             "replica_exchange_maximum_temperature"] = replica_exchange_maximum_temperature

         self.vars["num_sample_rounds"] = num_sample_rounds

         self.vars[

             "number_of_best_scoring_models"] = number_of_best_scoring_models

         self.vars["monte_carlo_steps"] = monte_carlo_steps

         self.vars["molecular_dynamics_steps"]=molecular_dynamics_steps

         self.vars["number_of_frames"] = number_of_frames

         self.vars["nframes_write_coordinates"] = nframes_write_coordinates

         self.vars["write_initial_rmf"] = write_initial_rmf

         self.vars["initial_rmf_name_suffix"] = initial_rmf_name_suffix

         self.vars["best_pdb_name_suffix"] = best_pdb_name_suffix

         self.vars["stat_file_name_suffix"] = stat_file_name_suffix

         self.vars["do_clean_first"] = do_clean_first

         self.vars["do_create_directories"] = do_create_directories

         self.vars["global_output_directory"] = global_output_directory

         self.vars["rmf_dir"] = rmf_dir

         self.vars["best_pdb_dir"] = best_pdb_dir

         self.vars["atomistic"] = atomistic

         self.vars["replica_stat_file_suffix"] = replica_stat_file_suffix


     def show_info(self):

         print "ReplicaExchange0: it generates initial.*.rmf3, stat.*.out, rmfs/*.rmf3 for each replica "

         print "--- it stores the best scoring pdb models in pdbs/"

         print "--- the stat.*.out and rmfs/*.rmf3 are saved only at the lowest temperature"

         print "--- variables:"

         keys = self.vars.keys()

         keys.sort()

         for v in keys:

             print "------", v.ljust(30), self.vars[v]


     def get_replica_exchange_object(self):

         return self.replica_exchange_object


     def execute_macro(self):


         temp_index_factor = 100000.0

         samplers=[]

         sampler_mc=None

         sampler_md=None

         if self.monte_carlo_sample_objects is not None:

             print "Setting up MonteCarlo"

             sampler_mc = IMP.pmi.samplers.MonteCarlo(self.model,

                                                  self.monte_carlo_sample_objects,

                                                  self.vars["monte_carlo_temperature"])

             self.output_objects.append(sampler_mc)

             samplers.append(sampler_mc)

         if self.molecular_dynamics_sample_objects is not None:

             print "Setting up MolecularDynamics"

             sampler_md = IMP.pmi.samplers.MolecularDynamics(self.model,

                                                        self.molecular_dynamics_sample_objects,

                                                        self.vars["monte_carlo_temperature"])

             self.output_objects.append(sampler_md)

             samplers.append(sampler_md)

 # -------------------------------------------------------------------------


         print "Setting up ReplicaExchange"

         rex = IMP.pmi.samplers.ReplicaExchange(self.model,

                                                self.vars[

                                                    "replica_exchange_minimum_temperature"],

                                                self.vars[

                                                    "replica_exchange_maximum_temperature"],

                                                samplers,

                                                replica_exchange_object=self.replica_exchange_object)

         self.replica_exchange_object = rex.rem


         myindex = rex.get_my_index()

         self.output_objects.append(rex)


         # must reset the minimum temperature due to the

         # different binary length of rem.get_my_parameter double and python

         # float

         min_temp_index = int(min(rex.get_temperatures()) * temp_index_factor)


 # -------------------------------------------------------------------------


         globaldir = self.vars["global_output_directory"] + "/"

         rmf_dir = globaldir + self.vars["rmf_dir"]

         pdb_dir = globaldir + self.vars["best_pdb_dir"]


         if self.vars["do_clean_first"]:

             pass


         if self.vars["do_create_directories"]:


             try:

                 os.makedirs(globaldir)

             except:

                 pass

             try:

                 os.makedirs(rmf_dir)

             except:

                 pass


             if not self.is_multi_state:

                 try:

                     os.makedirs(pdb_dir)

                 except:

                     pass

             else:

                 for n in range(self.vars["number_of_states"]):

                     try:

                         os.makedirs(pdb_dir + "/" + str(n))

                     except:

                         pass


 # -------------------------------------------------------------------------


         sw = IMP.pmi.tools.Stopwatch()

         self.output_objects.append(sw)


         print "Setting up stat file"

         output = IMP.pmi.output.Output(atomistic=self.vars["atomistic"])

         low_temp_stat_file = globaldir + \

             self.vars["stat_file_name_suffix"] + "." + str(myindex) + ".out"

         output.init_stat2(low_temp_stat_file,

                           self.output_objects,

                           extralabels=["rmf_file", "rmf_frame_index"])


         print "Setting up replica stat file"

         replica_stat_file = globaldir + \

             self.vars["replica_stat_file_suffix"] + "." + str(myindex) + ".out"

         output.init_stat2(replica_stat_file, [rex], extralabels=["score"])


         print "Setting up best pdb files"

         if not self.is_multi_state:

             output.init_pdb_best_scoring(pdb_dir + "/" +

                                          self.vars["best_pdb_name_suffix"],

                                          self.root_hier,

                                          self.vars[

                                              "number_of_best_scoring_models"],

                                          replica_exchange=True)

         else:

             for n in range(self.vars["number_of_states"]):

                 output.init_pdb_best_scoring(pdb_dir + "/" + str(n) + "/" +

                                              self.vars["best_pdb_name_suffix"],

                                              self.root_hiers[n],

                                              self.vars[

                                                  "number_of_best_scoring_models"],

                                              replica_exchange=True)


 # ---------------------------------------------


         if not self.em_object_for_rmf is None:

             if not self.is_multi_state:

                 output_hierarchies = [

                     self.root_hier,

                     self.em_object_for_rmf.get_density_as_hierarchy(

                     )]

             else:

                 output_hierarchies = self.root_hiers

                 output_hierarchies.append(

                     self.em_object_for_rmf.get_density_as_hierarchy())

         else:

             if not self.is_multi_state:

                 output_hierarchies = [self.root_hier]

             else:

                 output_hierarchies = self.root_hiers


 #----------------------------------------------

         print "Setting up and writing initial rmf coordinate file"

         init_suffix = globaldir + self.vars["initial_rmf_name_suffix"]

         output.init_rmf(init_suffix + "." + str(myindex) + ".rmf3",

                         output_hierarchies)

         if self.crosslink_restraints:

             output.add_restraints_to_rmf(

                 init_suffix + "." + str(myindex) + ".rmf3",

                 self.crosslink_restraints)

         output.write_rmf(init_suffix + "." + str(myindex) + ".rmf3")

         output.close_rmf(init_suffix + "." + str(myindex) + ".rmf3")


 #----------------------------------------------


         print "Setting up production rmf files"


         rmfname = rmf_dir + "/" + str(myindex) + ".rmf3"

         output.init_rmf(rmfname, output_hierarchies)


         if self.crosslink_restraints:

             output.add_restraints_to_rmf(rmfname, self.crosslink_restraints)


         ntimes_at_low_temp = 0


         if myindex == 0:

             self.show_info()


         for i in range(self.vars["number_of_frames"]):

             for nr in range(self.vars["num_sample_rounds"]):

                 if sampler_mc is not None:

                     sampler_mc.optimize(self.vars["monte_carlo_steps"])

                 if sampler_md is not None:

                     sampler_md.optimize(self.vars["molecular_dynamics_steps"])

             score = self.model.evaluate(False)

             output.set_output_entry("score", score)


             my_temp_index = int(rex.get_my_temp() * temp_index_factor)


             if min_temp_index == my_temp_index:

                 print "--- frame %s score %s " % (str(i), str(score))


                 if i % self.vars["nframes_write_coordinates"]==0:

                     print '--- writing coordinates'

                     output.write_pdb_best_scoring(score)

                     output.write_rmf(rmfname)

                     output.set_output_entry("rmf_file", rmfname)

                     output.set_output_entry("rmf_frame_index", ntimes_at_low_temp)

                 else:

                     output.set_output_entry("rmf_file", rmfname)

                     output.set_output_entry("rmf_frame_index", '-1')

                 output.write_stat2(low_temp_stat_file)

                 ntimes_at_low_temp += 1


             output.write_stat2(replica_stat_file)

             rex.swap_temp(i, score)


 # -----------------------------------------------------------------------


 def BuildModel0(

     m,

     data,

     resolutions=[1,

                  10],

     missing_bead_size=20,

         residue_per_gaussian=None):

     '''

     The macro construct a component for each subunit (no splitting, nothing fancy)

     You can pass the resolutions and the bead size for the missing residue regions.

     To use this macro, you must provide the following data structure:


     Component  pdbfile    chainid  rgb color     fastafile     sequence id

                                                                       in fastafile


 data = [("Rpb1",     pdbfile,   "A",     0.00000000,  (fastafile,    0)),

       ("Rpb2",     pdbfile,   "B",     0.09090909,  (fastafile,    1)),

       ("Rpb3",     pdbfile,   "C",     0.18181818,  (fastafile,    2)),

       ("Rpb4",     pdbfile,   "D",     0.27272727,  (fastafile,    3)),

       ("Rpb5",     pdbfile,   "E",     0.36363636,  (fastafile,    4)),

       ("Rpb6",     pdbfile,   "F",     0.45454545,  (fastafile,    5)),

       ("Rpb7",     pdbfile,   "G",     0.54545455,  (fastafile,    6)),

       ("Rpb8",     pdbfile,   "H",     0.63636364,  (fastafile,    7)),

       ("Rpb9",     pdbfile,   "I",     0.72727273,  (fastafile,    8)),

       ("Rpb10",    pdbfile,   "L",     0.81818182,  (fastafile,    9)),

       ("Rpb11",    pdbfile,   "J",     0.90909091,  (fastafile,   10)),

       ("Rpb12",    pdbfile,   "K",     1.00000000,  (fastafile,   11))]


     '''


     r = IMP.pmi.representation.Representation(m)


     # the dictionary for the hierarchies,

     hierarchies = {}


     for d in data:

                 # retrieve the information from the data structure

         component_name = d[0]

         pdb_file = d[1]

         chain_id = d[2]

         color_id = d[3]

         fasta_file = d[4][0]

         # this function

         fastids = IMP.pmi.tools.get_ids_from_fasta_file(fasta_file)

         fasta_file_id = d[4][1]

         # avoid to add a component with the same name

         r.create_component(component_name,

                            color=color_id)


         r.add_component_sequence(component_name,

                                  fasta_file,

                                  id=fastids[fasta_file_id])


         hierarchies = r.autobuild_model(component_name,

                                         pdb_file,

                                         chain_id,

                                         resolutions=resolutions,

                                         missingbeadsize=missing_bead_size)


         r.show_component_table(component_name)


         r.set_rigid_bodies([component_name])


         r.set_chain_of_super_rigid_bodies(

             hierarchies,

             min_length=2,

             max_length=2)


         r.setup_component_sequence_connectivity(component_name, resolution=1)

         r.setup_component_geometry(component_name)


     r.setup_bonds()

     # put it at the end of rigid bodies

     r.set_floppy_bodies()


     # set current coordinates as reference for RMSD calculation

     r.set_current_coordinates_as_reference_for_rmsd("Reference")


     return r


 # ----------------------------------------------------------------------


 class BuildModel1(object):


     ''' this building scheme needs a data structure with the following fields

           comp_name

           hier_name

           color

           fasta_file

           fasta_id

           pdb_name

           chain_id

           res_range

           read_em_files

           bead_size

           rb

           super_rb

           em_num_components

           em_txt_file_name

           em_mrc_file_name

     '''


     def __init__(self, representation):

         self.simo=representation

         self.gmm_models_directory="."


     def set_gmm_models_directory(self,directory_name):

         self.gmm_models_directory=directory_name


     def build_model(self,data_structure,sequence_connectivity_scale=4.0):


         self.domain_dict={}

         self.resdensities={}

         super_rigid_bodies={}

         chain_super_rigid_bodies={}

         rigid_bodies={}


         for d in data_structure:

             comp_name         = d[0]

             hier_name         = d[1]

             color             = d[2]

             fasta_file        = d[3]

             fasta_id          = d[4]

             pdb_name          = d[5]

             chain_id          = d[6]

             res_range         = d[7][0:2]

             try:

                 offset         = d[7][2]

             except:

                 offset         = 0

             read_em_files     = d[8]

             bead_size         = d[9]

             rb                = d[10]

             super_rb          = d[11]

             em_num_components = d[12]

             em_txt_file_name  = d[13]

             em_mrc_file_name  = d[14]

             chain_of_super_rb = d[15]


             if comp_name not in self.simo.get_component_names():

                 self.simo.create_component(comp_name,color=0.0)

                 self.simo.add_component_sequence(comp_name,fasta_file,fasta_id)

             outhier=self.autobuild(self.simo,comp_name,pdb_name,chain_id,res_range,read=read_em_files,beadsize=bead_size,color=color,offset=offset)


             if not read_em_files is None:

                 if em_txt_file_name is " ": em_txt_file_name=self.gmm_models_directory+"/"+hier_name+".txt"

                 if em_mrc_file_name is " ": em_mrc_file_name=self.gmm_models_directory+"/"+hier_name+".mrc"


                 dens_hier,beads=self.create_density(self.simo,comp_name,outhier,em_txt_file_name,em_mrc_file_name,em_num_components,read_em_files)

                 self.simo.add_all_atom_densities(comp_name, hierarchies=beads)

                 dens_hier+=beads


             else:

                 dens_hier=[]


             self.resdensities[hier_name]=dens_hier

             self.domain_dict[hier_name]=outhier+dens_hier


             if rb is not None:

                 if rb not in rigid_bodies:

                     rigid_bodies[rb]=[h for h in self.domain_dict[hier_name]]

                 else:

                     rigid_bodies[rb]+=[h for h in self.domain_dict[hier_name]]


             if super_rb is not None:

                 for k in super_rb:

                     if k not in super_rigid_bodies:

                         super_rigid_bodies[k]=[h for h in self.domain_dict[hier_name]]

                     else:

                         super_rigid_bodies[k]+=[h for h in self.domain_dict[hier_name]]


             if  chain_of_super_rb is not None:

                 for k in chain_of_super_rb:

                     if k not in chain_super_rigid_bodies:

                         chain_super_rigid_bodies[k]=[h for h in self.domain_dict[hier_name]]

                     else:

                         chain_super_rigid_bodies[k]+=[h for h in self.domain_dict[hier_name]]


         self.rigid_bodies=rigid_bodies


         for c in self.simo.get_component_names():

             self.simo.setup_component_sequence_connectivity(c,scale=sequence_connectivity_scale)

             self.simo.setup_component_geometry(c)


         for rb in rigid_bodies:

             self.simo.set_rigid_body_from_hierarchies(rigid_bodies[rb])


         for k in super_rigid_bodies:

             self.simo.set_super_rigid_body_from_hierarchies(super_rigid_bodies[k])


         for k in chain_super_rigid_bodies:

             self.simo.set_chain_of_super_rigid_bodies(chain_super_rigid_bodies[k],2,3)


         self.simo.set_floppy_bodies()

         self.simo.setup_bonds()


     def get_density_hierarchies(self,hier_name_list):

         # return a list of density hierarchies

         # specify the list of hierarchy names

         dens_hier_list=[]

         for hn in hier_name_list:

             print hn

             dens_hier_list+=self.resdensities[hn]

         return dens_hier_list


     def get_pdb_bead_bits(self,hierarchy):

         pdbbits=[]

         beadbits=[]

         helixbits=[]

         for h in hierarchy:

             if "_pdb" in h.get_name():pdbbits.append(h)

             if "_bead" in h.get_name():beadbits.append(h)

             if "_helix" in h.get_name():helixbits.append(h)

         return (pdbbits,beadbits,helixbits)


     def scale_bead_radii(self,nresidues,scale):

         scaled_beads=set()

         for h in self.domain_dict:

             (pdbbits,beadbits,helixbits)=self.get_pdb_bead_bits(self.domain_dict[h])

             slope=(1.0-scale)/(1.0-float(nresidues))


             for b in beadbits:

                 # I have to do the following

                 # because otherwise we'll scale more than once

                 if b not in scaled_beads:

                     scaled_beads.add(b)

                 else:

                     continue

                 radius=IMP.core.XYZR(b).get_radius()

                 num_residues=len(IMP.pmi.tools.get_residue_indexes(b))

                 scale_factor=slope*float(num_residues)+1.0

                 print scale_factor

                 new_radius=scale_factor*radius

                 IMP.core.XYZR(b).set_radius(new_radius)

                 print b.get_name()

                 print "particle with radius "+str(radius)+" and "+str(num_residues)+" residues scaled to a new radius "+str(new_radius)


     def create_density(self,simo,compname,comphier,txtfilename,mrcfilename,num_components,read=True):

         #density generation for the EM restraint

         (pdbbits,beadbits,helixbits)=self.get_pdb_bead_bits(comphier)


         outhier=[]

         if read:

             if len(pdbbits)!=0:

                 outhier+=simo.add_component_density(compname,

                                          pdbbits,

                                          num_components=num_components, # number of gaussian into which the simulated density is approximated

                                          resolution=0,      # resolution that you want to calculate the simulated density

                                          inputfile=txtfilename) # read what it was calculated before

             if len(helixbits)!=0:

                 outhier+=simo.add_component_density(compname,

                                          helixbits,

                                          num_components=num_components, # number of gaussian into which the simulated density is approximated

                                          resolution=1,      # resolution that you want to calculate the simulated density

                                          inputfile=txtfilename) # read what it was calculated before


         else:

             if len(pdbbits)!=0:

                 outhier+=simo.add_component_density(compname,

                                          pdbbits,

                                          num_components=num_components, # number of gaussian into which the simulated density is approximated

                                          resolution=0,      # resolution that you want to calculate the simulated density

                                          outputfile=txtfilename, # do the calculation

                                          outputmap=mrcfilename,

                                          multiply_by_total_mass=True) # do the calculation and output the mrc


             if len(helixbits)!=0:

                 outhier+=simo.add_component_density(compname,

                                          helixbits,

                                          num_components=num_components, # number of gaussian into which the simulated density is approximated

                                          resolution=1,      # resolution that you want to calculate the simulated density

                                          outputfile=txtfilename, # do the calculation

                                          outputmap=mrcfilename,

                                          multiply_by_total_mass=True) # do the calculation and output the mrc


         return outhier,beadbits


     def autobuild(self,simo,comname,pdbname,chain,resrange,read=True,beadsize=5,color=0.0,offset=0):


         if pdbname is not None and pdbname is not "IDEAL_HELIX" and pdbname is not "BEADS" :

             if resrange[-1]==-1: resrange=(resrange[0],len(simo.sequence_dict[comname]))

             if read==False:

                 outhier=simo.autobuild_model(comname,

                                  pdbname=pdbname,

                                  chain=chain,

                                  resrange=resrange,

                                  resolutions=[0,1,10],

                                  offset=offset,

                                  color=color,

                                  missingbeadsize=beadsize)

             else:

                 outhier=simo.autobuild_model(comname,

                                  pdbname=pdbname,

                                  chain=chain,

                                  resrange=resrange,

                                  resolutions=[1,10],

                                  offset=offset,

                                  color=color,

                                  missingbeadsize=beadsize)


         elif pdbname is not None and pdbname is "IDEAL_HELIX" and pdbname is not "BEADS" :


             outhier=simo.add_component_ideal_helix(comname,

                                                 resolutions=[1,10],

                                                 resrange=resrange,

                                                 color=color,

                                                 show=False)


         elif pdbname is not None and pdbname is not "IDEAL_HELIX" and pdbname is "BEADS" :

             outhier=simo.add_component_necklace(comname,resrange[0],resrange[1],beadsize,color=color)


         else:


             seq_len=len(simo.sequence_dict[comname])

             outhier=simo.add_component_necklace(comname,

                                   begin=1,

                                   end=seq_len,

                                   length=beadsize)


         return outhier


 # ----------------------------------------------------------------------


 class AnalysisReplicaExchange0(object):

     """A macro for running all the basic operations of analysis.

     Including clustering, precision analysis, and making ensemble density maps.

     A number of plots are also supported.

     """

     def __init__(self, model,

                  stat_file_name_suffix="stat",

                  # if you want to merge two calculation directories

                  merge_directories=["./"],

                  best_pdb_name_suffix="model",

                  do_clean_first=True,

                  do_create_directories=True,

                  global_output_directory="./",

                  replica_stat_file_suffix="stat_replica",

                  global_analysis_result_directory="./analysis/",

                  rmf_dir='', #NOT USED

                  ):


         """ Setup analysis.

         @param model                           The IMP model

         @param stat_file_name_suffix

         @param merge_directories               The directories containing output files

         @param best_pdb_name_suffix

         @param do_clean_first

         @param do_create_directories

         @param global_output_directory          Where everything is

         @param replica_stat_file_suffix

         @param global_analysis_result_directory

         """


         self.model = model

         stat_dir = global_output_directory

         self.stat_files = []

         # it contains the position of the root directories

         for rd in merge_directories:

             stat_files = glob.glob(rd + "/" + stat_dir + "/stat.*.out")

             self.stat_files += stat_files


     def clustering(self,

                    score_key="SimplifiedModel_Total_Score_None",

                    rmf_file_key="rmf_file",

                    rmf_file_frame_key="rmf_frame_index",

                    prefiltervalue=None,

                    feature_keys=[],

                    outputdir="./",

                    alignment_components=None,

                    number_of_best_scoring_models=10,

                    rmsd_calculation_components=None,

                    distance_matrix_file=None,

                    load_distance_matrix_file=False,

                    is_mpi=False,

                    skip_clustering=False,

                    number_of_clusters=1,

                    display_plot=False,

                    exit_after_display=True,

                    get_every=1,

                    first_and_last_frames=None,

                    density_custom_ranges=None,

                    write_pdb_with_centered_coordinates=False,

                    voxel_size=5.0):

         """ Get the best scoring models, compute a distance matrix, cluster them, and create density maps

         @param score_key                           The score for ranking models

         @param rmf_file_key                        Key pointing to RMF filename

         @param rmf_file_frame_key                  Key pointing to RMF frame number

         @param prefiltervalue                      Only include frames where the score key is below this value

         @param feature_keys                        Keywords for which you want to calculate average,

                                                     medians, etc,

         @param outputdir                           The local output directory used in the run

         @param alignment_components                List of tuples for aligning the structures

                                                    e.g. ["Rpb1", (20,100,"Rpb2"), .....]

         @param number_of_best_scoring_models       Num models to keep per run

         @param rmsd_calculation_components         List of tuples for calculating RMSD

                                                    e.g. ["Rpb1", (20,100,"Rpb2"), .....]

         @param distance_matrix_file                Where to store/read the distance matrix

         @param load_distance_matrix_file           Try to load the distance matrix file

         @param is_mpi                              Enable MPI

         @param skip_clustering                     Just extract the best scoring models and save the pdbs

         @param number_of_clusters                  Number of k-means clusters

         @param display_plot                        Display the distance matrix

         @param exit_after_display                  Exit after displaying distance matrix

         @param get_every                           Extract every nth frame

         @param first_and_last_frames               A tuple with the first and last frames to be

                                                    analyzed. Values are percentages!

                                                    Default: get all frames

         @param density_custom_ranges               List of tuples or strings for density calculation

                                                    e.g. ["Rpb1", (20,100,"Rpb2"), .....]

         @param write_pdb_with_centered_coordinates

         @param voxel_size                          Used for the density output

         """

         if is_mpi:

             from mpi4py import MPI

             comm = MPI.COMM_WORLD

             rank = comm.Get_rank()

             number_of_processes = comm.size

         else:

             rank = 0

             number_of_processes = 1


         if not load_distance_matrix_file:

             if len(self.stat_files)==0: print "ERROR: no stat file found in the given path"; return

             my_stat_files=IMP.pmi.tools.chunk_list_into_segments(self.stat_files,number_of_processes)[rank]

             best_models = IMP.pmi.io.input.get_best_models(my_stat_files,

                                                           score_key,

                                                           feature_keys,

                                                           rmf_file_key,

                                                           rmf_file_frame_key,

                                                           prefiltervalue,

                                                           get_every)

             rmf_file_list=best_models[0]

             rmf_file_frame_list=best_models[1]

             score_list=best_models[2]

             feature_keyword_list_dict=best_models[3]


 # ------------------------------------------------------------------------

 # collect all the files and scores

 # ------------------------------------------------------------------------


             if number_of_processes > 1:

                 score_list = IMP.pmi.tools.scatter_and_gather(score_list)

                 rmf_file_list = IMP.pmi.tools.scatter_and_gather(rmf_file_list)

                 rmf_file_frame_list = IMP.pmi.tools.scatter_and_gather(

                     rmf_file_frame_list)

                 for k in feature_keyword_list_dict:

                     feature_keyword_list_dict[k] = IMP.pmi.tools.scatter_and_gather(

                         feature_keyword_list_dict[k])


             # sort by score and get the best scoring ones

             score_rmf_tuples = zip(score_list,

                                    rmf_file_list,

                                    rmf_file_frame_list,

                                    range(len(score_list)))


             # keep subset of frames if requested

             if first_and_last_frames is not None:

                 nframes = len(score_rmf_tuples)

                 first_frame = int(first_and_last_frames[0] * nframes)

                 last_frame = int(first_and_last_frames[1] * nframes)

                 if last_frame > len(score_rmf_tuples):

                     last_frame = -1

                 score_rmf_tuples = score_rmf_tuples[first_frame:last_frame]


             # sort RMFs by the score_key in ascending order, and store the rank

             best_score_rmf_tuples = sorted(score_rmf_tuples,

                                            key=lambda x: float(x[0]))[:number_of_best_scoring_models]

             best_score_rmf_tuples=[t+(n,) for n,t in enumerate(best_score_rmf_tuples)]


             # sort the feature scores in the same way

             best_score_feature_keyword_list_dict = defaultdict(list)

             for tpl in best_score_rmf_tuples:

                 index = tpl[3]

                 for f in feature_keyword_list_dict:

                     best_score_feature_keyword_list_dict[f].append(

                         feature_keyword_list_dict[f][index])


             my_best_score_rmf_tuples = IMP.pmi.tools.chunk_list_into_segments(

                 best_score_rmf_tuples,

                 number_of_processes)[rank]


 # ------------------------------------------------------------------------

 # optionally don't compute distance matrix or cluster, just write top files

 # ------------------------------------------------------------------------

             if skip_clustering:

                 dircluster=os.path.join(outputdir,"all_models."+str(n))

                 try:

                     os.mkdir(outputdir)

                 except:

                     pass

                 try:

                     os.mkdir(dircluster)

                 except:

                     pass

                 clusstat=open(os.path.join(dircluster,"stat."+str(rank)+".out"),"w")

                 for cnt,tpl in enumerate(my_best_score_rmf_tuples):

                     rmf_name=tpl[1]

                     rmf_frame_number=tpl[2]

                     tmp_dict={}

                     index=tpl[4]

                     for key in best_score_feature_keyword_list_dict:

                         tmp_dict[key]=best_score_feature_keyword_list_dict[key][index]


                     prot=IMP.pmi.analysis.get_hier_from_rmf(self.model,rmf_frame_number,rmf_name)

                     if not prot:

                         continue


                     o=IMP.pmi.output.Output()

                     out_pdb_fn=os.path.join(dircluster,str(cnt)+"."+str(rank)+".pdb")

                     out_rmf_fn=os.path.join(dircluster,str(cnt)+"."+str(rank)+".rmf")

                     o.init_pdb(out_pdb_fn,prot)

                     o.write_pdb(out_pdb_fn,

                                 translate_to_geometric_center=write_pdb_with_centered_coordinates)


                     tmp_dict["local_pdb_file_name"]=os.path.basename(out_pdb_fn)

                     tmp_dict["rmf_file_full_path"]=rmf_name

                     tmp_dict["local_rmf_file_name"]=os.path.basename(out_rmf_fn)

                     tmp_dict["local_rmf_frame_number"]=0


                     clusstat.write(str(tmp_dict)+"\n")

                     o.init_rmf(out_rmf_fn,[prot])

                     o.write_rmf(out_rmf_fn)

                     o.close_rmf(out_rmf_fn)

                 return


 #-------------------------------------------------------------

 # read the coordinates

 # ------------------------------------------------------------

             rmsd_weights = IMP.pmi.io.input.get_bead_sizes(self.model,

                                                          my_best_score_rmf_tuples[0],

                                                          rmsd_calculation_components)

             got_coords = IMP.pmi.io.input.read_coordinates_of_rmfs(self.model,

                                                               my_best_score_rmf_tuples,

                                                               alignment_components,

                                                               rmsd_calculation_components)

             all_coordinates=got_coords[0]          # dict:key=component name,val=coords per hit

             alignment_coordinates=got_coords[1]    # same as above, limited to alignment bits

             rmsd_coordinates=got_coords[2]         # same as above, limited to RMSD bits

             rmf_file_name_index_dict=got_coords[3] # dictionary with key=RMF, value=score rank

             all_rmf_file_names=got_coords[4]       # RMF file per hit


             # broadcast the coordinates

             if number_of_processes > 1:

                 all_coordinates = IMP.pmi.tools.scatter_and_gather(

                     all_coordinates)

                 all_rmf_file_names = IMP.pmi.tools.scatter_and_gather(

                     all_rmf_file_names)

                 rmf_file_name_index_dict = IMP.pmi.tools.scatter_and_gather(

                     rmf_file_name_index_dict)

                 alignment_coordinates=IMP.pmi.tools.scatter_and_gather(

                     alignment_coordinates)

                 rmsd_coordinates=IMP.pmi.tools.scatter_and_gather(

                     rmsd_coordinates)


             if rank == 0:

                 # save needed informations in external files

                 self.save_objects(

                     [best_score_feature_keyword_list_dict,

                      rmf_file_name_index_dict],

                     ".macro.pkl")


 # ------------------------------------------------------------------------

 # Calculate distance matrix and cluster

 # ------------------------------------------------------------------------

             print "setup clustering class"

             Clusters = IMP.pmi.analysis.Clustering(rmsd_weights)


             for n, model_coordinate_dict in enumerate(all_coordinates):

                 template_coordinate_dict = {}

                 # let's try to align

                 if alignment_components is not None and len(Clusters.all_coords) == 0:

                     # set the first model as template coordinates

                     Clusters.set_template(alignment_coordinates[n])

                 Clusters.fill(all_rmf_file_names[n], rmsd_coordinates[n])


             print "Global calculating the distance matrix"


             # calculate distance matrix, all against all

             Clusters.dist_matrix(is_mpi=is_mpi)


             # perform clustering and optionally display

             if rank == 0:

                 Clusters.do_cluster(number_of_clusters)

                 if display_plot:

                     if rank == 0:

                         Clusters.plot_matrix()

                     if number_of_processes > 1:

                         comm.Barrier()

                     if exit_after_display:

                         exit()

                 Clusters.save_distance_matrix_file(file_name=distance_matrix_file)


 # ------------------------------------------------------------------------

 # Alteratively, load the distance matrix from file and cluster that

 # ------------------------------------------------------------------------

         else:

             if rank==0:

                 print "setup clustering class"

                 Clusters = IMP.pmi.analysis.Clustering()

                 Clusters.load_distance_matrix_file(file_name=distance_matrix_file)

                 print "clustering with %s clusters" % str(number_of_clusters)

                 Clusters.do_cluster(number_of_clusters)

                 [best_score_feature_keyword_list_dict,

                  rmf_file_name_index_dict] = self.load_objects(".macro.pkl")

                 if display_plot:

                     if rank == 0:

                         Clusters.plot_matrix()

                     if number_of_processes > 1:

                         comm.Barrier()

                     if exit_after_display:

                         exit()


 # ------------------------------------------------------------------------

 # now save all informations about the clusters

 # ------------------------------------------------------------------------


         if rank == 0:

             print Clusters.get_cluster_labels()

             for n, cl in enumerate(Clusters.get_cluster_labels()):

                 print "rank %s " % str(rank)

                 print "cluster %s " % str(n)

                 print "cluster label %s " % str(cl)

                 print Clusters.get_cluster_label_names(cl)


                 # first initialize the Density class if requested


                 if density_custom_ranges:

                     DensModule = IMP.pmi.analysis.GetModelDensity(

                         density_custom_ranges,

                         voxel=voxel_size)


                 dircluster = outputdir + "/cluster." + str(n) + "/"

                 try:

                     os.mkdir(outputdir)

                 except:

                     pass

                 try:

                     os.mkdir(dircluster)

                 except:

                     pass


                 rmsd_dict = {"AVERAGE_RMSD":

                              str(Clusters.get_cluster_label_average_rmsd(cl))}

                 clusstat = open(dircluster + "stat.out", "w")

                 for k, structure_name in enumerate(Clusters.get_cluster_label_names(cl)):


                     # extract the features

                     tmp_dict = {}

                     tmp_dict.update(rmsd_dict)

                     index = rmf_file_name_index_dict[structure_name]

                     for key in best_score_feature_keyword_list_dict:

                         tmp_dict[

                             key] = best_score_feature_keyword_list_dict[

                             key][

                             index]


                     # get the rmf name and the frame number from the list of

                     # frame names

                     rmf_name = structure_name.split("|")[0]

                     rmf_frame_number = int(structure_name.split("|")[1])


                     clusstat.write(str(tmp_dict) + "\n")

                     prot,rs = IMP.pmi.analysis.get_hier_and_restraints_from_rmf(

                         self.model,

                         rmf_frame_number,

                         rmf_name)

                     if not prot:

                         continue


                     if k > 0:

                         model_index = Clusters.get_model_index_from_name(

                             structure_name)

                         transformation = Clusters.get_transformation_to_first_member(

                             cl,

                             model_index)


                         rbs = set()

                         for p in IMP.atom.get_leaves(prot):

                             if not IMP.core.XYZR.get_is_setup(p):

                                 IMP.core.XYZR.setup_particle(p)

                                 IMP.core.XYZR(p).set_radius(0.0001)

                                 IMP.core.XYZR(p).set_coordinates((0, 0, 0))


                             if IMP.core.RigidBodyMember.get_is_setup(p):

                                 rb = IMP.core.RigidBodyMember(p).get_rigid_body()

                                 rbs.add(rb)

                             else:

                                 IMP.core.transform(IMP.core.XYZ(p),

                                                    transformation)

                         for rb in rbs:

                             IMP.core.transform(rb,transformation)


                     # add the density

                     if density_custom_ranges:

                         DensModule.add_subunits_density(prot)


                     o = IMP.pmi.output.Output()

                     o.init_pdb(dircluster + str(k) + ".pdb", prot)

                     o.write_pdb(dircluster + str(k) + ".pdb")


                     o.init_rmf(dircluster + str(k) + ".rmf3", [prot],rs)

                     # IMP.rmf.add_restraints(o.dictionary_rmfs[dircluster+str(n)+".rmf3"],restraints)

                     o.write_rmf(dircluster + str(k) + ".rmf3")

                     o.close_rmf(dircluster + str(k) + ".rmf3")


                     del o

                     # IMP.atom.destroy(prot)


                 if density_custom_ranges:

                     DensModule.write_mrc(path=dircluster)

                     del DensModule


         if is_mpi:

             comm.Barrier()


     def save_objects(self, objects, file_name):

         import pickle

         outf = open(file_name, 'w')

         pickle.dump(objects, outf)

         outf.close()


     def load_objects(self, file_name):

         import pickle

         inputf = open(file_name, 'r')

         objects = pickle.load(inputf)

         inputf.close()

         return objects

IMP::pmi.macros.AnalysisReplicaExchange0
A macro for running all the basic operations of analysis.
Definition: macros.py:666

IMP::core::RigidBodyMember
A member of a rigid body, it has internal (local) coordinates.
Definition: rigid_bodies.h:368

IMP::pmi.tools
Miscellaneous utilities.
Definition: tools.py:1

IMP::pmi.macros.AnalysisReplicaExchange0.clustering
def clustering
Get the best scoring models, compute a distance matrix, cluster them, and create density maps...
Definition: macros.py:707

IMP::pmi.analysis.Clustering
A class to cluster structures.
Definition: pmi/Analysis.py:186

IMP::pmi.representation
Representation of the system.
Definition: representation.py:1

IMP::core::XYZR::get_is_setup
static bool get_is_setup(const IMP::kernel::ParticleAdaptor &p)
Definition: XYZR.h:47

IMP::core::XYZR::setup_particle
static XYZR setup_particle(kernel::Model *m, ParticleIndex pi)
Definition: XYZR.h:48

IMP::pmi.tools.scatter_and_gather
def scatter_and_gather
Synchronize data over a parallel run.
Definition: tools.py:970

IMP::core::transform
void transform(XYZ a, const algebra::Transformation3D &tr)
Apply a transformation to the particle.

IMP::atom::get_by_type
Hierarchies get_by_type(Hierarchy mhd, GetByType t)

IMP::core::XYZ
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30

IMP::pmi.macros.BuildModel0
def BuildModel0
The macro construct a component for each subunit (no splitting, nothing fancy) You can pass the resol...
Definition: macros.py:333

IMP::core::RigidBodyMember::get_is_setup
static bool get_is_setup(const IMP::kernel::ParticleAdaptor &p)
Definition: rigid_bodies.h:369

IMP::pmi.output.Output
Class for easy writing of PDBs, RMFs, and stat files.
Definition: output.py:18

IMP::pmi.analysis
Analysis tools.
Definition: pmi/Analysis.py:1

IMP::pmi.output
Classes for writing output files and processing them.
Definition: output.py:1

IMP::pmi.samplers
Sampling of the system.
Definition: samplers.py:1

IMP::pmi.macros.BuildModel1
this building scheme needs a data structure with the following fields comp_name hier_name color fasta...
Definition: macros.py:416

IMP::pmi.macros.ReplicaExchange0
A macro to help setup and run replica exchange, supporting monte carlo and molecular dynamics...
Definition: macros.py:19

IMP::atom::get_leaves
Hierarchies get_leaves(const Selection &h)

IMP::pmi.analysis.GetModelDensity
A class to compute mean density maps from structures.
Definition: pmi/Analysis.py:457

IMP::rmf
Support for the RMF file format for storing hierarchical molecular data and markup.

IMP::pmi.tools.get_residue_indexes
def get_residue_indexes
This "overloaded" function retrieves the residue indexes for each particle which is an instance of Fr...
Definition: tools.py:929

IMP::core::XYZR
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27