IMP  2.3.0
The Integrative Modeling Platform
macros.py
1 """@namespace IMP.pmi.macros
2 Protocols for sampling structures and analyzing them.
3 """
4 
6 import IMP.pmi.tools
7 import IMP.pmi.samplers
8 import IMP.pmi.output
9 import IMP.pmi.analysis
10 import IMP.pmi.io.input
11 import IMP.rmf
12 import RMF
13 import os
14 import glob
15 from operator import itemgetter
16 from collections import defaultdict
17 import numpy as np
18 
19 class ReplicaExchange0(object):
20  """ A macro to help setup and run replica exchange,
21  supporting monte carlo and molecular dynamics.
22  Produces trajectory RMF files, best PDB structures,
23  and output stat files
24  """
25 
26  def __init__(self, model,
27  representation=None,
28  root_hier=None,
29  sample_objects=None, # DEPRECATED
30  monte_carlo_sample_objects=None,
31  molecular_dynamics_sample_objects=None,
32  output_objects=None,
33  crosslink_restraints=None,
34  monte_carlo_temperature=1.0,
35  replica_exchange_minimum_temperature=1.0,
36  replica_exchange_maximum_temperature=2.5,
37  num_sample_rounds=1,
38  number_of_best_scoring_models=500,
39  monte_carlo_steps=10,
40  molecular_dynamics_steps=10,
41  number_of_frames=1000,
42  nframes_write_coordinates=1,
43  write_initial_rmf=True,
44  initial_rmf_name_suffix="initial",
45  stat_file_name_suffix="stat",
46  best_pdb_name_suffix="model",
47  do_clean_first=True,
48  do_create_directories=True,
49  global_output_directory="./",
50  rmf_dir="rmfs/",
51  best_pdb_dir="pdbs/",
52  replica_stat_file_suffix="stat_replica",
53  em_object_for_rmf=None,
54  atomistic=False,
55  replica_exchange_object=None):
56  """
57  Setup replica exchange.
58  @param model The IMP model
59  @param representation PMI.Representation() (or list of them, for multi-state modeling)
60  @param root_hier Instead of passing Representation, just pass a hierarchy
61  @param mote_carlo_sample_objcts Objects for MC sampling
62  @param molecular_dynamics_sample_objects Objects for MD sampling
63  @param output_objects Objects with get_output() for packing into stat files
64  @param crosslink_restraints Harmonic restraints to go in output RMF files
65  @param monte_carlo_temperature MC temp
66  @param replica_exchange_minimum_temperature Low temp for REX
67  @param replica_exchange_maximum_temperature High temp for REX
68  @param num_sample_rounds Number of rounds of MC/MD per cycle
69  @param number_of_best_scoring_models Number of top-scoring PDB models to keep around
70  @param monte_carlo_steps Number of MC steps per round
71  @param molecular_dynamics_steps Number of MD steps per round
72  @param number_of_frames Number of REX frames to run
73  @param nframes_write_coordinates How often to write the coordinates of a frame
74  @param write_initial_rmf Write the initial configuration
75  """
76 
77  self.model = model
78  self.vars = {}
79 
80  ### add check hierarchy is multistate
81  if representation:
82  if type(representation) == list:
83  self.is_multi_state = True
84  self.root_hiers = [r.prot for r in representation]
85  self.vars["number_of_states"] = len(representation)
86  else:
87  self.is_multi_state = False
88  self.root_hier = representation.prot
89  self.vars["number_of_states"] = 1
90  elif root_hier:
91  states = IMP.atom.get_by_type(root_hier,IMP.atom.STATE_TYPE)
92  self.vars["number_of_states"] = len(states)
93  if len(states)>1:
94  self.root_hiers = states
95  self.is_multi_state = True
96  else:
97  self.root_hier = root_hier
98  self.is_multi_state = False
99  else:
100  print "ERROR: Must provide representation or root_hier"
101  return
102 
103  self.crosslink_restraints = crosslink_restraints
104  self.em_object_for_rmf = em_object_for_rmf
105  self.monte_carlo_sample_objects = monte_carlo_sample_objects
106  if sample_objects is not None:
107  self.monte_carlo_sample_objects+=sample_objects
108  self.molecular_dynamics_sample_objects=molecular_dynamics_sample_objects
109  self.output_objects = output_objects
110  self.replica_exchange_object = replica_exchange_object
111  self.vars["monte_carlo_temperature"] = monte_carlo_temperature
112  self.vars[
113  "replica_exchange_minimum_temperature"] = replica_exchange_minimum_temperature
114  self.vars[
115  "replica_exchange_maximum_temperature"] = replica_exchange_maximum_temperature
116  self.vars["num_sample_rounds"] = num_sample_rounds
117  self.vars[
118  "number_of_best_scoring_models"] = number_of_best_scoring_models
119  self.vars["monte_carlo_steps"] = monte_carlo_steps
120  self.vars["molecular_dynamics_steps"]=molecular_dynamics_steps
121  self.vars["number_of_frames"] = number_of_frames
122  self.vars["nframes_write_coordinates"] = nframes_write_coordinates
123  self.vars["write_initial_rmf"] = write_initial_rmf
124  self.vars["initial_rmf_name_suffix"] = initial_rmf_name_suffix
125  self.vars["best_pdb_name_suffix"] = best_pdb_name_suffix
126  self.vars["stat_file_name_suffix"] = stat_file_name_suffix
127  self.vars["do_clean_first"] = do_clean_first
128  self.vars["do_create_directories"] = do_create_directories
129  self.vars["global_output_directory"] = global_output_directory
130  self.vars["rmf_dir"] = rmf_dir
131  self.vars["best_pdb_dir"] = best_pdb_dir
132  self.vars["atomistic"] = atomistic
133  self.vars["replica_stat_file_suffix"] = replica_stat_file_suffix
134 
135  def show_info(self):
136  print "ReplicaExchange0: it generates initial.*.rmf3, stat.*.out, rmfs/*.rmf3 for each replica "
137  print "--- it stores the best scoring pdb models in pdbs/"
138  print "--- the stat.*.out and rmfs/*.rmf3 are saved only at the lowest temperature"
139  print "--- variables:"
140  keys = self.vars.keys()
141  keys.sort()
142  for v in keys:
143  print "------", v.ljust(30), self.vars[v]
144 
145  def get_replica_exchange_object(self):
146  return self.replica_exchange_object
147 
148  def execute_macro(self):
149 
150  temp_index_factor = 100000.0
151  samplers=[]
152  sampler_mc=None
153  sampler_md=None
154  if self.monte_carlo_sample_objects is not None:
155  print "Setting up MonteCarlo"
156  sampler_mc = IMP.pmi.samplers.MonteCarlo(self.model,
157  self.monte_carlo_sample_objects,
158  self.vars["monte_carlo_temperature"])
159  self.output_objects.append(sampler_mc)
160  samplers.append(sampler_mc)
161  if self.molecular_dynamics_sample_objects is not None:
162  print "Setting up MolecularDynamics"
163  sampler_md = IMP.pmi.samplers.MolecularDynamics(self.model,
164  self.molecular_dynamics_sample_objects,
165  self.vars["monte_carlo_temperature"])
166  self.output_objects.append(sampler_md)
167  samplers.append(sampler_md)
168 # -------------------------------------------------------------------------
169 
170  print "Setting up ReplicaExchange"
171  rex = IMP.pmi.samplers.ReplicaExchange(self.model,
172  self.vars[
173  "replica_exchange_minimum_temperature"],
174  self.vars[
175  "replica_exchange_maximum_temperature"],
176  samplers,
177  replica_exchange_object=self.replica_exchange_object)
178  self.replica_exchange_object = rex.rem
179 
180  myindex = rex.get_my_index()
181  self.output_objects.append(rex)
182 
183  # must reset the minimum temperature due to the
184  # different binary length of rem.get_my_parameter double and python
185  # float
186  min_temp_index = int(min(rex.get_temperatures()) * temp_index_factor)
187 
188 # -------------------------------------------------------------------------
189 
190  globaldir = self.vars["global_output_directory"] + "/"
191  rmf_dir = globaldir + self.vars["rmf_dir"]
192  pdb_dir = globaldir + self.vars["best_pdb_dir"]
193 
194  if self.vars["do_clean_first"]:
195  pass
196 
197  if self.vars["do_create_directories"]:
198 
199  try:
200  os.makedirs(globaldir)
201  except:
202  pass
203  try:
204  os.makedirs(rmf_dir)
205  except:
206  pass
207 
208  if not self.is_multi_state:
209  try:
210  os.makedirs(pdb_dir)
211  except:
212  pass
213  else:
214  for n in range(self.vars["number_of_states"]):
215  try:
216  os.makedirs(pdb_dir + "/" + str(n))
217  except:
218  pass
219 
220 # -------------------------------------------------------------------------
221 
222  sw = IMP.pmi.tools.Stopwatch()
223  self.output_objects.append(sw)
224 
225  print "Setting up stat file"
226  output = IMP.pmi.output.Output(atomistic=self.vars["atomistic"])
227  low_temp_stat_file = globaldir + \
228  self.vars["stat_file_name_suffix"] + "." + str(myindex) + ".out"
229  output.init_stat2(low_temp_stat_file,
230  self.output_objects,
231  extralabels=["rmf_file", "rmf_frame_index"])
232 
233  print "Setting up replica stat file"
234  replica_stat_file = globaldir + \
235  self.vars["replica_stat_file_suffix"] + "." + str(myindex) + ".out"
236  output.init_stat2(replica_stat_file, [rex], extralabels=["score"])
237 
238  print "Setting up best pdb files"
239  if not self.is_multi_state:
240  output.init_pdb_best_scoring(pdb_dir + "/" +
241  self.vars["best_pdb_name_suffix"],
242  self.root_hier,
243  self.vars[
244  "number_of_best_scoring_models"],
245  replica_exchange=True)
246  else:
247  for n in range(self.vars["number_of_states"]):
248  output.init_pdb_best_scoring(pdb_dir + "/" + str(n) + "/" +
249  self.vars["best_pdb_name_suffix"],
250  self.root_hiers[n],
251  self.vars[
252  "number_of_best_scoring_models"],
253  replica_exchange=True)
254 
255 # ---------------------------------------------
256 
257  if not self.em_object_for_rmf is None:
258  if not self.is_multi_state:
259  output_hierarchies = [
260  self.root_hier,
261  self.em_object_for_rmf.get_density_as_hierarchy(
262  )]
263  else:
264  output_hierarchies = self.root_hiers
265  output_hierarchies.append(
266  self.em_object_for_rmf.get_density_as_hierarchy())
267  else:
268  if not self.is_multi_state:
269  output_hierarchies = [self.root_hier]
270  else:
271  output_hierarchies = self.root_hiers
272 
273 #----------------------------------------------
274  print "Setting up and writing initial rmf coordinate file"
275  init_suffix = globaldir + self.vars["initial_rmf_name_suffix"]
276  output.init_rmf(init_suffix + "." + str(myindex) + ".rmf3",
277  output_hierarchies)
278  if self.crosslink_restraints:
279  output.add_restraints_to_rmf(
280  init_suffix + "." + str(myindex) + ".rmf3",
281  self.crosslink_restraints)
282  output.write_rmf(init_suffix + "." + str(myindex) + ".rmf3")
283  output.close_rmf(init_suffix + "." + str(myindex) + ".rmf3")
284 
285 #----------------------------------------------
286 
287  print "Setting up production rmf files"
288 
289  rmfname = rmf_dir + "/" + str(myindex) + ".rmf3"
290  output.init_rmf(rmfname, output_hierarchies)
291 
292  if self.crosslink_restraints:
293  output.add_restraints_to_rmf(rmfname, self.crosslink_restraints)
294 
295  ntimes_at_low_temp = 0
296 
297  if myindex == 0:
298  self.show_info()
299 
300  for i in range(self.vars["number_of_frames"]):
301  for nr in range(self.vars["num_sample_rounds"]):
302  if sampler_mc is not None:
303  sampler_mc.optimize(self.vars["monte_carlo_steps"])
304  if sampler_md is not None:
305  sampler_md.optimize(self.vars["molecular_dynamics_steps"])
306  score = self.model.evaluate(False)
307  output.set_output_entry("score", score)
308 
309  my_temp_index = int(rex.get_my_temp() * temp_index_factor)
310 
311  if min_temp_index == my_temp_index:
312  print "--- frame %s score %s " % (str(i), str(score))
313 
314  if i % self.vars["nframes_write_coordinates"]==0:
315  print '--- writing coordinates'
316  output.write_pdb_best_scoring(score)
317  output.write_rmf(rmfname)
318  output.set_output_entry("rmf_file", rmfname)
319  output.set_output_entry("rmf_frame_index", ntimes_at_low_temp)
320  else:
321  output.set_output_entry("rmf_file", rmfname)
322  output.set_output_entry("rmf_frame_index", '-1')
323  output.write_stat2(low_temp_stat_file)
324  ntimes_at_low_temp += 1
325 
326  output.write_stat2(replica_stat_file)
327  rex.swap_temp(i, score)
328 
329 
330 # -----------------------------------------------------------------------
331 
332 
334  m,
335  data,
336  resolutions=[1,
337  10],
338  missing_bead_size=20,
339  residue_per_gaussian=None):
340  '''
341  The macro construct a component for each subunit (no splitting, nothing fancy)
342  You can pass the resolutions and the bead size for the missing residue regions.
343  To use this macro, you must provide the following data structure:
344 
345  Component pdbfile chainid rgb color fastafile sequence id
346  in fastafile
347 
348 data = [("Rpb1", pdbfile, "A", 0.00000000, (fastafile, 0)),
349  ("Rpb2", pdbfile, "B", 0.09090909, (fastafile, 1)),
350  ("Rpb3", pdbfile, "C", 0.18181818, (fastafile, 2)),
351  ("Rpb4", pdbfile, "D", 0.27272727, (fastafile, 3)),
352  ("Rpb5", pdbfile, "E", 0.36363636, (fastafile, 4)),
353  ("Rpb6", pdbfile, "F", 0.45454545, (fastafile, 5)),
354  ("Rpb7", pdbfile, "G", 0.54545455, (fastafile, 6)),
355  ("Rpb8", pdbfile, "H", 0.63636364, (fastafile, 7)),
356  ("Rpb9", pdbfile, "I", 0.72727273, (fastafile, 8)),
357  ("Rpb10", pdbfile, "L", 0.81818182, (fastafile, 9)),
358  ("Rpb11", pdbfile, "J", 0.90909091, (fastafile, 10)),
359  ("Rpb12", pdbfile, "K", 1.00000000, (fastafile, 11))]
360 
361  '''
362 
363  r = IMP.pmi.representation.Representation(m)
364 
365  # the dictionary for the hierarchies,
366  hierarchies = {}
367 
368  for d in data:
369  # retrieve the information from the data structure
370  component_name = d[0]
371  pdb_file = d[1]
372  chain_id = d[2]
373  color_id = d[3]
374  fasta_file = d[4][0]
375  # this function
376  fastids = IMP.pmi.tools.get_ids_from_fasta_file(fasta_file)
377  fasta_file_id = d[4][1]
378  # avoid to add a component with the same name
379  r.create_component(component_name,
380  color=color_id)
381 
382  r.add_component_sequence(component_name,
383  fasta_file,
384  id=fastids[fasta_file_id])
385 
386  hierarchies = r.autobuild_model(component_name,
387  pdb_file,
388  chain_id,
389  resolutions=resolutions,
390  missingbeadsize=missing_bead_size)
391 
392  r.show_component_table(component_name)
393 
394  r.set_rigid_bodies([component_name])
395 
396  r.set_chain_of_super_rigid_bodies(
397  hierarchies,
398  min_length=2,
399  max_length=2)
400 
401  r.setup_component_sequence_connectivity(component_name, resolution=1)
402  r.setup_component_geometry(component_name)
403 
404  r.setup_bonds()
405  # put it at the end of rigid bodies
406  r.set_floppy_bodies()
407 
408  # set current coordinates as reference for RMSD calculation
409  r.set_current_coordinates_as_reference_for_rmsd("Reference")
410 
411  return r
412 
413 
414 # ----------------------------------------------------------------------
415 
416 class BuildModel1(object):
417 
418  ''' this building scheme needs a data structure with the following fields
419  comp_name
420  hier_name
421  color
422  fasta_file
423  fasta_id
424  pdb_name
425  chain_id
426  res_range
427  read_em_files
428  bead_size
429  rb
430  super_rb
431  em_num_components
432  em_txt_file_name
433  em_mrc_file_name
434  '''
435 
436  def __init__(self, representation):
437  self.simo=representation
438  self.gmm_models_directory="."
439 
440  def set_gmm_models_directory(self,directory_name):
441  self.gmm_models_directory=directory_name
442 
443  def build_model(self,data_structure,sequence_connectivity_scale=4.0):
444 
445  self.domain_dict={}
446  self.resdensities={}
447  super_rigid_bodies={}
448  chain_super_rigid_bodies={}
449  rigid_bodies={}
450 
451  for d in data_structure:
452  comp_name = d[0]
453  hier_name = d[1]
454  color = d[2]
455  fasta_file = d[3]
456  fasta_id = d[4]
457  pdb_name = d[5]
458  chain_id = d[6]
459  res_range = d[7][0:2]
460  try:
461  offset = d[7][2]
462  except:
463  offset = 0
464  read_em_files = d[8]
465  bead_size = d[9]
466  rb = d[10]
467  super_rb = d[11]
468  em_num_components = d[12]
469  em_txt_file_name = d[13]
470  em_mrc_file_name = d[14]
471  chain_of_super_rb = d[15]
472 
473  if comp_name not in self.simo.get_component_names():
474  self.simo.create_component(comp_name,color=0.0)
475  self.simo.add_component_sequence(comp_name,fasta_file,fasta_id)
476  outhier=self.autobuild(self.simo,comp_name,pdb_name,chain_id,res_range,read=read_em_files,beadsize=bead_size,color=color,offset=offset)
477 
478  if not read_em_files is None:
479  if em_txt_file_name is " ": em_txt_file_name=self.gmm_models_directory+"/"+hier_name+".txt"
480  if em_mrc_file_name is " ": em_mrc_file_name=self.gmm_models_directory+"/"+hier_name+".mrc"
481 
482 
483  dens_hier,beads=self.create_density(self.simo,comp_name,outhier,em_txt_file_name,em_mrc_file_name,em_num_components,read_em_files)
484  self.simo.add_all_atom_densities(comp_name, hierarchies=beads)
485  dens_hier+=beads
486 
487  else:
488  dens_hier=[]
489 
490  self.resdensities[hier_name]=dens_hier
491  self.domain_dict[hier_name]=outhier+dens_hier
492 
493  if rb is not None:
494  if rb not in rigid_bodies:
495  rigid_bodies[rb]=[h for h in self.domain_dict[hier_name]]
496  else:
497  rigid_bodies[rb]+=[h for h in self.domain_dict[hier_name]]
498 
499 
500  if super_rb is not None:
501  for k in super_rb:
502  if k not in super_rigid_bodies:
503  super_rigid_bodies[k]=[h for h in self.domain_dict[hier_name]]
504  else:
505  super_rigid_bodies[k]+=[h for h in self.domain_dict[hier_name]]
506 
507  if chain_of_super_rb is not None:
508  for k in chain_of_super_rb:
509  if k not in chain_super_rigid_bodies:
510  chain_super_rigid_bodies[k]=[h for h in self.domain_dict[hier_name]]
511  else:
512  chain_super_rigid_bodies[k]+=[h for h in self.domain_dict[hier_name]]
513 
514  self.rigid_bodies=rigid_bodies
515 
516 
517  for c in self.simo.get_component_names():
518  self.simo.setup_component_sequence_connectivity(c,scale=sequence_connectivity_scale)
519  self.simo.setup_component_geometry(c)
520 
521 
522  for rb in rigid_bodies:
523  self.simo.set_rigid_body_from_hierarchies(rigid_bodies[rb])
524 
525  for k in super_rigid_bodies:
526  self.simo.set_super_rigid_body_from_hierarchies(super_rigid_bodies[k])
527 
528  for k in chain_super_rigid_bodies:
529  self.simo.set_chain_of_super_rigid_bodies(chain_super_rigid_bodies[k],2,3)
530 
531  self.simo.set_floppy_bodies()
532  self.simo.setup_bonds()
533 
534  def get_density_hierarchies(self,hier_name_list):
535  # return a list of density hierarchies
536  # specify the list of hierarchy names
537  dens_hier_list=[]
538  for hn in hier_name_list:
539  print hn
540  dens_hier_list+=self.resdensities[hn]
541  return dens_hier_list
542 
543  def get_pdb_bead_bits(self,hierarchy):
544  pdbbits=[]
545  beadbits=[]
546  helixbits=[]
547  for h in hierarchy:
548  if "_pdb" in h.get_name():pdbbits.append(h)
549  if "_bead" in h.get_name():beadbits.append(h)
550  if "_helix" in h.get_name():helixbits.append(h)
551  return (pdbbits,beadbits,helixbits)
552 
553  def scale_bead_radii(self,nresidues,scale):
554  scaled_beads=set()
555  for h in self.domain_dict:
556  (pdbbits,beadbits,helixbits)=self.get_pdb_bead_bits(self.domain_dict[h])
557  slope=(1.0-scale)/(1.0-float(nresidues))
558 
559  for b in beadbits:
560  # I have to do the following
561  # because otherwise we'll scale more than once
562  if b not in scaled_beads:
563  scaled_beads.add(b)
564  else:
565  continue
566  radius=IMP.core.XYZR(b).get_radius()
567  num_residues=len(IMP.pmi.tools.get_residue_indexes(b))
568  scale_factor=slope*float(num_residues)+1.0
569  print scale_factor
570  new_radius=scale_factor*radius
571  IMP.core.XYZR(b).set_radius(new_radius)
572  print b.get_name()
573  print "particle with radius "+str(radius)+" and "+str(num_residues)+" residues scaled to a new radius "+str(new_radius)
574 
575 
576  def create_density(self,simo,compname,comphier,txtfilename,mrcfilename,num_components,read=True):
577  #density generation for the EM restraint
578  (pdbbits,beadbits,helixbits)=self.get_pdb_bead_bits(comphier)
579 
580 
581  outhier=[]
582  if read:
583  if len(pdbbits)!=0:
584  outhier+=simo.add_component_density(compname,
585  pdbbits,
586  num_components=num_components, # number of gaussian into which the simulated density is approximated
587  resolution=0, # resolution that you want to calculate the simulated density
588  inputfile=txtfilename) # read what it was calculated before
589  if len(helixbits)!=0:
590  outhier+=simo.add_component_density(compname,
591  helixbits,
592  num_components=num_components, # number of gaussian into which the simulated density is approximated
593  resolution=1, # resolution that you want to calculate the simulated density
594  inputfile=txtfilename) # read what it was calculated before
595 
596 
597  else:
598  if len(pdbbits)!=0:
599  outhier+=simo.add_component_density(compname,
600  pdbbits,
601  num_components=num_components, # number of gaussian into which the simulated density is approximated
602  resolution=0, # resolution that you want to calculate the simulated density
603  outputfile=txtfilename, # do the calculation
604  outputmap=mrcfilename,
605  multiply_by_total_mass=True) # do the calculation and output the mrc
606 
607  if len(helixbits)!=0:
608  outhier+=simo.add_component_density(compname,
609  helixbits,
610  num_components=num_components, # number of gaussian into which the simulated density is approximated
611  resolution=1, # resolution that you want to calculate the simulated density
612  outputfile=txtfilename, # do the calculation
613  outputmap=mrcfilename,
614  multiply_by_total_mass=True) # do the calculation and output the mrc
615 
616  return outhier,beadbits
617 
618  def autobuild(self,simo,comname,pdbname,chain,resrange,read=True,beadsize=5,color=0.0,offset=0):
619 
620  if pdbname is not None and pdbname is not "IDEAL_HELIX" and pdbname is not "BEADS" :
621  if resrange[-1]==-1: resrange=(resrange[0],len(simo.sequence_dict[comname]))
622  if read==False:
623  outhier=simo.autobuild_model(comname,
624  pdbname=pdbname,
625  chain=chain,
626  resrange=resrange,
627  resolutions=[0,1,10],
628  offset=offset,
629  color=color,
630  missingbeadsize=beadsize)
631  else:
632  outhier=simo.autobuild_model(comname,
633  pdbname=pdbname,
634  chain=chain,
635  resrange=resrange,
636  resolutions=[1,10],
637  offset=offset,
638  color=color,
639  missingbeadsize=beadsize)
640 
641 
642  elif pdbname is not None and pdbname is "IDEAL_HELIX" and pdbname is not "BEADS" :
643 
644  outhier=simo.add_component_ideal_helix(comname,
645  resolutions=[1,10],
646  resrange=resrange,
647  color=color,
648  show=False)
649 
650  elif pdbname is not None and pdbname is not "IDEAL_HELIX" and pdbname is "BEADS" :
651  outhier=simo.add_component_necklace(comname,resrange[0],resrange[1],beadsize,color=color)
652 
653  else:
654 
655  seq_len=len(simo.sequence_dict[comname])
656  outhier=simo.add_component_necklace(comname,
657  begin=1,
658  end=seq_len,
659  length=beadsize)
660 
661  return outhier
662 
663 
664 # ----------------------------------------------------------------------
665 
667  """A macro for running all the basic operations of analysis.
668  Including clustering, precision analysis, and making ensemble density maps.
669  A number of plots are also supported.
670  """
671  def __init__(self, model,
672  stat_file_name_suffix="stat",
673  # if you want to merge two calculation directories
674  merge_directories=["./"],
675  best_pdb_name_suffix="model",
676  do_clean_first=True,
677  do_create_directories=True,
678  global_output_directory="./",
679  replica_stat_file_suffix="stat_replica",
680  global_analysis_result_directory="./analysis/",
681  rmf_dir='', #NOT USED
682  ):
683 
684  """ Setup analysis.
685  @param model The IMP model
686  @param stat_file_name_suffix
687  @param merge_directories The directories containing output files
688  @param best_pdb_name_suffix
689  @param do_clean_first
690  @param do_create_directories
691  @param global_output_directory Where everything is
692  @param replica_stat_file_suffix
693  @param global_analysis_result_directory
694  """
695 
696  self.model = model
697  stat_dir = global_output_directory
698  self.stat_files = []
699  # it contains the position of the root directories
700  for rd in merge_directories:
701  stat_files = glob.glob(rd + "/" + stat_dir + "/stat.*.out")
702  self.stat_files += stat_files
703 
704  def clustering(self,
705  score_key="SimplifiedModel_Total_Score_None",
706  rmf_file_key="rmf_file",
707  rmf_file_frame_key="rmf_frame_index",
708  prefiltervalue=None,
709  feature_keys=[],
710  outputdir="./",
711  alignment_components=None,
712  number_of_best_scoring_models=10,
713  rmsd_calculation_components=None,
714  distance_matrix_file=None,
715  load_distance_matrix_file=False,
716  is_mpi=False,
717  skip_clustering=False,
718  number_of_clusters=1,
719  display_plot=False,
720  exit_after_display=True,
721  get_every=1,
722  first_and_last_frames=None,
723  density_custom_ranges=None,
724  write_pdb_with_centered_coordinates=False,
725  voxel_size=5.0):
726  """ Get the best scoring models, compute a distance matrix, cluster them, and create density maps
727  @param score_key The score for ranking models
728  @param rmf_file_key Key pointing to RMF filename
729  @param rmf_file_frame_key Key pointing to RMF frame number
730  @param prefiltervalue Only include frames where the score key is below this value
731  @param feature_keys Keywords for which you want to calculate average,
732  medians, etc,
733  @param outputdir The local output directory used in the run
734  @param alignment_components List of tuples for aligning the structures
735  e.g. ["Rpb1", (20,100,"Rpb2"), .....]
736  @param number_of_best_scoring_models Num models to keep per run
737  @param rmsd_calculation_components List of tuples for calculating RMSD
738  e.g. ["Rpb1", (20,100,"Rpb2"), .....]
739  @param distance_matrix_file Where to store/read the distance matrix
740  @param load_distance_matrix_file Try to load the distance matrix file
741  @param is_mpi Enable MPI
742  @param skip_clustering Just extract the best scoring models and save the pdbs
743  @param number_of_clusters Number of k-means clusters
744  @param display_plot Display the distance matrix
745  @param exit_after_display Exit after displaying distance matrix
746  @param get_every Extract every nth frame
747  @param first_and_last_frames A tuple with the first and last frames to be
748  analyzed. Values are percentages!
749  Default: get all frames
750  @param density_custom_ranges List of tuples or strings for density calculation
751  e.g. ["Rpb1", (20,100,"Rpb2"), .....]
752  @param write_pdb_with_centered_coordinates
753  @param voxel_size Used for the density output
754  """
755  if is_mpi:
756  from mpi4py import MPI
757  comm = MPI.COMM_WORLD
758  rank = comm.Get_rank()
759  number_of_processes = comm.size
760  else:
761  rank = 0
762  number_of_processes = 1
763 
764 
765 
766  if not load_distance_matrix_file:
767  if len(self.stat_files)==0: print "ERROR: no stat file found in the given path"; return
768  my_stat_files=IMP.pmi.tools.chunk_list_into_segments(self.stat_files,number_of_processes)[rank]
769  best_models = IMP.pmi.io.input.get_best_models(my_stat_files,
770  score_key,
771  feature_keys,
772  rmf_file_key,
773  rmf_file_frame_key,
774  prefiltervalue,
775  get_every)
776  rmf_file_list=best_models[0]
777  rmf_file_frame_list=best_models[1]
778  score_list=best_models[2]
779  feature_keyword_list_dict=best_models[3]
780 
781 # ------------------------------------------------------------------------
782 # collect all the files and scores
783 # ------------------------------------------------------------------------
784 
785  if number_of_processes > 1:
786  score_list = IMP.pmi.tools.scatter_and_gather(score_list)
787  rmf_file_list = IMP.pmi.tools.scatter_and_gather(rmf_file_list)
788  rmf_file_frame_list = IMP.pmi.tools.scatter_and_gather(
789  rmf_file_frame_list)
790  for k in feature_keyword_list_dict:
791  feature_keyword_list_dict[k] = IMP.pmi.tools.scatter_and_gather(
792  feature_keyword_list_dict[k])
793 
794  # sort by score and get the best scoring ones
795  score_rmf_tuples = zip(score_list,
796  rmf_file_list,
797  rmf_file_frame_list,
798  range(len(score_list)))
799 
800  # keep subset of frames if requested
801  if first_and_last_frames is not None:
802  nframes = len(score_rmf_tuples)
803  first_frame = int(first_and_last_frames[0] * nframes)
804  last_frame = int(first_and_last_frames[1] * nframes)
805  if last_frame > len(score_rmf_tuples):
806  last_frame = -1
807  score_rmf_tuples = score_rmf_tuples[first_frame:last_frame]
808 
809  # sort RMFs by the score_key in ascending order, and store the rank
810  best_score_rmf_tuples = sorted(score_rmf_tuples,
811  key=lambda x: float(x[0]))[:number_of_best_scoring_models]
812  best_score_rmf_tuples=[t+(n,) for n,t in enumerate(best_score_rmf_tuples)]
813 
814  # sort the feature scores in the same way
815  best_score_feature_keyword_list_dict = defaultdict(list)
816  for tpl in best_score_rmf_tuples:
817  index = tpl[3]
818  for f in feature_keyword_list_dict:
819  best_score_feature_keyword_list_dict[f].append(
820  feature_keyword_list_dict[f][index])
821 
822  my_best_score_rmf_tuples = IMP.pmi.tools.chunk_list_into_segments(
823  best_score_rmf_tuples,
824  number_of_processes)[rank]
825 
826 # ------------------------------------------------------------------------
827 # optionally don't compute distance matrix or cluster, just write top files
828 # ------------------------------------------------------------------------
829  if skip_clustering:
830  dircluster=os.path.join(outputdir,"all_models."+str(n))
831  try:
832  os.mkdir(outputdir)
833  except:
834  pass
835  try:
836  os.mkdir(dircluster)
837  except:
838  pass
839  clusstat=open(os.path.join(dircluster,"stat."+str(rank)+".out"),"w")
840  for cnt,tpl in enumerate(my_best_score_rmf_tuples):
841  rmf_name=tpl[1]
842  rmf_frame_number=tpl[2]
843  tmp_dict={}
844  index=tpl[4]
845  for key in best_score_feature_keyword_list_dict:
846  tmp_dict[key]=best_score_feature_keyword_list_dict[key][index]
847 
848  prot=IMP.pmi.analysis.get_hier_from_rmf(self.model,rmf_frame_number,rmf_name)
849  if not prot:
850  continue
851 
853  out_pdb_fn=os.path.join(dircluster,str(cnt)+"."+str(rank)+".pdb")
854  out_rmf_fn=os.path.join(dircluster,str(cnt)+"."+str(rank)+".rmf")
855  o.init_pdb(out_pdb_fn,prot)
856  o.write_pdb(out_pdb_fn,
857  translate_to_geometric_center=write_pdb_with_centered_coordinates)
858 
859  tmp_dict["local_pdb_file_name"]=os.path.basename(out_pdb_fn)
860  tmp_dict["rmf_file_full_path"]=rmf_name
861  tmp_dict["local_rmf_file_name"]=os.path.basename(out_rmf_fn)
862  tmp_dict["local_rmf_frame_number"]=0
863 
864  clusstat.write(str(tmp_dict)+"\n")
865  o.init_rmf(out_rmf_fn,[prot])
866  o.write_rmf(out_rmf_fn)
867  o.close_rmf(out_rmf_fn)
868  return
869 
870 
871 #-------------------------------------------------------------
872 # read the coordinates
873 # ------------------------------------------------------------
874  rmsd_weights = IMP.pmi.io.input.get_bead_sizes(self.model,
875  my_best_score_rmf_tuples[0],
876  rmsd_calculation_components)
877  got_coords = IMP.pmi.io.input.read_coordinates_of_rmfs(self.model,
878  my_best_score_rmf_tuples,
879  alignment_components,
880  rmsd_calculation_components)
881  all_coordinates=got_coords[0] # dict:key=component name,val=coords per hit
882  alignment_coordinates=got_coords[1] # same as above, limited to alignment bits
883  rmsd_coordinates=got_coords[2] # same as above, limited to RMSD bits
884  rmf_file_name_index_dict=got_coords[3] # dictionary with key=RMF, value=score rank
885  all_rmf_file_names=got_coords[4] # RMF file per hit
886 
887 
888  # broadcast the coordinates
889  if number_of_processes > 1:
890  all_coordinates = IMP.pmi.tools.scatter_and_gather(
891  all_coordinates)
892  all_rmf_file_names = IMP.pmi.tools.scatter_and_gather(
893  all_rmf_file_names)
894  rmf_file_name_index_dict = IMP.pmi.tools.scatter_and_gather(
895  rmf_file_name_index_dict)
896  alignment_coordinates=IMP.pmi.tools.scatter_and_gather(
897  alignment_coordinates)
898  rmsd_coordinates=IMP.pmi.tools.scatter_and_gather(
899  rmsd_coordinates)
900 
901  if rank == 0:
902  # save needed informations in external files
903  self.save_objects(
904  [best_score_feature_keyword_list_dict,
905  rmf_file_name_index_dict],
906  ".macro.pkl")
907 
908 # ------------------------------------------------------------------------
909 # Calculate distance matrix and cluster
910 # ------------------------------------------------------------------------
911  print "setup clustering class"
912  Clusters = IMP.pmi.analysis.Clustering(rmsd_weights)
913 
914  for n, model_coordinate_dict in enumerate(all_coordinates):
915  template_coordinate_dict = {}
916  # let's try to align
917  if alignment_components is not None and len(Clusters.all_coords) == 0:
918  # set the first model as template coordinates
919  Clusters.set_template(alignment_coordinates[n])
920  Clusters.fill(all_rmf_file_names[n], rmsd_coordinates[n])
921 
922  print "Global calculating the distance matrix"
923 
924  # calculate distance matrix, all against all
925  Clusters.dist_matrix(is_mpi=is_mpi)
926 
927  # perform clustering and optionally display
928  if rank == 0:
929  Clusters.do_cluster(number_of_clusters)
930  if display_plot:
931  if rank == 0:
932  Clusters.plot_matrix()
933  if number_of_processes > 1:
934  comm.Barrier()
935  if exit_after_display:
936  exit()
937  Clusters.save_distance_matrix_file(file_name=distance_matrix_file)
938 
939 # ------------------------------------------------------------------------
940 # Alteratively, load the distance matrix from file and cluster that
941 # ------------------------------------------------------------------------
942  else:
943  if rank==0:
944  print "setup clustering class"
945  Clusters = IMP.pmi.analysis.Clustering()
946  Clusters.load_distance_matrix_file(file_name=distance_matrix_file)
947  print "clustering with %s clusters" % str(number_of_clusters)
948  Clusters.do_cluster(number_of_clusters)
949  [best_score_feature_keyword_list_dict,
950  rmf_file_name_index_dict] = self.load_objects(".macro.pkl")
951  if display_plot:
952  if rank == 0:
953  Clusters.plot_matrix()
954  if number_of_processes > 1:
955  comm.Barrier()
956  if exit_after_display:
957  exit()
958 
959 # ------------------------------------------------------------------------
960 # now save all informations about the clusters
961 # ------------------------------------------------------------------------
962 
963  if rank == 0:
964  print Clusters.get_cluster_labels()
965  for n, cl in enumerate(Clusters.get_cluster_labels()):
966  print "rank %s " % str(rank)
967  print "cluster %s " % str(n)
968  print "cluster label %s " % str(cl)
969  print Clusters.get_cluster_label_names(cl)
970 
971  # first initialize the Density class if requested
972 
973  if density_custom_ranges:
975  density_custom_ranges,
976  voxel=voxel_size)
977 
978  dircluster = outputdir + "/cluster." + str(n) + "/"
979  try:
980  os.mkdir(outputdir)
981  except:
982  pass
983  try:
984  os.mkdir(dircluster)
985  except:
986  pass
987 
988  rmsd_dict = {"AVERAGE_RMSD":
989  str(Clusters.get_cluster_label_average_rmsd(cl))}
990  clusstat = open(dircluster + "stat.out", "w")
991  for k, structure_name in enumerate(Clusters.get_cluster_label_names(cl)):
992 
993  # extract the features
994  tmp_dict = {}
995  tmp_dict.update(rmsd_dict)
996  index = rmf_file_name_index_dict[structure_name]
997  for key in best_score_feature_keyword_list_dict:
998  tmp_dict[
999  key] = best_score_feature_keyword_list_dict[
1000  key][
1001  index]
1002 
1003  # get the rmf name and the frame number from the list of
1004  # frame names
1005  rmf_name = structure_name.split("|")[0]
1006  rmf_frame_number = int(structure_name.split("|")[1])
1007 
1008  clusstat.write(str(tmp_dict) + "\n")
1009  prot,rs = IMP.pmi.analysis.get_hier_and_restraints_from_rmf(
1010  self.model,
1011  rmf_frame_number,
1012  rmf_name)
1013  if not prot:
1014  continue
1015 
1016  if k > 0:
1017  model_index = Clusters.get_model_index_from_name(
1018  structure_name)
1019  transformation = Clusters.get_transformation_to_first_member(
1020  cl,
1021  model_index)
1022 
1023  rbs = set()
1024  for p in IMP.atom.get_leaves(prot):
1025  if not IMP.core.XYZR.get_is_setup(p):
1027  IMP.core.XYZR(p).set_radius(0.0001)
1028  IMP.core.XYZR(p).set_coordinates((0, 0, 0))
1029 
1031  rb = IMP.core.RigidBodyMember(p).get_rigid_body()
1032  rbs.add(rb)
1033  else:
1035  transformation)
1036  for rb in rbs:
1037  IMP.core.transform(rb,transformation)
1038 
1039  # add the density
1040  if density_custom_ranges:
1041  DensModule.add_subunits_density(prot)
1042 
1043  o = IMP.pmi.output.Output()
1044  o.init_pdb(dircluster + str(k) + ".pdb", prot)
1045  o.write_pdb(dircluster + str(k) + ".pdb")
1046 
1047  o.init_rmf(dircluster + str(k) + ".rmf3", [prot],rs)
1048  # IMP.rmf.add_restraints(o.dictionary_rmfs[dircluster+str(n)+".rmf3"],restraints)
1049  o.write_rmf(dircluster + str(k) + ".rmf3")
1050  o.close_rmf(dircluster + str(k) + ".rmf3")
1051 
1052  del o
1053  # IMP.atom.destroy(prot)
1054 
1055  if density_custom_ranges:
1056  DensModule.write_mrc(path=dircluster)
1057  del DensModule
1058 
1059  if is_mpi:
1060  comm.Barrier()
1061 
1062  def save_objects(self, objects, file_name):
1063  import pickle
1064  outf = open(file_name, 'w')
1065  pickle.dump(objects, outf)
1066  outf.close()
1067 
1068  def load_objects(self, file_name):
1069  import pickle
1070  inputf = open(file_name, 'r')
1071  objects = pickle.load(inputf)
1072  inputf.close()
1073  return objects
A macro for running all the basic operations of analysis.
Definition: macros.py:666
A member of a rigid body, it has internal (local) coordinates.
Definition: rigid_bodies.h:368
Miscellaneous utilities.
Definition: tools.py:1
def clustering
Get the best scoring models, compute a distance matrix, cluster them, and create density maps...
Definition: macros.py:707
A class to cluster structures.
Representation of the system.
static bool get_is_setup(const IMP::kernel::ParticleAdaptor &p)
Definition: XYZR.h:47
static XYZR setup_particle(kernel::Model *m, ParticleIndex pi)
Definition: XYZR.h:48
def scatter_and_gather
Synchronize data over a parallel run.
Definition: tools.py:970
void transform(XYZ a, const algebra::Transformation3D &tr)
Apply a transformation to the particle.
Hierarchies get_by_type(Hierarchy mhd, GetByType t)
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
def BuildModel0
The macro construct a component for each subunit (no splitting, nothing fancy) You can pass the resol...
Definition: macros.py:333
static bool get_is_setup(const IMP::kernel::ParticleAdaptor &p)
Definition: rigid_bodies.h:369
Class for easy writing of PDBs, RMFs, and stat files.
Definition: output.py:18
Analysis tools.
Definition: pmi/Analysis.py:1
Classes for writing output files and processing them.
Definition: output.py:1
Sampling of the system.
Definition: samplers.py:1
this building scheme needs a data structure with the following fields comp_name hier_name color fasta...
Definition: macros.py:416
A macro to help setup and run replica exchange, supporting monte carlo and molecular dynamics...
Definition: macros.py:19
Hierarchies get_leaves(const Selection &h)
A class to compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
def get_residue_indexes
This "overloaded" function retrieves the residue indexes for each particle which is an instance of Fr...
Definition: tools.py:929
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27