IMP logo
IMP Reference Guide  2.5.0
The Integrative Modeling Platform
macros.py
1 """@namespace IMP.pmi.macros
2 Protocols for sampling structures and analyzing them.
3 """
4 
5 from __future__ import print_function
7 import IMP.pmi.tools
8 import IMP.pmi.samplers
9 import IMP.pmi.output
10 import IMP.pmi.analysis
11 import IMP.pmi.io
12 import IMP.rmf
13 import RMF
14 import os
15 import glob
16 from operator import itemgetter
17 from collections import defaultdict
18 import numpy as np
19 
20 class ReplicaExchange0(object):
21  """A macro to help setup and run replica exchange.
22  Supports Monte Carlo and molecular dynamics.
23  Produces trajectory RMF files, best PDB structures,
24  and output stat files.
25  """
26  def __init__(self, model,
27  representation=None,
28  root_hier=None,
29  sample_objects=None, # DEPRECATED
30  monte_carlo_sample_objects=None,
31  molecular_dynamics_sample_objects=None,
32  output_objects=None,
33  crosslink_restraints=None,
34  monte_carlo_temperature=1.0,
35  simulated_annealing=False,
36  simulated_annealing_minimum_temperature=1.0,
37  simulated_annealing_maximum_temperature=2.5,
38  simulated_annealing_minimum_temperature_nframes=100,
39  simulated_annealing_maximum_temperature_nframes=100,
40  replica_exchange_minimum_temperature=1.0,
41  replica_exchange_maximum_temperature=2.5,
42  num_sample_rounds=1,
43  number_of_best_scoring_models=500,
44  monte_carlo_steps=10,
45  molecular_dynamics_steps=10,
46  molecular_dynamics_max_time_step=1.0,
47  number_of_frames=1000,
48  nframes_write_coordinates=1,
49  write_initial_rmf=True,
50  initial_rmf_name_suffix="initial",
51  stat_file_name_suffix="stat",
52  best_pdb_name_suffix="model",
53  do_clean_first=True,
54  do_create_directories=True,
55  global_output_directory="./",
56  rmf_dir="rmfs/",
57  best_pdb_dir="pdbs/",
58  replica_stat_file_suffix="stat_replica",
59  em_object_for_rmf=None,
60  atomistic=False,
61  replica_exchange_object=None):
62  """Constructor.
63  @param model The IMP model
64  @param representation PMI.representation.Representation object
65  (or list of them, for multi-state modeling)
66  @param root_hier Instead of passing Representation, just pass
67  a hierarchy
68  @param monte_carlo_sample_objects Objects for MC sampling; a list of
69  structural components (generally the representation) that will
70  be moved and restraints with parameters that need to
71  be sampled. OR a DegreesOfFreedom object!
72  @param molecular_dynamics_sample_objects Objects for MD sampling
73  @param output_objects A list of structural objects and restraints
74  that will be included in output (statistics files). Any object
75  that provides a get_output() method can be used here.
76  @param crosslink_restraints List of cross-link restraints that will
77  be included in output RMF files (for visualization).
78  @param monte_carlo_temperature MC temp (may need to be optimized
79  based on post-sampling analysis)
80  @param simulated_annealing If True, perform simulated annealing
81  @param simulated_annealing_minimum_temperature Should generally be
82  the same as monte_carlo_temperature.
83  @param simulated_annealing_minimum_temperature_nframes Number of
84  frames to compute at minimum temperature.
85  @param simulated_annealing_maximum_temperature_nframes Number of
86  frames to compute at
87  temps > simulated_annealing_maximum_temperature.
88  @param replica_exchange_minimum_temperature Low temp for REX; should
89  generally be the same as monte_carlo_temperature.
90  @param replica_exchange_maximum_temperature High temp for REX
91  @param num_sample_rounds Number of rounds of MC/MD per cycle
92  @param number_of_best_scoring_models Number of top-scoring PDB models
93  to keep around for analysis
94  @param monte_carlo_steps Number of MC steps per round
95  @param molecular_dynamics_steps Number of MD steps per round
96  @param molecular_dynamics_max_time_step Max time step for MD
97  @param number_of_frames Number of REX frames to run
98  @param nframes_write_coordinates How often to write the coordinates
99  of a frame
100  @param write_initial_rmf Write the initial configuration
101  @param global_output_directory Folder that will be created to house
102  output.
103  """
104  self.model = model
105  self.vars = {}
106 
107  ### add check hierarchy is multistate
108  if representation:
109  if type(representation) == list:
110  self.is_multi_state = True
111  self.root_hiers = [r.prot for r in representation]
112  self.vars["number_of_states"] = len(representation)
113  else:
114  self.is_multi_state = False
115  self.root_hier = representation.prot
116  self.vars["number_of_states"] = 1
117  elif root_hier:
118  states = IMP.atom.get_by_type(root_hier,IMP.atom.STATE_TYPE)
119  self.vars["number_of_states"] = len(states)
120  if len(states)>1:
121  self.root_hiers = states
122  self.is_multi_state = True
123  else:
124  self.root_hier = root_hier
125  self.is_multi_state = False
126  else:
127  print("ERROR: Must provide representation or root_hier")
128  return
129 
130  self.crosslink_restraints = crosslink_restraints
131  self.em_object_for_rmf = em_object_for_rmf
132  self.monte_carlo_sample_objects = monte_carlo_sample_objects
133  if sample_objects is not None:
134  self.monte_carlo_sample_objects+=sample_objects
135  self.molecular_dynamics_sample_objects=molecular_dynamics_sample_objects
136  self.output_objects = output_objects
137  self.replica_exchange_object = replica_exchange_object
138  self.molecular_dynamics_max_time_step = molecular_dynamics_max_time_step
139  self.vars["monte_carlo_temperature"] = monte_carlo_temperature
140  self.vars[
141  "replica_exchange_minimum_temperature"] = replica_exchange_minimum_temperature
142  self.vars[
143  "replica_exchange_maximum_temperature"] = replica_exchange_maximum_temperature
144 
145  self.vars["simulated_annealing"]=\
146  simulated_annealing
147  self.vars["simulated_annealing_minimum_temperature"]=\
148  simulated_annealing_minimum_temperature
149  self.vars["simulated_annealing_maximum_temperature"]=\
150  simulated_annealing_maximum_temperature
151  self.vars["simulated_annealing_minimum_temperature_nframes"]=\
152  simulated_annealing_minimum_temperature_nframes
153  self.vars["simulated_annealing_maximum_temperature_nframes"]=\
154  simulated_annealing_maximum_temperature_nframes
155 
156  self.vars["num_sample_rounds"] = num_sample_rounds
157  self.vars[
158  "number_of_best_scoring_models"] = number_of_best_scoring_models
159  self.vars["monte_carlo_steps"] = monte_carlo_steps
160  self.vars["molecular_dynamics_steps"]=molecular_dynamics_steps
161  self.vars["number_of_frames"] = number_of_frames
162  self.vars["nframes_write_coordinates"] = nframes_write_coordinates
163  self.vars["write_initial_rmf"] = write_initial_rmf
164  self.vars["initial_rmf_name_suffix"] = initial_rmf_name_suffix
165  self.vars["best_pdb_name_suffix"] = best_pdb_name_suffix
166  self.vars["stat_file_name_suffix"] = stat_file_name_suffix
167  self.vars["do_clean_first"] = do_clean_first
168  self.vars["do_create_directories"] = do_create_directories
169  self.vars["global_output_directory"] = global_output_directory
170  self.vars["rmf_dir"] = rmf_dir
171  self.vars["best_pdb_dir"] = best_pdb_dir
172  self.vars["atomistic"] = atomistic
173  self.vars["replica_stat_file_suffix"] = replica_stat_file_suffix
174 
175  def show_info(self):
176  print("ReplicaExchange0: it generates initial.*.rmf3, stat.*.out, rmfs/*.rmf3 for each replica ")
177  print("--- it stores the best scoring pdb models in pdbs/")
178  print("--- the stat.*.out and rmfs/*.rmf3 are saved only at the lowest temperature")
179  print("--- variables:")
180  keys = list(self.vars.keys())
181  keys.sort()
182  for v in keys:
183  print("------", v.ljust(30), self.vars[v])
184 
185  def get_replica_exchange_object(self):
186  return self.replica_exchange_object
187 
188  def execute_macro(self):
189 
190  temp_index_factor = 100000.0
191  samplers=[]
192  sampler_mc=None
193  sampler_md=None
194  if self.monte_carlo_sample_objects is not None:
195  print("Setting up MonteCarlo")
196  sampler_mc = IMP.pmi.samplers.MonteCarlo(self.model,
197  self.monte_carlo_sample_objects,
198  self.vars["monte_carlo_temperature"])
199  if self.vars["simulated_annealing"]:
200  tmin=self.vars["simulated_annealing_minimum_temperature"]
201  tmax=self.vars["simulated_annealing_maximum_temperature"]
202  nfmin=self.vars["simulated_annealing_minimum_temperature_nframes"]
203  nfmax=self.vars["simulated_annealing_maximum_temperature_nframes"]
204  sampler_mc.set_simulated_annealing(tmin,tmax,nfmin,nfmax)
205  self.output_objects.append(sampler_mc)
206  samplers.append(sampler_mc)
207 
208 
209  if self.molecular_dynamics_sample_objects is not None:
210  print("Setting up MolecularDynamics")
211  sampler_md = IMP.pmi.samplers.MolecularDynamics(self.model,
212  self.molecular_dynamics_sample_objects,
213  self.vars["monte_carlo_temperature"],
214  maximum_time_step=self.molecular_dynamics_max_time_step)
215  if self.vars["simulated_annealing"]:
216  tmin=self.vars["simulated_annealing_minimum_temperature"]
217  tmax=self.vars["simulated_annealing_maximum_temperature"]
218  nfmin=self.vars["simulated_annealing_minimum_temperature_nframes"]
219  nfmax=self.vars["simulated_annealing_maximum_temperature_nframes"]
220  sampler_md.set_simulated_annealing(tmin,tmax,nfmin,nfmax)
221  self.output_objects.append(sampler_md)
222  samplers.append(sampler_md)
223 # -------------------------------------------------------------------------
224 
225  print("Setting up ReplicaExchange")
226  rex = IMP.pmi.samplers.ReplicaExchange(self.model,
227  self.vars[
228  "replica_exchange_minimum_temperature"],
229  self.vars[
230  "replica_exchange_maximum_temperature"],
231  samplers,
232  replica_exchange_object=self.replica_exchange_object)
233  self.replica_exchange_object = rex.rem
234 
235  myindex = rex.get_my_index()
236  self.output_objects.append(rex)
237 
238  # must reset the minimum temperature due to the
239  # different binary length of rem.get_my_parameter double and python
240  # float
241  min_temp_index = int(min(rex.get_temperatures()) * temp_index_factor)
242 
243 # -------------------------------------------------------------------------
244 
245  globaldir = self.vars["global_output_directory"] + "/"
246  rmf_dir = globaldir + self.vars["rmf_dir"]
247  pdb_dir = globaldir + self.vars["best_pdb_dir"]
248 
249  if self.vars["do_clean_first"]:
250  pass
251 
252  if self.vars["do_create_directories"]:
253 
254  try:
255  os.makedirs(globaldir)
256  except:
257  pass
258  try:
259  os.makedirs(rmf_dir)
260  except:
261  pass
262 
263  if not self.is_multi_state:
264  try:
265  os.makedirs(pdb_dir)
266  except:
267  pass
268  else:
269  for n in range(self.vars["number_of_states"]):
270  try:
271  os.makedirs(pdb_dir + "/" + str(n))
272  except:
273  pass
274 
275 # -------------------------------------------------------------------------
276 
277  sw = IMP.pmi.tools.Stopwatch()
278  self.output_objects.append(sw)
279 
280  print("Setting up stat file")
281  output = IMP.pmi.output.Output(atomistic=self.vars["atomistic"])
282  low_temp_stat_file = globaldir + \
283  self.vars["stat_file_name_suffix"] + "." + str(myindex) + ".out"
284  output.init_stat2(low_temp_stat_file,
285  self.output_objects,
286  extralabels=["rmf_file", "rmf_frame_index"])
287 
288  print("Setting up replica stat file")
289  replica_stat_file = globaldir + \
290  self.vars["replica_stat_file_suffix"] + "." + str(myindex) + ".out"
291  output.init_stat2(replica_stat_file, [rex], extralabels=["score"])
292 
293  print("Setting up best pdb files")
294  if not self.is_multi_state:
295  if self.vars["number_of_best_scoring_models"] > 0:
296  output.init_pdb_best_scoring(pdb_dir + "/" +
297  self.vars["best_pdb_name_suffix"],
298  self.root_hier,
299  self.vars[
300  "number_of_best_scoring_models"],
301  replica_exchange=True)
302  output.write_psf(pdb_dir + "/" +"model.psf",pdb_dir + "/" +
303  self.vars["best_pdb_name_suffix"]+".0.pdb")
304  else:
305  if self.vars["number_of_best_scoring_models"] > 0:
306  for n in range(self.vars["number_of_states"]):
307  output.init_pdb_best_scoring(pdb_dir + "/" + str(n) + "/" +
308  self.vars["best_pdb_name_suffix"],
309  self.root_hiers[n],
310  self.vars[
311  "number_of_best_scoring_models"],
312  replica_exchange=True)
313  output.write_psf(pdb_dir + "/" + str(n) + "/" +"model.psf",pdb_dir + "/" + str(n) + "/" +
314  self.vars["best_pdb_name_suffix"]+".0.pdb")
315 # ---------------------------------------------
316 
317  if not self.em_object_for_rmf is None:
318  if not self.is_multi_state:
319  output_hierarchies = [
320  self.root_hier,
321  self.em_object_for_rmf.get_density_as_hierarchy(
322  )]
323  else:
324  output_hierarchies = self.root_hiers
325  output_hierarchies.append(
326  self.em_object_for_rmf.get_density_as_hierarchy())
327  else:
328  if not self.is_multi_state:
329  output_hierarchies = [self.root_hier]
330  else:
331  output_hierarchies = self.root_hiers
332 
333 #----------------------------------------------
334  print("Setting up and writing initial rmf coordinate file")
335  init_suffix = globaldir + self.vars["initial_rmf_name_suffix"]
336  output.init_rmf(init_suffix + "." + str(myindex) + ".rmf3",
337  output_hierarchies)
338  if self.crosslink_restraints:
339  output.add_restraints_to_rmf(
340  init_suffix + "." + str(myindex) + ".rmf3",
341  self.crosslink_restraints)
342  output.write_rmf(init_suffix + "." + str(myindex) + ".rmf3")
343  output.close_rmf(init_suffix + "." + str(myindex) + ".rmf3")
344 
345 #----------------------------------------------
346 
347  print("Setting up production rmf files")
348 
349  rmfname = rmf_dir + "/" + str(myindex) + ".rmf3"
350  output.init_rmf(rmfname, output_hierarchies)
351 
352  if self.crosslink_restraints:
353  output.add_restraints_to_rmf(rmfname, self.crosslink_restraints)
354 
355  ntimes_at_low_temp = 0
356 
357  if myindex == 0:
358  self.show_info()
359 
360  for i in range(self.vars["number_of_frames"]):
361  for nr in range(self.vars["num_sample_rounds"]):
362  if sampler_md is not None:
363  sampler_md.optimize(self.vars["molecular_dynamics_steps"])
364  if sampler_mc is not None:
365  sampler_mc.optimize(self.vars["monte_carlo_steps"])
366  score = IMP.pmi.tools.get_restraint_set(self.model).evaluate(False)
367  output.set_output_entry("score", score)
368 
369  my_temp_index = int(rex.get_my_temp() * temp_index_factor)
370 
371  if min_temp_index == my_temp_index:
372  print("--- frame %s score %s " % (str(i), str(score)))
373 
374  if i % self.vars["nframes_write_coordinates"]==0:
375  print('--- writing coordinates')
376  if self.vars["number_of_best_scoring_models"] > 0:
377  output.write_pdb_best_scoring(score)
378  output.write_rmf(rmfname)
379  output.set_output_entry("rmf_file", rmfname)
380  output.set_output_entry("rmf_frame_index", ntimes_at_low_temp)
381  else:
382  output.set_output_entry("rmf_file", rmfname)
383  output.set_output_entry("rmf_frame_index", '-1')
384  output.write_stat2(low_temp_stat_file)
385  ntimes_at_low_temp += 1
386 
387  output.write_stat2(replica_stat_file)
388  rex.swap_temp(i, score)
389 
390 
391 # -----------------------------------------------------------------------
392 
393 
395  m,
396  data,
397  resolutions=[1,
398  10],
399  missing_bead_size=20,
400  residue_per_gaussian=None):
401  '''
402  Construct a component for each subunit (no splitting, nothing fancy).
403 
404  You can pass the resolutions and the bead size for the missing residue regions.
405  To use this macro, you must provide the following data structure:
406 
407  Component pdbfile chainid rgb color fastafile sequence id
408  in fastafile
409 
410 data = [("Rpb1", pdbfile, "A", 0.00000000, (fastafile, 0)),
411  ("Rpb2", pdbfile, "B", 0.09090909, (fastafile, 1)),
412  ("Rpb3", pdbfile, "C", 0.18181818, (fastafile, 2)),
413  ("Rpb4", pdbfile, "D", 0.27272727, (fastafile, 3)),
414  ("Rpb5", pdbfile, "E", 0.36363636, (fastafile, 4)),
415  ("Rpb6", pdbfile, "F", 0.45454545, (fastafile, 5)),
416  ("Rpb7", pdbfile, "G", 0.54545455, (fastafile, 6)),
417  ("Rpb8", pdbfile, "H", 0.63636364, (fastafile, 7)),
418  ("Rpb9", pdbfile, "I", 0.72727273, (fastafile, 8)),
419  ("Rpb10", pdbfile, "L", 0.81818182, (fastafile, 9)),
420  ("Rpb11", pdbfile, "J", 0.90909091, (fastafile, 10)),
421  ("Rpb12", pdbfile, "K", 1.00000000, (fastafile, 11))]
422 
423  '''
424 
426 
427  # the dictionary for the hierarchies,
428  hierarchies = {}
429 
430  for d in data:
431  # retrieve the information from the data structure
432  component_name = d[0]
433  pdb_file = d[1]
434  chain_id = d[2]
435  color_id = d[3]
436  fasta_file = d[4][0]
437  # this function
438  fastids = IMP.pmi.tools.get_ids_from_fasta_file(fasta_file)
439  fasta_file_id = d[4][1]
440  # avoid to add a component with the same name
441  r.create_component(component_name,
442  color=color_id)
443 
444  r.add_component_sequence(component_name,
445  fasta_file,
446  id=fastids[fasta_file_id])
447 
448  hierarchies = r.autobuild_model(component_name,
449  pdb_file,
450  chain_id,
451  resolutions=resolutions,
452  missingbeadsize=missing_bead_size)
453 
454  r.show_component_table(component_name)
455 
456  r.set_rigid_bodies([component_name])
457 
458  r.set_chain_of_super_rigid_bodies(
459  hierarchies,
460  min_length=2,
461  max_length=2)
462 
463  r.setup_component_sequence_connectivity(component_name, resolution=1)
464  r.setup_component_geometry(component_name)
465 
466  r.setup_bonds()
467  # put it at the end of rigid bodies
468  r.set_floppy_bodies()
469 
470  # set current coordinates as reference for RMSD calculation
471  r.set_current_coordinates_as_reference_for_rmsd("Reference")
472 
473  return r
474 
475 
476 # ----------------------------------------------------------------------
477 
478 class BuildModel(object):
479  """A macro to build a Representation based on a Topology and lists of movers
480  """
481  def __init__(self,
482  model,
483  component_topologies,
484  list_of_rigid_bodies=[],
485  list_of_super_rigid_bodies=[],
486  chain_of_super_rigid_bodies=[],
487  sequence_connectivity_scale=4.0,
488  add_each_domain_as_rigid_body=False,
489  force_create_gmm_files=False):
490  """Constructor.
491  @param model The IMP model
492  @param component_topologies List of
493  IMP.pmi.topology.ComponentTopology items
494  @param list_of_rigid_bodies List of lists of domain names that will
495  be moved as rigid bodies.
496  @param list_of_super_rigid_bodies List of lists of domain names
497  that will move together in an additional Monte Carlo move.
498  @param chain_of_super_rigid_bodies List of lists of domain names
499  (choices can only be from the same molecule). Each of these
500  groups will be moved rigidly. This helps to sample more
501  efficiently complex topologies, made of several rigid bodies,
502  connected by flexible linkers.
503  @param sequence_connectivity_scale For scaling the connectivity
504  restraint
505  @param add_each_domain_as_rigid_body That way you don't have to
506  put all of them in the list
507  @param force_create_gmm_files If True, will sample and create GMMs
508  no matter what. If False, will only only sample if the
509  files don't exist. If number of Gaussians is zero, won't
510  do anything.
511  """
512  self.m = model
513  self.simo = IMP.pmi.representation.Representation(self.m,
514  upperharmonic=True,
515  disorderedlength=False)
516 
517  data=component_topologies
518  if list_of_rigid_bodies==[]:
519  print("WARNING: No list of rigid bodies inputted to build_model()")
520  if list_of_super_rigid_bodies==[]:
521  print("WARNING: No list of super rigid bodies inputted to build_model()")
522  if chain_of_super_rigid_bodies==[]:
523  print("WARNING: No chain of super rigid bodies inputted to build_model()")
524  all_dnames = set([d for sublist in list_of_rigid_bodies+list_of_super_rigid_bodies\
525  +chain_of_super_rigid_bodies for d in sublist])
526  all_available = set([c.domain_name for c in component_topologies])
527  if not all_dnames <= all_available:
528  raise ValueError("All requested movers must reference domain "
529  "names in the component topologies")
530 
531  self.domain_dict={}
532  self.resdensities={}
533  super_rigid_bodies={}
534  chain_super_rigid_bodies={}
535  rigid_bodies={}
536 
537  for c in data:
538  comp_name = c.name
539  hier_name = c.domain_name
540  color = c.color
541  fasta_file = c.fasta_file
542  fasta_id = c.fasta_id
543  pdb_name = c.pdb_file
544  chain_id = c.chain
545  res_range = c.residue_range
546  offset = c.pdb_offset
547  bead_size = c.bead_size
548  em_num_components = c.em_residues_per_gaussian
549  em_txt_file_name = c.gmm_file
550  em_mrc_file_name = c.mrc_file
551 
552  if comp_name not in self.simo.get_component_names():
553  self.simo.create_component(comp_name,color=0.0)
554  self.simo.add_component_sequence(comp_name,fasta_file,fasta_id)
555 
556  # create hierarchy (adds resolutions, beads) with autobuild and optionally add EM data
557  if em_num_components==0:
558  read_em_files=False
559  include_res0=False
560  else:
561  if (not os.path.isfile(em_txt_file_name)) or force_create_gmm_files:
562  read_em_files=False
563  include_res0=True
564  else:
565  read_em_files=True
566  include_res0=False
567 
568  outhier=self.autobuild(self.simo,comp_name,pdb_name,chain_id,
569  res_range,include_res0,beadsize=bead_size,
570  color=color,offset=offset)
571  if em_num_components!=0:
572  if read_em_files:
573  print("will read GMM files")
574  else:
575  print("will calculate GMMs")
576 
577  dens_hier,beads=self.create_density(self.simo,comp_name,outhier,em_txt_file_name,
578  em_mrc_file_name,em_num_components,read_em_files)
579  self.simo.add_all_atom_densities(comp_name, hierarchies=beads)
580  dens_hier+=beads
581  else:
582  dens_hier=[]
583 
584  self.resdensities[hier_name]=dens_hier
585  self.domain_dict[hier_name]=outhier+dens_hier
586 
587  # setup basic restraints
588  for c in self.simo.get_component_names():
589  self.simo.setup_component_sequence_connectivity(c,scale=sequence_connectivity_scale)
590  self.simo.setup_component_geometry(c)
591 
592  # create movers
593  for rblist in list_of_rigid_bodies:
594  rb=[]
595  for rbmember in rblist:
596  rb+=[h for h in self.domain_dict[rbmember]]
597  self.simo.set_rigid_body_from_hierarchies(rb)
598  for srblist in list_of_super_rigid_bodies:
599  srb=[]
600  for srbmember in rblist:
601  srb+=[h for h in self.domain_dict[srbmember]]
602  self.simo.set_super_rigid_body_from_hierarchies(srb)
603  for clist in chain_of_super_rigid_bodies:
604  crb=[]
605  for crbmember in rblist:
606  crb+=[h for h in self.domain_dict[crbmember]]
607  self.simo.set_chain_of_super_rigid_bodies(crb,2,3)
608 
609  self.simo.set_floppy_bodies()
610  self.simo.setup_bonds()
611 
613  '''Return the Representation object'''
614  return self.simo
615 
616  def get_density_hierarchies(self,hier_name_list):
617  # return a list of density hierarchies
618  # specify the list of hierarchy names
619  dens_hier_list=[]
620  for hn in hier_name_list:
621  print(hn)
622  dens_hier_list+=self.resdensities[hn]
623  return dens_hier_list
624 
625  def set_gmm_models_directory(self,directory_name):
626  self.gmm_models_directory=directory_name
627 
628  def get_pdb_bead_bits(self,hierarchy):
629  pdbbits=[]
630  beadbits=[]
631  helixbits=[]
632  for h in hierarchy:
633  if "_pdb" in h.get_name():pdbbits.append(h)
634  if "_bead" in h.get_name():beadbits.append(h)
635  if "_helix" in h.get_name():helixbits.append(h)
636  return (pdbbits,beadbits,helixbits)
637 
638  def scale_bead_radii(self,nresidues,scale):
639  scaled_beads=set()
640  for h in self.domain_dict:
641  (pdbbits,beadbits,helixbits)=self.get_pdb_bead_bits(self.domain_dict[h])
642  slope=(1.0-scale)/(1.0-float(nresidues))
643 
644  for b in beadbits:
645  # I have to do the following
646  # because otherwise we'll scale more than once
647  if b not in scaled_beads:
648  scaled_beads.add(b)
649  else:
650  continue
651  radius=IMP.core.XYZR(b).get_radius()
652  num_residues=len(IMP.pmi.tools.get_residue_indexes(b))
653  scale_factor=slope*float(num_residues)+1.0
654  print(scale_factor)
655  new_radius=scale_factor*radius
656  IMP.core.XYZR(b).set_radius(new_radius)
657  print(b.get_name())
658  print("particle with radius "+str(radius)+" and "+str(num_residues)+" residues scaled to a new radius "+str(new_radius))
659 
660 
661  def create_density(self,simo,compname,comphier,txtfilename,mrcfilename,num_components,read=True):
662  #density generation for the EM restraint
663  (pdbbits,beadbits,helixbits)=self.get_pdb_bead_bits(comphier)
664  #get the number of residues from the pdb bits
665  res_ind=[]
666  for pb in pdbbits+helixbits:
667  for p in IMP.core.get_leaves(pb):
669 
670  number_of_residues=len(set(res_ind))
671  outhier=[]
672  if read:
673  if len(pdbbits)!=0:
674  outhier+=simo.add_component_density(compname,
675  pdbbits,
676  num_components=num_components,
677  resolution=0,
678  inputfile=txtfilename)
679  if len(helixbits)!=0:
680  outhier+=simo.add_component_density(compname,
681  helixbits,
682  num_components=num_components,
683  resolution=1,
684  inputfile=txtfilename)
685 
686 
687  else:
688  if len(pdbbits)!=0:
689  num_components=number_of_residues/abs(num_components)
690  outhier+=simo.add_component_density(compname,
691  pdbbits,
692  num_components=num_components,
693  resolution=0,
694  outputfile=txtfilename,
695  outputmap=mrcfilename,
696  multiply_by_total_mass=True)
697 
698  if len(helixbits)!=0:
699  num_components=number_of_residues/abs(num_components)
700  outhier+=simo.add_component_density(compname,
701  helixbits,
702  num_components=num_components,
703  resolution=1,
704  outputfile=txtfilename,
705  outputmap=mrcfilename,
706  multiply_by_total_mass=True)
707 
708  return outhier,beadbits
709 
710  def autobuild(self,simo,comname,pdbname,chain,resrange,include_res0=False,
711  beadsize=5,color=0.0,offset=0):
712  if pdbname is not None and pdbname is not "IDEAL_HELIX" and pdbname is not "BEADS" :
713  if resrange[-1]==-1:
714  resrange=(resrange[0],len(simo.sequence_dict[comname]))
715  if include_res0:
716  outhier=simo.autobuild_model(comname,
717  pdbname=pdbname,
718  chain=chain,
719  resrange=resrange,
720  resolutions=[0,1,10],
721  offset=offset,
722  color=color,
723  missingbeadsize=beadsize)
724  else:
725  outhier=simo.autobuild_model(comname,
726  pdbname=pdbname,
727  chain=chain,
728  resrange=resrange,
729  resolutions=[1,10],
730  offset=offset,
731  color=color,
732  missingbeadsize=beadsize)
733 
734 
735  elif pdbname is not None and pdbname is "IDEAL_HELIX" and pdbname is not "BEADS" :
736  outhier=simo.add_component_ideal_helix(comname,
737  resolutions=[1,10],
738  resrange=resrange,
739  color=color,
740  show=False)
741 
742  elif pdbname is not None and pdbname is not "IDEAL_HELIX" and pdbname is "BEADS" :
743  outhier=simo.add_component_necklace(comname,resrange[0],resrange[1],beadsize,color=color)
744 
745  else:
746 
747  seq_len=len(simo.sequence_dict[comname])
748  outhier=simo.add_component_necklace(comname,
749  begin=1,
750  end=seq_len,
751  length=beadsize)
752 
753  return outhier
754 
755 
756 @IMP.deprecated_object("2.5", "Use BuildModel instead")
757 class BuildModel1(object):
758  """Deprecated building macro - use BuildModel()"""
759 
760  def __init__(self, representation):
761  """Constructor.
762  @param representation The PMI representation
763  """
764  self.simo=representation
765  self.gmm_models_directory="."
766  self.rmf_file={}
767  self.rmf_frame_number={}
768 
769  def set_gmm_models_directory(self,directory_name):
770  self.gmm_models_directory=directory_name
771 
772  def build_model(self,data_structure,sequence_connectivity_scale=4.0,
773  sequence_connectivity_resolution=10,rmf_file=None,rmf_frame_number=0):
774  """Create model.
775  @param data_structure List of lists containing these entries:
776  comp_name, hier_name, color, fasta_file, fasta_id, pdb_name, chain_id,
777  res_range, read_em_files, bead_size, rb, super_rb,
778  em_num_components, em_txt_file_name, em_mrc_file_name
779  @param sequence_connectivity_scale
780  @param rmf_file
781  @param rmf_frame_number
782  """
783  self.domain_dict={}
784  self.resdensities={}
785  super_rigid_bodies={}
786  chain_super_rigid_bodies={}
787  rigid_bodies={}
788 
789  for d in data_structure:
790  comp_name = d[0]
791  hier_name = d[1]
792  color = d[2]
793  fasta_file = d[3]
794  fasta_id = d[4]
795  pdb_name = d[5]
796  chain_id = d[6]
797  res_range = d[7][0:2]
798  try:
799  offset = d[7][2]
800  except:
801  offset = 0
802  read_em_files = d[8]
803  bead_size = d[9]
804  rb = d[10]
805  super_rb = d[11]
806  em_num_components = d[12]
807  em_txt_file_name = d[13]
808  em_mrc_file_name = d[14]
809  chain_of_super_rb = d[15]
810 
811  if comp_name not in self.simo.get_component_names():
812  self.simo.create_component(comp_name,color=0.0)
813  self.simo.add_component_sequence(comp_name,fasta_file,fasta_id)
814  outhier=self.autobuild(self.simo,comp_name,pdb_name,chain_id,res_range,read=read_em_files,beadsize=bead_size,color=color,offset=offset)
815 
816 
817  if not read_em_files is None:
818  if em_txt_file_name is " ": em_txt_file_name=self.gmm_models_directory+"/"+hier_name+".txt"
819  if em_mrc_file_name is " ": em_mrc_file_name=self.gmm_models_directory+"/"+hier_name+".mrc"
820 
821 
822  dens_hier,beads=self.create_density(self.simo,comp_name,outhier,em_txt_file_name,em_mrc_file_name,em_num_components,read_em_files)
823  self.simo.add_all_atom_densities(comp_name, hierarchies=beads)
824  dens_hier+=beads
825 
826  else:
827  dens_hier=[]
828 
829  self.resdensities[hier_name]=dens_hier
830  self.domain_dict[hier_name]=outhier+dens_hier
831 
832  if rb is not None:
833  if rb not in rigid_bodies:
834  rigid_bodies[rb]=[h for h in self.domain_dict[hier_name]]
835  else:
836  rigid_bodies[rb]+=[h for h in self.domain_dict[hier_name]]
837 
838 
839  if super_rb is not None:
840  for k in super_rb:
841  if k not in super_rigid_bodies:
842  super_rigid_bodies[k]=[h for h in self.domain_dict[hier_name]]
843  else:
844  super_rigid_bodies[k]+=[h for h in self.domain_dict[hier_name]]
845 
846  if chain_of_super_rb is not None:
847  for k in chain_of_super_rb:
848  if k not in chain_super_rigid_bodies:
849  chain_super_rigid_bodies[k]=[h for h in self.domain_dict[hier_name]]
850  else:
851  chain_super_rigid_bodies[k]+=[h for h in self.domain_dict[hier_name]]
852 
853 
854 
855  self.rigid_bodies=rigid_bodies
856 
857  for c in self.simo.get_component_names():
858  if rmf_file is not None:
859  rf=rmf_file
860  rfn=rmf_frame_number
861  self.simo.set_coordinates_from_rmf(c, rf,rfn)
862  else:
863  if c in self.rmf_file:
864  rf=self.rmf_file[c]
865  rfn=self.rmf_frame_number[c]
866  self.simo.set_coordinates_from_rmf(c, rf,rfn)
867  self.simo.setup_component_sequence_connectivity(c,resolution=sequence_connectivity_resolution,scale=sequence_connectivity_scale)
868  self.simo.setup_component_geometry(c)
869 
870  for rb in rigid_bodies:
871  self.simo.set_rigid_body_from_hierarchies(rigid_bodies[rb])
872 
873  for k in super_rigid_bodies:
874  self.simo.set_super_rigid_body_from_hierarchies(super_rigid_bodies[k])
875 
876  for k in chain_super_rigid_bodies:
877  self.simo.set_chain_of_super_rigid_bodies(chain_super_rigid_bodies[k],2,3)
878 
879  self.simo.set_floppy_bodies()
880  self.simo.setup_bonds()
881 
882 
883 
884  def set_rmf_file(self,component_name,rmf_file,rmf_frame_number):
885  self.rmf_file[component_name]=rmf_file
886  self.rmf_frame_number[component_name]=rmf_frame_number
887 
888  def get_density_hierarchies(self,hier_name_list):
889  # return a list of density hierarchies
890  # specify the list of hierarchy names
891  dens_hier_list=[]
892  for hn in hier_name_list:
893  print(hn)
894  dens_hier_list+=self.resdensities[hn]
895  return dens_hier_list
896 
897  def get_pdb_bead_bits(self,hierarchy):
898  pdbbits=[]
899  beadbits=[]
900  helixbits=[]
901  for h in hierarchy:
902  if "_pdb" in h.get_name():pdbbits.append(h)
903  if "_bead" in h.get_name():beadbits.append(h)
904  if "_helix" in h.get_name():helixbits.append(h)
905  return (pdbbits,beadbits,helixbits)
906 
907  def scale_bead_radii(self,nresidues,scale):
908  scaled_beads=set()
909  for h in self.domain_dict:
910  (pdbbits,beadbits,helixbits)=self.get_pdb_bead_bits(self.domain_dict[h])
911  slope=(1.0-scale)/(1.0-float(nresidues))
912 
913  for b in beadbits:
914  # I have to do the following
915  # because otherwise we'll scale more than once
916  if b not in scaled_beads:
917  scaled_beads.add(b)
918  else:
919  continue
920  radius=IMP.core.XYZR(b).get_radius()
921  num_residues=len(IMP.pmi.tools.get_residue_indexes(b))
922  scale_factor=slope*float(num_residues)+1.0
923  print(scale_factor)
924  new_radius=scale_factor*radius
925  IMP.core.XYZR(b).set_radius(new_radius)
926  print(b.get_name())
927  print("particle with radius "+str(radius)+" and "+str(num_residues)+" residues scaled to a new radius "+str(new_radius))
928 
929 
930  def create_density(self,simo,compname,comphier,txtfilename,mrcfilename,num_components,read=True):
931  #density generation for the EM restraint
932  (pdbbits,beadbits,helixbits)=self.get_pdb_bead_bits(comphier)
933 
934  #get the number of residues from the pdb bits
935  res_ind=[]
936  for pb in pdbbits+helixbits:
937  for p in IMP.core.get_leaves(pb):
939 
940  number_of_residues=len(set(res_ind))
941  outhier=[]
942  if read:
943  if len(pdbbits)!=0:
944  outhier+=simo.add_component_density(compname,
945  pdbbits,
946  num_components=num_components, # number of gaussian into which the simulated density is approximated
947  resolution=0, # resolution that you want to calculate the simulated density
948  inputfile=txtfilename) # read what it was calculated before
949  if len(helixbits)!=0:
950  outhier+=simo.add_component_density(compname,
951  helixbits,
952  num_components=num_components, # number of gaussian into which the simulated density is approximated
953  resolution=1, # resolution that you want to calculate the simulated density
954  inputfile=txtfilename) # read what it was calculated before
955 
956 
957  else:
958  if len(pdbbits)!=0:
959  if num_components<0:
960  #if negative calculate the number of gmm components automatically
961  # from the number of residues
962  num_components=number_of_residues/abs(num_components)
963  outhier+=simo.add_component_density(compname,
964  pdbbits,
965  num_components=num_components, # number of gaussian into which the simulated density is approximated
966  resolution=0, # resolution that you want to calculate the simulated density
967  outputfile=txtfilename, # do the calculation
968  outputmap=mrcfilename,
969  multiply_by_total_mass=True) # do the calculation and output the mrc
970 
971  if len(helixbits)!=0:
972  if num_components<0:
973  #if negative calculate the number of gmm components automatically
974  # from the number of residues
975  num_components=number_of_residues/abs(num_components)
976  outhier+=simo.add_component_density(compname,
977  helixbits,
978  num_components=num_components, # number of gaussian into which the simulated density is approximated
979  resolution=1, # resolution that you want to calculate the simulated density
980  outputfile=txtfilename, # do the calculation
981  outputmap=mrcfilename,
982  multiply_by_total_mass=True) # do the calculation and output the mrc
983 
984  return outhier,beadbits
985 
986  def autobuild(self,simo,comname,pdbname,chain,resrange,read=True,beadsize=5,color=0.0,offset=0):
987 
988  if pdbname is not None and pdbname is not "IDEAL_HELIX" and pdbname is not "BEADS" and pdbname is not "DENSITY" :
989  if resrange[-1]==-1: resrange=(resrange[0],len(simo.sequence_dict[comname]))
990  if read==False:
991  outhier=simo.autobuild_model(comname,
992  pdbname=pdbname,
993  chain=chain,
994  resrange=resrange,
995  resolutions=[0,1,10],
996  offset=offset,
997  color=color,
998  missingbeadsize=beadsize)
999  else:
1000  outhier=simo.autobuild_model(comname,
1001  pdbname=pdbname,
1002  chain=chain,
1003  resrange=resrange,
1004  resolutions=[1,10],
1005  offset=offset,
1006  color=color,
1007  missingbeadsize=beadsize)
1008 
1009 
1010  elif pdbname is not None and pdbname is "IDEAL_HELIX" and pdbname is not "BEADS" and pdbname is not "DENSITY" :
1011 
1012  outhier=simo.add_component_ideal_helix(comname,
1013  resolutions=[1,10],
1014  resrange=resrange,
1015  color=color,
1016  show=False)
1017 
1018  elif pdbname is not None and pdbname is not "IDEAL_HELIX" and pdbname is "BEADS" and pdbname is not "DENSITY" :
1019  outhier=simo.add_component_necklace(comname,resrange[0],resrange[1],beadsize,color=color)
1020 
1021  elif pdbname is not None and pdbname is not "IDEAL_HELIX" and pdbname is not "BEADS" and pdbname is "DENSITY" :
1022  outhier=[]
1023 
1024  else:
1025 
1026  seq_len=len(simo.sequence_dict[comname])
1027  outhier=simo.add_component_necklace(comname,
1028  begin=1,
1029  end=seq_len,
1030  length=beadsize)
1031 
1032  return outhier
1033 
1034  def set_coordinates(self,hier_name,xyz_tuple):
1035  hier=self.domain_dict[hier_name]
1036  for h in IMP.atom.get_leaves(hier):
1037  p=h.get_particle()
1039  pass
1040  else:
1041  IMP.core.XYZ(p).set_coordinates(xyz_tuple)
1042 
1043  def save_rmf(self,rmfname):
1044 
1046  o.init_rmf(rmfname,[self.simo.prot])
1047  o.write_rmf(rmfname)
1048  o.close_rmf(rmfname)
1049 
1050 # ----------------------------------------------------------------------
1051 
1053  """A macro for running all the basic operations of analysis.
1054  Includes clustering, precision analysis, and making ensemble density maps.
1055  A number of plots are also supported.
1056  """
1057  def __init__(self, model,
1058  merge_directories=["./"],
1059  stat_file_name_suffix="stat",
1060  best_pdb_name_suffix="model",
1061  do_clean_first=True,
1062  do_create_directories=True,
1063  global_output_directory="output/",
1064  replica_stat_file_suffix="stat_replica",
1065  global_analysis_result_directory="./analysis/"):
1066  """Constructor.
1067  @param model The IMP model
1068  @param stat_file_name_suffix
1069  @param merge_directories The directories containing output files
1070  @param best_pdb_name_suffix
1071  @param do_clean_first
1072  @param do_create_directories
1073  @param global_output_directory Where everything is
1074  @param replica_stat_file_suffix
1075  @param global_analysis_result_directory
1076  """
1077 
1078  try:
1079  from mpi4py import MPI
1080  self.comm = MPI.COMM_WORLD
1081  self.rank = self.comm.Get_rank()
1082  self.number_of_processes = self.comm.size
1083  except ImportError:
1084  self.rank = 0
1085  self.number_of_processes = 1
1086 
1087  self.model = model
1088  stat_dir = global_output_directory
1089  self.stat_files = []
1090  # it contains the position of the root directories
1091  for rd in merge_directories:
1092  stat_files = glob.glob(rd + "/" + stat_dir + "/stat.*.out")
1093  if len(stat_files)==0:
1094  print("WARNING: no stat files found in",rd + "/" + stat_dir + "/stat.*.out")
1095  self.stat_files += stat_files
1096 
1097 
1099  score_key="SimplifiedModel_Total_Score_None",
1100  rmf_file_key="rmf_file",
1101  rmf_file_frame_key="rmf_frame_index",
1102  outputdir="./",
1103  get_every=1,
1104  nframes_trajectory=10000):
1105  """ Get a trajectory of the modeling run, for generating demonstrative movies
1106  @param score_key The score for ranking models
1107  @param rmf_file_key Key pointing to RMF filename
1108  @param rmf_file_frame_key Key pointing to RMF frame number
1109  @param outputdir The local output directory used in the run
1110  @param get_every Extract every nth frame
1111  @param nframes_trajectory Total number of frames of the trajectory
1112  """
1113  from operator import itemgetter
1114  import math
1115 
1116  trajectory_models = IMP.pmi.io.get_trajectory_models(self.stat_files,
1117  score_key,
1118  rmf_file_key,
1119  rmf_file_frame_key,
1120  get_every)
1121  rmf_file_list=trajectory_models[0]
1122  rmf_file_frame_list=trajectory_models[1]
1123  score_list=list(map(float, trajectory_models[2]))
1124 
1125  max_score=max(score_list)
1126  min_score=min(score_list)
1127 
1128 
1129  bins=[(max_score-min_score)*math.exp(-float(i))+min_score for i in range(nframes_trajectory)]
1130  binned_scores=[None]*nframes_trajectory
1131  binned_model_indexes=[-1]*nframes_trajectory
1132 
1133  for model_index,s in enumerate(score_list):
1134  bins_score_diffs=[abs(s-b) for b in bins]
1135  bin_index=min(enumerate(bins_score_diffs), key=itemgetter(1))[0]
1136  if binned_scores[bin_index]==None:
1137  binned_scores[bin_index]=s
1138  binned_model_indexes[bin_index]=model_index
1139  else:
1140  old_diff=abs(binned_scores[bin_index]-bins[bin_index])
1141  new_diff=abs(s-bins[bin_index])
1142  if new_diff < old_diff:
1143  binned_scores[bin_index]=s
1144  binned_model_indexes[bin_index]=model_index
1145 
1146  print(binned_scores)
1147  print(binned_model_indexes)
1148 
1149 
1150 
1151  def clustering(self,
1152  score_key="SimplifiedModel_Total_Score_None",
1153  rmf_file_key="rmf_file",
1154  rmf_file_frame_key="rmf_frame_index",
1155  state_number=0,
1156  prefiltervalue=None,
1157  feature_keys=[],
1158  outputdir="./",
1159  alignment_components=None,
1160  number_of_best_scoring_models=10,
1161  rmsd_calculation_components=None,
1162  distance_matrix_file=None,
1163  load_distance_matrix_file=False,
1164  skip_clustering=False,
1165  number_of_clusters=1,
1166  display_plot=False,
1167  exit_after_display=True,
1168  get_every=1,
1169  first_and_last_frames=None,
1170  density_custom_ranges=None,
1171  write_pdb_with_centered_coordinates=False,
1172  voxel_size=5.0):
1173  """ Get the best scoring models, compute a distance matrix, cluster them, and create density maps
1174  @param score_key The score for ranking models
1175  @param rmf_file_key Key pointing to RMF filename
1176  @param rmf_file_frame_key Key pointing to RMF frame number
1177  @param state_number State number to analyse
1178  @param prefiltervalue Only include frames where the score key is below this value
1179  @param feature_keys Keywords for which you want to calculate average,
1180  medians, etc,
1181  @param outputdir The local output directory used in the run
1182  @param alignment_components List of tuples for aligning the structures
1183  e.g. ["Rpb1", (20,100,"Rpb2"), .....]
1184  @param number_of_best_scoring_models Num models to keep per run
1185  @param rmsd_calculation_components List of tuples for calculating RMSD
1186  e.g. ["Rpb1", (20,100,"Rpb2"), .....]
1187  @param distance_matrix_file Where to store/read the distance matrix
1188  @param load_distance_matrix_file Try to load the distance matrix file
1189  @param skip_clustering Just extract the best scoring models and save the pdbs
1190  @param number_of_clusters Number of k-means clusters
1191  @param display_plot Display the distance matrix
1192  @param exit_after_display Exit after displaying distance matrix
1193  @param get_every Extract every nth frame
1194  @param first_and_last_frames A tuple with the first and last frames to be
1195  analyzed. Values are percentages!
1196  Default: get all frames
1197  @param density_custom_ranges List of tuples or strings for density calculation
1198  e.g. ["Rpb1", (20,100,"Rpb2"), .....]
1199  @param write_pdb_with_centered_coordinates
1200  @param voxel_size Used for the density output
1201  """
1202 
1203 
1204  if self.rank==0:
1205  try:
1206  os.mkdir(outputdir)
1207  except:
1208  pass
1209 
1210 
1211  if not load_distance_matrix_file:
1212  if len(self.stat_files)==0: print("ERROR: no stat file found in the given path"); return
1213  my_stat_files=IMP.pmi.tools.chunk_list_into_segments(
1214  self.stat_files,self.number_of_processes)[self.rank]
1215  best_models = IMP.pmi.io.get_best_models(my_stat_files,
1216  score_key,
1217  feature_keys,
1218  rmf_file_key,
1219  rmf_file_frame_key,
1220  prefiltervalue,
1221  get_every)
1222  rmf_file_list=best_models[0]
1223  rmf_file_frame_list=best_models[1]
1224  score_list=best_models[2]
1225  feature_keyword_list_dict=best_models[3]
1226 
1227 # ------------------------------------------------------------------------
1228 # collect all the files and scores
1229 # ------------------------------------------------------------------------
1230 
1231  if self.number_of_processes > 1:
1232  score_list = IMP.pmi.tools.scatter_and_gather(score_list)
1233  rmf_file_list = IMP.pmi.tools.scatter_and_gather(rmf_file_list)
1234  rmf_file_frame_list = IMP.pmi.tools.scatter_and_gather(
1235  rmf_file_frame_list)
1236  for k in feature_keyword_list_dict:
1237  feature_keyword_list_dict[k] = IMP.pmi.tools.scatter_and_gather(
1238  feature_keyword_list_dict[k])
1239 
1240  # sort by score and get the best scoring ones
1241  score_rmf_tuples = list(zip(score_list,
1242  rmf_file_list,
1243  rmf_file_frame_list,
1244  list(range(len(score_list)))))
1245 
1246  # keep subset of frames if requested
1247  if first_and_last_frames is not None:
1248  nframes = len(score_rmf_tuples)
1249  first_frame = int(first_and_last_frames[0] * nframes)
1250  last_frame = int(first_and_last_frames[1] * nframes)
1251  if last_frame > len(score_rmf_tuples):
1252  last_frame = -1
1253  score_rmf_tuples = score_rmf_tuples[first_frame:last_frame]
1254 
1255  # sort RMFs by the score_key in ascending order, and store the rank
1256  best_score_rmf_tuples = sorted(score_rmf_tuples,
1257  key=lambda x: float(x[0]))[:number_of_best_scoring_models]
1258  best_score_rmf_tuples=[t+(n,) for n,t in enumerate(best_score_rmf_tuples)]
1259 
1260  # sort the feature scores in the same way
1261  best_score_feature_keyword_list_dict = defaultdict(list)
1262  for tpl in best_score_rmf_tuples:
1263  index = tpl[3]
1264  for f in feature_keyword_list_dict:
1265  best_score_feature_keyword_list_dict[f].append(
1266  feature_keyword_list_dict[f][index])
1267 
1268  my_best_score_rmf_tuples = IMP.pmi.tools.chunk_list_into_segments(
1269  best_score_rmf_tuples,
1270  self.number_of_processes)[self.rank]
1271 
1272 
1273 #-------------------------------------------------------------
1274 # read the coordinates
1275 # ------------------------------------------------------------
1276  rmsd_weights = IMP.pmi.io.get_bead_sizes(self.model,
1277  my_best_score_rmf_tuples[0],
1278  rmsd_calculation_components,
1279  state_number=state_number)
1280  got_coords = IMP.pmi.io.read_coordinates_of_rmfs(self.model,
1281  my_best_score_rmf_tuples,
1282  alignment_components,
1283  rmsd_calculation_components,
1284  state_number=state_number)
1285 
1286  # note! the coordinates are simply float tuples, NOT decorators, NOT Vector3D,
1287  # NOR particles, because these object cannot be serialized. We need serialization
1288  # for the parallel computation based on mpi.
1289  all_coordinates=got_coords[0] # dict:key=component name,val=coords per hit
1290  alignment_coordinates=got_coords[1] # same as above, limited to alignment bits
1291  rmsd_coordinates=got_coords[2] # same as above, limited to RMSD bits
1292  rmf_file_name_index_dict=got_coords[3] # dictionary with key=RMF, value=score rank
1293  all_rmf_file_names=got_coords[4] # RMF file per hit
1294 
1295 # ------------------------------------------------------------------------
1296 # optionally don't compute distance matrix or cluster, just write top files
1297 # ------------------------------------------------------------------------
1298  if skip_clustering:
1299  if density_custom_ranges:
1300  DensModule = IMP.pmi.analysis.GetModelDensity(
1301  density_custom_ranges,
1302  voxel=voxel_size)
1303 
1304  dircluster=os.path.join(outputdir,"all_models."+str(n))
1305  try:
1306  os.mkdir(outputdir)
1307  except:
1308  pass
1309  try:
1310  os.mkdir(dircluster)
1311  except:
1312  pass
1313  clusstat=open(os.path.join(dircluster,"stat."+str(self.rank)+".out"),"w")
1314  for cnt,tpl in enumerate(my_best_score_rmf_tuples):
1315  rmf_name=tpl[1]
1316  rmf_frame_number=tpl[2]
1317  tmp_dict={}
1318  index=tpl[4]
1319  for key in best_score_feature_keyword_list_dict:
1320  tmp_dict[key]=best_score_feature_keyword_list_dict[key][index]
1321 
1322  if cnt==0:
1323  prots,rs = IMP.pmi.analysis.get_hiers_and_restraints_from_rmf(
1324  self.model,
1325  rmf_frame_number,
1326  rmf_name)
1327  else:
1328  IMP.pmi.analysis.link_hiers_and_restraints_to_rmf(
1329  self.model,
1330  prots,
1331  rs,
1332  rmf_frame_number,
1333  rmf_name)
1334 
1335  if not prots:
1336  continue
1337 
1338  prot=prots[state_number]
1339 
1340  if cnt==0:
1341  coords_f1=alignment_coordinates[cnt]
1342  if cnt > 0:
1343  coords_f2=alignment_coordinates[cnt]
1344  Ali = IMP.pmi.analysis.Alignment(coords_f1, coords_f2)
1345  transformation = Ali.align()[1]
1346  rbs = set()
1347  for p in IMP.atom.get_leaves(prot):
1348  if not IMP.core.XYZR.get_is_setup(p):
1350  IMP.core.XYZR(p).set_radius(0.0001)
1351  IMP.core.XYZR(p).set_coordinates((0, 0, 0))
1352 
1354  rb = IMP.core.RigidBodyMember(p).get_rigid_body()
1355  rbs.add(rb)
1356  else:
1358  transformation)
1359  for rb in rbs:
1360  IMP.core.transform(rb,transformation)
1361 
1363  out_pdb_fn=os.path.join(dircluster,str(cnt)+"."+str(self.rank)+".pdb")
1364  out_rmf_fn=os.path.join(dircluster,str(cnt)+"."+str(self.rank)+".rmf3")
1365  o.init_pdb(out_pdb_fn,prot)
1366  o.write_pdb(out_pdb_fn,
1367  translate_to_geometric_center=write_pdb_with_centered_coordinates)
1368 
1369  tmp_dict["local_pdb_file_name"]=os.path.basename(out_pdb_fn)
1370  tmp_dict["rmf_file_full_path"]=rmf_name
1371  tmp_dict["local_rmf_file_name"]=os.path.basename(out_rmf_fn)
1372  tmp_dict["local_rmf_frame_number"]=0
1373 
1374  clusstat.write(str(tmp_dict)+"\n")
1375  o.init_rmf(out_rmf_fn,[prot],rs)
1376  o.write_rmf(out_rmf_fn)
1377  o.close_rmf(out_rmf_fn)
1378  # add the density
1379  if density_custom_ranges:
1380  DensModule.add_subunits_density(prot)
1381 
1382  if density_custom_ranges:
1383  DensModule.write_mrc(path=dircluster)
1384  del DensModule
1385  return
1386 
1387 
1388 
1389  # broadcast the coordinates
1390  if self.number_of_processes > 1:
1391  all_coordinates = IMP.pmi.tools.scatter_and_gather(
1392  all_coordinates)
1393  all_rmf_file_names = IMP.pmi.tools.scatter_and_gather(
1394  all_rmf_file_names)
1395  rmf_file_name_index_dict = IMP.pmi.tools.scatter_and_gather(
1396  rmf_file_name_index_dict)
1397  alignment_coordinates=IMP.pmi.tools.scatter_and_gather(
1398  alignment_coordinates)
1399  rmsd_coordinates=IMP.pmi.tools.scatter_and_gather(
1400  rmsd_coordinates)
1401 
1402  if self.rank == 0:
1403  # save needed informations in external files
1404  self.save_objects(
1405  [best_score_feature_keyword_list_dict,
1406  rmf_file_name_index_dict],
1407  ".macro.pkl")
1408 
1409 # ------------------------------------------------------------------------
1410 # Calculate distance matrix and cluster
1411 # ------------------------------------------------------------------------
1412  print("setup clustering class")
1413  Clusters = IMP.pmi.analysis.Clustering(rmsd_weights)
1414 
1415  for n, model_coordinate_dict in enumerate(all_coordinates):
1416  template_coordinate_dict = {}
1417  # let's try to align
1418  if alignment_components is not None and len(Clusters.all_coords) == 0:
1419  # set the first model as template coordinates
1420  Clusters.set_template(alignment_coordinates[n])
1421  Clusters.fill(all_rmf_file_names[n], rmsd_coordinates[n])
1422 
1423  print("Global calculating the distance matrix")
1424 
1425  # calculate distance matrix, all against all
1426  Clusters.dist_matrix()
1427 
1428  # perform clustering and optionally display
1429  if self.rank == 0:
1430  Clusters.do_cluster(number_of_clusters)
1431  if display_plot:
1432  if self.rank == 0:
1433  Clusters.plot_matrix(figurename=os.path.join(outputdir,'dist_matrix.pdf'))
1434  if exit_after_display:
1435  exit()
1436  Clusters.save_distance_matrix_file(file_name=distance_matrix_file)
1437 
1438 # ------------------------------------------------------------------------
1439 # Alteratively, load the distance matrix from file and cluster that
1440 # ------------------------------------------------------------------------
1441  else:
1442  if self.rank==0:
1443  print("setup clustering class")
1444  Clusters = IMP.pmi.analysis.Clustering()
1445  Clusters.load_distance_matrix_file(file_name=distance_matrix_file)
1446  print("clustering with %s clusters" % str(number_of_clusters))
1447  Clusters.do_cluster(number_of_clusters)
1448  [best_score_feature_keyword_list_dict,
1449  rmf_file_name_index_dict] = self.load_objects(".macro.pkl")
1450  if display_plot:
1451  if self.rank == 0:
1452  Clusters.plot_matrix(figurename=os.path.join(outputdir,'dist_matrix.pdf'))
1453  if exit_after_display:
1454  exit()
1455  if self.number_of_processes > 1:
1456  self.comm.Barrier()
1457 
1458 # ------------------------------------------------------------------------
1459 # now save all informations about the clusters
1460 # ------------------------------------------------------------------------
1461 
1462  if self.rank == 0:
1463  print(Clusters.get_cluster_labels())
1464  for n, cl in enumerate(Clusters.get_cluster_labels()):
1465  print("rank %s " % str(self.rank))
1466  print("cluster %s " % str(n))
1467  print("cluster label %s " % str(cl))
1468  print(Clusters.get_cluster_label_names(cl))
1469 
1470  # first initialize the Density class if requested
1471 
1472  if density_custom_ranges:
1473  DensModule = IMP.pmi.analysis.GetModelDensity(
1474  density_custom_ranges,
1475  voxel=voxel_size)
1476 
1477  dircluster = outputdir + "/cluster." + str(n) + "/"
1478  try:
1479  os.mkdir(dircluster)
1480  except:
1481  pass
1482 
1483  rmsd_dict = {"AVERAGE_RMSD":
1484  str(Clusters.get_cluster_label_average_rmsd(cl))}
1485  clusstat = open(dircluster + "stat.out", "w")
1486  for k, structure_name in enumerate(Clusters.get_cluster_label_names(cl)):
1487 
1488  # extract the features
1489  tmp_dict = {}
1490  tmp_dict.update(rmsd_dict)
1491  index = rmf_file_name_index_dict[structure_name]
1492  for key in best_score_feature_keyword_list_dict:
1493  tmp_dict[
1494  key] = best_score_feature_keyword_list_dict[
1495  key][
1496  index]
1497  # get the rmf name and the frame number from the list of
1498  # frame names
1499  rmf_name = structure_name.split("|")[0]
1500  rmf_frame_number = int(structure_name.split("|")[1])
1501 
1502  clusstat.write(str(tmp_dict) + "\n")
1503 
1504  if k==0:
1505  prots,rs = IMP.pmi.analysis.get_hiers_and_restraints_from_rmf(
1506  self.model,
1507  rmf_frame_number,
1508  rmf_name)
1509  else:
1510  IMP.pmi.analysis.link_hiers_and_restraints_to_rmf(
1511  self.model,
1512  prots,
1513  rs,
1514  rmf_frame_number,
1515  rmf_name)
1516 
1517  if not prots:
1518  continue
1519 
1520  prot=prots[state_number]
1521 
1522  if k > 0:
1523  model_index = Clusters.get_model_index_from_name(
1524  structure_name)
1525  transformation = Clusters.get_transformation_to_first_member(
1526  cl,
1527  model_index)
1528  rbs = set()
1529  for p in IMP.atom.get_leaves(prot):
1530  if not IMP.core.XYZR.get_is_setup(p):
1532  IMP.core.XYZR(p).set_radius(0.0001)
1533  IMP.core.XYZR(p).set_coordinates((0, 0, 0))
1534 
1536  rb = IMP.core.RigidBodyMember(p).get_rigid_body()
1537  rbs.add(rb)
1538  else:
1540  transformation)
1541  for rb in rbs:
1542  IMP.core.transform(rb,transformation)
1543 
1544  # add the density
1545  if density_custom_ranges:
1546  DensModule.add_subunits_density(prot)
1547 
1548  # pdb writing should be optimized!
1549  o = IMP.pmi.output.Output()
1550  o.init_pdb(dircluster + str(k) + ".pdb", prot)
1551  o.write_pdb(dircluster + str(k) + ".pdb")
1552 
1553  o.init_rmf(dircluster + str(k) + ".rmf3", [prot],rs)
1554  o.write_rmf(dircluster + str(k) + ".rmf3")
1555  o.close_rmf(dircluster + str(k) + ".rmf3")
1556 
1557  del o
1558  # IMP.atom.destroy(prot)
1559 
1560  if density_custom_ranges:
1561  DensModule.write_mrc(path=dircluster)
1562  del DensModule
1563 
1564  if self.number_of_processes>1:
1565  self.comm.Barrier()
1566 
1567  def save_objects(self, objects, file_name):
1568  import pickle
1569  outf = open(file_name, 'wb')
1570  pickle.dump(objects, outf)
1571  outf.close()
1572 
1573  def load_objects(self, file_name):
1574  import pickle
1575  inputf = open(file_name, 'rb')
1576  objects = pickle.load(inputf)
1577  inputf.close()
1578  return objects
A macro for running all the basic operations of analysis.
Definition: macros.py:1052
def get_restraint_set
Get a RestraintSet containing all PMI restraints added to the model.
Definition: tools.py:34
Sample using molecular dynamics.
Definition: samplers.py:352
A member of a rigid body, it has internal (local) coordinates.
Definition: rigid_bodies.h:358
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:359
def __init__
Constructor.
Definition: macros.py:500
static XYZR setup_particle(Model *m, ParticleIndex pi)
Definition: XYZR.h:48
Utility classes and functions for reading and storing PMI files.
def get_best_models
Given a list of stat files, read them all and find the best models.
Miscellaneous utilities.
Definition: tools.py:1
def __init__
Constructor.
Definition: macros.py:62
def clustering
Get the best scoring models, compute a distance matrix, cluster them, and create density maps...
Definition: macros.py:1151
GenericHierarchies get_leaves(Hierarchy mhd)
Get all the leaves of the bit of hierarchy.
A class to cluster structures.
Representation of the system.
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: XYZR.h:47
def get_modeling_trajectory
Get a trajectory of the modeling run, for generating demonstrative movies.
Definition: macros.py:1098
def get_trajectory_models
Given a list of stat files, read them all and find a trajectory of models.
def build_model
Create model.
Definition: macros.py:772
Performs alignment and RMSD calculation for two sets of coordinates.
Definition: pmi/Analysis.py:23
def scatter_and_gather
Synchronize data over a parallel run.
Definition: tools.py:1001
void transform(XYZ a, const algebra::Transformation3D &tr)
Apply a transformation to the particle.
Hierarchies get_by_type(Hierarchy mhd, GetByType t)
Gather all the molecular particles of a certain level in the hierarchy.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
def BuildModel0
Construct a component for each subunit (no splitting, nothing fancy).
Definition: macros.py:394
Class for easy writing of PDBs, RMFs, and stat files.
Definition: output.py:21
def __init__
Constructor.
Definition: macros.py:760
A macro to build a Representation based on a Topology and lists of movers.
Definition: macros.py:478
Tools for clustering and cluster analysis.
Definition: pmi/Analysis.py:1
Classes for writing output files and processing them.
Definition: output.py:1
def deprecated_object
Python decorator to mark a class as deprecated.
Definition: __init__.py:9367
Sampling of the system.
Definition: samplers.py:1
Sample using Monte Carlo.
Definition: samplers.py:35
Deprecated building macro - use BuildModel()
Definition: macros.py:756
def read_coordinates_of_rmfs
Read in coordinates of a set of RMF tuples.
Set up the representation of all proteins and nucleic acid macromolecules.
A macro to help setup and run replica exchange.
Definition: macros.py:20
Hierarchies get_leaves(const Selection &h)
Compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
def get_residue_indexes
Retrieve the residue indexes for the given particle.
Definition: tools.py:959
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:490
Sample using replica exchange.
Definition: samplers.py:442
def get_representation
Return the Representation object.
Definition: macros.py:612
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27