IMP  2.4.0
The Integrative Modeling Platform
macros.py
1 """@namespace IMP.pmi.macros
2 Protocols for sampling structures and analyzing them.
3 """
4 
5 from __future__ import print_function
7 import IMP.pmi.tools
8 import IMP.pmi.samplers
9 import IMP.pmi.output
10 import IMP.pmi.analysis
11 import IMP.pmi.io
12 import IMP.rmf
13 import RMF
14 import os
15 import glob
16 from operator import itemgetter
17 from collections import defaultdict
18 import numpy as np
19 
20 class ReplicaExchange0(object):
21  """A macro to help setup and run replica exchange.
22  Supports Monte Carlo and molecular dynamics.
23  Produces trajectory RMF files, best PDB structures,
24  and output stat files.
25  """
26  def __init__(self, model,
27  representation=None,
28  root_hier=None,
29  sample_objects=None, # DEPRECATED
30  monte_carlo_sample_objects=None,
31  molecular_dynamics_sample_objects=None,
32  output_objects=None,
33  crosslink_restraints=None,
34  monte_carlo_temperature=1.0,
35  simulated_annealing=False,
36  simulated_annealing_minimum_temperature=1.0,
37  simulated_annealing_maximum_temperature=2.5,
38  simulated_annealing_minimum_temperature_nframes=100,
39  simulated_annealing_maximum_temperature_nframes=100,
40  replica_exchange_minimum_temperature=1.0,
41  replica_exchange_maximum_temperature=2.5,
42  num_sample_rounds=1,
43  number_of_best_scoring_models=500,
44  monte_carlo_steps=10,
45  molecular_dynamics_steps=10,
46  molecular_dynamics_max_time_step=1.0,
47  number_of_frames=1000,
48  nframes_write_coordinates=1,
49  write_initial_rmf=True,
50  initial_rmf_name_suffix="initial",
51  stat_file_name_suffix="stat",
52  best_pdb_name_suffix="model",
53  do_clean_first=True,
54  do_create_directories=True,
55  global_output_directory="./",
56  rmf_dir="rmfs/",
57  best_pdb_dir="pdbs/",
58  replica_stat_file_suffix="stat_replica",
59  em_object_for_rmf=None,
60  atomistic=False,
61  replica_exchange_object=None):
62  """Constructor.
63  @param model The IMP model
64  @param representation PMI.representation.Representation object
65  (or list of them, for multi-state modeling)
66  @param root_hier Instead of passing Representation, just pass
67  a hierarchy
68  @param monte_carlo_sample_objcts Objects for MC sampling; a list of
69  structural components (generally the representation) that will
70  be moved and restraints with parameters that need to
71  be sampled
72  @param molecular_dynamics_sample_objects Objects for MD sampling
73  @param output_objects A list of structural objects and restraints
74  that will be included in output (statistics files). Any object
75  that provides a get_output() method can be used here.
76  @param crosslink_restraints List of cross-link restraints that will
77  be included in output RMF files (for visualization).
78  @param monte_carlo_temperature MC temp (may need to be optimized
79  based on post-sampling analysis)
80  @param simulated_annealing If True, perform simulated annealing
81  @param simulated_annealing_minimum_temperature Should generally be
82  the same as monte_carlo_temperature.
83  @param simulated_annealing_minimum_temperature_frames Number of
84  frames to compute at minimum temperature.
85  @param simulated_annealing_maximum_temperature_frames Number of
86  frames to compute at
87  temps > simulated_annealing_maximum_temperature.
88  @param replica_exchange_minimum_temperature Low temp for REX; should
89  generally be the same as monte_carlo_temperature.
90  @param replica_exchange_maximum_temperature High temp for REX
91  @param num_sample_rounds Number of rounds of MC/MD per cycle
92  @param number_of_best_scoring_models Number of top-scoring PDB models
93  to keep around for analysis
94  @param monte_carlo_steps Number of MC steps per round
95  @param molecular_dynamics_steps Number of MD steps per round
96  @param molecular_dynamics_max_time_step Max time step for MD
97  @param number_of_frames Number of REX frames to run
98  @param nframes_write_coordinates How often to write the coordinates
99  of a frame
100  @param write_initial_rmf Write the initial configuration
101  @param global_output_directory Folder that will be created to house
102  output.
103  """
104  self.model = model
105  self.vars = {}
106 
107  ### add check hierarchy is multistate
108  if representation:
109  if type(representation) == list:
110  self.is_multi_state = True
111  self.root_hiers = [r.prot for r in representation]
112  self.vars["number_of_states"] = len(representation)
113  else:
114  self.is_multi_state = False
115  self.root_hier = representation.prot
116  self.vars["number_of_states"] = 1
117  elif root_hier:
118  states = IMP.atom.get_by_type(root_hier,IMP.atom.STATE_TYPE)
119  self.vars["number_of_states"] = len(states)
120  if len(states)>1:
121  self.root_hiers = states
122  self.is_multi_state = True
123  else:
124  self.root_hier = root_hier
125  self.is_multi_state = False
126  else:
127  print("ERROR: Must provide representation or root_hier")
128  return
129 
130  self.crosslink_restraints = crosslink_restraints
131  self.em_object_for_rmf = em_object_for_rmf
132  self.monte_carlo_sample_objects = monte_carlo_sample_objects
133  if sample_objects is not None:
134  self.monte_carlo_sample_objects+=sample_objects
135  self.molecular_dynamics_sample_objects=molecular_dynamics_sample_objects
136  self.output_objects = output_objects
137  self.replica_exchange_object = replica_exchange_object
138  self.molecular_dynamics_max_time_step = molecular_dynamics_max_time_step
139  self.vars["monte_carlo_temperature"] = monte_carlo_temperature
140  self.vars[
141  "replica_exchange_minimum_temperature"] = replica_exchange_minimum_temperature
142  self.vars[
143  "replica_exchange_maximum_temperature"] = replica_exchange_maximum_temperature
144 
145  self.vars["simulated_annealing"]=\
146  simulated_annealing
147  self.vars["simulated_annealing_minimum_temperature"]=\
148  simulated_annealing_minimum_temperature
149  self.vars["simulated_annealing_maximum_temperature"]=\
150  simulated_annealing_maximum_temperature
151  self.vars["simulated_annealing_minimum_temperature_nframes"]=\
152  simulated_annealing_minimum_temperature_nframes
153  self.vars["simulated_annealing_maximum_temperature_nframes"]=\
154  simulated_annealing_maximum_temperature_nframes
155 
156  self.vars["num_sample_rounds"] = num_sample_rounds
157  self.vars[
158  "number_of_best_scoring_models"] = number_of_best_scoring_models
159  self.vars["monte_carlo_steps"] = monte_carlo_steps
160  self.vars["molecular_dynamics_steps"]=molecular_dynamics_steps
161  self.vars["number_of_frames"] = number_of_frames
162  self.vars["nframes_write_coordinates"] = nframes_write_coordinates
163  self.vars["write_initial_rmf"] = write_initial_rmf
164  self.vars["initial_rmf_name_suffix"] = initial_rmf_name_suffix
165  self.vars["best_pdb_name_suffix"] = best_pdb_name_suffix
166  self.vars["stat_file_name_suffix"] = stat_file_name_suffix
167  self.vars["do_clean_first"] = do_clean_first
168  self.vars["do_create_directories"] = do_create_directories
169  self.vars["global_output_directory"] = global_output_directory
170  self.vars["rmf_dir"] = rmf_dir
171  self.vars["best_pdb_dir"] = best_pdb_dir
172  self.vars["atomistic"] = atomistic
173  self.vars["replica_stat_file_suffix"] = replica_stat_file_suffix
174 
175  def show_info(self):
176  print("ReplicaExchange0: it generates initial.*.rmf3, stat.*.out, rmfs/*.rmf3 for each replica ")
177  print("--- it stores the best scoring pdb models in pdbs/")
178  print("--- the stat.*.out and rmfs/*.rmf3 are saved only at the lowest temperature")
179  print("--- variables:")
180  keys = list(self.vars.keys())
181  keys.sort()
182  for v in keys:
183  print("------", v.ljust(30), self.vars[v])
184 
185  def get_replica_exchange_object(self):
186  return self.replica_exchange_object
187 
188  def execute_macro(self):
189 
190  temp_index_factor = 100000.0
191  samplers=[]
192  sampler_mc=None
193  sampler_md=None
194  if self.monte_carlo_sample_objects is not None:
195  print("Setting up MonteCarlo")
196  sampler_mc = IMP.pmi.samplers.MonteCarlo(self.model,
197  self.monte_carlo_sample_objects,
198  self.vars["monte_carlo_temperature"])
199  if self.vars["simulated_annealing"]:
200  tmin=self.vars["simulated_annealing_minimum_temperature"]
201  tmax=self.vars["simulated_annealing_maximum_temperature"]
202  nfmin=self.vars["simulated_annealing_minimum_temperature_nframes"]
203  nfmax=self.vars["simulated_annealing_maximum_temperature_nframes"]
204  sampler_mc.set_simulated_annealing(tmin,tmax,nfmin,nfmax)
205  self.output_objects.append(sampler_mc)
206  samplers.append(sampler_mc)
207 
208 
209  if self.molecular_dynamics_sample_objects is not None:
210  print("Setting up MolecularDynamics")
211  sampler_md = IMP.pmi.samplers.MolecularDynamics(self.model,
212  self.molecular_dynamics_sample_objects,
213  self.vars["monte_carlo_temperature"],
214  maximum_time_step=self.molecular_dynamics_max_time_step)
215  if self.vars["simulated_annealing"]:
216  tmin=self.vars["simulated_annealing_minimum_temperature"]
217  tmax=self.vars["simulated_annealing_maximum_temperature"]
218  nfmin=self.vars["simulated_annealing_minimum_temperature_nframes"]
219  nfmax=self.vars["simulated_annealing_maximum_temperature_nframes"]
220  sampler_md.set_simulated_annealing(tmin,tmax,nfmin,nfmax)
221  self.output_objects.append(sampler_md)
222  samplers.append(sampler_md)
223 # -------------------------------------------------------------------------
224 
225  print("Setting up ReplicaExchange")
226  rex = IMP.pmi.samplers.ReplicaExchange(self.model,
227  self.vars[
228  "replica_exchange_minimum_temperature"],
229  self.vars[
230  "replica_exchange_maximum_temperature"],
231  samplers,
232  replica_exchange_object=self.replica_exchange_object)
233  self.replica_exchange_object = rex.rem
234 
235  myindex = rex.get_my_index()
236  self.output_objects.append(rex)
237 
238  # must reset the minimum temperature due to the
239  # different binary length of rem.get_my_parameter double and python
240  # float
241  min_temp_index = int(min(rex.get_temperatures()) * temp_index_factor)
242 
243 # -------------------------------------------------------------------------
244 
245  globaldir = self.vars["global_output_directory"] + "/"
246  rmf_dir = globaldir + self.vars["rmf_dir"]
247  pdb_dir = globaldir + self.vars["best_pdb_dir"]
248 
249  if self.vars["do_clean_first"]:
250  pass
251 
252  if self.vars["do_create_directories"]:
253 
254  try:
255  os.makedirs(globaldir)
256  except:
257  pass
258  try:
259  os.makedirs(rmf_dir)
260  except:
261  pass
262 
263  if not self.is_multi_state:
264  try:
265  os.makedirs(pdb_dir)
266  except:
267  pass
268  else:
269  for n in range(self.vars["number_of_states"]):
270  try:
271  os.makedirs(pdb_dir + "/" + str(n))
272  except:
273  pass
274 
275 # -------------------------------------------------------------------------
276 
277  sw = IMP.pmi.tools.Stopwatch()
278  self.output_objects.append(sw)
279 
280  print("Setting up stat file")
281  output = IMP.pmi.output.Output(atomistic=self.vars["atomistic"])
282  low_temp_stat_file = globaldir + \
283  self.vars["stat_file_name_suffix"] + "." + str(myindex) + ".out"
284  output.init_stat2(low_temp_stat_file,
285  self.output_objects,
286  extralabels=["rmf_file", "rmf_frame_index"])
287 
288  print("Setting up replica stat file")
289  replica_stat_file = globaldir + \
290  self.vars["replica_stat_file_suffix"] + "." + str(myindex) + ".out"
291  output.init_stat2(replica_stat_file, [rex], extralabels=["score"])
292 
293  print("Setting up best pdb files")
294  if not self.is_multi_state:
295  if self.vars["number_of_best_scoring_models"] > 0:
296  output.init_pdb_best_scoring(pdb_dir + "/" +
297  self.vars["best_pdb_name_suffix"],
298  self.root_hier,
299  self.vars[
300  "number_of_best_scoring_models"],
301  replica_exchange=True)
302  output.write_psf(pdb_dir + "/" +"model.psf",pdb_dir + "/" +
303  self.vars["best_pdb_name_suffix"]+".0.pdb")
304  else:
305  if self.vars["number_of_best_scoring_models"] > 0:
306  for n in range(self.vars["number_of_states"]):
307  output.init_pdb_best_scoring(pdb_dir + "/" + str(n) + "/" +
308  self.vars["best_pdb_name_suffix"],
309  self.root_hiers[n],
310  self.vars[
311  "number_of_best_scoring_models"],
312  replica_exchange=True)
313  output.write_psf(pdb_dir + "/" + str(n) + "/" +"model.psf",pdb_dir + "/" + str(n) + "/" +
314  self.vars["best_pdb_name_suffix"]+".0.pdb")
315 # ---------------------------------------------
316 
317  if not self.em_object_for_rmf is None:
318  if not self.is_multi_state:
319  output_hierarchies = [
320  self.root_hier,
321  self.em_object_for_rmf.get_density_as_hierarchy(
322  )]
323  else:
324  output_hierarchies = self.root_hiers
325  output_hierarchies.append(
326  self.em_object_for_rmf.get_density_as_hierarchy())
327  else:
328  if not self.is_multi_state:
329  output_hierarchies = [self.root_hier]
330  else:
331  output_hierarchies = self.root_hiers
332 
333 #----------------------------------------------
334  print("Setting up and writing initial rmf coordinate file")
335  init_suffix = globaldir + self.vars["initial_rmf_name_suffix"]
336  output.init_rmf(init_suffix + "." + str(myindex) + ".rmf3",
337  output_hierarchies)
338  if self.crosslink_restraints:
339  output.add_restraints_to_rmf(
340  init_suffix + "." + str(myindex) + ".rmf3",
341  self.crosslink_restraints)
342  output.write_rmf(init_suffix + "." + str(myindex) + ".rmf3")
343  output.close_rmf(init_suffix + "." + str(myindex) + ".rmf3")
344 
345 #----------------------------------------------
346 
347  print("Setting up production rmf files")
348 
349  rmfname = rmf_dir + "/" + str(myindex) + ".rmf3"
350  output.init_rmf(rmfname, output_hierarchies)
351 
352  if self.crosslink_restraints:
353  output.add_restraints_to_rmf(rmfname, self.crosslink_restraints)
354 
355  ntimes_at_low_temp = 0
356 
357  if myindex == 0:
358  self.show_info()
359 
360  for i in range(self.vars["number_of_frames"]):
361  for nr in range(self.vars["num_sample_rounds"]):
362  if sampler_md is not None:
363  sampler_md.optimize(self.vars["molecular_dynamics_steps"])
364  if sampler_mc is not None:
365  sampler_mc.optimize(self.vars["monte_carlo_steps"])
366  score = self.model.evaluate(False)
367  output.set_output_entry("score", score)
368 
369  my_temp_index = int(rex.get_my_temp() * temp_index_factor)
370 
371  if min_temp_index == my_temp_index:
372  print("--- frame %s score %s " % (str(i), str(score)))
373 
374  if i % self.vars["nframes_write_coordinates"]==0:
375  print('--- writing coordinates')
376  if self.vars["number_of_best_scoring_models"] > 0:
377  output.write_pdb_best_scoring(score)
378  output.write_rmf(rmfname)
379  output.set_output_entry("rmf_file", rmfname)
380  output.set_output_entry("rmf_frame_index", ntimes_at_low_temp)
381  else:
382  output.set_output_entry("rmf_file", rmfname)
383  output.set_output_entry("rmf_frame_index", '-1')
384  output.write_stat2(low_temp_stat_file)
385  ntimes_at_low_temp += 1
386 
387  output.write_stat2(replica_stat_file)
388  rex.swap_temp(i, score)
389 
390 
391 # -----------------------------------------------------------------------
392 
393 
395  m,
396  data,
397  resolutions=[1,
398  10],
399  missing_bead_size=20,
400  residue_per_gaussian=None):
401  '''
402  Construct a component for each subunit (no splitting, nothing fancy).
403 
404  You can pass the resolutions and the bead size for the missing residue regions.
405  To use this macro, you must provide the following data structure:
406 
407  Component pdbfile chainid rgb color fastafile sequence id
408  in fastafile
409 
410 data = [("Rpb1", pdbfile, "A", 0.00000000, (fastafile, 0)),
411  ("Rpb2", pdbfile, "B", 0.09090909, (fastafile, 1)),
412  ("Rpb3", pdbfile, "C", 0.18181818, (fastafile, 2)),
413  ("Rpb4", pdbfile, "D", 0.27272727, (fastafile, 3)),
414  ("Rpb5", pdbfile, "E", 0.36363636, (fastafile, 4)),
415  ("Rpb6", pdbfile, "F", 0.45454545, (fastafile, 5)),
416  ("Rpb7", pdbfile, "G", 0.54545455, (fastafile, 6)),
417  ("Rpb8", pdbfile, "H", 0.63636364, (fastafile, 7)),
418  ("Rpb9", pdbfile, "I", 0.72727273, (fastafile, 8)),
419  ("Rpb10", pdbfile, "L", 0.81818182, (fastafile, 9)),
420  ("Rpb11", pdbfile, "J", 0.90909091, (fastafile, 10)),
421  ("Rpb12", pdbfile, "K", 1.00000000, (fastafile, 11))]
422 
423  '''
424 
426 
427  # the dictionary for the hierarchies,
428  hierarchies = {}
429 
430  for d in data:
431  # retrieve the information from the data structure
432  component_name = d[0]
433  pdb_file = d[1]
434  chain_id = d[2]
435  color_id = d[3]
436  fasta_file = d[4][0]
437  # this function
438  fastids = IMP.pmi.tools.get_ids_from_fasta_file(fasta_file)
439  fasta_file_id = d[4][1]
440  # avoid to add a component with the same name
441  r.create_component(component_name,
442  color=color_id)
443 
444  r.add_component_sequence(component_name,
445  fasta_file,
446  id=fastids[fasta_file_id])
447 
448  hierarchies = r.autobuild_model(component_name,
449  pdb_file,
450  chain_id,
451  resolutions=resolutions,
452  missingbeadsize=missing_bead_size)
453 
454  r.show_component_table(component_name)
455 
456  r.set_rigid_bodies([component_name])
457 
458  r.set_chain_of_super_rigid_bodies(
459  hierarchies,
460  min_length=2,
461  max_length=2)
462 
463  r.setup_component_sequence_connectivity(component_name, resolution=1)
464  r.setup_component_geometry(component_name)
465 
466  r.setup_bonds()
467  # put it at the end of rigid bodies
468  r.set_floppy_bodies()
469 
470  # set current coordinates as reference for RMSD calculation
471  r.set_current_coordinates_as_reference_for_rmsd("Reference")
472 
473  return r
474 
475 
476 # ----------------------------------------------------------------------
477 
478 class BuildModel(object):
479  """A macro to build a Representation based on a Topology and lists of movers
480  """
481  def __init__(self,
482  model,
483  component_topologies,
484  list_of_rigid_bodies=[],
485  list_of_super_rigid_bodies=[],
486  chain_of_super_rigid_bodies=[],
487  sequence_connectivity_scale=4.0,
488  add_each_domain_as_rigid_body=False,
489  force_create_gmm_files=False):
490  """Constructor.
491  @param model The IMP model
492  @param component_topologies List of
493  IMP.pmi.topology.ComponentTopology items
494  @param list_of_rigid_bodies List of lists of domain names that will
495  be moved as rigid bodies.
496  @param list_of_super_rigid_bodies List of lists of domain names
497  that will move together in an additional Monte Carlo move.
498  @param chain_of_super_rigid_bodies List of lists of domain names
499  (choices can only be from the same molecule). Each of these
500  groups will be moved rigidly. This helps to sample more
501  efficiently complex topologies, made of several rigid bodies,
502  connected by flexible linkers.
503  @param sequence_connectivity_scale For scaling the connectivity
504  restraint
505  @param add_each_domain_as_rigid_body That way you don't have to
506  put all of them in the list
507  @param force_create_gmm_files If True, will sample and create GMMs
508  no matter what. If False, will only only sample if the
509  files don't exist. If number of Gaussians is zero, won't
510  do anything.
511  """
512  self.m = model
513  self.simo = IMP.pmi.representation.Representation(self.m,
514  upperharmonic=True,
515  disorderedlength=False)
516 
517  data=component_topologies
518  if list_of_rigid_bodies==[]:
519  print("WARNING: No list of rigid bodies inputted to build_model()")
520  if list_of_super_rigid_bodies==[]:
521  print("WARNING: No list of super rigid bodies inputted to build_model()")
522  if chain_of_super_rigid_bodies==[]:
523  print("WARNING: No chain of super rigid bodies inputted to build_model()")
524  all_dnames = set([d for sublist in list_of_rigid_bodies+list_of_super_rigid_bodies\
525  +chain_of_super_rigid_bodies for d in sublist])
526  all_available = set([c.domain_name for c in component_topologies])
527  if not all_dnames <= all_available:
528  raise ValueError("All requested movers must reference domain "
529  "names in the component topologies")
530 
531  self.domain_dict={}
532  self.resdensities={}
533  super_rigid_bodies={}
534  chain_super_rigid_bodies={}
535  rigid_bodies={}
536 
537  for c in data:
538  comp_name = c.name
539  hier_name = c.domain_name
540  color = c.color
541  fasta_file = c.fasta_file
542  fasta_id = c.fasta_id
543  pdb_name = c.pdb_file
544  chain_id = c.chain
545  res_range = c.residue_range
546  offset = c.pdb_offset
547  bead_size = c.bead_size
548  em_num_components = c.em_residues_per_gaussian
549  em_txt_file_name = c.gmm_file
550  em_mrc_file_name = c.mrc_file
551 
552  if comp_name not in self.simo.get_component_names():
553  self.simo.create_component(comp_name,color=0.0)
554  self.simo.add_component_sequence(comp_name,fasta_file,fasta_id)
555 
556  # create hierarchy (adds resolutions, beads) with autobuild and optionally add EM data
557  if em_num_components==0:
558  read_em_files=False
559  include_res0=False
560  else:
561  if (not os.path.isfile(em_txt_file_name)) or force_create_gmm_files:
562  read_em_files=False
563  include_res0=True
564  else:
565  read_em_files=True
566  include_res0=False
567 
568  outhier=self.autobuild(self.simo,comp_name,pdb_name,chain_id,
569  res_range,include_res0,beadsize=bead_size,
570  color=color,offset=offset)
571  if em_num_components!=0:
572  if read_em_files:
573  print("will read GMM files")
574  else:
575  print("will calculate GMMs")
576 
577  dens_hier,beads=self.create_density(self.simo,comp_name,outhier,em_txt_file_name,
578  em_mrc_file_name,em_num_components,read_em_files)
579  self.simo.add_all_atom_densities(comp_name, hierarchies=beads)
580  dens_hier+=beads
581  else:
582  dens_hier=[]
583 
584  self.resdensities[hier_name]=dens_hier
585  self.domain_dict[hier_name]=outhier+dens_hier
586 
587  # setup basic restraints
588  for c in self.simo.get_component_names():
589  self.simo.setup_component_sequence_connectivity(c,scale=sequence_connectivity_scale)
590  self.simo.setup_component_geometry(c)
591 
592  # create movers
593  for rblist in list_of_rigid_bodies:
594  rb=[]
595  for rbmember in rblist:
596  rb+=[h for h in self.domain_dict[rbmember]]
597  self.simo.set_rigid_body_from_hierarchies(rb)
598  for srblist in list_of_super_rigid_bodies:
599  srb=[]
600  for srbmember in rblist:
601  srb+=[h for h in self.domain_dict[srbmember]]
602  self.simo.set_super_rigid_body_from_hierarchies(srb)
603  for clist in chain_of_super_rigid_bodies:
604  crb=[]
605  for crbmember in rblist:
606  crb+=[h for h in self.domain_dict[crbmember]]
607  self.simo.set_chain_of_super_rigid_bodies(crb,2,3)
608 
609  self.simo.set_floppy_bodies()
610  self.simo.setup_bonds()
611 
613  '''Return the Representation object'''
614  return self.simo
615 
616  def get_density_hierarchies(self,hier_name_list):
617  # return a list of density hierarchies
618  # specify the list of hierarchy names
619  dens_hier_list=[]
620  for hn in hier_name_list:
621  print(hn)
622  dens_hier_list+=self.resdensities[hn]
623  return dens_hier_list
624 
625  def set_gmm_models_directory(self,directory_name):
626  self.gmm_models_directory=directory_name
627 
628  def get_pdb_bead_bits(self,hierarchy):
629  pdbbits=[]
630  beadbits=[]
631  helixbits=[]
632  for h in hierarchy:
633  if "_pdb" in h.get_name():pdbbits.append(h)
634  if "_bead" in h.get_name():beadbits.append(h)
635  if "_helix" in h.get_name():helixbits.append(h)
636  return (pdbbits,beadbits,helixbits)
637 
638  def scale_bead_radii(self,nresidues,scale):
639  scaled_beads=set()
640  for h in self.domain_dict:
641  (pdbbits,beadbits,helixbits)=self.get_pdb_bead_bits(self.domain_dict[h])
642  slope=(1.0-scale)/(1.0-float(nresidues))
643 
644  for b in beadbits:
645  # I have to do the following
646  # because otherwise we'll scale more than once
647  if b not in scaled_beads:
648  scaled_beads.add(b)
649  else:
650  continue
651  radius=IMP.core.XYZR(b).get_radius()
652  num_residues=len(IMP.pmi.tools.get_residue_indexes(b))
653  scale_factor=slope*float(num_residues)+1.0
654  print(scale_factor)
655  new_radius=scale_factor*radius
656  IMP.core.XYZR(b).set_radius(new_radius)
657  print(b.get_name())
658  print("particle with radius "+str(radius)+" and "+str(num_residues)+" residues scaled to a new radius "+str(new_radius))
659 
660 
661  def create_density(self,simo,compname,comphier,txtfilename,mrcfilename,num_components,read=True):
662  #density generation for the EM restraint
663  (pdbbits,beadbits,helixbits)=self.get_pdb_bead_bits(comphier)
664  #get the number of residues from the pdb bits
665  res_ind=[]
666  for pb in pdbbits+helixbits:
667  for p in IMP.core.get_leaves(pb):
669 
670  number_of_residues=len(set(res_ind))
671  outhier=[]
672  if read:
673  if len(pdbbits)!=0:
674  outhier+=simo.add_component_density(compname,
675  pdbbits,
676  num_components=num_components,
677  resolution=0,
678  inputfile=txtfilename)
679  if len(helixbits)!=0:
680  outhier+=simo.add_component_density(compname,
681  helixbits,
682  num_components=num_components,
683  resolution=1,
684  inputfile=txtfilename)
685 
686 
687  else:
688  if len(pdbbits)!=0:
689  num_components=number_of_residues/abs(num_components)
690  outhier+=simo.add_component_density(compname,
691  pdbbits,
692  num_components=num_components,
693  resolution=0,
694  outputfile=txtfilename,
695  outputmap=mrcfilename,
696  multiply_by_total_mass=True)
697 
698  if len(helixbits)!=0:
699  num_components=number_of_residues/abs(num_components)
700  outhier+=simo.add_component_density(compname,
701  helixbits,
702  num_components=num_components,
703  resolution=1,
704  outputfile=txtfilename,
705  outputmap=mrcfilename,
706  multiply_by_total_mass=True)
707 
708  return outhier,beadbits
709 
710  def autobuild(self,simo,comname,pdbname,chain,resrange,include_res0=False,
711  beadsize=5,color=0.0,offset=0):
712  if pdbname is not None and pdbname is not "IDEAL_HELIX" and pdbname is not "BEADS" :
713  if resrange[-1]==-1:
714  resrange=(resrange[0],len(simo.sequence_dict[comname]))
715  if include_res0:
716  outhier=simo.autobuild_model(comname,
717  pdbname=pdbname,
718  chain=chain,
719  resrange=resrange,
720  resolutions=[0,1,10],
721  offset=offset,
722  color=color,
723  missingbeadsize=beadsize)
724  else:
725  outhier=simo.autobuild_model(comname,
726  pdbname=pdbname,
727  chain=chain,
728  resrange=resrange,
729  resolutions=[1,10],
730  offset=offset,
731  color=color,
732  missingbeadsize=beadsize)
733 
734 
735  elif pdbname is not None and pdbname is "IDEAL_HELIX" and pdbname is not "BEADS" :
736  outhier=simo.add_component_ideal_helix(comname,
737  resolutions=[1,10],
738  resrange=resrange,
739  color=color,
740  show=False)
741 
742  elif pdbname is not None and pdbname is not "IDEAL_HELIX" and pdbname is "BEADS" :
743  outhier=simo.add_component_necklace(comname,resrange[0],resrange[1],beadsize,color=color)
744 
745  else:
746 
747  seq_len=len(simo.sequence_dict[comname])
748  outhier=simo.add_component_necklace(comname,
749  begin=1,
750  end=seq_len,
751  length=beadsize)
752 
753  return outhier
754 
755 
756 class BuildModel1(object):
757  """Deprecated building macro - use BuildModel()"""
758 
759  def __init__(self, representation):
760  """Constructor.
761  @param representation The PMI representation
762  """
763  self.simo=representation
764  self.gmm_models_directory="."
765  self.rmf_file={}
766  self.rmf_frame_number={}
767 
768  def set_gmm_models_directory(self,directory_name):
769  self.gmm_models_directory=directory_name
770 
771  def build_model(self,data_structure,sequence_connectivity_scale=4.0,rmf_file=None,rmf_frame_number=0):
772  """Create model.
773  @param data_structure List of lists containing these entries:
774  comp_name, hier_name, color, fasta_file, fasta_id, pdb_name, chain_id,
775  res_range, read_em_files, bead_size, rb, super_rb,
776  em_num_components, em_txt_file_name, em_mrc_file_name
777  @param sequence_connectivity_scale
778  """
779  self.domain_dict={}
780  self.resdensities={}
781  super_rigid_bodies={}
782  chain_super_rigid_bodies={}
783  rigid_bodies={}
784 
785  for d in data_structure:
786  comp_name = d[0]
787  hier_name = d[1]
788  color = d[2]
789  fasta_file = d[3]
790  fasta_id = d[4]
791  pdb_name = d[5]
792  chain_id = d[6]
793  res_range = d[7][0:2]
794  try:
795  offset = d[7][2]
796  except:
797  offset = 0
798  read_em_files = d[8]
799  bead_size = d[9]
800  rb = d[10]
801  super_rb = d[11]
802  em_num_components = d[12]
803  em_txt_file_name = d[13]
804  em_mrc_file_name = d[14]
805  chain_of_super_rb = d[15]
806 
807  if comp_name not in self.simo.get_component_names():
808  self.simo.create_component(comp_name,color=0.0)
809  self.simo.add_component_sequence(comp_name,fasta_file,fasta_id)
810  outhier=self.autobuild(self.simo,comp_name,pdb_name,chain_id,res_range,read=read_em_files,beadsize=bead_size,color=color,offset=offset)
811 
812 
813  if not read_em_files is None:
814  if em_txt_file_name is " ": em_txt_file_name=self.gmm_models_directory+"/"+hier_name+".txt"
815  if em_mrc_file_name is " ": em_mrc_file_name=self.gmm_models_directory+"/"+hier_name+".mrc"
816 
817 
818  dens_hier,beads=self.create_density(self.simo,comp_name,outhier,em_txt_file_name,em_mrc_file_name,em_num_components,read_em_files)
819  self.simo.add_all_atom_densities(comp_name, hierarchies=beads)
820  dens_hier+=beads
821 
822  else:
823  dens_hier=[]
824 
825  self.resdensities[hier_name]=dens_hier
826  self.domain_dict[hier_name]=outhier+dens_hier
827 
828  if rb is not None:
829  if rb not in rigid_bodies:
830  rigid_bodies[rb]=[h for h in self.domain_dict[hier_name]]
831  else:
832  rigid_bodies[rb]+=[h for h in self.domain_dict[hier_name]]
833 
834 
835  if super_rb is not None:
836  for k in super_rb:
837  if k not in super_rigid_bodies:
838  super_rigid_bodies[k]=[h for h in self.domain_dict[hier_name]]
839  else:
840  super_rigid_bodies[k]+=[h for h in self.domain_dict[hier_name]]
841 
842  if chain_of_super_rb is not None:
843  for k in chain_of_super_rb:
844  if k not in chain_super_rigid_bodies:
845  chain_super_rigid_bodies[k]=[h for h in self.domain_dict[hier_name]]
846  else:
847  chain_super_rigid_bodies[k]+=[h for h in self.domain_dict[hier_name]]
848 
849 
850 
851  self.rigid_bodies=rigid_bodies
852 
853  for c in self.simo.get_component_names():
854  if rmf_file is not None:
855  rf=rmf_file
856  rfn=rmf_frame_number
857  self.simo.set_coordinates_from_rmf(c, rf,rfn)
858  else:
859  if c in self.rmf_file:
860  rf=self.rmf_file[c]
861  rfn=self.rmf_frame_number[c]
862  self.simo.set_coordinates_from_rmf(c, rf,rfn)
863  self.simo.setup_component_sequence_connectivity(c,scale=sequence_connectivity_scale)
864  self.simo.setup_component_geometry(c)
865 
866  for rb in rigid_bodies:
867  self.simo.set_rigid_body_from_hierarchies(rigid_bodies[rb])
868 
869  for k in super_rigid_bodies:
870  self.simo.set_super_rigid_body_from_hierarchies(super_rigid_bodies[k])
871 
872  for k in chain_super_rigid_bodies:
873  self.simo.set_chain_of_super_rigid_bodies(chain_super_rigid_bodies[k],2,3)
874 
875  self.simo.set_floppy_bodies()
876  self.simo.setup_bonds()
877 
878 
879 
880  def set_rmf_file(self,component_name,rmf_file,rmf_frame_number):
881  self.rmf_file[component_name]=rmf_file
882  self.rmf_frame_number[component_name]=rmf_frame_number
883 
884  def get_density_hierarchies(self,hier_name_list):
885  # return a list of density hierarchies
886  # specify the list of hierarchy names
887  dens_hier_list=[]
888  for hn in hier_name_list:
889  print(hn)
890  dens_hier_list+=self.resdensities[hn]
891  return dens_hier_list
892 
893  def get_pdb_bead_bits(self,hierarchy):
894  pdbbits=[]
895  beadbits=[]
896  helixbits=[]
897  for h in hierarchy:
898  if "_pdb" in h.get_name():pdbbits.append(h)
899  if "_bead" in h.get_name():beadbits.append(h)
900  if "_helix" in h.get_name():helixbits.append(h)
901  return (pdbbits,beadbits,helixbits)
902 
903  def scale_bead_radii(self,nresidues,scale):
904  scaled_beads=set()
905  for h in self.domain_dict:
906  (pdbbits,beadbits,helixbits)=self.get_pdb_bead_bits(self.domain_dict[h])
907  slope=(1.0-scale)/(1.0-float(nresidues))
908 
909  for b in beadbits:
910  # I have to do the following
911  # because otherwise we'll scale more than once
912  if b not in scaled_beads:
913  scaled_beads.add(b)
914  else:
915  continue
916  radius=IMP.core.XYZR(b).get_radius()
917  num_residues=len(IMP.pmi.tools.get_residue_indexes(b))
918  scale_factor=slope*float(num_residues)+1.0
919  print(scale_factor)
920  new_radius=scale_factor*radius
921  IMP.core.XYZR(b).set_radius(new_radius)
922  print(b.get_name())
923  print("particle with radius "+str(radius)+" and "+str(num_residues)+" residues scaled to a new radius "+str(new_radius))
924 
925 
926  def create_density(self,simo,compname,comphier,txtfilename,mrcfilename,num_components,read=True):
927  #density generation for the EM restraint
928  (pdbbits,beadbits,helixbits)=self.get_pdb_bead_bits(comphier)
929 
930  #get the number of residues from the pdb bits
931  res_ind=[]
932  for pb in pdbbits+helixbits:
933  for p in IMP.core.get_leaves(pb):
935 
936  number_of_residues=len(set(res_ind))
937  outhier=[]
938  if read:
939  if len(pdbbits)!=0:
940  outhier+=simo.add_component_density(compname,
941  pdbbits,
942  num_components=num_components, # number of gaussian into which the simulated density is approximated
943  resolution=0, # resolution that you want to calculate the simulated density
944  inputfile=txtfilename) # read what it was calculated before
945  if len(helixbits)!=0:
946  outhier+=simo.add_component_density(compname,
947  helixbits,
948  num_components=num_components, # number of gaussian into which the simulated density is approximated
949  resolution=1, # resolution that you want to calculate the simulated density
950  inputfile=txtfilename) # read what it was calculated before
951 
952 
953  else:
954  if len(pdbbits)!=0:
955  if num_components<0:
956  #if negative calculate the number of gmm components automatically
957  # from the number of residues
958  num_components=number_of_residues/abs(num_components)
959  outhier+=simo.add_component_density(compname,
960  pdbbits,
961  num_components=num_components, # number of gaussian into which the simulated density is approximated
962  resolution=0, # resolution that you want to calculate the simulated density
963  outputfile=txtfilename, # do the calculation
964  outputmap=mrcfilename,
965  multiply_by_total_mass=True) # do the calculation and output the mrc
966 
967  if len(helixbits)!=0:
968  if num_components<0:
969  #if negative calculate the number of gmm components automatically
970  # from the number of residues
971  num_components=number_of_residues/abs(num_components)
972  outhier+=simo.add_component_density(compname,
973  helixbits,
974  num_components=num_components, # number of gaussian into which the simulated density is approximated
975  resolution=1, # resolution that you want to calculate the simulated density
976  outputfile=txtfilename, # do the calculation
977  outputmap=mrcfilename,
978  multiply_by_total_mass=True) # do the calculation and output the mrc
979 
980  return outhier,beadbits
981 
982  def autobuild(self,simo,comname,pdbname,chain,resrange,read=True,beadsize=5,color=0.0,offset=0):
983 
984  if pdbname is not None and pdbname is not "IDEAL_HELIX" and pdbname is not "BEADS" :
985  if resrange[-1]==-1: resrange=(resrange[0],len(simo.sequence_dict[comname]))
986  if read==False:
987  outhier=simo.autobuild_model(comname,
988  pdbname=pdbname,
989  chain=chain,
990  resrange=resrange,
991  resolutions=[0,1,10],
992  offset=offset,
993  color=color,
994  missingbeadsize=beadsize)
995  else:
996  outhier=simo.autobuild_model(comname,
997  pdbname=pdbname,
998  chain=chain,
999  resrange=resrange,
1000  resolutions=[1,10],
1001  offset=offset,
1002  color=color,
1003  missingbeadsize=beadsize)
1004 
1005 
1006  elif pdbname is not None and pdbname is "IDEAL_HELIX" and pdbname is not "BEADS" :
1007 
1008  outhier=simo.add_component_ideal_helix(comname,
1009  resolutions=[1,10],
1010  resrange=resrange,
1011  color=color,
1012  show=False)
1013 
1014  elif pdbname is not None and pdbname is not "IDEAL_HELIX" and pdbname is "BEADS" :
1015  outhier=simo.add_component_necklace(comname,resrange[0],resrange[1],beadsize,color=color)
1016 
1017  else:
1018 
1019  seq_len=len(simo.sequence_dict[comname])
1020  outhier=simo.add_component_necklace(comname,
1021  begin=1,
1022  end=seq_len,
1023  length=beadsize)
1024 
1025  return outhier
1026 
1027 
1028 
1029 # ----------------------------------------------------------------------
1030 
1032  """A macro for running all the basic operations of analysis.
1033  Includes clustering, precision analysis, and making ensemble density maps.
1034  A number of plots are also supported.
1035  """
1036  def __init__(self, model,
1037  merge_directories=["./"],
1038  stat_file_name_suffix="stat",
1039  best_pdb_name_suffix="model",
1040  do_clean_first=True,
1041  do_create_directories=True,
1042  global_output_directory="output/",
1043  replica_stat_file_suffix="stat_replica",
1044  global_analysis_result_directory="./analysis/"):
1045  """Constructor.
1046  @param model The IMP model
1047  @param stat_file_name_suffix
1048  @param merge_directories The directories containing output files
1049  @param best_pdb_name_suffix
1050  @param do_clean_first
1051  @param do_create_directories
1052  @param global_output_directory Where everything is
1053  @param replica_stat_file_suffix
1054  @param global_analysis_result_directory
1055  """
1056 
1057  try:
1058  from mpi4py import MPI
1059  self.comm = MPI.COMM_WORLD
1060  self.rank = self.comm.Get_rank()
1061  self.number_of_processes = self.comm.size
1062  except ImportError:
1063  self.rank = 0
1064  self.number_of_processes = 1
1065 
1066  self.model = model
1067  stat_dir = global_output_directory
1068  self.stat_files = []
1069  # it contains the position of the root directories
1070  for rd in merge_directories:
1071  stat_files = glob.glob(rd + "/" + stat_dir + "/stat.*.out")
1072  if len(stat_files)==0:
1073  print("WARNING: no stat files found in",rd + "/" + stat_dir + "/stat.*.out")
1074  self.stat_files += stat_files
1075 
1076 
1078  score_key="SimplifiedModel_Total_Score_None",
1079  rmf_file_key="rmf_file",
1080  rmf_file_frame_key="rmf_frame_index",
1081  outputdir="./",
1082  get_every=1,
1083  nframes_trajectory=10000):
1084  """ Get a trajectory of the modeling run, for generating demonstrative movies
1085  @param score_key The score for ranking models
1086  @param rmf_file_key Key pointing to RMF filename
1087  @param rmf_file_frame_key Key pointing to RMF frame number
1088  @param outputdir The local output directory used in the run
1089  @param get_every Extract every nth frame
1090  @param nframes_trajectory Total number of frames of the trajectory
1091  """
1092  from operator import itemgetter
1093  import math
1094 
1095  trajectory_models = IMP.pmi.io.get_trajectory_models(self.stat_files,
1096  score_key,
1097  rmf_file_key,
1098  rmf_file_frame_key,
1099  get_every)
1100  rmf_file_list=trajectory_models[0]
1101  rmf_file_frame_list=trajectory_models[1]
1102  score_list=list(map(float, trajectory_models[2]))
1103 
1104  max_score=max(score_list)
1105  min_score=min(score_list)
1106 
1107 
1108  bins=[(max_score-min_score)*math.exp(-float(i))+min_score for i in range(nframes_trajectory)]
1109  binned_scores=[None]*nframes_trajectory
1110  binned_model_indexes=[-1]*nframes_trajectory
1111 
1112  for model_index,s in enumerate(score_list):
1113  bins_score_diffs=[abs(s-b) for b in bins]
1114  bin_index=min(enumerate(bins_score_diffs), key=itemgetter(1))[0]
1115  if binned_scores[bin_index]==None:
1116  binned_scores[bin_index]=s
1117  binned_model_indexes[bin_index]=model_index
1118  else:
1119  old_diff=abs(binned_scores[bin_index]-bins[bin_index])
1120  new_diff=abs(s-bins[bin_index])
1121  if new_diff < old_diff:
1122  binned_scores[bin_index]=s
1123  binned_model_indexes[bin_index]=model_index
1124 
1125  print(binned_scores)
1126  print(binned_model_indexes)
1127 
1128 
1129 
1130  def clustering(self,
1131  score_key="SimplifiedModel_Total_Score_None",
1132  rmf_file_key="rmf_file",
1133  rmf_file_frame_key="rmf_frame_index",
1134  state_number=0,
1135  prefiltervalue=None,
1136  feature_keys=[],
1137  outputdir="./",
1138  alignment_components=None,
1139  number_of_best_scoring_models=10,
1140  rmsd_calculation_components=None,
1141  distance_matrix_file=None,
1142  load_distance_matrix_file=False,
1143  skip_clustering=False,
1144  number_of_clusters=1,
1145  display_plot=False,
1146  exit_after_display=True,
1147  get_every=1,
1148  first_and_last_frames=None,
1149  density_custom_ranges=None,
1150  write_pdb_with_centered_coordinates=False,
1151  voxel_size=5.0):
1152  """ Get the best scoring models, compute a distance matrix, cluster them, and create density maps
1153  @param score_key The score for ranking models
1154  @param rmf_file_key Key pointing to RMF filename
1155  @param rmf_file_frame_key Key pointing to RMF frame number
1156  @param state_number State number to analyse
1157  @param prefiltervalue Only include frames where the score key is below this value
1158  @param feature_keys Keywords for which you want to calculate average,
1159  medians, etc,
1160  @param outputdir The local output directory used in the run
1161  @param alignment_components List of tuples for aligning the structures
1162  e.g. ["Rpb1", (20,100,"Rpb2"), .....]
1163  @param number_of_best_scoring_models Num models to keep per run
1164  @param rmsd_calculation_components List of tuples for calculating RMSD
1165  e.g. ["Rpb1", (20,100,"Rpb2"), .....]
1166  @param distance_matrix_file Where to store/read the distance matrix
1167  @param load_distance_matrix_file Try to load the distance matrix file
1168  @param skip_clustering Just extract the best scoring models and save the pdbs
1169  @param number_of_clusters Number of k-means clusters
1170  @param display_plot Display the distance matrix
1171  @param exit_after_display Exit after displaying distance matrix
1172  @param get_every Extract every nth frame
1173  @param first_and_last_frames A tuple with the first and last frames to be
1174  analyzed. Values are percentages!
1175  Default: get all frames
1176  @param density_custom_ranges List of tuples or strings for density calculation
1177  e.g. ["Rpb1", (20,100,"Rpb2"), .....]
1178  @param write_pdb_with_centered_coordinates
1179  @param voxel_size Used for the density output
1180  """
1181 
1182 
1183  if self.rank==0:
1184  try:
1185  os.mkdir(outputdir)
1186  except:
1187  pass
1188 
1189 
1190  if not load_distance_matrix_file:
1191  if len(self.stat_files)==0: print("ERROR: no stat file found in the given path"); return
1192  my_stat_files=IMP.pmi.tools.chunk_list_into_segments(
1193  self.stat_files,self.number_of_processes)[self.rank]
1194  best_models = IMP.pmi.io.get_best_models(my_stat_files,
1195  score_key,
1196  feature_keys,
1197  rmf_file_key,
1198  rmf_file_frame_key,
1199  prefiltervalue,
1200  get_every)
1201  rmf_file_list=best_models[0]
1202  rmf_file_frame_list=best_models[1]
1203  score_list=best_models[2]
1204  feature_keyword_list_dict=best_models[3]
1205 
1206 # ------------------------------------------------------------------------
1207 # collect all the files and scores
1208 # ------------------------------------------------------------------------
1209 
1210  if self.number_of_processes > 1:
1211  score_list = IMP.pmi.tools.scatter_and_gather(score_list)
1212  rmf_file_list = IMP.pmi.tools.scatter_and_gather(rmf_file_list)
1213  rmf_file_frame_list = IMP.pmi.tools.scatter_and_gather(
1214  rmf_file_frame_list)
1215  for k in feature_keyword_list_dict:
1216  feature_keyword_list_dict[k] = IMP.pmi.tools.scatter_and_gather(
1217  feature_keyword_list_dict[k])
1218 
1219  # sort by score and get the best scoring ones
1220  score_rmf_tuples = list(zip(score_list,
1221  rmf_file_list,
1222  rmf_file_frame_list,
1223  list(range(len(score_list)))))
1224 
1225  # keep subset of frames if requested
1226  if first_and_last_frames is not None:
1227  nframes = len(score_rmf_tuples)
1228  first_frame = int(first_and_last_frames[0] * nframes)
1229  last_frame = int(first_and_last_frames[1] * nframes)
1230  if last_frame > len(score_rmf_tuples):
1231  last_frame = -1
1232  score_rmf_tuples = score_rmf_tuples[first_frame:last_frame]
1233 
1234  # sort RMFs by the score_key in ascending order, and store the rank
1235  best_score_rmf_tuples = sorted(score_rmf_tuples,
1236  key=lambda x: float(x[0]))[:number_of_best_scoring_models]
1237  best_score_rmf_tuples=[t+(n,) for n,t in enumerate(best_score_rmf_tuples)]
1238 
1239  # sort the feature scores in the same way
1240  best_score_feature_keyword_list_dict = defaultdict(list)
1241  for tpl in best_score_rmf_tuples:
1242  index = tpl[3]
1243  for f in feature_keyword_list_dict:
1244  best_score_feature_keyword_list_dict[f].append(
1245  feature_keyword_list_dict[f][index])
1246 
1247  my_best_score_rmf_tuples = IMP.pmi.tools.chunk_list_into_segments(
1248  best_score_rmf_tuples,
1249  self.number_of_processes)[self.rank]
1250 
1251 
1252 #-------------------------------------------------------------
1253 # read the coordinates
1254 # ------------------------------------------------------------
1255  rmsd_weights = IMP.pmi.io.get_bead_sizes(self.model,
1256  my_best_score_rmf_tuples[0],
1257  rmsd_calculation_components)
1258  got_coords = IMP.pmi.io.read_coordinates_of_rmfs(self.model,
1259  my_best_score_rmf_tuples,
1260  alignment_components,
1261  rmsd_calculation_components)
1262 
1263  # note! the coordinates are simple float tuples, NOT decorators, NOT Vector3D,
1264  # NOR particles, because these object cannot be serialized. We need serialization
1265  # for the parallel computation based on mpi.
1266  all_coordinates=got_coords[0] # dict:key=component name,val=coords per hit
1267  alignment_coordinates=got_coords[1] # same as above, limited to alignment bits
1268  rmsd_coordinates=got_coords[2] # same as above, limited to RMSD bits
1269  rmf_file_name_index_dict=got_coords[3] # dictionary with key=RMF, value=score rank
1270  all_rmf_file_names=got_coords[4] # RMF file per hit
1271 
1272 # ------------------------------------------------------------------------
1273 # optionally don't compute distance matrix or cluster, just write top files
1274 # ------------------------------------------------------------------------
1275  if skip_clustering:
1276  if density_custom_ranges:
1277  DensModule = IMP.pmi.analysis.GetModelDensity(
1278  density_custom_ranges,
1279  voxel=voxel_size)
1280 
1281  dircluster=os.path.join(outputdir,"all_models."+str(n))
1282  try:
1283  os.mkdir(outputdir)
1284  except:
1285  pass
1286  try:
1287  os.mkdir(dircluster)
1288  except:
1289  pass
1290  clusstat=open(os.path.join(dircluster,"stat."+str(self.rank)+".out"),"w")
1291  for cnt,tpl in enumerate(my_best_score_rmf_tuples):
1292  rmf_name=tpl[1]
1293  rmf_frame_number=tpl[2]
1294  tmp_dict={}
1295  index=tpl[4]
1296  for key in best_score_feature_keyword_list_dict:
1297  tmp_dict[key]=best_score_feature_keyword_list_dict[key][index]
1298 
1299  prot=IMP.pmi.analysis.get_hier_from_rmf(self.model,rmf_frame_number,rmf_name,state_number=state_number)
1300  if not prot:
1301  continue
1302 
1303  if cnt==0:
1304  coords_f1=alignment_coordinates[cnt]
1305  if cnt > 0:
1306  coords_f2=alignment_coordinates[cnt]
1307  Ali = IMP.pmi.analysis.Alignment(coords_f1, coords_f2)
1308  transformation = Ali.align()[1]
1309  rbs = set()
1310  for p in IMP.atom.get_leaves(prot):
1311  if not IMP.core.XYZR.get_is_setup(p):
1313  IMP.core.XYZR(p).set_radius(0.0001)
1314  IMP.core.XYZR(p).set_coordinates((0, 0, 0))
1315 
1317  rb = IMP.core.RigidBodyMember(p).get_rigid_body()
1318  rbs.add(rb)
1319  else:
1321  transformation)
1322  for rb in rbs:
1323  IMP.core.transform(rb,transformation)
1324 
1326  out_pdb_fn=os.path.join(dircluster,str(cnt)+"."+str(self.rank)+".pdb")
1327  out_rmf_fn=os.path.join(dircluster,str(cnt)+"."+str(self.rank)+".rmf")
1328  o.init_pdb(out_pdb_fn,prot)
1329  o.write_pdb(out_pdb_fn,
1330  translate_to_geometric_center=write_pdb_with_centered_coordinates)
1331 
1332  tmp_dict["local_pdb_file_name"]=os.path.basename(out_pdb_fn)
1333  tmp_dict["rmf_file_full_path"]=rmf_name
1334  tmp_dict["local_rmf_file_name"]=os.path.basename(out_rmf_fn)
1335  tmp_dict["local_rmf_frame_number"]=0
1336 
1337  clusstat.write(str(tmp_dict)+"\n")
1338  o.init_rmf(out_rmf_fn,[prot])
1339  o.write_rmf(out_rmf_fn)
1340  o.close_rmf(out_rmf_fn)
1341  # add the density
1342  if density_custom_ranges:
1343  DensModule.add_subunits_density(prot)
1344 
1345  if density_custom_ranges:
1346  DensModule.write_mrc(path=dircluster)
1347  del DensModule
1348  return
1349 
1350 
1351 
1352  # broadcast the coordinates
1353  if self.number_of_processes > 1:
1354  all_coordinates = IMP.pmi.tools.scatter_and_gather(
1355  all_coordinates)
1356  all_rmf_file_names = IMP.pmi.tools.scatter_and_gather(
1357  all_rmf_file_names)
1358  rmf_file_name_index_dict = IMP.pmi.tools.scatter_and_gather(
1359  rmf_file_name_index_dict)
1360  alignment_coordinates=IMP.pmi.tools.scatter_and_gather(
1361  alignment_coordinates)
1362  rmsd_coordinates=IMP.pmi.tools.scatter_and_gather(
1363  rmsd_coordinates)
1364 
1365  if self.rank == 0:
1366  # save needed informations in external files
1367  self.save_objects(
1368  [best_score_feature_keyword_list_dict,
1369  rmf_file_name_index_dict],
1370  ".macro.pkl")
1371 
1372 # ------------------------------------------------------------------------
1373 # Calculate distance matrix and cluster
1374 # ------------------------------------------------------------------------
1375  print("setup clustering class")
1376  Clusters = IMP.pmi.analysis.Clustering(rmsd_weights)
1377 
1378  for n, model_coordinate_dict in enumerate(all_coordinates):
1379  template_coordinate_dict = {}
1380  # let's try to align
1381  if alignment_components is not None and len(Clusters.all_coords) == 0:
1382  # set the first model as template coordinates
1383  Clusters.set_template(alignment_coordinates[n])
1384  Clusters.fill(all_rmf_file_names[n], rmsd_coordinates[n])
1385 
1386  print("Global calculating the distance matrix")
1387 
1388  # calculate distance matrix, all against all
1389  Clusters.dist_matrix()
1390 
1391  # perform clustering and optionally display
1392  if self.rank == 0:
1393  Clusters.do_cluster(number_of_clusters)
1394  if display_plot:
1395  if self.rank == 0:
1396  Clusters.plot_matrix(figurename=os.path.join(outputdir,'dist_matrix.pdf'))
1397  if exit_after_display:
1398  exit()
1399  Clusters.save_distance_matrix_file(file_name=distance_matrix_file)
1400 
1401 # ------------------------------------------------------------------------
1402 # Alteratively, load the distance matrix from file and cluster that
1403 # ------------------------------------------------------------------------
1404  else:
1405  if self.rank==0:
1406  print("setup clustering class")
1407  Clusters = IMP.pmi.analysis.Clustering()
1408  Clusters.load_distance_matrix_file(file_name=distance_matrix_file)
1409  print("clustering with %s clusters" % str(number_of_clusters))
1410  Clusters.do_cluster(number_of_clusters)
1411  [best_score_feature_keyword_list_dict,
1412  rmf_file_name_index_dict] = self.load_objects(".macro.pkl")
1413  if display_plot:
1414  if self.rank == 0:
1415  Clusters.plot_matrix(figurename=os.path.join(outputdir,'dist_matrix.pdf'))
1416  if exit_after_display:
1417  exit()
1418  if self.number_of_processes > 1:
1419  self.comm.Barrier()
1420 
1421 # ------------------------------------------------------------------------
1422 # now save all informations about the clusters
1423 # ------------------------------------------------------------------------
1424 
1425  if self.rank == 0:
1426  print(Clusters.get_cluster_labels())
1427  for n, cl in enumerate(Clusters.get_cluster_labels()):
1428  print("rank %s " % str(self.rank))
1429  print("cluster %s " % str(n))
1430  print("cluster label %s " % str(cl))
1431  print(Clusters.get_cluster_label_names(cl))
1432 
1433  # first initialize the Density class if requested
1434 
1435  if density_custom_ranges:
1436  DensModule = IMP.pmi.analysis.GetModelDensity(
1437  density_custom_ranges,
1438  voxel=voxel_size)
1439 
1440  dircluster = outputdir + "/cluster." + str(n) + "/"
1441  try:
1442  os.mkdir(dircluster)
1443  except:
1444  pass
1445 
1446  rmsd_dict = {"AVERAGE_RMSD":
1447  str(Clusters.get_cluster_label_average_rmsd(cl))}
1448  clusstat = open(dircluster + "stat.out", "w")
1449  for k, structure_name in enumerate(Clusters.get_cluster_label_names(cl)):
1450 
1451  # extract the features
1452  tmp_dict = {}
1453  tmp_dict.update(rmsd_dict)
1454  index = rmf_file_name_index_dict[structure_name]
1455  for key in best_score_feature_keyword_list_dict:
1456  tmp_dict[
1457  key] = best_score_feature_keyword_list_dict[
1458  key][
1459  index]
1460  # get the rmf name and the frame number from the list of
1461  # frame names
1462  rmf_name = structure_name.split("|")[0]
1463  rmf_frame_number = int(structure_name.split("|")[1])
1464 
1465  clusstat.write(str(tmp_dict) + "\n")
1466  prot,rs = IMP.pmi.analysis.get_hier_and_restraints_from_rmf(
1467  self.model,
1468  rmf_frame_number,
1469  rmf_name,
1470  state_number)
1471  if not prot:
1472  continue
1473 
1474  if k > 0:
1475  model_index = Clusters.get_model_index_from_name(
1476  structure_name)
1477  transformation = Clusters.get_transformation_to_first_member(
1478  cl,
1479  model_index)
1480  rbs = set()
1481  for p in IMP.atom.get_leaves(prot):
1482  if not IMP.core.XYZR.get_is_setup(p):
1484  IMP.core.XYZR(p).set_radius(0.0001)
1485  IMP.core.XYZR(p).set_coordinates((0, 0, 0))
1486 
1488  rb = IMP.core.RigidBodyMember(p).get_rigid_body()
1489  rbs.add(rb)
1490  else:
1492  transformation)
1493  for rb in rbs:
1494  IMP.core.transform(rb,transformation)
1495 
1496  # add the density
1497  if density_custom_ranges:
1498  DensModule.add_subunits_density(prot)
1499 
1500  # pdb writing should be optimized!
1501  o = IMP.pmi.output.Output()
1502  o.init_pdb(dircluster + str(k) + ".pdb", prot)
1503  o.write_pdb(dircluster + str(k) + ".pdb")
1504 
1505  o.init_rmf(dircluster + str(k) + ".rmf3", [prot],rs)
1506  o.write_rmf(dircluster + str(k) + ".rmf3")
1507  o.close_rmf(dircluster + str(k) + ".rmf3")
1508 
1509  del o
1510  # IMP.atom.destroy(prot)
1511 
1512  if density_custom_ranges:
1513  DensModule.write_mrc(path=dircluster)
1514  del DensModule
1515 
1516  if self.number_of_processes>1:
1517  self.comm.Barrier()
1518 
1519  def save_objects(self, objects, file_name):
1520  import pickle
1521  outf = open(file_name, 'wb')
1522  pickle.dump(objects, outf)
1523  outf.close()
1524 
1525  def load_objects(self, file_name):
1526  import pickle
1527  inputf = open(file_name, 'rb')
1528  objects = pickle.load(inputf)
1529  inputf.close()
1530  return objects
A macro for running all the basic operations of analysis.
Definition: macros.py:1031
Sample using molecular dynamics.
Definition: samplers.py:340
A member of a rigid body, it has internal (local) coordinates.
Definition: rigid_bodies.h:368
def __init__
Constructor.
Definition: macros.py:500
Utility classes and functions for reading and storing PMI files.
def get_best_models
Given a list of stat files, read them all and find the best models.
Miscellaneous utilities.
Definition: tools.py:1
def __init__
Constructor.
Definition: macros.py:62
def clustering
Get the best scoring models, compute a distance matrix, cluster them, and create density maps...
Definition: macros.py:1130
GenericHierarchies get_leaves(Hierarchy mhd)
Get all the leaves of the bit of hierarchy.
A class to cluster structures.
Representation of the system.
def get_modeling_trajectory
Get a trajectory of the modeling run, for generating demonstrative movies.
Definition: macros.py:1077
static bool get_is_setup(const IMP::kernel::ParticleAdaptor &p)
Definition: XYZR.h:47
static XYZR setup_particle(kernel::Model *m, ParticleIndex pi)
Definition: XYZR.h:48
def get_trajectory_models
Given a list of stat files, read them all and find a trajectory of models.
def build_model
Create model.
Definition: macros.py:771
Performs alignment and RMSD calculation for two sets of coordinates.
Definition: pmi/Analysis.py:23
def scatter_and_gather
Synchronize data over a parallel run.
Definition: tools.py:972
void transform(XYZ a, const algebra::Transformation3D &tr)
Apply a transformation to the particle.
Hierarchies get_by_type(Hierarchy mhd, GetByType t)
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
def BuildModel0
Construct a component for each subunit (no splitting, nothing fancy).
Definition: macros.py:394
static bool get_is_setup(const IMP::kernel::ParticleAdaptor &p)
Definition: rigid_bodies.h:369
Class for easy writing of PDBs, RMFs, and stat files.
Definition: output.py:20
def __init__
Constructor.
Definition: macros.py:759
A macro to build a Representation based on a Topology and lists of movers.
Definition: macros.py:478
Tools for clustering and cluster analysis.
Definition: pmi/Analysis.py:1
Classes for writing output files and processing them.
Definition: output.py:1
Sampling of the system.
Definition: samplers.py:1
Sample using Monte Carlo.
Definition: samplers.py:33
Deprecated building macro - use BuildModel()
Definition: macros.py:756
def read_coordinates_of_rmfs
Read in coordinates of a set of RMF tuples.
Set up the representation of all proteins and nucleic acid macromolecules.
A macro to help setup and run replica exchange.
Definition: macros.py:20
Hierarchies get_leaves(const Selection &h)
Compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
def get_residue_indexes
Retrieve the residue indexes for the given particle.
Definition: tools.py:930
Sample using replica exchange.
Definition: samplers.py:425
def get_representation
Return the Representation object.
Definition: macros.py:612
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27