1 """@namespace IMP.pmi.macros
2 Protocols for sampling structures and analyzing them.
5 from __future__
import print_function
16 from operator
import itemgetter
17 from collections
import defaultdict
21 """A macro to help setup and run replica exchange.
22 Supports Monte Carlo and molecular dynamics.
23 Produces trajectory RMF files, best PDB structures,
24 and output stat files.
30 monte_carlo_sample_objects=
None,
31 molecular_dynamics_sample_objects=
None,
33 crosslink_restraints=
None,
34 monte_carlo_temperature=1.0,
35 simulated_annealing=
False,
36 simulated_annealing_minimum_temperature=1.0,
37 simulated_annealing_maximum_temperature=2.5,
38 simulated_annealing_minimum_temperature_nframes=100,
39 simulated_annealing_maximum_temperature_nframes=100,
40 replica_exchange_minimum_temperature=1.0,
41 replica_exchange_maximum_temperature=2.5,
43 number_of_best_scoring_models=500,
45 molecular_dynamics_steps=10,
46 molecular_dynamics_max_time_step=1.0,
47 number_of_frames=1000,
48 nframes_write_coordinates=1,
49 write_initial_rmf=
True,
50 initial_rmf_name_suffix=
"initial",
51 stat_file_name_suffix=
"stat",
52 best_pdb_name_suffix=
"model",
54 do_create_directories=
True,
55 global_output_directory=
"./",
58 replica_stat_file_suffix=
"stat_replica",
59 em_object_for_rmf=
None,
61 replica_exchange_object=
None):
63 @param model The IMP model
64 @param representation PMI.representation.Representation object
65 (or list of them, for multi-state modeling)
66 @param root_hier Instead of passing Representation, just pass
68 @param monte_carlo_sample_objcts Objects for MC sampling; a list of
69 structural components (generally the representation) that will
70 be moved and restraints with parameters that need to
72 @param molecular_dynamics_sample_objects Objects for MD sampling
73 @param output_objects A list of structural objects and restraints
74 that will be included in output (statistics files). Any object
75 that provides a get_output() method can be used here.
76 @param crosslink_restraints List of cross-link restraints that will
77 be included in output RMF files (for visualization).
78 @param monte_carlo_temperature MC temp (may need to be optimized
79 based on post-sampling analysis)
80 @param simulated_annealing If True, perform simulated annealing
81 @param simulated_annealing_minimum_temperature Should generally be
82 the same as monte_carlo_temperature.
83 @param simulated_annealing_minimum_temperature_frames Number of
84 frames to compute at minimum temperature.
85 @param simulated_annealing_maximum_temperature_frames Number of
87 temps > simulated_annealing_maximum_temperature.
88 @param replica_exchange_minimum_temperature Low temp for REX; should
89 generally be the same as monte_carlo_temperature.
90 @param replica_exchange_maximum_temperature High temp for REX
91 @param num_sample_rounds Number of rounds of MC/MD per cycle
92 @param number_of_best_scoring_models Number of top-scoring PDB models
93 to keep around for analysis
94 @param monte_carlo_steps Number of MC steps per round
95 @param molecular_dynamics_steps Number of MD steps per round
96 @param molecular_dynamics_max_time_step Max time step for MD
97 @param number_of_frames Number of REX frames to run
98 @param nframes_write_coordinates How often to write the coordinates
100 @param write_initial_rmf Write the initial configuration
101 @param global_output_directory Folder that will be created to house
109 if type(representation) == list:
110 self.is_multi_state =
True
111 self.root_hiers = [r.prot
for r
in representation]
112 self.vars[
"number_of_states"] = len(representation)
114 self.is_multi_state =
False
115 self.root_hier = representation.prot
116 self.vars[
"number_of_states"] = 1
119 self.vars[
"number_of_states"] = len(states)
121 self.root_hiers = states
122 self.is_multi_state =
True
124 self.root_hier = root_hier
125 self.is_multi_state =
False
127 print(
"ERROR: Must provide representation or root_hier")
130 self.crosslink_restraints = crosslink_restraints
131 self.em_object_for_rmf = em_object_for_rmf
132 self.monte_carlo_sample_objects = monte_carlo_sample_objects
133 if sample_objects
is not None:
134 self.monte_carlo_sample_objects+=sample_objects
135 self.molecular_dynamics_sample_objects=molecular_dynamics_sample_objects
136 self.output_objects = output_objects
137 self.replica_exchange_object = replica_exchange_object
138 self.molecular_dynamics_max_time_step = molecular_dynamics_max_time_step
139 self.vars[
"monte_carlo_temperature"] = monte_carlo_temperature
141 "replica_exchange_minimum_temperature"] = replica_exchange_minimum_temperature
143 "replica_exchange_maximum_temperature"] = replica_exchange_maximum_temperature
145 self.vars[
"simulated_annealing"]=\
147 self.vars[
"simulated_annealing_minimum_temperature"]=\
148 simulated_annealing_minimum_temperature
149 self.vars[
"simulated_annealing_maximum_temperature"]=\
150 simulated_annealing_maximum_temperature
151 self.vars[
"simulated_annealing_minimum_temperature_nframes"]=\
152 simulated_annealing_minimum_temperature_nframes
153 self.vars[
"simulated_annealing_maximum_temperature_nframes"]=\
154 simulated_annealing_maximum_temperature_nframes
156 self.vars[
"num_sample_rounds"] = num_sample_rounds
158 "number_of_best_scoring_models"] = number_of_best_scoring_models
159 self.vars[
"monte_carlo_steps"] = monte_carlo_steps
160 self.vars[
"molecular_dynamics_steps"]=molecular_dynamics_steps
161 self.vars[
"number_of_frames"] = number_of_frames
162 self.vars[
"nframes_write_coordinates"] = nframes_write_coordinates
163 self.vars[
"write_initial_rmf"] = write_initial_rmf
164 self.vars[
"initial_rmf_name_suffix"] = initial_rmf_name_suffix
165 self.vars[
"best_pdb_name_suffix"] = best_pdb_name_suffix
166 self.vars[
"stat_file_name_suffix"] = stat_file_name_suffix
167 self.vars[
"do_clean_first"] = do_clean_first
168 self.vars[
"do_create_directories"] = do_create_directories
169 self.vars[
"global_output_directory"] = global_output_directory
170 self.vars[
"rmf_dir"] = rmf_dir
171 self.vars[
"best_pdb_dir"] = best_pdb_dir
172 self.vars[
"atomistic"] = atomistic
173 self.vars[
"replica_stat_file_suffix"] = replica_stat_file_suffix
176 print(
"ReplicaExchange0: it generates initial.*.rmf3, stat.*.out, rmfs/*.rmf3 for each replica ")
177 print(
"--- it stores the best scoring pdb models in pdbs/")
178 print(
"--- the stat.*.out and rmfs/*.rmf3 are saved only at the lowest temperature")
179 print(
"--- variables:")
180 keys = list(self.vars.keys())
183 print(
"------", v.ljust(30), self.vars[v])
185 def get_replica_exchange_object(self):
186 return self.replica_exchange_object
188 def execute_macro(self):
190 temp_index_factor = 100000.0
194 if self.monte_carlo_sample_objects
is not None:
195 print(
"Setting up MonteCarlo")
197 self.monte_carlo_sample_objects,
198 self.vars[
"monte_carlo_temperature"])
199 if self.vars[
"simulated_annealing"]:
200 tmin=self.vars[
"simulated_annealing_minimum_temperature"]
201 tmax=self.vars[
"simulated_annealing_maximum_temperature"]
202 nfmin=self.vars[
"simulated_annealing_minimum_temperature_nframes"]
203 nfmax=self.vars[
"simulated_annealing_maximum_temperature_nframes"]
204 sampler_mc.set_simulated_annealing(tmin,tmax,nfmin,nfmax)
205 self.output_objects.append(sampler_mc)
206 samplers.append(sampler_mc)
209 if self.molecular_dynamics_sample_objects
is not None:
210 print(
"Setting up MolecularDynamics")
212 self.molecular_dynamics_sample_objects,
213 self.vars[
"monte_carlo_temperature"],
214 maximum_time_step=self.molecular_dynamics_max_time_step)
215 if self.vars[
"simulated_annealing"]:
216 tmin=self.vars[
"simulated_annealing_minimum_temperature"]
217 tmax=self.vars[
"simulated_annealing_maximum_temperature"]
218 nfmin=self.vars[
"simulated_annealing_minimum_temperature_nframes"]
219 nfmax=self.vars[
"simulated_annealing_maximum_temperature_nframes"]
220 sampler_md.set_simulated_annealing(tmin,tmax,nfmin,nfmax)
221 self.output_objects.append(sampler_md)
222 samplers.append(sampler_md)
225 print(
"Setting up ReplicaExchange")
228 "replica_exchange_minimum_temperature"],
230 "replica_exchange_maximum_temperature"],
232 replica_exchange_object=self.replica_exchange_object)
233 self.replica_exchange_object = rex.rem
235 myindex = rex.get_my_index()
236 self.output_objects.append(rex)
241 min_temp_index = int(min(rex.get_temperatures()) * temp_index_factor)
245 globaldir = self.vars[
"global_output_directory"] +
"/"
246 rmf_dir = globaldir + self.vars[
"rmf_dir"]
247 pdb_dir = globaldir + self.vars[
"best_pdb_dir"]
249 if self.vars[
"do_clean_first"]:
252 if self.vars[
"do_create_directories"]:
255 os.makedirs(globaldir)
263 if not self.is_multi_state:
269 for n
in range(self.vars[
"number_of_states"]):
271 os.makedirs(pdb_dir +
"/" + str(n))
277 sw = IMP.pmi.tools.Stopwatch()
278 self.output_objects.append(sw)
280 print(
"Setting up stat file")
282 low_temp_stat_file = globaldir + \
283 self.vars[
"stat_file_name_suffix"] +
"." + str(myindex) +
".out"
284 output.init_stat2(low_temp_stat_file,
286 extralabels=[
"rmf_file",
"rmf_frame_index"])
288 print(
"Setting up replica stat file")
289 replica_stat_file = globaldir + \
290 self.vars[
"replica_stat_file_suffix"] +
"." + str(myindex) +
".out"
291 output.init_stat2(replica_stat_file, [rex], extralabels=[
"score"])
293 print(
"Setting up best pdb files")
294 if not self.is_multi_state:
295 if self.vars[
"number_of_best_scoring_models"] > 0:
296 output.init_pdb_best_scoring(pdb_dir +
"/" +
297 self.vars[
"best_pdb_name_suffix"],
300 "number_of_best_scoring_models"],
301 replica_exchange=
True)
302 output.write_psf(pdb_dir +
"/" +
"model.psf",pdb_dir +
"/" +
303 self.vars[
"best_pdb_name_suffix"]+
".0.pdb")
305 if self.vars[
"number_of_best_scoring_models"] > 0:
306 for n
in range(self.vars[
"number_of_states"]):
307 output.init_pdb_best_scoring(pdb_dir +
"/" + str(n) +
"/" +
308 self.vars[
"best_pdb_name_suffix"],
311 "number_of_best_scoring_models"],
312 replica_exchange=
True)
313 output.write_psf(pdb_dir +
"/" + str(n) +
"/" +
"model.psf",pdb_dir +
"/" + str(n) +
"/" +
314 self.vars[
"best_pdb_name_suffix"]+
".0.pdb")
317 if not self.em_object_for_rmf
is None:
318 if not self.is_multi_state:
319 output_hierarchies = [
321 self.em_object_for_rmf.get_density_as_hierarchy(
324 output_hierarchies = self.root_hiers
325 output_hierarchies.append(
326 self.em_object_for_rmf.get_density_as_hierarchy())
328 if not self.is_multi_state:
329 output_hierarchies = [self.root_hier]
331 output_hierarchies = self.root_hiers
334 print(
"Setting up and writing initial rmf coordinate file")
335 init_suffix = globaldir + self.vars[
"initial_rmf_name_suffix"]
336 output.init_rmf(init_suffix +
"." + str(myindex) +
".rmf3",
338 if self.crosslink_restraints:
339 output.add_restraints_to_rmf(
340 init_suffix +
"." + str(myindex) +
".rmf3",
341 self.crosslink_restraints)
342 output.write_rmf(init_suffix +
"." + str(myindex) +
".rmf3")
343 output.close_rmf(init_suffix +
"." + str(myindex) +
".rmf3")
347 print(
"Setting up production rmf files")
349 rmfname = rmf_dir +
"/" + str(myindex) +
".rmf3"
350 output.init_rmf(rmfname, output_hierarchies)
352 if self.crosslink_restraints:
353 output.add_restraints_to_rmf(rmfname, self.crosslink_restraints)
355 ntimes_at_low_temp = 0
360 for i
in range(self.vars[
"number_of_frames"]):
361 for nr
in range(self.vars[
"num_sample_rounds"]):
362 if sampler_md
is not None:
363 sampler_md.optimize(self.vars[
"molecular_dynamics_steps"])
364 if sampler_mc
is not None:
365 sampler_mc.optimize(self.vars[
"monte_carlo_steps"])
366 score = self.model.evaluate(
False)
367 output.set_output_entry(
"score", score)
369 my_temp_index = int(rex.get_my_temp() * temp_index_factor)
371 if min_temp_index == my_temp_index:
372 print(
"--- frame %s score %s " % (str(i), str(score)))
374 if i % self.vars[
"nframes_write_coordinates"]==0:
375 print(
'--- writing coordinates')
376 if self.vars[
"number_of_best_scoring_models"] > 0:
377 output.write_pdb_best_scoring(score)
378 output.write_rmf(rmfname)
379 output.set_output_entry(
"rmf_file", rmfname)
380 output.set_output_entry(
"rmf_frame_index", ntimes_at_low_temp)
382 output.set_output_entry(
"rmf_file", rmfname)
383 output.set_output_entry(
"rmf_frame_index",
'-1')
384 output.write_stat2(low_temp_stat_file)
385 ntimes_at_low_temp += 1
387 output.write_stat2(replica_stat_file)
388 rex.swap_temp(i, score)
399 missing_bead_size=20,
400 residue_per_gaussian=
None):
402 Construct a component for each subunit (no splitting, nothing fancy).
404 You can pass the resolutions and the bead size for the missing residue regions.
405 To use this macro, you must provide the following data structure:
407 Component pdbfile chainid rgb color fastafile sequence id
410 data = [("Rpb1", pdbfile, "A", 0.00000000, (fastafile, 0)),
411 ("Rpb2", pdbfile, "B", 0.09090909, (fastafile, 1)),
412 ("Rpb3", pdbfile, "C", 0.18181818, (fastafile, 2)),
413 ("Rpb4", pdbfile, "D", 0.27272727, (fastafile, 3)),
414 ("Rpb5", pdbfile, "E", 0.36363636, (fastafile, 4)),
415 ("Rpb6", pdbfile, "F", 0.45454545, (fastafile, 5)),
416 ("Rpb7", pdbfile, "G", 0.54545455, (fastafile, 6)),
417 ("Rpb8", pdbfile, "H", 0.63636364, (fastafile, 7)),
418 ("Rpb9", pdbfile, "I", 0.72727273, (fastafile, 8)),
419 ("Rpb10", pdbfile, "L", 0.81818182, (fastafile, 9)),
420 ("Rpb11", pdbfile, "J", 0.90909091, (fastafile, 10)),
421 ("Rpb12", pdbfile, "K", 1.00000000, (fastafile, 11))]
432 component_name = d[0]
438 fastids = IMP.pmi.tools.get_ids_from_fasta_file(fasta_file)
439 fasta_file_id = d[4][1]
441 r.create_component(component_name,
444 r.add_component_sequence(component_name,
446 id=fastids[fasta_file_id])
448 hierarchies = r.autobuild_model(component_name,
451 resolutions=resolutions,
452 missingbeadsize=missing_bead_size)
454 r.show_component_table(component_name)
456 r.set_rigid_bodies([component_name])
458 r.set_chain_of_super_rigid_bodies(
463 r.setup_component_sequence_connectivity(component_name, resolution=1)
464 r.setup_component_geometry(component_name)
468 r.set_floppy_bodies()
471 r.set_current_coordinates_as_reference_for_rmsd(
"Reference")
479 """A macro to build a Representation based on a Topology and lists of movers
483 component_topologies,
484 list_of_rigid_bodies=[],
485 list_of_super_rigid_bodies=[],
486 chain_of_super_rigid_bodies=[],
487 sequence_connectivity_scale=4.0,
488 add_each_domain_as_rigid_body=
False,
489 force_create_gmm_files=
False):
491 @param model The IMP model
492 @param component_topologies List of
493 IMP.pmi.topology.ComponentTopology items
494 @param list_of_rigid_bodies List of lists of domain names that will
495 be moved as rigid bodies.
496 @param list_of_super_rigid_bodies List of lists of domain names
497 that will move together in an additional Monte Carlo move.
498 @param chain_of_super_rigid_bodies List of lists of domain names
499 (choices can only be from the same molecule). Each of these
500 groups will be moved rigidly. This helps to sample more
501 efficiently complex topologies, made of several rigid bodies,
502 connected by flexible linkers.
503 @param sequence_connectivity_scale For scaling the connectivity
505 @param add_each_domain_as_rigid_body That way you don't have to
506 put all of them in the list
507 @param force_create_gmm_files If True, will sample and create GMMs
508 no matter what. If False, will only only sample if the
509 files don't exist. If number of Gaussians is zero, won't
515 disorderedlength=
False)
517 data=component_topologies
518 if list_of_rigid_bodies==[]:
519 print(
"WARNING: No list of rigid bodies inputted to build_model()")
520 if list_of_super_rigid_bodies==[]:
521 print(
"WARNING: No list of super rigid bodies inputted to build_model()")
522 if chain_of_super_rigid_bodies==[]:
523 print(
"WARNING: No chain of super rigid bodies inputted to build_model()")
524 all_dnames = set([d
for sublist
in list_of_rigid_bodies+list_of_super_rigid_bodies\
525 +chain_of_super_rigid_bodies
for d
in sublist])
526 all_available = set([c.domain_name
for c
in component_topologies])
527 if not all_dnames <= all_available:
528 raise ValueError(
"All requested movers must reference domain "
529 "names in the component topologies")
533 super_rigid_bodies={}
534 chain_super_rigid_bodies={}
539 hier_name = c.domain_name
541 fasta_file = c.fasta_file
542 fasta_id = c.fasta_id
543 pdb_name = c.pdb_file
545 res_range = c.residue_range
546 offset = c.pdb_offset
547 bead_size = c.bead_size
548 em_num_components = c.em_residues_per_gaussian
549 em_txt_file_name = c.gmm_file
550 em_mrc_file_name = c.mrc_file
552 if comp_name
not in self.simo.get_component_names():
553 self.simo.create_component(comp_name,color=0.0)
554 self.simo.add_component_sequence(comp_name,fasta_file,fasta_id)
557 if em_num_components==0:
561 if (
not os.path.isfile(em_txt_file_name))
or force_create_gmm_files:
568 outhier=self.autobuild(self.simo,comp_name,pdb_name,chain_id,
569 res_range,include_res0,beadsize=bead_size,
570 color=color,offset=offset)
571 if em_num_components!=0:
573 print(
"will read GMM files")
575 print(
"will calculate GMMs")
577 dens_hier,beads=self.create_density(self.simo,comp_name,outhier,em_txt_file_name,
578 em_mrc_file_name,em_num_components,read_em_files)
579 self.simo.add_all_atom_densities(comp_name, hierarchies=beads)
584 self.resdensities[hier_name]=dens_hier
585 self.domain_dict[hier_name]=outhier+dens_hier
588 for c
in self.simo.get_component_names():
589 self.simo.setup_component_sequence_connectivity(c,scale=sequence_connectivity_scale)
590 self.simo.setup_component_geometry(c)
593 for rblist
in list_of_rigid_bodies:
595 for rbmember
in rblist:
596 rb+=[h
for h
in self.domain_dict[rbmember]]
597 self.simo.set_rigid_body_from_hierarchies(rb)
598 for srblist
in list_of_super_rigid_bodies:
600 for srbmember
in rblist:
601 srb+=[h
for h
in self.domain_dict[srbmember]]
602 self.simo.set_super_rigid_body_from_hierarchies(srb)
603 for clist
in chain_of_super_rigid_bodies:
605 for crbmember
in rblist:
606 crb+=[h
for h
in self.domain_dict[crbmember]]
607 self.simo.set_chain_of_super_rigid_bodies(crb,2,3)
609 self.simo.set_floppy_bodies()
610 self.simo.setup_bonds()
613 '''Return the Representation object'''
616 def get_density_hierarchies(self,hier_name_list):
620 for hn
in hier_name_list:
622 dens_hier_list+=self.resdensities[hn]
623 return dens_hier_list
625 def set_gmm_models_directory(self,directory_name):
626 self.gmm_models_directory=directory_name
628 def get_pdb_bead_bits(self,hierarchy):
633 if "_pdb" in h.get_name():pdbbits.append(h)
634 if "_bead" in h.get_name():beadbits.append(h)
635 if "_helix" in h.get_name():helixbits.append(h)
636 return (pdbbits,beadbits,helixbits)
638 def scale_bead_radii(self,nresidues,scale):
640 for h
in self.domain_dict:
641 (pdbbits,beadbits,helixbits)=self.get_pdb_bead_bits(self.domain_dict[h])
642 slope=(1.0-scale)/(1.0-float(nresidues))
647 if b
not in scaled_beads:
653 scale_factor=slope*float(num_residues)+1.0
655 new_radius=scale_factor*radius
658 print(
"particle with radius "+str(radius)+
" and "+str(num_residues)+
" residues scaled to a new radius "+str(new_radius))
661 def create_density(self,simo,compname,comphier,txtfilename,mrcfilename,num_components,read=True):
663 (pdbbits,beadbits,helixbits)=self.get_pdb_bead_bits(comphier)
666 for pb
in pdbbits+helixbits:
670 number_of_residues=len(set(res_ind))
674 outhier+=simo.add_component_density(compname,
676 num_components=num_components,
678 inputfile=txtfilename)
679 if len(helixbits)!=0:
680 outhier+=simo.add_component_density(compname,
682 num_components=num_components,
684 inputfile=txtfilename)
689 num_components=number_of_residues/abs(num_components)
690 outhier+=simo.add_component_density(compname,
692 num_components=num_components,
694 outputfile=txtfilename,
695 outputmap=mrcfilename,
696 multiply_by_total_mass=
True)
698 if len(helixbits)!=0:
699 num_components=number_of_residues/abs(num_components)
700 outhier+=simo.add_component_density(compname,
702 num_components=num_components,
704 outputfile=txtfilename,
705 outputmap=mrcfilename,
706 multiply_by_total_mass=
True)
708 return outhier,beadbits
710 def autobuild(self,simo,comname,pdbname,chain,resrange,include_res0=False,
711 beadsize=5,color=0.0,offset=0):
712 if pdbname
is not None and pdbname
is not "IDEAL_HELIX" and pdbname
is not "BEADS" :
714 resrange=(resrange[0],len(simo.sequence_dict[comname]))
716 outhier=simo.autobuild_model(comname,
720 resolutions=[0,1,10],
723 missingbeadsize=beadsize)
725 outhier=simo.autobuild_model(comname,
732 missingbeadsize=beadsize)
735 elif pdbname
is not None and pdbname
is "IDEAL_HELIX" and pdbname
is not "BEADS" :
736 outhier=simo.add_component_ideal_helix(comname,
742 elif pdbname
is not None and pdbname
is not "IDEAL_HELIX" and pdbname
is "BEADS" :
743 outhier=simo.add_component_necklace(comname,resrange[0],resrange[1],beadsize,color=color)
747 seq_len=len(simo.sequence_dict[comname])
748 outhier=simo.add_component_necklace(comname,
757 """Deprecated building macro - use BuildModel()"""
761 @param representation The PMI representation
763 self.simo=representation
764 self.gmm_models_directory=
"."
766 self.rmf_frame_number={}
768 def set_gmm_models_directory(self,directory_name):
769 self.gmm_models_directory=directory_name
771 def build_model(self,data_structure,sequence_connectivity_scale=4.0,rmf_file=None,rmf_frame_number=0):
773 @param data_structure List of lists containing these entries:
774 comp_name, hier_name, color, fasta_file, fasta_id, pdb_name, chain_id,
775 res_range, read_em_files, bead_size, rb, super_rb,
776 em_num_components, em_txt_file_name, em_mrc_file_name
777 @param sequence_connectivity_scale
781 super_rigid_bodies={}
782 chain_super_rigid_bodies={}
785 for d
in data_structure:
793 res_range = d[7][0:2]
802 em_num_components = d[12]
803 em_txt_file_name = d[13]
804 em_mrc_file_name = d[14]
805 chain_of_super_rb = d[15]
807 if comp_name
not in self.simo.get_component_names():
808 self.simo.create_component(comp_name,color=0.0)
809 self.simo.add_component_sequence(comp_name,fasta_file,fasta_id)
810 outhier=self.autobuild(self.simo,comp_name,pdb_name,chain_id,res_range,read=read_em_files,beadsize=bead_size,color=color,offset=offset)
813 if not read_em_files
is None:
814 if em_txt_file_name
is " ": em_txt_file_name=self.gmm_models_directory+
"/"+hier_name+
".txt"
815 if em_mrc_file_name
is " ": em_mrc_file_name=self.gmm_models_directory+
"/"+hier_name+
".mrc"
818 dens_hier,beads=self.create_density(self.simo,comp_name,outhier,em_txt_file_name,em_mrc_file_name,em_num_components,read_em_files)
819 self.simo.add_all_atom_densities(comp_name, hierarchies=beads)
825 self.resdensities[hier_name]=dens_hier
826 self.domain_dict[hier_name]=outhier+dens_hier
829 if rb
not in rigid_bodies:
830 rigid_bodies[rb]=[h
for h
in self.domain_dict[hier_name]]
832 rigid_bodies[rb]+=[h
for h
in self.domain_dict[hier_name]]
835 if super_rb
is not None:
837 if k
not in super_rigid_bodies:
838 super_rigid_bodies[k]=[h
for h
in self.domain_dict[hier_name]]
840 super_rigid_bodies[k]+=[h
for h
in self.domain_dict[hier_name]]
842 if chain_of_super_rb
is not None:
843 for k
in chain_of_super_rb:
844 if k
not in chain_super_rigid_bodies:
845 chain_super_rigid_bodies[k]=[h
for h
in self.domain_dict[hier_name]]
847 chain_super_rigid_bodies[k]+=[h
for h
in self.domain_dict[hier_name]]
851 self.rigid_bodies=rigid_bodies
853 for c
in self.simo.get_component_names():
854 if rmf_file
is not None:
857 self.simo.set_coordinates_from_rmf(c, rf,rfn)
859 if c
in self.rmf_file:
861 rfn=self.rmf_frame_number[c]
862 self.simo.set_coordinates_from_rmf(c, rf,rfn)
863 self.simo.setup_component_sequence_connectivity(c,scale=sequence_connectivity_scale)
864 self.simo.setup_component_geometry(c)
866 for rb
in rigid_bodies:
867 self.simo.set_rigid_body_from_hierarchies(rigid_bodies[rb])
869 for k
in super_rigid_bodies:
870 self.simo.set_super_rigid_body_from_hierarchies(super_rigid_bodies[k])
872 for k
in chain_super_rigid_bodies:
873 self.simo.set_chain_of_super_rigid_bodies(chain_super_rigid_bodies[k],2,3)
875 self.simo.set_floppy_bodies()
876 self.simo.setup_bonds()
880 def set_rmf_file(self,component_name,rmf_file,rmf_frame_number):
881 self.rmf_file[component_name]=rmf_file
882 self.rmf_frame_number[component_name]=rmf_frame_number
884 def get_density_hierarchies(self,hier_name_list):
888 for hn
in hier_name_list:
890 dens_hier_list+=self.resdensities[hn]
891 return dens_hier_list
893 def get_pdb_bead_bits(self,hierarchy):
898 if "_pdb" in h.get_name():pdbbits.append(h)
899 if "_bead" in h.get_name():beadbits.append(h)
900 if "_helix" in h.get_name():helixbits.append(h)
901 return (pdbbits,beadbits,helixbits)
903 def scale_bead_radii(self,nresidues,scale):
905 for h
in self.domain_dict:
906 (pdbbits,beadbits,helixbits)=self.get_pdb_bead_bits(self.domain_dict[h])
907 slope=(1.0-scale)/(1.0-float(nresidues))
912 if b
not in scaled_beads:
918 scale_factor=slope*float(num_residues)+1.0
920 new_radius=scale_factor*radius
923 print(
"particle with radius "+str(radius)+
" and "+str(num_residues)+
" residues scaled to a new radius "+str(new_radius))
926 def create_density(self,simo,compname,comphier,txtfilename,mrcfilename,num_components,read=True):
928 (pdbbits,beadbits,helixbits)=self.get_pdb_bead_bits(comphier)
932 for pb
in pdbbits+helixbits:
936 number_of_residues=len(set(res_ind))
940 outhier+=simo.add_component_density(compname,
942 num_components=num_components,
944 inputfile=txtfilename)
945 if len(helixbits)!=0:
946 outhier+=simo.add_component_density(compname,
948 num_components=num_components,
950 inputfile=txtfilename)
958 num_components=number_of_residues/abs(num_components)
959 outhier+=simo.add_component_density(compname,
961 num_components=num_components,
963 outputfile=txtfilename,
964 outputmap=mrcfilename,
965 multiply_by_total_mass=
True)
967 if len(helixbits)!=0:
971 num_components=number_of_residues/abs(num_components)
972 outhier+=simo.add_component_density(compname,
974 num_components=num_components,
976 outputfile=txtfilename,
977 outputmap=mrcfilename,
978 multiply_by_total_mass=
True)
980 return outhier,beadbits
982 def autobuild(self,simo,comname,pdbname,chain,resrange,read=True,beadsize=5,color=0.0,offset=0):
984 if pdbname
is not None and pdbname
is not "IDEAL_HELIX" and pdbname
is not "BEADS" :
985 if resrange[-1]==-1: resrange=(resrange[0],len(simo.sequence_dict[comname]))
987 outhier=simo.autobuild_model(comname,
991 resolutions=[0,1,10],
994 missingbeadsize=beadsize)
996 outhier=simo.autobuild_model(comname,
1003 missingbeadsize=beadsize)
1006 elif pdbname
is not None and pdbname
is "IDEAL_HELIX" and pdbname
is not "BEADS" :
1008 outhier=simo.add_component_ideal_helix(comname,
1014 elif pdbname
is not None and pdbname
is not "IDEAL_HELIX" and pdbname
is "BEADS" :
1015 outhier=simo.add_component_necklace(comname,resrange[0],resrange[1],beadsize,color=color)
1019 seq_len=len(simo.sequence_dict[comname])
1020 outhier=simo.add_component_necklace(comname,
1032 """A macro for running all the basic operations of analysis.
1033 Includes clustering, precision analysis, and making ensemble density maps.
1034 A number of plots are also supported.
1037 merge_directories=[
"./"],
1038 stat_file_name_suffix=
"stat",
1039 best_pdb_name_suffix=
"model",
1040 do_clean_first=
True,
1041 do_create_directories=
True,
1042 global_output_directory=
"output/",
1043 replica_stat_file_suffix=
"stat_replica",
1044 global_analysis_result_directory=
"./analysis/"):
1046 @param model The IMP model
1047 @param stat_file_name_suffix
1048 @param merge_directories The directories containing output files
1049 @param best_pdb_name_suffix
1050 @param do_clean_first
1051 @param do_create_directories
1052 @param global_output_directory Where everything is
1053 @param replica_stat_file_suffix
1054 @param global_analysis_result_directory
1058 from mpi4py
import MPI
1059 self.comm = MPI.COMM_WORLD
1060 self.rank = self.comm.Get_rank()
1061 self.number_of_processes = self.comm.size
1064 self.number_of_processes = 1
1067 stat_dir = global_output_directory
1068 self.stat_files = []
1070 for rd
in merge_directories:
1071 stat_files = glob.glob(rd +
"/" + stat_dir +
"/stat.*.out")
1072 if len(stat_files)==0:
1073 print(
"WARNING: no stat files found in",rd +
"/" + stat_dir +
"/stat.*.out")
1074 self.stat_files += stat_files
1078 score_key=
"SimplifiedModel_Total_Score_None",
1079 rmf_file_key=
"rmf_file",
1080 rmf_file_frame_key=
"rmf_frame_index",
1083 nframes_trajectory=10000):
1084 """ Get a trajectory of the modeling run, for generating demonstrative movies
1085 @param score_key The score for ranking models
1086 @param rmf_file_key Key pointing to RMF filename
1087 @param rmf_file_frame_key Key pointing to RMF frame number
1088 @param outputdir The local output directory used in the run
1089 @param get_every Extract every nth frame
1090 @param nframes_trajectory Total number of frames of the trajectory
1092 from operator
import itemgetter
1100 rmf_file_list=trajectory_models[0]
1101 rmf_file_frame_list=trajectory_models[1]
1102 score_list=list(map(float, trajectory_models[2]))
1104 max_score=max(score_list)
1105 min_score=min(score_list)
1108 bins=[(max_score-min_score)*math.exp(-float(i))+min_score
for i
in range(nframes_trajectory)]
1109 binned_scores=[
None]*nframes_trajectory
1110 binned_model_indexes=[-1]*nframes_trajectory
1112 for model_index,s
in enumerate(score_list):
1113 bins_score_diffs=[abs(s-b)
for b
in bins]
1114 bin_index=min(enumerate(bins_score_diffs), key=itemgetter(1))[0]
1115 if binned_scores[bin_index]==
None:
1116 binned_scores[bin_index]=s
1117 binned_model_indexes[bin_index]=model_index
1119 old_diff=abs(binned_scores[bin_index]-bins[bin_index])
1120 new_diff=abs(s-bins[bin_index])
1121 if new_diff < old_diff:
1122 binned_scores[bin_index]=s
1123 binned_model_indexes[bin_index]=model_index
1125 print(binned_scores)
1126 print(binned_model_indexes)
1131 score_key=
"SimplifiedModel_Total_Score_None",
1132 rmf_file_key=
"rmf_file",
1133 rmf_file_frame_key=
"rmf_frame_index",
1135 prefiltervalue=
None,
1138 alignment_components=
None,
1139 number_of_best_scoring_models=10,
1140 rmsd_calculation_components=
None,
1141 distance_matrix_file=
None,
1142 load_distance_matrix_file=
False,
1143 skip_clustering=
False,
1144 number_of_clusters=1,
1146 exit_after_display=
True,
1148 first_and_last_frames=
None,
1149 density_custom_ranges=
None,
1150 write_pdb_with_centered_coordinates=
False,
1152 """ Get the best scoring models, compute a distance matrix, cluster them, and create density maps
1153 @param score_key The score for ranking models
1154 @param rmf_file_key Key pointing to RMF filename
1155 @param rmf_file_frame_key Key pointing to RMF frame number
1156 @param state_number State number to analyse
1157 @param prefiltervalue Only include frames where the score key is below this value
1158 @param feature_keys Keywords for which you want to calculate average,
1160 @param outputdir The local output directory used in the run
1161 @param alignment_components List of tuples for aligning the structures
1162 e.g. ["Rpb1", (20,100,"Rpb2"), .....]
1163 @param number_of_best_scoring_models Num models to keep per run
1164 @param rmsd_calculation_components List of tuples for calculating RMSD
1165 e.g. ["Rpb1", (20,100,"Rpb2"), .....]
1166 @param distance_matrix_file Where to store/read the distance matrix
1167 @param load_distance_matrix_file Try to load the distance matrix file
1168 @param skip_clustering Just extract the best scoring models and save the pdbs
1169 @param number_of_clusters Number of k-means clusters
1170 @param display_plot Display the distance matrix
1171 @param exit_after_display Exit after displaying distance matrix
1172 @param get_every Extract every nth frame
1173 @param first_and_last_frames A tuple with the first and last frames to be
1174 analyzed. Values are percentages!
1175 Default: get all frames
1176 @param density_custom_ranges List of tuples or strings for density calculation
1177 e.g. ["Rpb1", (20,100,"Rpb2"), .....]
1178 @param write_pdb_with_centered_coordinates
1179 @param voxel_size Used for the density output
1190 if not load_distance_matrix_file:
1191 if len(self.stat_files)==0: print(
"ERROR: no stat file found in the given path");
return
1192 my_stat_files=IMP.pmi.tools.chunk_list_into_segments(
1193 self.stat_files,self.number_of_processes)[self.rank]
1201 rmf_file_list=best_models[0]
1202 rmf_file_frame_list=best_models[1]
1203 score_list=best_models[2]
1204 feature_keyword_list_dict=best_models[3]
1210 if self.number_of_processes > 1:
1214 rmf_file_frame_list)
1215 for k
in feature_keyword_list_dict:
1217 feature_keyword_list_dict[k])
1220 score_rmf_tuples = list(zip(score_list,
1222 rmf_file_frame_list,
1223 list(range(len(score_list)))))
1226 if first_and_last_frames
is not None:
1227 nframes = len(score_rmf_tuples)
1228 first_frame = int(first_and_last_frames[0] * nframes)
1229 last_frame = int(first_and_last_frames[1] * nframes)
1230 if last_frame > len(score_rmf_tuples):
1232 score_rmf_tuples = score_rmf_tuples[first_frame:last_frame]
1235 best_score_rmf_tuples = sorted(score_rmf_tuples,
1236 key=
lambda x: float(x[0]))[:number_of_best_scoring_models]
1237 best_score_rmf_tuples=[t+(n,)
for n,t
in enumerate(best_score_rmf_tuples)]
1240 best_score_feature_keyword_list_dict = defaultdict(list)
1241 for tpl
in best_score_rmf_tuples:
1243 for f
in feature_keyword_list_dict:
1244 best_score_feature_keyword_list_dict[f].append(
1245 feature_keyword_list_dict[f][index])
1247 my_best_score_rmf_tuples = IMP.pmi.tools.chunk_list_into_segments(
1248 best_score_rmf_tuples,
1249 self.number_of_processes)[self.rank]
1256 my_best_score_rmf_tuples[0],
1257 rmsd_calculation_components)
1259 my_best_score_rmf_tuples,
1260 alignment_components,
1261 rmsd_calculation_components)
1266 all_coordinates=got_coords[0]
1267 alignment_coordinates=got_coords[1]
1268 rmsd_coordinates=got_coords[2]
1269 rmf_file_name_index_dict=got_coords[3]
1270 all_rmf_file_names=got_coords[4]
1276 if density_custom_ranges:
1278 density_custom_ranges,
1281 dircluster=os.path.join(outputdir,
"all_models."+str(n))
1287 os.mkdir(dircluster)
1290 clusstat=open(os.path.join(dircluster,
"stat."+str(self.rank)+
".out"),
"w")
1291 for cnt,tpl
in enumerate(my_best_score_rmf_tuples):
1293 rmf_frame_number=tpl[2]
1296 for key
in best_score_feature_keyword_list_dict:
1297 tmp_dict[key]=best_score_feature_keyword_list_dict[key][index]
1299 prot=IMP.pmi.analysis.get_hier_from_rmf(self.model,rmf_frame_number,rmf_name,state_number=state_number)
1304 coords_f1=alignment_coordinates[cnt]
1306 coords_f2=alignment_coordinates[cnt]
1308 transformation = Ali.align()[1]
1326 out_pdb_fn=os.path.join(dircluster,str(cnt)+
"."+str(self.rank)+
".pdb")
1327 out_rmf_fn=os.path.join(dircluster,str(cnt)+
"."+str(self.rank)+
".rmf")
1328 o.init_pdb(out_pdb_fn,prot)
1329 o.write_pdb(out_pdb_fn,
1330 translate_to_geometric_center=write_pdb_with_centered_coordinates)
1332 tmp_dict[
"local_pdb_file_name"]=os.path.basename(out_pdb_fn)
1333 tmp_dict[
"rmf_file_full_path"]=rmf_name
1334 tmp_dict[
"local_rmf_file_name"]=os.path.basename(out_rmf_fn)
1335 tmp_dict[
"local_rmf_frame_number"]=0
1337 clusstat.write(str(tmp_dict)+
"\n")
1338 o.init_rmf(out_rmf_fn,[prot])
1339 o.write_rmf(out_rmf_fn)
1340 o.close_rmf(out_rmf_fn)
1342 if density_custom_ranges:
1343 DensModule.add_subunits_density(prot)
1345 if density_custom_ranges:
1346 DensModule.write_mrc(path=dircluster)
1353 if self.number_of_processes > 1:
1359 rmf_file_name_index_dict)
1361 alignment_coordinates)
1368 [best_score_feature_keyword_list_dict,
1369 rmf_file_name_index_dict],
1375 print(
"setup clustering class")
1378 for n, model_coordinate_dict
in enumerate(all_coordinates):
1379 template_coordinate_dict = {}
1381 if alignment_components
is not None and len(Clusters.all_coords) == 0:
1383 Clusters.set_template(alignment_coordinates[n])
1384 Clusters.fill(all_rmf_file_names[n], rmsd_coordinates[n])
1386 print(
"Global calculating the distance matrix")
1389 Clusters.dist_matrix()
1393 Clusters.do_cluster(number_of_clusters)
1396 Clusters.plot_matrix(figurename=os.path.join(outputdir,
'dist_matrix.pdf'))
1397 if exit_after_display:
1399 Clusters.save_distance_matrix_file(file_name=distance_matrix_file)
1406 print(
"setup clustering class")
1408 Clusters.load_distance_matrix_file(file_name=distance_matrix_file)
1409 print(
"clustering with %s clusters" % str(number_of_clusters))
1410 Clusters.do_cluster(number_of_clusters)
1411 [best_score_feature_keyword_list_dict,
1412 rmf_file_name_index_dict] = self.load_objects(
".macro.pkl")
1415 Clusters.plot_matrix(figurename=os.path.join(outputdir,
'dist_matrix.pdf'))
1416 if exit_after_display:
1418 if self.number_of_processes > 1:
1426 print(Clusters.get_cluster_labels())
1427 for n, cl
in enumerate(Clusters.get_cluster_labels()):
1428 print(
"rank %s " % str(self.rank))
1429 print(
"cluster %s " % str(n))
1430 print(
"cluster label %s " % str(cl))
1431 print(Clusters.get_cluster_label_names(cl))
1435 if density_custom_ranges:
1437 density_custom_ranges,
1440 dircluster = outputdir +
"/cluster." + str(n) +
"/"
1442 os.mkdir(dircluster)
1446 rmsd_dict = {
"AVERAGE_RMSD":
1447 str(Clusters.get_cluster_label_average_rmsd(cl))}
1448 clusstat = open(dircluster +
"stat.out",
"w")
1449 for k, structure_name
in enumerate(Clusters.get_cluster_label_names(cl)):
1453 tmp_dict.update(rmsd_dict)
1454 index = rmf_file_name_index_dict[structure_name]
1455 for key
in best_score_feature_keyword_list_dict:
1457 key] = best_score_feature_keyword_list_dict[
1462 rmf_name = structure_name.split(
"|")[0]
1463 rmf_frame_number = int(structure_name.split(
"|")[1])
1465 clusstat.write(str(tmp_dict) +
"\n")
1466 prot,rs = IMP.pmi.analysis.get_hier_and_restraints_from_rmf(
1475 model_index = Clusters.get_model_index_from_name(
1477 transformation = Clusters.get_transformation_to_first_member(
1497 if density_custom_ranges:
1498 DensModule.add_subunits_density(prot)
1502 o.init_pdb(dircluster + str(k) +
".pdb", prot)
1503 o.write_pdb(dircluster + str(k) +
".pdb")
1505 o.init_rmf(dircluster + str(k) +
".rmf3", [prot],rs)
1506 o.write_rmf(dircluster + str(k) +
".rmf3")
1507 o.close_rmf(dircluster + str(k) +
".rmf3")
1512 if density_custom_ranges:
1513 DensModule.write_mrc(path=dircluster)
1516 if self.number_of_processes>1:
1519 def save_objects(self, objects, file_name):
1521 outf = open(file_name,
'wb')
1522 pickle.dump(objects, outf)
1525 def load_objects(self, file_name):
1527 inputf = open(file_name,
'rb')
1528 objects = pickle.load(inputf)
A macro for running all the basic operations of analysis.
Sample using molecular dynamics.
A member of a rigid body, it has internal (local) coordinates.
Utility classes and functions for reading and storing PMI files.
def get_best_models
Given a list of stat files, read them all and find the best models.
def clustering
Get the best scoring models, compute a distance matrix, cluster them, and create density maps...
GenericHierarchies get_leaves(Hierarchy mhd)
Get all the leaves of the bit of hierarchy.
A class to cluster structures.
Representation of the system.
def get_modeling_trajectory
Get a trajectory of the modeling run, for generating demonstrative movies.
static bool get_is_setup(const IMP::kernel::ParticleAdaptor &p)
static XYZR setup_particle(kernel::Model *m, ParticleIndex pi)
def get_trajectory_models
Given a list of stat files, read them all and find a trajectory of models.
def build_model
Create model.
Performs alignment and RMSD calculation for two sets of coordinates.
void transform(XYZ a, const algebra::Transformation3D &tr)
Apply a transformation to the particle.
Hierarchies get_by_type(Hierarchy mhd, GetByType t)
A decorator for a particle with x,y,z coordinates.
def BuildModel0
Construct a component for each subunit (no splitting, nothing fancy).
static bool get_is_setup(const IMP::kernel::ParticleAdaptor &p)
Class for easy writing of PDBs, RMFs, and stat files.
A macro to build a Representation based on a Topology and lists of movers.
Tools for clustering and cluster analysis.
Classes for writing output files and processing them.
Sample using Monte Carlo.
Deprecated building macro - use BuildModel()
def read_coordinates_of_rmfs
Read in coordinates of a set of RMF tuples.
Set up the representation of all proteins and nucleic acid macromolecules.
A macro to help setup and run replica exchange.
Hierarchies get_leaves(const Selection &h)
Compute mean density maps from structures.
Support for the RMF file format for storing hierarchical molecular data and markup.
Sample using replica exchange.
def get_representation
Return the Representation object.
A decorator for a particle with x,y,z coordinates and a radius.