1 """@namespace IMP.pmi.io
2 Utility classes and functions for reading and storing PMI files
5 from __future__
import print_function
17 from collections
import defaultdict
20 def parse_dssp(dssp_fn, limit_to_chains='', name_map=None):
21 """Read a DSSP file, and return secondary structure elements (SSEs).
22 Values are all PDB residue numbering.
23 @param dssp_fn The file to read
24 @param limit_to_chains Only read/return these chain IDs
25 @param name_map If passed, return tuples organized by molecule name
26 (name_map should be a dictionary with chain IDs as keys and
27 molecule names as values).
29 @return a dictionary with keys 'helix', 'beta', 'loop'.
30 Each contains a list of SSEs.
31 Each SSE is a list of elements (e.g. strands in a sheet).
32 Each element is a tuple (residue start, residue end, chain).
34 Example for a structure with helix A:5-7 and Beta strands A:1-3,A:9-11:
37 ret = { 'helix' : [ [ (5,7,'A') ], ...]
38 'beta' : [ [ (1,3,'A'),
39 (9,11,'A'), ...], ...]
40 'loop' : same format as helix
45 def convert_chain(ch):
49 return name_map.get(ch, ch)
54 loop_classes = [
' ',
'',
'T',
'S']
56 for h
in helix_classes:
58 for s
in strand_classes:
60 for lc
in loop_classes:
62 sses = {
'helix': [],
'beta': [],
'loop': []}
72 with open(dssp_fn,
'r') as fh:
78 if fields[1] ==
"RESIDUE":
86 elif limit_to_chains !=
'' and line[11]
not in limit_to_chains:
91 pdb_res_num = int(line[5:10])
93 sstype = sse_dict[line[16]]
97 if prev_sstype
is None:
98 cur_sse = [pdb_res_num, pdb_res_num, convert_chain(chain)]
99 elif sstype != prev_sstype
or chain_break:
101 if prev_sstype
in [
'helix',
'loop']:
102 sses[prev_sstype].append([cur_sse])
104 beta_dict[prev_beta_id].append(cur_sse)
105 cur_sse = [pdb_res_num, pdb_res_num, convert_chain(chain)]
107 cur_sse[1] = pdb_res_num
113 prev_beta_id = beta_id
116 if prev_sstype
in [
'helix',
'loop']:
117 sses[prev_sstype].append([cur_sse])
118 elif prev_sstype ==
'beta':
119 beta_dict[prev_beta_id].append(cur_sse)
121 for beta_sheet
in beta_dict:
122 sses[
'beta'].append(beta_dict[beta_sheet])
127 number_of_best_scoring_models=10, get_every=1,
128 score_key=
"SimplifiedModel_Total_Score_None",
129 feature_keys=
None, rmf_file_key=
"rmf_file",
130 rmf_file_frame_key=
"rmf_frame_index",
131 override_rmf_dir=
None):
132 """Given a list of stat files, read them all and find the best models.
133 Save to a single RMF along with a stat file.
134 @param model The IMP Model
135 @param out_dir The output directory. Will save 3 files (RMF, stat, summary)
136 @param stat_files List of all stat files to collect
137 @param number_of_best_scoring_models Num best models to gather
138 @param get_every Skip frames
139 @param score_key Used for the ranking
140 @param feature_keys Keys to keep around
141 @param rmf_file_key The key that says RMF file name
142 @param rmf_file_frame_key The key that says RMF frame number
143 @param override_rmf_dir For output, change the name of the RMF
144 directory (experiment)
149 from mpi4py
import MPI
150 comm = MPI.COMM_WORLD
151 rank = comm.Get_rank()
152 number_of_processes = comm.size
155 number_of_processes = 1
156 my_stat_files = IMP.pmi.tools.chunk_list_into_segments(
157 stat_files, number_of_processes)[rank]
160 out_stat_fn = os.path.join(
161 out_dir,
"top_" + str(number_of_best_scoring_models) +
".out")
162 out_rmf_fn = os.path.join(
163 out_dir,
"top_" + str(number_of_best_scoring_models) +
".rmf3")
167 for nsf, sf
in enumerate(my_stat_files):
170 root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
171 print(
"getting data from file %s" % sf)
173 all_keys = [score_key,
176 for k
in po.get_keys():
177 for fk
in feature_keys:
180 fields = po.get_fields(all_keys,
184 length_set = set([len(fields[f])
for f
in fields])
185 minlen = min(length_set)
188 if len(length_set) > 1:
189 print(
"get_best_models: the statfile is not synchronous")
190 minlen = min(length_set)
192 fields[f] = fields[f][0:minlen]
197 all_fields[k] += fields[k]
199 if override_rmf_dir
is not None:
200 for i
in range(minlen):
201 all_fields[rmf_file_key][i] = os.path.join(
203 os.path.basename(all_fields[rmf_file_key][i]))
206 if number_of_processes != 1:
209 comm.send(all_fields, dest=0, tag=11)
211 for i
in range(1, number_of_processes):
212 data_tmp = comm.recv(source=i, tag=11)
214 all_fields[k] += data_tmp[k]
217 order = sorted(range(len(all_fields[score_key])),
218 key=
lambda i: float(all_fields[score_key][i]))
221 stat = open(out_stat_fn,
'w')
222 rh0 = RMF.open_rmf_file_read_only(
223 os.path.join(root_directory_of_stat_file,
224 all_fields[rmf_file_key][0]))
227 outf = RMF.create_rmf_file(out_rmf_fn)
229 for nm, i
in enumerate(order[:number_of_best_scoring_models]):
230 dline = dict((k, all_fields[k][i])
for k
in all_fields)
231 dline[
'orig_rmf_file'] = dline[rmf_file_key]
232 dline[
'orig_rmf_frame_index'] = dline[rmf_file_frame_key]
233 dline[rmf_file_key] = out_rmf_fn
234 dline[rmf_file_frame_key] = nm
235 rh = RMF.open_rmf_file_read_only(
236 os.path.join(root_directory_of_stat_file,
237 all_fields[rmf_file_key][i]))
240 RMF.FrameID(all_fields[rmf_file_frame_key][i]))
243 stat.write(str(dline) +
'\n')
245 print(
'wrote stats to', out_stat_fn)
246 print(
'wrote rmfs to', out_rmf_fn)
249 class _TempProvenance(object):
250 """Placeholder to track provenance information added to the IMP model.
251 This is since we typically don't preserve the IMP::Model object
252 throughout a PMI protocol."""
253 def __init__(self, provclass, particle_name, *args, **kwargs):
254 self.provclass = provclass
255 self.particle_name = particle_name
259 def get_decorator(self, model):
260 """Instantiate and return an IMP Provenance object in the model."""
261 pi = model.add_particle(self.particle_name)
262 return self.provclass.setup_particle(model, pi, *self.args,
266 class ClusterProvenance(_TempProvenance):
267 def __init__(self, *args, **kwargs):
269 "clustering", *args, **kwargs)
272 class FilterProvenance(_TempProvenance):
273 def __init__(self, *args, **kwargs):
275 "filtering", *args, **kwargs)
278 class CombineProvenance(_TempProvenance):
279 def __init__(self, *args, **kwargs):
281 "combine runs", *args, **kwargs)
285 """Add provenance information in `prov` (a list of _TempProvenance
286 objects) to each of the IMP hierarchies provided.
287 Note that we do this all at once since we typically don't preserve
288 the IMP::Model object throughout a PMI protocol."""
290 IMP.pmi.tools._add_pmi_provenance(h)
297 score_key=
"SimplifiedModel_Total_Score_None",
299 rmf_file_key=
"rmf_file",
300 rmf_file_frame_key=
"rmf_frame_index",
302 get_every=1, provenance=
None):
303 """ Given a list of stat files, read them all and find the best models.
304 Returns the best rmf filenames, frame numbers, scores, and values
308 rmf_file_frame_list = []
311 feature_keyword_list_dict = defaultdict(list)
313 for sf
in stat_files:
314 root_directory_of_stat_file = os.path.dirname(os.path.abspath(sf))
315 if sf[-4:] ==
'rmf3':
316 root_directory_of_stat_file = os.path.dirname(
317 os.path.abspath(root_directory_of_stat_file))
318 print(
"getting data from file %s" % sf)
322 file_keywords = po.get_keys()
326 keywords = [score_key, rmf_file_key, rmf_file_frame_key]
331 for requested_key
in feature_keys:
332 for file_k
in file_keywords:
333 if requested_key
in file_k:
334 if file_k
not in keywords:
335 keywords.append(file_k)
337 if prefiltervalue
is None:
338 fields = po.get_fields(keywords,
340 statistics=statistics)
342 fields = po.get_fields(
343 keywords, filtertuple=(score_key,
"<", prefiltervalue),
344 get_every=get_every, statistics=statistics)
349 length_set.add(len(fields[f]))
353 if len(length_set) > 1:
354 print(
"get_best_models: the statfile is not synchronous")
355 minlen = min(length_set)
357 fields[f] = fields[f][0:minlen]
360 score_list += fields[score_key]
361 for rmf
in fields[rmf_file_key]:
362 rmf = os.path.normpath(rmf)
363 if root_directory_of_stat_file
not in rmf:
364 rmf_local_path = os.path.join(
365 os.path.basename(os.path.dirname(rmf)),
366 os.path.basename(rmf))
367 rmf = os.path.join(root_directory_of_stat_file, rmf_local_path)
368 rmf_file_list.append(rmf)
370 rmf_file_frame_list += fields[rmf_file_frame_key]
373 feature_keyword_list_dict[k] += fields[k]
376 if provenance
is not None:
377 if len(stat_files) > 1:
378 provenance.append(CombineProvenance(len(stat_files),
382 FilterProvenance(
"Keep fraction", 0.,
383 statistics.passed_get_every))
384 if prefiltervalue
is not None:
385 provenance.append(FilterProvenance(
"Total score",
387 statistics.passed_filtertuple))
389 return (rmf_file_list, rmf_file_frame_list, score_list,
390 feature_keyword_list_dict)
394 score_key=
"SimplifiedModel_Total_Score_None",
395 rmf_file_key=
"rmf_file",
396 rmf_file_frame_key=
"rmf_frame_index",
398 """Given a list of stat files, read them all and find a trajectory
399 of models. Returns the rmf filenames, frame numbers, scores, and
400 values for feature keywords
403 rmf_file_frame_list = []
405 for sf
in stat_files:
406 root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
407 print(
"getting data from file %s" % sf)
410 feature_keywords = [score_key, rmf_file_key, rmf_file_frame_key]
412 fields = po.get_fields(feature_keywords, get_every=get_every)
417 length_set.add(len(fields[f]))
421 if len(length_set) > 1:
422 print(
"get_best_models: the statfile is not synchronous")
423 minlen = min(length_set)
425 fields[f] = fields[f][0:minlen]
428 score_list += fields[score_key]
429 for rmf
in fields[rmf_file_key]:
430 rmf_file_list.append(os.path.join(root_directory_of_stat_file,
433 rmf_file_frame_list += fields[rmf_file_frame_key]
435 return rmf_file_list, rmf_file_frame_list, score_list
440 alignment_components=
None,
441 rmsd_calculation_components=
None,
443 """Read in coordinates of a set of RMF tuples.
444 Returns the coordinates split as requested (all, alignment only, rmsd only)
445 as well as RMF file names (as keys in a dictionary, with values being
446 the rank number) and just a plain list
447 @param model The IMP model
448 @param rmf_tuples [score,filename,frame number,original order number, rank]
449 @param alignment_components Tuples to specify what you're aligning on
450 @param rmsd_calculation_components Tuples to specify what components
451 are used for RMSD calc
454 rmsd_coordinates = []
455 alignment_coordinates = []
456 all_rmf_file_names = []
457 rmf_file_name_index_dict = {}
459 for cnt, tpl
in enumerate(rmf_tuples):
461 frame_number = tpl[2]
463 prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
467 IMP.pmi.analysis.link_hiers_to_rmf(model, prots, frame_number,
473 states = IMP.atom.get_by_type(prots[0], IMP.atom.STATE_TYPE)
474 prot = states[state_number]
476 prot = prots[state_number]
480 all_particles = [pp
for key
in part_dict
for pp
in part_dict[key]]
481 all_ps_set = set(all_particles)
482 model_coordinate_dict = {}
483 template_coordinate_dict = {}
484 rmsd_coordinate_dict = {}
487 model_coordinate_dict[pr] = np.array(
489 for i
in part_dict[pr]])
492 for tuple_dict, result_dict
in zip(
493 (alignment_components, rmsd_calculation_components),
494 (template_coordinate_dict, rmsd_coordinate_dict)):
496 if tuple_dict
is None:
501 for pr
in tuple_dict:
503 prot, tuple_dict[pr], resolution=1)
508 for pr
in tuple_dict:
509 if type(tuple_dict[pr])
is str:
510 name = tuple_dict[pr]
512 elif type(tuple_dict[pr])
is tuple:
513 name = tuple_dict[pr][2]
514 rend = tuple_dict[pr][1]
515 rbegin = tuple_dict[pr][0]
518 residue_indexes=range(rbegin, rend+1))
519 ps = s.get_selected_particles()
520 filtered_particles = [p
for p
in ps
if p
in all_ps_set]
523 for p
in filtered_particles]
525 all_coordinates.append(model_coordinate_dict)
526 alignment_coordinates.append(template_coordinate_dict)
527 rmsd_coordinates.append(rmsd_coordinate_dict)
528 frame_name = rmf_file +
'|' + str(frame_number)
529 all_rmf_file_names.append(frame_name)
530 rmf_file_name_index_dict[frame_name] = tpl[4]
531 return (all_coordinates, alignment_coordinates, rmsd_coordinates,
532 rmf_file_name_index_dict, all_rmf_file_names)
538 @param model The IMP model
539 @param rmf_tuple score,filename,frame number,original order number, rank
540 @param rmsd_calculation_components Tuples to specify what components
541 are used for RMSD calc
543 if rmsd_calculation_components
is None:
546 rmf_file = rmf_tuple[1]
547 frame_number = rmf_tuple[2]
548 prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
553 states = IMP.atom.get_by_type(prots[0], IMP.atom.STATE_TYPE)
554 prot = states[state_number]
556 prot = prots[state_number]
558 rmsd_bead_size_dict = {}
562 for pr
in rmsd_calculation_components:
564 prot, rmsd_calculation_components[pr], resolution=1)
565 rmsd_bead_size_dict[pr] = [
570 all_particles = [pp
for key
in part_dict
for pp
in part_dict[key]]
571 all_ps_set = set(all_particles)
574 for pr
in rmsd_calculation_components:
575 if type(rmsd_calculation_components[pr])
is str:
576 name = rmsd_calculation_components[pr]
578 elif type(rmsd_calculation_components[pr])
is tuple:
579 name = rmsd_calculation_components[pr][2]
580 rend = rmsd_calculation_components[pr][1]
581 rbegin = rmsd_calculation_components[pr][0]
583 prot, molecule=name, residue_indexes=range(rbegin, rend+1))
584 ps = s.get_selected_particles()
585 filtered_particles = [p
for p
in ps
if p
in all_ps_set]
586 rmsd_bead_size_dict[pr] = \
588 for p
in filtered_particles]
590 return rmsd_bead_size_dict
594 """A helper output for model evaluation"""
595 def __init__(self, model):
599 def get_output(self):
600 score = self.rs.evaluate(
False)
602 output[
"Total_Score"] = str(score)
A class for reading stat files (either rmf or ascii v1 and v2)
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def parse_dssp
Read a DSSP file, and return secondary structure elements (SSEs).
def get_best_models
Given a list of stat files, read them all and find the best models.
A helper output for model evaluation.
Collect statistics from ProcessOutput.get_fields().
Track creation of a system fragment by combination.
def get_trajectory_models
Given a list of stat files, read them all and find a trajectory of models.
Track creation of a system fragment by filtering.
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Tools for clustering and cluster analysis.
bool get_is_canonical(atom::Hierarchy h)
Walk up a PMI2 hierarchy/representations and check if the root is named System.
Classes for writing output files and processing them.
def save_best_models
Given a list of stat files, read them all and find the best models.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Track creation of a system fragment from clustering.
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
def read_coordinates_of_rmfs
Read in coordinates of a set of RMF tuples.
Python classes to represent, score, sample and analyze models.
Functionality for loading, creating, manipulating and scoring atomic structures.
void add_provenance(Model *m, ParticleIndex pi, Provenance p)
Add provenance to part of the model.
Select hierarchy particles identified by the biological name.
Support for the RMF file format for storing hierarchical molecular data and markup.
def add_provenance
Add provenance information in prov (a list of _TempProvenance objects) to each of the IMP hierarchies...