1 """@namespace IMP.pmi.io
2 Utility classes and functions for reading and storing PMI files
5 from __future__
import print_function
17 from collections
import defaultdict
19 from pathlib
import Path
21 from IMP._compat_pathlib
import Path
24 def parse_dssp(dssp_fn, limit_to_chains='', name_map=None):
25 """Read a DSSP file, and return secondary structure elements (SSEs).
26 Values are all PDB residue numbering.
27 @param dssp_fn The file to read
28 @param limit_to_chains Only read/return these chain IDs
29 @param name_map If passed, return tuples organized by molecule name
30 (name_map should be a dictionary with chain IDs as keys and
31 molecule names as values).
33 @return a dictionary with keys 'helix', 'beta', 'loop'.
34 Each contains a list of SSEs.
35 Each SSE is a list of elements (e.g. strands in a sheet).
36 Each element is a tuple (residue start, residue end, chain).
38 Example for a structure with helix A:5-7 and Beta strands A:1-3,A:9-11:
41 ret = { 'helix' : [ [ (5,7,'A') ], ...]
42 'beta' : [ [ (1,3,'A'),
43 (9,11,'A'), ...], ...]
44 'loop' : same format as helix
49 def convert_chain(ch):
53 return name_map.get(ch, ch)
58 loop_classes = [
' ',
'',
'T',
'S']
60 for h
in helix_classes:
62 for s
in strand_classes:
64 for lc
in loop_classes:
66 sses = {
'helix': [],
'beta': [],
'loop': []}
76 with open(dssp_fn,
'r') as fh:
82 if fields[1] ==
"RESIDUE":
90 elif limit_to_chains !=
'' and line[11]
not in limit_to_chains:
95 pdb_res_num = int(line[5:10])
97 sstype = sse_dict[line[16]]
101 if prev_sstype
is None:
102 cur_sse = [pdb_res_num, pdb_res_num, convert_chain(chain)]
103 elif sstype != prev_sstype
or chain_break:
105 if prev_sstype
in [
'helix',
'loop']:
106 sses[prev_sstype].append([cur_sse])
108 beta_dict[prev_beta_id].append(cur_sse)
109 cur_sse = [pdb_res_num, pdb_res_num, convert_chain(chain)]
111 cur_sse[1] = pdb_res_num
117 prev_beta_id = beta_id
120 if prev_sstype
in [
'helix',
'loop']:
121 sses[prev_sstype].append([cur_sse])
122 elif prev_sstype ==
'beta':
123 beta_dict[prev_beta_id].append(cur_sse)
125 for beta_sheet
in beta_dict:
126 sses[
'beta'].append(beta_dict[beta_sheet])
131 number_of_best_scoring_models=10, get_every=1,
132 score_key=
"SimplifiedModel_Total_Score_None",
133 feature_keys=
None, rmf_file_key=
"rmf_file",
134 rmf_file_frame_key=
"rmf_frame_index",
135 override_rmf_dir=
None):
136 """Given a list of stat files, read them all and find the best models.
137 Save to a single RMF along with a stat file.
138 @param model The IMP Model
139 @param out_dir The output directory. Will save 3 files (RMF, stat, summary)
140 @param stat_files List of all stat files to collect
141 @param number_of_best_scoring_models Num best models to gather
142 @param get_every Skip frames
143 @param score_key Used for the ranking
144 @param feature_keys Keys to keep around
145 @param rmf_file_key The key that says RMF file name
146 @param rmf_file_frame_key The key that says RMF frame number
147 @param override_rmf_dir For output, change the name of the RMF
148 directory (experiment)
153 from mpi4py
import MPI
154 comm = MPI.COMM_WORLD
155 rank = comm.Get_rank()
156 number_of_processes = comm.size
159 number_of_processes = 1
160 my_stat_files = IMP.pmi.tools.chunk_list_into_segments(
161 stat_files, number_of_processes)[rank]
164 out_dir = Path(out_dir)
165 out_stat_fn = out_dir / (
"top_%d.out" % number_of_best_scoring_models)
166 out_rmf_fn = out_dir / (
"top_%d.rmf3" % number_of_best_scoring_models)
170 for nsf, sf
in enumerate(my_stat_files):
173 root_directory_of_stat_file = Path(sf).parent.parent
174 print(
"getting data from file %s" % sf)
176 all_keys = [score_key,
179 for k
in po.get_keys():
180 for fk
in feature_keys:
183 fields = po.get_fields(all_keys,
187 length_set = set([len(fields[f])
for f
in fields])
188 minlen = min(length_set)
191 if len(length_set) > 1:
192 print(
"get_best_models: the statfile is not synchronous")
193 minlen = min(length_set)
195 fields[f] = fields[f][0:minlen]
200 all_fields[k] += fields[k]
202 if override_rmf_dir
is not None:
203 for i
in range(minlen):
204 all_fields[rmf_file_key][i] = os.path.join(
206 os.path.basename(all_fields[rmf_file_key][i]))
209 if number_of_processes != 1:
212 comm.send(all_fields, dest=0, tag=11)
214 for i
in range(1, number_of_processes):
215 data_tmp = comm.recv(source=i, tag=11)
217 all_fields[k] += data_tmp[k]
220 order = sorted(range(len(all_fields[score_key])),
221 key=
lambda i: float(all_fields[score_key][i]))
224 stat = open(str(out_stat_fn),
'w')
225 rh0 = RMF.open_rmf_file_read_only(
226 str(root_directory_of_stat_file / all_fields[rmf_file_key][0]))
229 outf = RMF.create_rmf_file(str(out_rmf_fn))
231 for nm, i
in enumerate(order[:number_of_best_scoring_models]):
232 dline = dict((k, all_fields[k][i])
for k
in all_fields)
233 dline[
'orig_rmf_file'] = dline[rmf_file_key]
234 dline[
'orig_rmf_frame_index'] = dline[rmf_file_frame_key]
235 dline[rmf_file_key] = str(out_rmf_fn)
236 dline[rmf_file_frame_key] = nm
237 rh = RMF.open_rmf_file_read_only(
238 str(root_directory_of_stat_file / all_fields[rmf_file_key][i]))
241 RMF.FrameID(all_fields[rmf_file_frame_key][i]))
244 stat.write(str(dline) +
'\n')
246 print(
'wrote stats to', out_stat_fn)
247 print(
'wrote rmfs to', out_rmf_fn)
250 class _TempProvenance(object):
251 """Placeholder to track provenance information added to the IMP model.
252 This is since we typically don't preserve the IMP::Model object
253 throughout a PMI protocol."""
254 def __init__(self, provclass, particle_name, *args, **kwargs):
255 self.provclass = provclass
256 self.particle_name = particle_name
260 def get_decorator(self, model):
261 """Instantiate and return an IMP Provenance object in the model."""
262 pi = model.add_particle(self.particle_name)
263 return self.provclass.setup_particle(model, pi, *self.args,
267 class ClusterProvenance(_TempProvenance):
268 def __init__(self, *args, **kwargs):
270 "clustering", *args, **kwargs)
273 class FilterProvenance(_TempProvenance):
274 def __init__(self, *args, **kwargs):
276 "filtering", *args, **kwargs)
279 class CombineProvenance(_TempProvenance):
280 def __init__(self, *args, **kwargs):
282 "combine runs", *args, **kwargs)
286 """Add provenance information in `prov` (a list of _TempProvenance
287 objects) to each of the IMP hierarchies provided.
288 Note that we do this all at once since we typically don't preserve
289 the IMP::Model object throughout a PMI protocol."""
291 IMP.pmi.tools._add_pmi_provenance(h)
298 score_key=
"SimplifiedModel_Total_Score_None",
300 rmf_file_key=
"rmf_file",
301 rmf_file_frame_key=
"rmf_frame_index",
303 get_every=1, provenance=
None):
304 """ Given a list of stat files, read them all and find the best models.
305 Returns the best rmf filenames, frame numbers, scores, and values
309 rmf_file_frame_list = []
312 feature_keyword_list_dict = defaultdict(list)
314 for sf
in stat_files:
315 root_directory_of_stat_file = os.path.dirname(os.path.abspath(sf))
316 if sf[-4:] ==
'rmf3':
317 root_directory_of_stat_file = os.path.dirname(
318 os.path.abspath(root_directory_of_stat_file))
319 print(
"getting data from file %s" % sf)
323 file_keywords = po.get_keys()
327 keywords = [score_key, rmf_file_key, rmf_file_frame_key]
332 for requested_key
in feature_keys:
333 for file_k
in file_keywords:
334 if requested_key
in file_k:
335 if file_k
not in keywords:
336 keywords.append(file_k)
338 if prefiltervalue
is None:
339 fields = po.get_fields(keywords,
341 statistics=statistics)
343 fields = po.get_fields(
344 keywords, filtertuple=(score_key,
"<", prefiltervalue),
345 get_every=get_every, statistics=statistics)
350 length_set.add(len(fields[f]))
354 if len(length_set) > 1:
355 print(
"get_best_models: the statfile is not synchronous")
356 minlen = min(length_set)
358 fields[f] = fields[f][0:minlen]
361 score_list += fields[score_key]
362 for rmf
in fields[rmf_file_key]:
363 rmf = os.path.normpath(rmf)
364 if root_directory_of_stat_file
not in rmf:
365 rmf_local_path = os.path.join(
366 os.path.basename(os.path.dirname(rmf)),
367 os.path.basename(rmf))
368 rmf = os.path.join(root_directory_of_stat_file, rmf_local_path)
369 rmf_file_list.append(rmf)
371 rmf_file_frame_list += fields[rmf_file_frame_key]
374 feature_keyword_list_dict[k] += fields[k]
377 if provenance
is not None:
378 if len(stat_files) > 1:
379 provenance.append(CombineProvenance(len(stat_files),
383 FilterProvenance(
"Keep fraction", 0.,
384 statistics.passed_get_every))
385 if prefiltervalue
is not None:
386 provenance.append(FilterProvenance(
"Total score",
388 statistics.passed_filtertuple))
390 return (rmf_file_list, rmf_file_frame_list, score_list,
391 feature_keyword_list_dict)
395 score_key=
"SimplifiedModel_Total_Score_None",
396 rmf_file_key=
"rmf_file",
397 rmf_file_frame_key=
"rmf_frame_index",
399 """Given a list of stat files, read them all and find a trajectory
400 of models. Returns the rmf filenames, frame numbers, scores, and
401 values for feature keywords
404 rmf_file_frame_list = []
406 for sf
in stat_files:
407 root_directory_of_stat_file = Path(sf).parent.parent
408 print(
"getting data from file %s" % sf)
411 feature_keywords = [score_key, rmf_file_key, rmf_file_frame_key]
413 fields = po.get_fields(feature_keywords, get_every=get_every)
418 length_set.add(len(fields[f]))
422 if len(length_set) > 1:
423 print(
"get_best_models: the statfile is not synchronous")
424 minlen = min(length_set)
426 fields[f] = fields[f][0:minlen]
429 score_list += fields[score_key]
430 for rmf
in fields[rmf_file_key]:
431 rmf_file_list.append(str(root_directory_of_stat_file / rmf))
433 rmf_file_frame_list += fields[rmf_file_frame_key]
435 return rmf_file_list, rmf_file_frame_list, score_list
440 alignment_components=
None,
441 rmsd_calculation_components=
None,
443 """Read in coordinates of a set of RMF tuples.
444 Returns the coordinates split as requested (all, alignment only, rmsd only)
445 as well as RMF file names (as keys in a dictionary, with values being
446 the rank number) and just a plain list
447 @param model The IMP model
448 @param rmf_tuples [score,filename,frame number,original order number, rank]
449 @param alignment_components Tuples to specify what you're aligning on
450 @param rmsd_calculation_components Tuples to specify what components
451 are used for RMSD calc
454 rmsd_coordinates = []
455 alignment_coordinates = []
456 all_rmf_file_names = []
457 rmf_file_name_index_dict = {}
459 for cnt, tpl
in enumerate(rmf_tuples):
461 frame_number = tpl[2]
463 prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
467 IMP.pmi.analysis.link_hiers_to_rmf(model, prots, frame_number,
472 states = IMP.atom.get_by_type(prots[0], IMP.atom.STATE_TYPE)
474 prot = states[state_number]
477 prot = prots[state_number]
481 model_coordinate_dict = {}
482 template_coordinate_dict = {}
483 rmsd_coordinate_dict = {}
486 model_coordinate_dict[pr] = np.array(
488 for i
in part_dict[pr]])
491 for tuple_dict, result_dict
in zip(
492 (alignment_components, rmsd_calculation_components),
493 (template_coordinate_dict, rmsd_coordinate_dict)):
495 if tuple_dict
is None:
499 for pr
in tuple_dict:
501 prot, tuple_dict[pr], resolution=1)
506 all_coordinates.append(model_coordinate_dict)
507 alignment_coordinates.append(template_coordinate_dict)
508 rmsd_coordinates.append(rmsd_coordinate_dict)
509 frame_name = rmf_file +
'|' + str(frame_number)
510 all_rmf_file_names.append(frame_name)
511 rmf_file_name_index_dict[frame_name] = tpl[4]
512 return (all_coordinates, alignment_coordinates, rmsd_coordinates,
513 rmf_file_name_index_dict, all_rmf_file_names)
519 @param model The IMP model
520 @param rmf_tuple score,filename,frame number,original order number, rank
521 @param rmsd_calculation_components Tuples to specify what components
522 are used for RMSD calc
524 if rmsd_calculation_components
is None:
527 rmf_file = rmf_tuple[1]
528 frame_number = rmf_tuple[2]
529 prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
533 states = IMP.atom.get_by_type(prots[0], IMP.atom.STATE_TYPE)
534 prot = states[state_number]
536 rmsd_bead_size_dict = {}
539 for pr
in rmsd_calculation_components:
541 prot, rmsd_calculation_components[pr], resolution=1)
542 rmsd_bead_size_dict[pr] = [
545 return rmsd_bead_size_dict
549 """A helper output for model evaluation"""
550 def __init__(self, model):
554 def get_output(self):
555 score = self.rs.evaluate(
False)
557 output[
"Total_Score"] = str(score)
A class for reading stat files (either rmf or ascii v1 and v2)
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def parse_dssp
Read a DSSP file, and return secondary structure elements (SSEs).
def get_best_models
Given a list of stat files, read them all and find the best models.
A helper output for model evaluation.
Collect statistics from ProcessOutput.get_fields().
Track creation of a system fragment by combination.
def get_trajectory_models
Given a list of stat files, read them all and find a trajectory of models.
Track creation of a system fragment by filtering.
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Tools for clustering and cluster analysis.
Classes for writing output files and processing them.
def save_best_models
Given a list of stat files, read them all and find the best models.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Track creation of a system fragment from clustering.
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
def read_coordinates_of_rmfs
Read in coordinates of a set of RMF tuples.
Python classes to represent, score, sample and analyze models.
Functionality for loading, creating, manipulating and scoring atomic structures.
void add_provenance(Model *m, ParticleIndex pi, Provenance p)
Add provenance to part of the model.
Support for the RMF file format for storing hierarchical molecular data and markup.
def add_provenance
Add provenance information in prov (a list of _TempProvenance objects) to each of the IMP hierarchies...