1 """@namespace IMP.pmi.io
2 Utility classes and functions for reading and storing PMI files
16 from collections
import defaultdict
17 from pathlib
import Path
20 def parse_dssp(dssp_fn, limit_to_chains='', name_map=None):
21 """Read a DSSP file, and return secondary structure elements (SSEs).
22 Values are all PDB residue numbering.
23 @param dssp_fn The file to read
24 @param limit_to_chains Only read/return these chain IDs
25 @param name_map If passed, return tuples organized by molecule name
26 (name_map should be a dictionary with chain IDs as keys and
27 molecule names as values).
29 @return a dictionary with keys 'helix', 'beta', 'loop'.
30 Each contains a list of SSEs.
31 Each SSE is a list of elements (e.g. strands in a sheet).
32 Each element is a tuple (residue start, residue end, chain).
34 Example for a structure with helix A:5-7 and Beta strands A:1-3,A:9-11:
37 ret = { 'helix' : [ [ (5,7,'A') ], ...]
38 'beta' : [ [ (1,3,'A'),
39 (9,11,'A'), ...], ...]
40 'loop' : same format as helix
45 def convert_chain(ch):
49 return name_map.get(ch, ch)
54 loop_classes = [
' ',
'',
'T',
'S']
56 for h
in helix_classes:
58 for s
in strand_classes:
60 for lc
in loop_classes:
62 sses = {
'helix': [],
'beta': [],
'loop': []}
72 with open(dssp_fn,
'r') as fh:
78 if fields[1] ==
"RESIDUE":
86 elif limit_to_chains !=
'' and line[11]
not in limit_to_chains:
91 pdb_res_num = int(line[5:10])
93 sstype = sse_dict[line[16]]
97 if prev_sstype
is None:
98 cur_sse = [pdb_res_num, pdb_res_num, convert_chain(chain)]
99 elif sstype != prev_sstype
or chain_break:
101 if prev_sstype
in [
'helix',
'loop']:
102 sses[prev_sstype].append([cur_sse])
104 beta_dict[prev_beta_id].append(cur_sse)
105 cur_sse = [pdb_res_num, pdb_res_num, convert_chain(chain)]
107 cur_sse[1] = pdb_res_num
113 prev_beta_id = beta_id
116 if prev_sstype
in [
'helix',
'loop']:
117 sses[prev_sstype].append([cur_sse])
118 elif prev_sstype ==
'beta':
119 beta_dict[prev_beta_id].append(cur_sse)
121 for beta_sheet
in beta_dict:
122 sses[
'beta'].append(beta_dict[beta_sheet])
127 number_of_best_scoring_models=10, get_every=1,
128 score_key=
"SimplifiedModel_Total_Score_None",
129 feature_keys=
None, rmf_file_key=
"rmf_file",
130 rmf_file_frame_key=
"rmf_frame_index",
131 override_rmf_dir=
None):
132 """Given a list of stat files, read them all and find the best models.
133 Save to a single RMF along with a stat file.
134 @param model The IMP Model
135 @param out_dir The output directory. Will save 3 files (RMF, stat, summary)
136 @param stat_files List of all stat files to collect
137 @param number_of_best_scoring_models Num best models to gather
138 @param get_every Skip frames
139 @param score_key Used for the ranking
140 @param feature_keys Keys to keep around
141 @param rmf_file_key The key that says RMF file name
142 @param rmf_file_frame_key The key that says RMF frame number
143 @param override_rmf_dir For output, change the name of the RMF
144 directory (experiment)
149 from mpi4py
import MPI
150 comm = MPI.COMM_WORLD
151 rank = comm.Get_rank()
152 number_of_processes = comm.size
155 number_of_processes = 1
156 my_stat_files = IMP.pmi.tools.chunk_list_into_segments(
157 stat_files, number_of_processes)[rank]
160 out_dir = Path(out_dir)
161 out_stat_fn = out_dir / (
"top_%d.out" % number_of_best_scoring_models)
162 out_rmf_fn = out_dir / (
"top_%d.rmf3" % number_of_best_scoring_models)
166 for nsf, sf
in enumerate(my_stat_files):
169 root_directory_of_stat_file = Path(sf).parent.parent
170 print(
"getting data from file %s" % sf)
172 all_keys = [score_key,
175 for k
in po.get_keys():
176 for fk
in feature_keys:
179 fields = po.get_fields(all_keys,
183 length_set = set([len(fields[f])
for f
in fields])
184 minlen = min(length_set)
187 if len(length_set) > 1:
188 print(
"get_best_models: the statfile is not synchronous")
189 minlen = min(length_set)
191 fields[f] = fields[f][0:minlen]
196 all_fields[k] += fields[k]
198 if override_rmf_dir
is not None:
199 for i
in range(minlen):
200 all_fields[rmf_file_key][i] = os.path.join(
202 os.path.basename(all_fields[rmf_file_key][i]))
205 if number_of_processes != 1:
208 comm.send(all_fields, dest=0, tag=11)
210 for i
in range(1, number_of_processes):
211 data_tmp = comm.recv(source=i, tag=11)
213 all_fields[k] += data_tmp[k]
216 order = sorted(range(len(all_fields[score_key])),
217 key=
lambda i: float(all_fields[score_key][i]))
220 stat = open(str(out_stat_fn),
'w')
221 rh0 = RMF.open_rmf_file_read_only(
222 str(root_directory_of_stat_file / all_fields[rmf_file_key][0]))
225 outf = RMF.create_rmf_file(str(out_rmf_fn))
227 for nm, i
in enumerate(order[:number_of_best_scoring_models]):
228 dline = dict((k, all_fields[k][i])
for k
in all_fields)
229 dline[
'orig_rmf_file'] = dline[rmf_file_key]
230 dline[
'orig_rmf_frame_index'] = dline[rmf_file_frame_key]
231 dline[rmf_file_key] = str(out_rmf_fn)
232 dline[rmf_file_frame_key] = nm
233 rh = RMF.open_rmf_file_read_only(
234 str(root_directory_of_stat_file / all_fields[rmf_file_key][i]))
237 RMF.FrameID(all_fields[rmf_file_frame_key][i]))
240 stat.write(str(dline) +
'\n')
242 print(
'wrote stats to', out_stat_fn)
243 print(
'wrote rmfs to', out_rmf_fn)
246 class _TempProvenance:
247 """Placeholder to track provenance information added to the IMP model.
248 This is since we typically don't preserve the IMP::Model object
249 throughout a PMI protocol."""
250 def __init__(self, provclass, particle_name, *args, **kwargs):
251 self.provclass = provclass
252 self.particle_name = particle_name
256 def get_decorator(self, model):
257 """Instantiate and return an IMP Provenance object in the model."""
258 pi = model.add_particle(self.particle_name)
259 return self.provclass.setup_particle(model, pi, *self.args,
263 class ClusterProvenance(_TempProvenance):
264 def __init__(self, *args, **kwargs):
266 "clustering", *args, **kwargs)
269 class FilterProvenance(_TempProvenance):
270 def __init__(self, *args, **kwargs):
272 "filtering", *args, **kwargs)
275 class CombineProvenance(_TempProvenance):
276 def __init__(self, *args, **kwargs):
278 "combine runs", *args, **kwargs)
282 """Add provenance information in `prov` (a list of _TempProvenance
283 objects) to each of the IMP hierarchies provided.
284 Note that we do this all at once since we typically don't preserve
285 the IMP::Model object throughout a PMI protocol."""
287 IMP.pmi.tools._add_pmi_provenance(h)
294 score_key=
"SimplifiedModel_Total_Score_None",
296 rmf_file_key=
"rmf_file",
297 rmf_file_frame_key=
"rmf_frame_index",
299 get_every=1, provenance=
None):
300 """ Given a list of stat files, read them all and find the best models.
301 Returns the best rmf filenames, frame numbers, scores, and values
305 rmf_file_frame_list = []
308 feature_keyword_list_dict = defaultdict(list)
310 for sf
in stat_files:
311 root_directory_of_stat_file = os.path.dirname(os.path.abspath(sf))
312 if sf[-4:] ==
'rmf3':
313 root_directory_of_stat_file = os.path.dirname(
314 os.path.abspath(root_directory_of_stat_file))
315 print(
"getting data from file %s" % sf)
319 file_keywords = po.get_keys()
323 keywords = [score_key, rmf_file_key, rmf_file_frame_key]
328 for requested_key
in feature_keys:
329 for file_k
in file_keywords:
330 if requested_key
in file_k:
331 if file_k
not in keywords:
332 keywords.append(file_k)
334 if prefiltervalue
is None:
335 fields = po.get_fields(keywords,
337 statistics=statistics)
339 fields = po.get_fields(
340 keywords, filtertuple=(score_key,
"<", prefiltervalue),
341 get_every=get_every, statistics=statistics)
346 length_set.add(len(fields[f]))
350 if len(length_set) > 1:
351 print(
"get_best_models: the statfile is not synchronous")
352 minlen = min(length_set)
354 fields[f] = fields[f][0:minlen]
357 score_list += fields[score_key]
358 for rmf
in fields[rmf_file_key]:
359 rmf = os.path.normpath(rmf)
360 if root_directory_of_stat_file
not in rmf:
361 rmf_local_path = os.path.join(
362 os.path.basename(os.path.dirname(rmf)),
363 os.path.basename(rmf))
364 rmf = os.path.join(root_directory_of_stat_file, rmf_local_path)
365 rmf_file_list.append(rmf)
367 rmf_file_frame_list += fields[rmf_file_frame_key]
370 feature_keyword_list_dict[k] += fields[k]
373 if provenance
is not None:
374 if len(stat_files) > 1:
375 provenance.append(CombineProvenance(len(stat_files),
379 FilterProvenance(
"Keep fraction", 0.,
380 statistics.passed_get_every))
381 if prefiltervalue
is not None:
382 provenance.append(FilterProvenance(
"Total score",
384 statistics.passed_filtertuple))
386 return (rmf_file_list, rmf_file_frame_list, score_list,
387 feature_keyword_list_dict)
391 score_key=
"SimplifiedModel_Total_Score_None",
392 rmf_file_key=
"rmf_file",
393 rmf_file_frame_key=
"rmf_frame_index",
395 """Given a list of stat files, read them all and find a trajectory
396 of models. Returns the rmf filenames, frame numbers, scores, and
397 values for feature keywords
400 rmf_file_frame_list = []
402 for sf
in stat_files:
403 root_directory_of_stat_file = Path(sf).parent.parent
404 print(
"getting data from file %s" % sf)
407 feature_keywords = [score_key, rmf_file_key, rmf_file_frame_key]
409 fields = po.get_fields(feature_keywords, get_every=get_every)
414 length_set.add(len(fields[f]))
418 if len(length_set) > 1:
419 print(
"get_best_models: the statfile is not synchronous")
420 minlen = min(length_set)
422 fields[f] = fields[f][0:minlen]
425 score_list += fields[score_key]
426 for rmf
in fields[rmf_file_key]:
427 rmf_file_list.append(str(root_directory_of_stat_file / rmf))
429 rmf_file_frame_list += fields[rmf_file_frame_key]
431 return rmf_file_list, rmf_file_frame_list, score_list
436 alignment_components=
None,
437 rmsd_calculation_components=
None,
439 """Read in coordinates of a set of RMF tuples.
440 Returns the coordinates split as requested (all, alignment only, rmsd only)
441 as well as RMF file names (as keys in a dictionary, with values being
442 the rank number) and just a plain list
443 @param model The IMP model
444 @param rmf_tuples [score,filename,frame number,original order number, rank]
445 @param alignment_components Tuples to specify what you're aligning on
446 @param rmsd_calculation_components Tuples to specify what components
447 are used for RMSD calc
448 @param state_number The state to extract from the file
451 rmsd_coordinates = []
452 alignment_coordinates = []
453 all_rmf_file_names = []
454 rmf_file_name_index_dict = {}
456 for cnt, tpl
in enumerate(rmf_tuples):
458 frame_number = tpl[2]
460 prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
464 IMP.pmi.analysis.link_hiers_to_rmf(model, prots, frame_number,
469 states = IMP.atom.get_by_type(prots[0], IMP.atom.STATE_TYPE)
471 prot = states[state_number]
474 prot = prots[state_number]
478 model_coordinate_dict = {}
479 template_coordinate_dict = {}
480 rmsd_coordinate_dict = {}
483 model_coordinate_dict[pr] = np.array(
485 for i
in part_dict[pr]])
488 for tuple_dict, result_dict
in zip(
489 (alignment_components, rmsd_calculation_components),
490 (template_coordinate_dict, rmsd_coordinate_dict)):
492 if tuple_dict
is None:
496 for pr
in tuple_dict:
498 prot, tuple_dict[pr], resolution=1)
503 all_coordinates.append(model_coordinate_dict)
504 alignment_coordinates.append(template_coordinate_dict)
505 rmsd_coordinates.append(rmsd_coordinate_dict)
506 frame_name = rmf_file +
'|' + str(frame_number)
507 all_rmf_file_names.append(frame_name)
508 rmf_file_name_index_dict[frame_name] = tpl[4]
509 return (all_coordinates, alignment_coordinates, rmsd_coordinates,
510 rmf_file_name_index_dict, all_rmf_file_names)
516 @param model The IMP model
517 @param rmf_tuple score,filename,frame number,original order number, rank
518 @param rmsd_calculation_components Tuples to specify what components
519 are used for RMSD calc
520 @param state_number The state to extract from the file
522 if rmsd_calculation_components
is None:
525 rmf_file = rmf_tuple[1]
526 frame_number = rmf_tuple[2]
527 prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
531 states = IMP.atom.get_by_type(prots[0], IMP.atom.STATE_TYPE)
532 prot = states[state_number]
534 rmsd_bead_size_dict = {}
537 for pr
in rmsd_calculation_components:
539 prot, rmsd_calculation_components[pr], resolution=1)
540 rmsd_bead_size_dict[pr] = [
543 return rmsd_bead_size_dict
547 """A helper output for model evaluation"""
548 def __init__(self, model):
552 def get_output(self):
553 score = self.rs.evaluate(
False)
555 output[
"Total_Score"] = str(score)
A class for reading stat files (either rmf or ascii v1 and v2)
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def parse_dssp
Read a DSSP file, and return secondary structure elements (SSEs).
def get_best_models
Given a list of stat files, read them all and find the best models.
A helper output for model evaluation.
Collect statistics from ProcessOutput.get_fields().
Track creation of a system fragment by combination.
def get_trajectory_models
Given a list of stat files, read them all and find a trajectory of models.
Track creation of a system fragment by filtering.
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Tools for clustering and cluster analysis.
Classes for writing output files and processing them.
def save_best_models
Given a list of stat files, read them all and find the best models.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Track creation of a system fragment from clustering.
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
def read_coordinates_of_rmfs
Read in coordinates of a set of RMF tuples.
Python classes to represent, score, sample and analyze models.
Functionality for loading, creating, manipulating and scoring atomic structures.
void add_provenance(Model *m, ParticleIndex pi, Provenance p)
Add provenance to part of the model.
Support for the RMF file format for storing hierarchical molecular data and markup.
def add_provenance
Add provenance information in prov (a list of _TempProvenance objects) to each of the IMP hierarchies...