IMP logo
IMP Reference Guide  develop.e004443c3b,2024/04/25
The Integrative Modeling Platform
pmi/io/__init__.py
1 """@namespace IMP.pmi.io
2  Utility classes and functions for reading and storing PMI files
3 """
4 
5 from __future__ import print_function
6 import IMP
7 import IMP.algebra
8 import IMP.atom
9 import IMP.pmi
10 import IMP.rmf
11 import IMP.pmi.analysis
12 import IMP.pmi.output
13 import IMP.pmi.tools
14 import RMF
15 import os
16 import numpy as np
17 from collections import defaultdict
18 try:
19  from pathlib import Path
20 except ImportError: # Use bundled pathlib on Python 2 without pathlib
21  from IMP._compat_pathlib import Path
22 
23 
24 def parse_dssp(dssp_fn, limit_to_chains='', name_map=None):
25  """Read a DSSP file, and return secondary structure elements (SSEs).
26  Values are all PDB residue numbering.
27  @param dssp_fn The file to read
28  @param limit_to_chains Only read/return these chain IDs
29  @param name_map If passed, return tuples organized by molecule name
30  (name_map should be a dictionary with chain IDs as keys and
31  molecule names as values).
32 
33  @return a dictionary with keys 'helix', 'beta', 'loop'.
34  Each contains a list of SSEs.
35  Each SSE is a list of elements (e.g. strands in a sheet).
36  Each element is a tuple (residue start, residue end, chain).
37 
38  Example for a structure with helix A:5-7 and Beta strands A:1-3,A:9-11:
39 
40  @code{.py}
41  ret = { 'helix' : [ [ (5,7,'A') ], ...]
42  'beta' : [ [ (1,3,'A'),
43  (9,11,'A'), ...], ...]
44  'loop' : same format as helix
45  }
46  @endcode
47  """
48 
49  def convert_chain(ch):
50  if name_map is None:
51  return ch
52  else:
53  return name_map.get(ch, ch)
54 
55  # setup
56  helix_classes = 'GHI'
57  strand_classes = 'EB'
58  loop_classes = [' ', '', 'T', 'S']
59  sse_dict = {}
60  for h in helix_classes:
61  sse_dict[h] = 'helix'
62  for s in strand_classes:
63  sse_dict[s] = 'beta'
64  for lc in loop_classes:
65  sse_dict[lc] = 'loop'
66  sses = {'helix': [], 'beta': [], 'loop': []}
67 
68  # read file and parse
69  start = False
70 
71  # temporary beta dictionary indexed by DSSP's ID
72  beta_dict = IMP.pmi.tools.OrderedDefaultDict(list)
73  prev_sstype = None
74  prev_beta_id = None
75 
76  with open(dssp_fn, 'r') as fh:
77  for line in fh:
78  fields = line.split()
79  chain_break = False
80  if len(fields) < 2:
81  continue
82  if fields[1] == "RESIDUE":
83  # Start parsing from here
84  start = True
85  continue
86  if not start:
87  continue
88  if line[9] == " ":
89  chain_break = True
90  elif limit_to_chains != '' and line[11] not in limit_to_chains:
91  continue
92 
93  # gather line info
94  if not chain_break:
95  pdb_res_num = int(line[5:10])
96  chain = line[11]
97  sstype = sse_dict[line[16]]
98  beta_id = line[33]
99 
100  # decide whether to extend or store the SSE
101  if prev_sstype is None:
102  cur_sse = [pdb_res_num, pdb_res_num, convert_chain(chain)]
103  elif sstype != prev_sstype or chain_break:
104  # add cur_sse to the right place
105  if prev_sstype in ['helix', 'loop']:
106  sses[prev_sstype].append([cur_sse])
107  else: # prev_sstype == 'beta'
108  beta_dict[prev_beta_id].append(cur_sse)
109  cur_sse = [pdb_res_num, pdb_res_num, convert_chain(chain)]
110  else:
111  cur_sse[1] = pdb_res_num
112  if chain_break:
113  prev_sstype = None
114  prev_beta_id = None
115  else:
116  prev_sstype = sstype
117  prev_beta_id = beta_id
118 
119  # final SSE processing
120  if prev_sstype in ['helix', 'loop']:
121  sses[prev_sstype].append([cur_sse])
122  elif prev_sstype == 'beta':
123  beta_dict[prev_beta_id].append(cur_sse)
124  # gather betas
125  for beta_sheet in beta_dict:
126  sses['beta'].append(beta_dict[beta_sheet])
127  return sses
128 
129 
130 def save_best_models(model, out_dir, stat_files,
131  number_of_best_scoring_models=10, get_every=1,
132  score_key="SimplifiedModel_Total_Score_None",
133  feature_keys=None, rmf_file_key="rmf_file",
134  rmf_file_frame_key="rmf_frame_index",
135  override_rmf_dir=None):
136  """Given a list of stat files, read them all and find the best models.
137  Save to a single RMF along with a stat file.
138  @param model The IMP Model
139  @param out_dir The output directory. Will save 3 files (RMF, stat, summary)
140  @param stat_files List of all stat files to collect
141  @param number_of_best_scoring_models Num best models to gather
142  @param get_every Skip frames
143  @param score_key Used for the ranking
144  @param feature_keys Keys to keep around
145  @param rmf_file_key The key that says RMF file name
146  @param rmf_file_frame_key The key that says RMF frame number
147  @param override_rmf_dir For output, change the name of the RMF
148  directory (experiment)
149  """
150 
151  # start by splitting into jobs
152  try:
153  from mpi4py import MPI
154  comm = MPI.COMM_WORLD
155  rank = comm.Get_rank()
156  number_of_processes = comm.size
157  except ImportError:
158  rank = 0
159  number_of_processes = 1
160  my_stat_files = IMP.pmi.tools.chunk_list_into_segments(
161  stat_files, number_of_processes)[rank]
162 
163  # filenames
164  out_dir = Path(out_dir)
165  out_stat_fn = out_dir / ("top_%d.out" % number_of_best_scoring_models)
166  out_rmf_fn = out_dir / ("top_%d.rmf3" % number_of_best_scoring_models)
167 
168  # extract all the models
169  all_fields = []
170  for nsf, sf in enumerate(my_stat_files):
171 
172  # get list of keywords
173  root_directory_of_stat_file = Path(sf).parent.parent
174  print("getting data from file %s" % sf)
176  all_keys = [score_key,
177  rmf_file_key,
178  rmf_file_frame_key]
179  for k in po.get_keys():
180  for fk in feature_keys:
181  if fk in k:
182  all_keys.append(k)
183  fields = po.get_fields(all_keys,
184  get_every=get_every)
185 
186  # check that all lengths are all equal
187  length_set = set([len(fields[f]) for f in fields])
188  minlen = min(length_set)
189  # if some of the fields are missing, truncate
190  # the feature files to the shortest one
191  if len(length_set) > 1:
192  print("get_best_models: the statfile is not synchronous")
193  minlen = min(length_set)
194  for f in fields:
195  fields[f] = fields[f][0:minlen]
196  if nsf == 0:
197  all_fields = fields
198  else:
199  for k in fields:
200  all_fields[k] += fields[k]
201 
202  if override_rmf_dir is not None:
203  for i in range(minlen):
204  all_fields[rmf_file_key][i] = os.path.join(
205  override_rmf_dir,
206  os.path.basename(all_fields[rmf_file_key][i]))
207 
208  # gather info, sort, write
209  if number_of_processes != 1:
210  comm.Barrier()
211  if rank != 0:
212  comm.send(all_fields, dest=0, tag=11)
213  else:
214  for i in range(1, number_of_processes):
215  data_tmp = comm.recv(source=i, tag=11)
216  for k in all_fields:
217  all_fields[k] += data_tmp[k]
218 
219  # sort by total score
220  order = sorted(range(len(all_fields[score_key])),
221  key=lambda i: float(all_fields[score_key][i]))
222 
223  # write the stat and RMF files
224  stat = open(str(out_stat_fn), 'w')
225  rh0 = RMF.open_rmf_file_read_only(
226  str(root_directory_of_stat_file / all_fields[rmf_file_key][0]))
227  prots = IMP.rmf.create_hierarchies(rh0, model)
228  del rh0
229  outf = RMF.create_rmf_file(str(out_rmf_fn))
230  IMP.rmf.add_hierarchies(outf, prots)
231  for nm, i in enumerate(order[:number_of_best_scoring_models]):
232  dline = dict((k, all_fields[k][i]) for k in all_fields)
233  dline['orig_rmf_file'] = dline[rmf_file_key]
234  dline['orig_rmf_frame_index'] = dline[rmf_file_frame_key]
235  dline[rmf_file_key] = str(out_rmf_fn)
236  dline[rmf_file_frame_key] = nm
237  rh = RMF.open_rmf_file_read_only(
238  str(root_directory_of_stat_file / all_fields[rmf_file_key][i]))
239  IMP.rmf.link_hierarchies(rh, prots)
241  RMF.FrameID(all_fields[rmf_file_frame_key][i]))
242  IMP.rmf.save_frame(outf)
243  del rh
244  stat.write(str(dline) + '\n')
245  stat.close()
246  print('wrote stats to', out_stat_fn)
247  print('wrote rmfs to', out_rmf_fn)
248 
249 
250 class _TempProvenance(object):
251  """Placeholder to track provenance information added to the IMP model.
252  This is since we typically don't preserve the IMP::Model object
253  throughout a PMI protocol."""
254  def __init__(self, provclass, particle_name, *args, **kwargs):
255  self.provclass = provclass
256  self.particle_name = particle_name
257  self.args = args
258  self.kwargs = kwargs
259 
260  def get_decorator(self, model):
261  """Instantiate and return an IMP Provenance object in the model."""
262  pi = model.add_particle(self.particle_name)
263  return self.provclass.setup_particle(model, pi, *self.args,
264  **self.kwargs)
265 
266 
267 class ClusterProvenance(_TempProvenance):
268  def __init__(self, *args, **kwargs):
269  _TempProvenance.__init__(self, IMP.core.ClusterProvenance,
270  "clustering", *args, **kwargs)
271 
272 
273 class FilterProvenance(_TempProvenance):
274  def __init__(self, *args, **kwargs):
275  _TempProvenance.__init__(self, IMP.core.FilterProvenance,
276  "filtering", *args, **kwargs)
277 
278 
279 class CombineProvenance(_TempProvenance):
280  def __init__(self, *args, **kwargs):
281  _TempProvenance.__init__(self, IMP.core.CombineProvenance,
282  "combine runs", *args, **kwargs)
283 
284 
285 def add_provenance(prov, hiers):
286  """Add provenance information in `prov` (a list of _TempProvenance
287  objects) to each of the IMP hierarchies provided.
288  Note that we do this all at once since we typically don't preserve
289  the IMP::Model object throughout a PMI protocol."""
290  for h in hiers:
291  IMP.pmi.tools._add_pmi_provenance(h)
292  m = h.get_model()
293  for p in prov:
294  IMP.core.add_provenance(m, h, p.get_decorator(m))
295 
296 
297 def get_best_models(stat_files,
298  score_key="SimplifiedModel_Total_Score_None",
299  feature_keys=None,
300  rmf_file_key="rmf_file",
301  rmf_file_frame_key="rmf_frame_index",
302  prefiltervalue=None,
303  get_every=1, provenance=None):
304  """ Given a list of stat files, read them all and find the best models.
305  Returns the best rmf filenames, frame numbers, scores, and values
306  for feature keywords
307  """
308  rmf_file_list = [] # best RMF files
309  rmf_file_frame_list = [] # best RMF frames
310  score_list = [] # best scores
311  # best values of the feature keys
312  feature_keyword_list_dict = defaultdict(list)
313  statistics = IMP.pmi.output.OutputStatistics()
314  for sf in stat_files:
315  root_directory_of_stat_file = os.path.dirname(os.path.abspath(sf))
316  if sf[-4:] == 'rmf3':
317  root_directory_of_stat_file = os.path.dirname(
318  os.path.abspath(root_directory_of_stat_file))
319  print("getting data from file %s" % sf)
321 
322  try:
323  file_keywords = po.get_keys()
324  except: # noqa: E722
325  continue
326 
327  keywords = [score_key, rmf_file_key, rmf_file_frame_key]
328 
329  # check all requested keys are in the file
330  # this looks weird because searching for "*requested_key*"
331  if feature_keys:
332  for requested_key in feature_keys:
333  for file_k in file_keywords:
334  if requested_key in file_k:
335  if file_k not in keywords:
336  keywords.append(file_k)
337 
338  if prefiltervalue is None:
339  fields = po.get_fields(keywords,
340  get_every=get_every,
341  statistics=statistics)
342  else:
343  fields = po.get_fields(
344  keywords, filtertuple=(score_key, "<", prefiltervalue),
345  get_every=get_every, statistics=statistics)
346 
347  # check that all lengths are all equal
348  length_set = set()
349  for f in fields:
350  length_set.add(len(fields[f]))
351 
352  # if some of the fields are missing, truncate
353  # the feature files to the shortest one
354  if len(length_set) > 1:
355  print("get_best_models: the statfile is not synchronous")
356  minlen = min(length_set)
357  for f in fields:
358  fields[f] = fields[f][0:minlen]
359 
360  # append to the lists
361  score_list += fields[score_key]
362  for rmf in fields[rmf_file_key]:
363  rmf = os.path.normpath(rmf)
364  if root_directory_of_stat_file not in rmf:
365  rmf_local_path = os.path.join(
366  os.path.basename(os.path.dirname(rmf)),
367  os.path.basename(rmf))
368  rmf = os.path.join(root_directory_of_stat_file, rmf_local_path)
369  rmf_file_list.append(rmf)
370 
371  rmf_file_frame_list += fields[rmf_file_frame_key]
372 
373  for k in keywords:
374  feature_keyword_list_dict[k] += fields[k]
375 
376  # Record combining and filtering operations in provenance, if requested
377  if provenance is not None:
378  if len(stat_files) > 1:
379  provenance.append(CombineProvenance(len(stat_files),
380  statistics.total))
381  if get_every != 1:
382  provenance.append(
383  FilterProvenance("Keep fraction", 0.,
384  statistics.passed_get_every))
385  if prefiltervalue is not None:
386  provenance.append(FilterProvenance("Total score",
387  prefiltervalue,
388  statistics.passed_filtertuple))
389 
390  return (rmf_file_list, rmf_file_frame_list, score_list,
391  feature_keyword_list_dict)
392 
393 
394 def get_trajectory_models(stat_files,
395  score_key="SimplifiedModel_Total_Score_None",
396  rmf_file_key="rmf_file",
397  rmf_file_frame_key="rmf_frame_index",
398  get_every=1):
399  """Given a list of stat files, read them all and find a trajectory
400  of models. Returns the rmf filenames, frame numbers, scores, and
401  values for feature keywords
402  """
403  rmf_file_list = [] # best RMF files
404  rmf_file_frame_list = [] # best RMF frames
405  score_list = [] # best scores
406  for sf in stat_files:
407  root_directory_of_stat_file = Path(sf).parent.parent
408  print("getting data from file %s" % sf)
410 
411  feature_keywords = [score_key, rmf_file_key, rmf_file_frame_key]
412 
413  fields = po.get_fields(feature_keywords, get_every=get_every)
414 
415  # check that all lengths are all equal
416  length_set = set()
417  for f in fields:
418  length_set.add(len(fields[f]))
419 
420  # if some of the fields are missing, truncate
421  # the feature files to the shortest one
422  if len(length_set) > 1:
423  print("get_best_models: the statfile is not synchronous")
424  minlen = min(length_set)
425  for f in fields:
426  fields[f] = fields[f][0:minlen]
427 
428  # append to the lists
429  score_list += fields[score_key]
430  for rmf in fields[rmf_file_key]:
431  rmf_file_list.append(str(root_directory_of_stat_file / rmf))
432 
433  rmf_file_frame_list += fields[rmf_file_frame_key]
434 
435  return rmf_file_list, rmf_file_frame_list, score_list
436 
437 
439  rmf_tuples,
440  alignment_components=None,
441  rmsd_calculation_components=None,
442  state_number=0):
443  """Read in coordinates of a set of RMF tuples.
444  Returns the coordinates split as requested (all, alignment only, rmsd only)
445  as well as RMF file names (as keys in a dictionary, with values being
446  the rank number) and just a plain list
447  @param model The IMP model
448  @param rmf_tuples [score,filename,frame number,original order number, rank]
449  @param alignment_components Tuples to specify what you're aligning on
450  @param rmsd_calculation_components Tuples to specify what components
451  are used for RMSD calc
452  """
453  all_coordinates = []
454  rmsd_coordinates = []
455  alignment_coordinates = []
456  all_rmf_file_names = []
457  rmf_file_name_index_dict = {} # storing the features
458 
459  for cnt, tpl in enumerate(rmf_tuples):
460  rmf_file = tpl[1]
461  frame_number = tpl[2]
462  if cnt == 0:
463  prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
464  frame_number,
465  rmf_file)
466  else:
467  IMP.pmi.analysis.link_hiers_to_rmf(model, prots, frame_number,
468  rmf_file)
469 
470  if not prots:
471  continue
472  states = IMP.atom.get_by_type(prots[0], IMP.atom.STATE_TYPE)
473  if states:
474  prot = states[state_number]
475  else:
476  # If no states, assume PMI1-style
477  prot = prots[state_number]
478 
479  # getting the particles
481  model_coordinate_dict = {}
482  template_coordinate_dict = {}
483  rmsd_coordinate_dict = {}
484 
485  for pr in part_dict:
486  model_coordinate_dict[pr] = np.array(
487  [np.array(IMP.core.XYZ(i).get_coordinates())
488  for i in part_dict[pr]])
489  # for each file, get (as floats) a list of all coordinates
490  # of all requested tuples, organized as dictionaries.
491  for tuple_dict, result_dict in zip(
492  (alignment_components, rmsd_calculation_components),
493  (template_coordinate_dict, rmsd_coordinate_dict)):
494 
495  if tuple_dict is None:
496  continue
497 
498  # PMI2: do selection of resolution and name at the same time
499  for pr in tuple_dict:
501  prot, tuple_dict[pr], resolution=1)
502  result_dict[pr] = [
503  list(map(float, IMP.core.XYZ(p).get_coordinates()))
504  for p in ps]
505 
506  all_coordinates.append(model_coordinate_dict)
507  alignment_coordinates.append(template_coordinate_dict)
508  rmsd_coordinates.append(rmsd_coordinate_dict)
509  frame_name = rmf_file + '|' + str(frame_number)
510  all_rmf_file_names.append(frame_name)
511  rmf_file_name_index_dict[frame_name] = tpl[4]
512  return (all_coordinates, alignment_coordinates, rmsd_coordinates,
513  rmf_file_name_index_dict, all_rmf_file_names)
514 
515 
516 def get_bead_sizes(model, rmf_tuple, rmsd_calculation_components=None,
517  state_number=0):
518  '''
519  @param model The IMP model
520  @param rmf_tuple score,filename,frame number,original order number, rank
521  @param rmsd_calculation_components Tuples to specify what components
522  are used for RMSD calc
523  '''
524  if rmsd_calculation_components is None:
525  return {}
526 
527  rmf_file = rmf_tuple[1]
528  frame_number = rmf_tuple[2]
529  prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
530  frame_number,
531  rmf_file)
532 
533  states = IMP.atom.get_by_type(prots[0], IMP.atom.STATE_TYPE)
534  prot = states[state_number]
535 
536  rmsd_bead_size_dict = {}
537 
538  # PMI2: do selection of resolution and name at the same time
539  for pr in rmsd_calculation_components:
541  prot, rmsd_calculation_components[pr], resolution=1)
542  rmsd_bead_size_dict[pr] = [
543  len(IMP.pmi.tools.get_residue_indexes(p)) for p in ps]
544 
545  return rmsd_bead_size_dict
546 
547 
548 class TotalScoreOutput(object):
549  """A helper output for model evaluation"""
550  def __init__(self, model):
551  self.model = model
552  self.rs = IMP.pmi.tools.get_restraint_set(self.model)
553 
554  def get_output(self):
555  score = self.rs.evaluate(False)
556  output = {}
557  output["Total_Score"] = str(score)
558  return output
def get_restraint_set
Get a RestraintSet containing all PMI restraints added to the model.
Definition: tools.py:112
A class for reading stat files (either rmf or ascii v1 and v2)
Definition: output.py:935
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def parse_dssp
Read a DSSP file, and return secondary structure elements (SSEs).
def get_best_models
Given a list of stat files, read them all and find the best models.
A helper output for model evaluation.
Miscellaneous utilities.
Definition: tools.py:1
Collect statistics from ProcessOutput.get_fields().
Definition: output.py:924
Track creation of a system fragment by combination.
Definition: provenance.h:280
def get_trajectory_models
Given a list of stat files, read them all and find a trajectory of models.
def select_by_tuple_2
New tuple format: molname OR (start,stop,molname,copynum,statenum) Copy and state are optional...
Definition: tools.py:438
Track creation of a system fragment by filtering.
Definition: provenance.h:340
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Tools for clustering and cluster analysis.
Definition: pmi/Analysis.py:1
Classes for writing output files and processing them.
Definition: output.py:1
def save_best_models
Given a list of stat files, read them all and find the best models.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Track creation of a system fragment from clustering.
Definition: provenance.h:426
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
def read_coordinates_of_rmfs
Read in coordinates of a set of RMF tuples.
Python classes to represent, score, sample and analyze models.
Functionality for loading, creating, manipulating and scoring atomic structures.
void add_provenance(Model *m, ParticleIndex pi, Provenance p)
Add provenance to part of the model.
Support for the RMF file format for storing hierarchical molecular data and markup.
def get_residue_indexes
Retrieve the residue indexes for the given particle.
Definition: tools.py:512
def add_provenance
Add provenance information in prov (a list of _TempProvenance objects) to each of the IMP hierarchies...
Store objects in order they were added, but with default type.
Definition: tools.py:832