IMP logo
IMP Reference Guide  develop.d97d4ead1f,2024/11/21
The Integrative Modeling Platform
pmi/io/__init__.py
1 """@namespace IMP.pmi.io
2  Utility classes and functions for reading and storing PMI files
3 """
4 
5 import IMP
6 import IMP.algebra
7 import IMP.atom
8 import IMP.pmi
9 import IMP.rmf
10 import IMP.pmi.analysis
11 import IMP.pmi.output
12 import IMP.pmi.tools
13 import RMF
14 import os
15 import numpy as np
16 from collections import defaultdict
17 from pathlib import Path
18 
19 
20 def parse_dssp(dssp_fn, limit_to_chains='', name_map=None):
21  """Read a DSSP file, and return secondary structure elements (SSEs).
22  Values are all PDB residue numbering.
23  @param dssp_fn The file to read
24  @param limit_to_chains Only read/return these chain IDs
25  @param name_map If passed, return tuples organized by molecule name
26  (name_map should be a dictionary with chain IDs as keys and
27  molecule names as values).
28 
29  @return a dictionary with keys 'helix', 'beta', 'loop'.
30  Each contains a list of SSEs.
31  Each SSE is a list of elements (e.g. strands in a sheet).
32  Each element is a tuple (residue start, residue end, chain).
33 
34  Example for a structure with helix A:5-7 and Beta strands A:1-3,A:9-11:
35 
36  @code{.py}
37  ret = { 'helix' : [ [ (5,7,'A') ], ...]
38  'beta' : [ [ (1,3,'A'),
39  (9,11,'A'), ...], ...]
40  'loop' : same format as helix
41  }
42  @endcode
43  """
44 
45  def convert_chain(ch):
46  if name_map is None:
47  return ch
48  else:
49  return name_map.get(ch, ch)
50 
51  # setup
52  helix_classes = 'GHI'
53  strand_classes = 'EB'
54  loop_classes = [' ', '', 'T', 'S']
55  sse_dict = {}
56  for h in helix_classes:
57  sse_dict[h] = 'helix'
58  for s in strand_classes:
59  sse_dict[s] = 'beta'
60  for lc in loop_classes:
61  sse_dict[lc] = 'loop'
62  sses = {'helix': [], 'beta': [], 'loop': []}
63 
64  # read file and parse
65  start = False
66 
67  # temporary beta dictionary indexed by DSSP's ID
68  beta_dict = IMP.pmi.tools.OrderedDefaultDict(list)
69  prev_sstype = None
70  prev_beta_id = None
71 
72  with open(dssp_fn, 'r') as fh:
73  for line in fh:
74  fields = line.split()
75  chain_break = False
76  if len(fields) < 2:
77  continue
78  if fields[1] == "RESIDUE":
79  # Start parsing from here
80  start = True
81  continue
82  if not start:
83  continue
84  if line[9] == " ":
85  chain_break = True
86  elif limit_to_chains != '' and line[11] not in limit_to_chains:
87  continue
88 
89  # gather line info
90  if not chain_break:
91  pdb_res_num = int(line[5:10])
92  chain = line[11]
93  sstype = sse_dict[line[16]]
94  beta_id = line[33]
95 
96  # decide whether to extend or store the SSE
97  if prev_sstype is None:
98  cur_sse = [pdb_res_num, pdb_res_num, convert_chain(chain)]
99  elif sstype != prev_sstype or chain_break:
100  # add cur_sse to the right place
101  if prev_sstype in ['helix', 'loop']:
102  sses[prev_sstype].append([cur_sse])
103  else: # prev_sstype == 'beta'
104  beta_dict[prev_beta_id].append(cur_sse)
105  cur_sse = [pdb_res_num, pdb_res_num, convert_chain(chain)]
106  else:
107  cur_sse[1] = pdb_res_num
108  if chain_break:
109  prev_sstype = None
110  prev_beta_id = None
111  else:
112  prev_sstype = sstype
113  prev_beta_id = beta_id
114 
115  # final SSE processing
116  if prev_sstype in ['helix', 'loop']:
117  sses[prev_sstype].append([cur_sse])
118  elif prev_sstype == 'beta':
119  beta_dict[prev_beta_id].append(cur_sse)
120  # gather betas
121  for beta_sheet in beta_dict:
122  sses['beta'].append(beta_dict[beta_sheet])
123  return sses
124 
125 
126 def save_best_models(model, out_dir, stat_files,
127  number_of_best_scoring_models=10, get_every=1,
128  score_key="SimplifiedModel_Total_Score_None",
129  feature_keys=None, rmf_file_key="rmf_file",
130  rmf_file_frame_key="rmf_frame_index",
131  override_rmf_dir=None):
132  """Given a list of stat files, read them all and find the best models.
133  Save to a single RMF along with a stat file.
134  @param model The IMP Model
135  @param out_dir The output directory. Will save 3 files (RMF, stat, summary)
136  @param stat_files List of all stat files to collect
137  @param number_of_best_scoring_models Num best models to gather
138  @param get_every Skip frames
139  @param score_key Used for the ranking
140  @param feature_keys Keys to keep around
141  @param rmf_file_key The key that says RMF file name
142  @param rmf_file_frame_key The key that says RMF frame number
143  @param override_rmf_dir For output, change the name of the RMF
144  directory (experiment)
145  """
146 
147  # start by splitting into jobs
148  try:
149  from mpi4py import MPI
150  comm = MPI.COMM_WORLD
151  rank = comm.Get_rank()
152  number_of_processes = comm.size
153  except ImportError:
154  rank = 0
155  number_of_processes = 1
156  my_stat_files = IMP.pmi.tools.chunk_list_into_segments(
157  stat_files, number_of_processes)[rank]
158 
159  # filenames
160  out_dir = Path(out_dir)
161  out_stat_fn = out_dir / ("top_%d.out" % number_of_best_scoring_models)
162  out_rmf_fn = out_dir / ("top_%d.rmf3" % number_of_best_scoring_models)
163 
164  # extract all the models
165  all_fields = []
166  for nsf, sf in enumerate(my_stat_files):
167 
168  # get list of keywords
169  root_directory_of_stat_file = Path(sf).parent.parent
170  print("getting data from file %s" % sf)
172  all_keys = [score_key,
173  rmf_file_key,
174  rmf_file_frame_key]
175  for k in po.get_keys():
176  for fk in feature_keys:
177  if fk in k:
178  all_keys.append(k)
179  fields = po.get_fields(all_keys,
180  get_every=get_every)
181 
182  # check that all lengths are all equal
183  length_set = set([len(fields[f]) for f in fields])
184  minlen = min(length_set)
185  # if some of the fields are missing, truncate
186  # the feature files to the shortest one
187  if len(length_set) > 1:
188  print("get_best_models: the statfile is not synchronous")
189  minlen = min(length_set)
190  for f in fields:
191  fields[f] = fields[f][0:minlen]
192  if nsf == 0:
193  all_fields = fields
194  else:
195  for k in fields:
196  all_fields[k] += fields[k]
197 
198  if override_rmf_dir is not None:
199  for i in range(minlen):
200  all_fields[rmf_file_key][i] = os.path.join(
201  override_rmf_dir,
202  os.path.basename(all_fields[rmf_file_key][i]))
203 
204  # gather info, sort, write
205  if number_of_processes != 1:
206  comm.Barrier()
207  if rank != 0:
208  comm.send(all_fields, dest=0, tag=11)
209  else:
210  for i in range(1, number_of_processes):
211  data_tmp = comm.recv(source=i, tag=11)
212  for k in all_fields:
213  all_fields[k] += data_tmp[k]
214 
215  # sort by total score
216  order = sorted(range(len(all_fields[score_key])),
217  key=lambda i: float(all_fields[score_key][i]))
218 
219  # write the stat and RMF files
220  stat = open(str(out_stat_fn), 'w')
221  rh0 = RMF.open_rmf_file_read_only(
222  str(root_directory_of_stat_file / all_fields[rmf_file_key][0]))
223  prots = IMP.rmf.create_hierarchies(rh0, model)
224  del rh0
225  outf = RMF.create_rmf_file(str(out_rmf_fn))
226  IMP.rmf.add_hierarchies(outf, prots)
227  for nm, i in enumerate(order[:number_of_best_scoring_models]):
228  dline = dict((k, all_fields[k][i]) for k in all_fields)
229  dline['orig_rmf_file'] = dline[rmf_file_key]
230  dline['orig_rmf_frame_index'] = dline[rmf_file_frame_key]
231  dline[rmf_file_key] = str(out_rmf_fn)
232  dline[rmf_file_frame_key] = nm
233  rh = RMF.open_rmf_file_read_only(
234  str(root_directory_of_stat_file / all_fields[rmf_file_key][i]))
235  IMP.rmf.link_hierarchies(rh, prots)
237  RMF.FrameID(all_fields[rmf_file_frame_key][i]))
238  IMP.rmf.save_frame(outf)
239  del rh
240  stat.write(str(dline) + '\n')
241  stat.close()
242  print('wrote stats to', out_stat_fn)
243  print('wrote rmfs to', out_rmf_fn)
244 
245 
246 class _TempProvenance:
247  """Placeholder to track provenance information added to the IMP model.
248  This is since we typically don't preserve the IMP::Model object
249  throughout a PMI protocol."""
250  def __init__(self, provclass, particle_name, *args, **kwargs):
251  self.provclass = provclass
252  self.particle_name = particle_name
253  self.args = args
254  self.kwargs = kwargs
255 
256  def get_decorator(self, model):
257  """Instantiate and return an IMP Provenance object in the model."""
258  pi = model.add_particle(self.particle_name)
259  return self.provclass.setup_particle(model, pi, *self.args,
260  **self.kwargs)
261 
262 
263 class ClusterProvenance(_TempProvenance):
264  def __init__(self, *args, **kwargs):
265  _TempProvenance.__init__(self, IMP.core.ClusterProvenance,
266  "clustering", *args, **kwargs)
267 
268 
269 class FilterProvenance(_TempProvenance):
270  def __init__(self, *args, **kwargs):
271  _TempProvenance.__init__(self, IMP.core.FilterProvenance,
272  "filtering", *args, **kwargs)
273 
274 
275 class CombineProvenance(_TempProvenance):
276  def __init__(self, *args, **kwargs):
277  _TempProvenance.__init__(self, IMP.core.CombineProvenance,
278  "combine runs", *args, **kwargs)
279 
280 
281 def add_provenance(prov, hiers):
282  """Add provenance information in `prov` (a list of _TempProvenance
283  objects) to each of the IMP hierarchies provided.
284  Note that we do this all at once since we typically don't preserve
285  the IMP::Model object throughout a PMI protocol."""
286  for h in hiers:
287  IMP.pmi.tools._add_pmi_provenance(h)
288  m = h.get_model()
289  for p in prov:
290  IMP.core.add_provenance(m, h, p.get_decorator(m))
291 
292 
293 def get_best_models(stat_files,
294  score_key="SimplifiedModel_Total_Score_None",
295  feature_keys=None,
296  rmf_file_key="rmf_file",
297  rmf_file_frame_key="rmf_frame_index",
298  prefiltervalue=None,
299  get_every=1, provenance=None):
300  """ Given a list of stat files, read them all and find the best models.
301  Returns the best rmf filenames, frame numbers, scores, and values
302  for feature keywords
303  """
304  rmf_file_list = [] # best RMF files
305  rmf_file_frame_list = [] # best RMF frames
306  score_list = [] # best scores
307  # best values of the feature keys
308  feature_keyword_list_dict = defaultdict(list)
309  statistics = IMP.pmi.output.OutputStatistics()
310  for sf in stat_files:
311  root_directory_of_stat_file = os.path.dirname(os.path.abspath(sf))
312  if sf[-4:] == 'rmf3':
313  root_directory_of_stat_file = os.path.dirname(
314  os.path.abspath(root_directory_of_stat_file))
315  print("getting data from file %s" % sf)
317 
318  try:
319  file_keywords = po.get_keys()
320  except: # noqa: E722
321  continue
322 
323  keywords = [score_key, rmf_file_key, rmf_file_frame_key]
324 
325  # check all requested keys are in the file
326  # this looks weird because searching for "*requested_key*"
327  if feature_keys:
328  for requested_key in feature_keys:
329  for file_k in file_keywords:
330  if requested_key in file_k:
331  if file_k not in keywords:
332  keywords.append(file_k)
333 
334  if prefiltervalue is None:
335  fields = po.get_fields(keywords,
336  get_every=get_every,
337  statistics=statistics)
338  else:
339  fields = po.get_fields(
340  keywords, filtertuple=(score_key, "<", prefiltervalue),
341  get_every=get_every, statistics=statistics)
342 
343  # check that all lengths are all equal
344  length_set = set()
345  for f in fields:
346  length_set.add(len(fields[f]))
347 
348  # if some of the fields are missing, truncate
349  # the feature files to the shortest one
350  if len(length_set) > 1:
351  print("get_best_models: the statfile is not synchronous")
352  minlen = min(length_set)
353  for f in fields:
354  fields[f] = fields[f][0:minlen]
355 
356  # append to the lists
357  score_list += fields[score_key]
358  for rmf in fields[rmf_file_key]:
359  rmf = os.path.normpath(rmf)
360  if root_directory_of_stat_file not in rmf:
361  rmf_local_path = os.path.join(
362  os.path.basename(os.path.dirname(rmf)),
363  os.path.basename(rmf))
364  rmf = os.path.join(root_directory_of_stat_file, rmf_local_path)
365  rmf_file_list.append(rmf)
366 
367  rmf_file_frame_list += fields[rmf_file_frame_key]
368 
369  for k in keywords:
370  feature_keyword_list_dict[k] += fields[k]
371 
372  # Record combining and filtering operations in provenance, if requested
373  if provenance is not None:
374  if len(stat_files) > 1:
375  provenance.append(CombineProvenance(len(stat_files),
376  statistics.total))
377  if get_every != 1:
378  provenance.append(
379  FilterProvenance("Keep fraction", 0.,
380  statistics.passed_get_every))
381  if prefiltervalue is not None:
382  provenance.append(FilterProvenance("Total score",
383  prefiltervalue,
384  statistics.passed_filtertuple))
385 
386  return (rmf_file_list, rmf_file_frame_list, score_list,
387  feature_keyword_list_dict)
388 
389 
390 def get_trajectory_models(stat_files,
391  score_key="SimplifiedModel_Total_Score_None",
392  rmf_file_key="rmf_file",
393  rmf_file_frame_key="rmf_frame_index",
394  get_every=1):
395  """Given a list of stat files, read them all and find a trajectory
396  of models. Returns the rmf filenames, frame numbers, scores, and
397  values for feature keywords
398  """
399  rmf_file_list = [] # best RMF files
400  rmf_file_frame_list = [] # best RMF frames
401  score_list = [] # best scores
402  for sf in stat_files:
403  root_directory_of_stat_file = Path(sf).parent.parent
404  print("getting data from file %s" % sf)
406 
407  feature_keywords = [score_key, rmf_file_key, rmf_file_frame_key]
408 
409  fields = po.get_fields(feature_keywords, get_every=get_every)
410 
411  # check that all lengths are all equal
412  length_set = set()
413  for f in fields:
414  length_set.add(len(fields[f]))
415 
416  # if some of the fields are missing, truncate
417  # the feature files to the shortest one
418  if len(length_set) > 1:
419  print("get_best_models: the statfile is not synchronous")
420  minlen = min(length_set)
421  for f in fields:
422  fields[f] = fields[f][0:minlen]
423 
424  # append to the lists
425  score_list += fields[score_key]
426  for rmf in fields[rmf_file_key]:
427  rmf_file_list.append(str(root_directory_of_stat_file / rmf))
428 
429  rmf_file_frame_list += fields[rmf_file_frame_key]
430 
431  return rmf_file_list, rmf_file_frame_list, score_list
432 
433 
435  rmf_tuples,
436  alignment_components=None,
437  rmsd_calculation_components=None,
438  state_number=0):
439  """Read in coordinates of a set of RMF tuples.
440  Returns the coordinates split as requested (all, alignment only, rmsd only)
441  as well as RMF file names (as keys in a dictionary, with values being
442  the rank number) and just a plain list
443  @param model The IMP model
444  @param rmf_tuples [score,filename,frame number,original order number, rank]
445  @param alignment_components Tuples to specify what you're aligning on
446  @param rmsd_calculation_components Tuples to specify what components
447  are used for RMSD calc
448  @param state_number The state to extract from the file
449  """
450  all_coordinates = []
451  rmsd_coordinates = []
452  alignment_coordinates = []
453  all_rmf_file_names = []
454  rmf_file_name_index_dict = {} # storing the features
455 
456  for cnt, tpl in enumerate(rmf_tuples):
457  rmf_file = tpl[1]
458  frame_number = tpl[2]
459  if cnt == 0:
460  prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
461  frame_number,
462  rmf_file)
463  else:
464  IMP.pmi.analysis.link_hiers_to_rmf(model, prots, frame_number,
465  rmf_file)
466 
467  if not prots:
468  continue
469  states = IMP.atom.get_by_type(prots[0], IMP.atom.STATE_TYPE)
470  if states:
471  prot = states[state_number]
472  else:
473  # If no states, assume PMI1-style
474  prot = prots[state_number]
475 
476  # getting the particles
478  model_coordinate_dict = {}
479  template_coordinate_dict = {}
480  rmsd_coordinate_dict = {}
481 
482  for pr in part_dict:
483  model_coordinate_dict[pr] = np.array(
484  [np.array(IMP.core.XYZ(i).get_coordinates())
485  for i in part_dict[pr]])
486  # for each file, get (as floats) a list of all coordinates
487  # of all requested tuples, organized as dictionaries.
488  for tuple_dict, result_dict in zip(
489  (alignment_components, rmsd_calculation_components),
490  (template_coordinate_dict, rmsd_coordinate_dict)):
491 
492  if tuple_dict is None:
493  continue
494 
495  # PMI2: do selection of resolution and name at the same time
496  for pr in tuple_dict:
498  prot, tuple_dict[pr], resolution=1)
499  result_dict[pr] = [
500  list(map(float, IMP.core.XYZ(p).get_coordinates()))
501  for p in ps]
502 
503  all_coordinates.append(model_coordinate_dict)
504  alignment_coordinates.append(template_coordinate_dict)
505  rmsd_coordinates.append(rmsd_coordinate_dict)
506  frame_name = rmf_file + '|' + str(frame_number)
507  all_rmf_file_names.append(frame_name)
508  rmf_file_name_index_dict[frame_name] = tpl[4]
509  return (all_coordinates, alignment_coordinates, rmsd_coordinates,
510  rmf_file_name_index_dict, all_rmf_file_names)
511 
512 
513 def get_bead_sizes(model, rmf_tuple, rmsd_calculation_components=None,
514  state_number=0):
515  '''
516  @param model The IMP model
517  @param rmf_tuple score,filename,frame number,original order number, rank
518  @param rmsd_calculation_components Tuples to specify what components
519  are used for RMSD calc
520  @param state_number The state to extract from the file
521  '''
522  if rmsd_calculation_components is None:
523  return {}
524 
525  rmf_file = rmf_tuple[1]
526  frame_number = rmf_tuple[2]
527  prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
528  frame_number,
529  rmf_file)
530 
531  states = IMP.atom.get_by_type(prots[0], IMP.atom.STATE_TYPE)
532  prot = states[state_number]
533 
534  rmsd_bead_size_dict = {}
535 
536  # PMI2: do selection of resolution and name at the same time
537  for pr in rmsd_calculation_components:
539  prot, rmsd_calculation_components[pr], resolution=1)
540  rmsd_bead_size_dict[pr] = [
541  len(IMP.pmi.tools.get_residue_indexes(p)) for p in ps]
542 
543  return rmsd_bead_size_dict
544 
545 
547  """A helper output for model evaluation"""
548  def __init__(self, model):
549  self.model = model
550  self.rs = IMP.pmi.tools.get_restraint_set(self.model)
551 
552  def get_output(self):
553  score = self.rs.evaluate(False)
554  output = {}
555  output["Total_Score"] = str(score)
556  return output
def get_restraint_set
Get a RestraintSet containing all PMI restraints added to the model.
Definition: tools.py:104
A class for reading stat files (either rmf or ascii v1 and v2)
Definition: output.py:931
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def parse_dssp
Read a DSSP file, and return secondary structure elements (SSEs).
def get_best_models
Given a list of stat files, read them all and find the best models.
A helper output for model evaluation.
Miscellaneous utilities.
Definition: tools.py:1
Collect statistics from ProcessOutput.get_fields().
Definition: output.py:920
Track creation of a system fragment by combination.
Definition: provenance.h:280
def get_trajectory_models
Given a list of stat files, read them all and find a trajectory of models.
def select_by_tuple_2
New tuple format: molname OR (start,stop,molname,copynum,statenum) Copy and state are optional...
Definition: tools.py:430
Track creation of a system fragment by filtering.
Definition: provenance.h:340
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Tools for clustering and cluster analysis.
Definition: pmi/Analysis.py:1
Classes for writing output files and processing them.
Definition: output.py:1
def save_best_models
Given a list of stat files, read them all and find the best models.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Track creation of a system fragment from clustering.
Definition: provenance.h:426
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
def read_coordinates_of_rmfs
Read in coordinates of a set of RMF tuples.
Python classes to represent, score, sample and analyze models.
Functionality for loading, creating, manipulating and scoring atomic structures.
void add_provenance(Model *m, ParticleIndex pi, Provenance p)
Add provenance to part of the model.
Support for the RMF file format for storing hierarchical molecular data and markup.
def get_residue_indexes
Retrieve the residue indexes for the given particle.
Definition: tools.py:499
def add_provenance
Add provenance information in prov (a list of _TempProvenance objects) to each of the IMP hierarchies...
Store objects in order they were added, but with default type.
Definition: tools.py:819