IMP Reference Guide  develop.5d113ecbfd,2023/06/06 The Integrative Modeling Platform
pmi/io/__init__.py
1 """@namespace IMP.pmi.io
2  Utility classes and functions for reading and storing PMI files
3 """
4
5 from __future__ import print_function
6 import IMP
7 import IMP.algebra
8 import IMP.atom
9 import IMP.pmi
10 import IMP.rmf
11 import IMP.pmi.analysis
12 import IMP.pmi.output
13 import IMP.pmi.tools
14 import RMF
15 import os
16 import numpy as np
17 from collections import defaultdict
18 try:
19  from pathlib import Path
20 except ImportError: # Use bundled pathlib on Python 2 without pathlib
21  from IMP._compat_pathlib import Path
22
23
24 def parse_dssp(dssp_fn, limit_to_chains='', name_map=None):
25  """Read a DSSP file, and return secondary structure elements (SSEs).
26  Values are all PDB residue numbering.
27  @param dssp_fn The file to read
28  @param limit_to_chains Only read/return these chain IDs
29  @param name_map If passed, return tuples organized by molecule name
30  (name_map should be a dictionary with chain IDs as keys and
31  molecule names as values).
32
33  @return a dictionary with keys 'helix', 'beta', 'loop'.
34  Each contains a list of SSEs.
35  Each SSE is a list of elements (e.g. strands in a sheet).
36  Each element is a tuple (residue start, residue end, chain).
37
38  Example for a structure with helix A:5-7 and Beta strands A:1-3,A:9-11:
39
40  @code{.py}
41  ret = { 'helix' : [ [ (5,7,'A') ], ...]
42  'beta' : [ [ (1,3,'A'),
43  (9,11,'A'), ...], ...]
44  'loop' : same format as helix
45  }
46  @endcode
47  """
48
49  def convert_chain(ch):
50  if name_map is None:
51  return ch
52  else:
53  return name_map.get(ch, ch)
54
55  # setup
56  helix_classes = 'GHI'
57  strand_classes = 'EB'
58  loop_classes = [' ', '', 'T', 'S']
59  sse_dict = {}
60  for h in helix_classes:
61  sse_dict[h] = 'helix'
62  for s in strand_classes:
63  sse_dict[s] = 'beta'
64  for lc in loop_classes:
65  sse_dict[lc] = 'loop'
66  sses = {'helix': [], 'beta': [], 'loop': []}
67
68  # read file and parse
69  start = False
70
71  # temporary beta dictionary indexed by DSSP's ID
72  beta_dict = IMP.pmi.tools.OrderedDefaultDict(list)
73  prev_sstype = None
74  prev_beta_id = None
75
76  with open(dssp_fn, 'r') as fh:
77  for line in fh:
78  fields = line.split()
79  chain_break = False
80  if len(fields) < 2:
81  continue
82  if fields[1] == "RESIDUE":
83  # Start parsing from here
84  start = True
85  continue
86  if not start:
87  continue
88  if line[9] == " ":
89  chain_break = True
90  elif limit_to_chains != '' and line[11] not in limit_to_chains:
91  continue
92
93  # gather line info
94  if not chain_break:
95  pdb_res_num = int(line[5:10])
96  chain = line[11]
97  sstype = sse_dict[line[16]]
98  beta_id = line[33]
99
100  # decide whether to extend or store the SSE
101  if prev_sstype is None:
102  cur_sse = [pdb_res_num, pdb_res_num, convert_chain(chain)]
103  elif sstype != prev_sstype or chain_break:
104  # add cur_sse to the right place
105  if prev_sstype in ['helix', 'loop']:
106  sses[prev_sstype].append([cur_sse])
107  else: # prev_sstype == 'beta'
108  beta_dict[prev_beta_id].append(cur_sse)
109  cur_sse = [pdb_res_num, pdb_res_num, convert_chain(chain)]
110  else:
111  cur_sse[1] = pdb_res_num
112  if chain_break:
113  prev_sstype = None
114  prev_beta_id = None
115  else:
116  prev_sstype = sstype
117  prev_beta_id = beta_id
118
119  # final SSE processing
120  if prev_sstype in ['helix', 'loop']:
121  sses[prev_sstype].append([cur_sse])
122  elif prev_sstype == 'beta':
123  beta_dict[prev_beta_id].append(cur_sse)
124  # gather betas
125  for beta_sheet in beta_dict:
126  sses['beta'].append(beta_dict[beta_sheet])
127  return sses
128
129
130 def save_best_models(model, out_dir, stat_files,
131  number_of_best_scoring_models=10, get_every=1,
132  score_key="SimplifiedModel_Total_Score_None",
133  feature_keys=None, rmf_file_key="rmf_file",
134  rmf_file_frame_key="rmf_frame_index",
135  override_rmf_dir=None):
136  """Given a list of stat files, read them all and find the best models.
137  Save to a single RMF along with a stat file.
138  @param model The IMP Model
139  @param out_dir The output directory. Will save 3 files (RMF, stat, summary)
140  @param stat_files List of all stat files to collect
141  @param number_of_best_scoring_models Num best models to gather
142  @param get_every Skip frames
143  @param score_key Used for the ranking
144  @param feature_keys Keys to keep around
145  @param rmf_file_key The key that says RMF file name
146  @param rmf_file_frame_key The key that says RMF frame number
147  @param override_rmf_dir For output, change the name of the RMF
148  directory (experiment)
149  """
150
151  # start by splitting into jobs
152  try:
153  from mpi4py import MPI
154  comm = MPI.COMM_WORLD
155  rank = comm.Get_rank()
156  number_of_processes = comm.size
157  except ImportError:
158  rank = 0
159  number_of_processes = 1
160  my_stat_files = IMP.pmi.tools.chunk_list_into_segments(
161  stat_files, number_of_processes)[rank]
162
163  # filenames
164  out_dir = Path(out_dir)
165  out_stat_fn = out_dir / ("top_%d.out" % number_of_best_scoring_models)
166  out_rmf_fn = out_dir / ("top_%d.rmf3" % number_of_best_scoring_models)
167
168  # extract all the models
169  all_fields = []
170  for nsf, sf in enumerate(my_stat_files):
171
172  # get list of keywords
173  root_directory_of_stat_file = Path(sf).parent.parent
174  print("getting data from file %s" % sf)
176  all_keys = [score_key,
177  rmf_file_key,
178  rmf_file_frame_key]
179  for k in po.get_keys():
180  for fk in feature_keys:
181  if fk in k:
182  all_keys.append(k)
183  fields = po.get_fields(all_keys,
184  get_every=get_every)
185
186  # check that all lengths are all equal
187  length_set = set([len(fields[f]) for f in fields])
188  minlen = min(length_set)
189  # if some of the fields are missing, truncate
190  # the feature files to the shortest one
191  if len(length_set) > 1:
192  print("get_best_models: the statfile is not synchronous")
193  minlen = min(length_set)
194  for f in fields:
195  fields[f] = fields[f][0:minlen]
196  if nsf == 0:
197  all_fields = fields
198  else:
199  for k in fields:
200  all_fields[k] += fields[k]
201
202  if override_rmf_dir is not None:
203  for i in range(minlen):
204  all_fields[rmf_file_key][i] = os.path.join(
205  override_rmf_dir,
206  os.path.basename(all_fields[rmf_file_key][i]))
207
208  # gather info, sort, write
209  if number_of_processes != 1:
210  comm.Barrier()
211  if rank != 0:
212  comm.send(all_fields, dest=0, tag=11)
213  else:
214  for i in range(1, number_of_processes):
215  data_tmp = comm.recv(source=i, tag=11)
216  for k in all_fields:
217  all_fields[k] += data_tmp[k]
218
219  # sort by total score
220  order = sorted(range(len(all_fields[score_key])),
221  key=lambda i: float(all_fields[score_key][i]))
222
223  # write the stat and RMF files
224  stat = open(str(out_stat_fn), 'w')
226  str(root_directory_of_stat_file / all_fields[rmf_file_key][0]))
227  prots = IMP.rmf.create_hierarchies(rh0, model)
228  del rh0
229  outf = RMF.create_rmf_file(str(out_rmf_fn))
231  for nm, i in enumerate(order[:number_of_best_scoring_models]):
232  dline = dict((k, all_fields[k][i]) for k in all_fields)
233  dline['orig_rmf_file'] = dline[rmf_file_key]
234  dline['orig_rmf_frame_index'] = dline[rmf_file_frame_key]
235  dline[rmf_file_key] = str(out_rmf_fn)
236  dline[rmf_file_frame_key] = nm
238  str(root_directory_of_stat_file / all_fields[rmf_file_key][i]))
241  RMF.FrameID(all_fields[rmf_file_frame_key][i]))
242  IMP.rmf.save_frame(outf)
243  del rh
244  stat.write(str(dline) + '\n')
245  stat.close()
246  print('wrote stats to', out_stat_fn)
247  print('wrote rmfs to', out_rmf_fn)
248
249
250 class _TempProvenance(object):
251  """Placeholder to track provenance information added to the IMP model.
252  This is since we typically don't preserve the IMP::Model object
253  throughout a PMI protocol."""
254  def __init__(self, provclass, particle_name, *args, **kwargs):
255  self.provclass = provclass
256  self.particle_name = particle_name
257  self.args = args
258  self.kwargs = kwargs
259
260  def get_decorator(self, model):
261  """Instantiate and return an IMP Provenance object in the model."""
263  return self.provclass.setup_particle(model, pi, *self.args,
264  **self.kwargs)
265
266
267 class ClusterProvenance(_TempProvenance):
268  def __init__(self, *args, **kwargs):
269  _TempProvenance.__init__(self, IMP.core.ClusterProvenance,
270  "clustering", *args, **kwargs)
271
272
273 class FilterProvenance(_TempProvenance):
274  def __init__(self, *args, **kwargs):
275  _TempProvenance.__init__(self, IMP.core.FilterProvenance,
276  "filtering", *args, **kwargs)
277
278
279 class CombineProvenance(_TempProvenance):
280  def __init__(self, *args, **kwargs):
281  _TempProvenance.__init__(self, IMP.core.CombineProvenance,
282  "combine runs", *args, **kwargs)
283
284
286  """Add provenance information in prov (a list of _TempProvenance
287  objects) to each of the IMP hierarchies provided.
288  Note that we do this all at once since we typically don't preserve
289  the IMP::Model object throughout a PMI protocol."""
290  for h in hiers:
292  m = h.get_model()
293  for p in prov:
295
296
297 def get_best_models(stat_files,
298  score_key="SimplifiedModel_Total_Score_None",
299  feature_keys=None,
300  rmf_file_key="rmf_file",
301  rmf_file_frame_key="rmf_frame_index",
302  prefiltervalue=None,
303  get_every=1, provenance=None):
304  """ Given a list of stat files, read them all and find the best models.
305  Returns the best rmf filenames, frame numbers, scores, and values
306  for feature keywords
307  """
308  rmf_file_list = [] # best RMF files
309  rmf_file_frame_list = [] # best RMF frames
310  score_list = [] # best scores
311  # best values of the feature keys
312  feature_keyword_list_dict = defaultdict(list)
313  statistics = IMP.pmi.output.OutputStatistics()
314  for sf in stat_files:
315  root_directory_of_stat_file = os.path.dirname(os.path.abspath(sf))
316  if sf[-4:] == 'rmf3':
317  root_directory_of_stat_file = os.path.dirname(
318  os.path.abspath(root_directory_of_stat_file))
319  print("getting data from file %s" % sf)
321
322  try:
323  file_keywords = po.get_keys()
324  except: # noqa: E722
325  continue
326
327  keywords = [score_key, rmf_file_key, rmf_file_frame_key]
328
329  # check all requested keys are in the file
330  # this looks weird because searching for "*requested_key*"
331  if feature_keys:
332  for requested_key in feature_keys:
333  for file_k in file_keywords:
334  if requested_key in file_k:
335  if file_k not in keywords:
336  keywords.append(file_k)
337
338  if prefiltervalue is None:
339  fields = po.get_fields(keywords,
340  get_every=get_every,
341  statistics=statistics)
342  else:
343  fields = po.get_fields(
344  keywords, filtertuple=(score_key, "<", prefiltervalue),
345  get_every=get_every, statistics=statistics)
346
347  # check that all lengths are all equal
348  length_set = set()
349  for f in fields:
351
352  # if some of the fields are missing, truncate
353  # the feature files to the shortest one
354  if len(length_set) > 1:
355  print("get_best_models: the statfile is not synchronous")
356  minlen = min(length_set)
357  for f in fields:
358  fields[f] = fields[f][0:minlen]
359
360  # append to the lists
361  score_list += fields[score_key]
362  for rmf in fields[rmf_file_key]:
363  rmf = os.path.normpath(rmf)
364  if root_directory_of_stat_file not in rmf:
365  rmf_local_path = os.path.join(
366  os.path.basename(os.path.dirname(rmf)),
367  os.path.basename(rmf))
368  rmf = os.path.join(root_directory_of_stat_file, rmf_local_path)
369  rmf_file_list.append(rmf)
370
371  rmf_file_frame_list += fields[rmf_file_frame_key]
372
373  for k in keywords:
374  feature_keyword_list_dict[k] += fields[k]
375
376  # Record combining and filtering operations in provenance, if requested
377  if provenance is not None:
378  if len(stat_files) > 1:
379  provenance.append(CombineProvenance(len(stat_files),
380  statistics.total))
381  if get_every != 1:
382  provenance.append(
383  FilterProvenance("Keep fraction", 0.,
384  statistics.passed_get_every))
385  if prefiltervalue is not None:
386  provenance.append(FilterProvenance("Total score",
387  prefiltervalue,
388  statistics.passed_filtertuple))
389
390  return (rmf_file_list, rmf_file_frame_list, score_list,
391  feature_keyword_list_dict)
392
393
394 def get_trajectory_models(stat_files,
395  score_key="SimplifiedModel_Total_Score_None",
396  rmf_file_key="rmf_file",
397  rmf_file_frame_key="rmf_frame_index",
398  get_every=1):
399  """Given a list of stat files, read them all and find a trajectory
400  of models. Returns the rmf filenames, frame numbers, scores, and
401  values for feature keywords
402  """
403  rmf_file_list = [] # best RMF files
404  rmf_file_frame_list = [] # best RMF frames
405  score_list = [] # best scores
406  for sf in stat_files:
407  root_directory_of_stat_file = Path(sf).parent.parent
408  print("getting data from file %s" % sf)
410
411  feature_keywords = [score_key, rmf_file_key, rmf_file_frame_key]
412
413  fields = po.get_fields(feature_keywords, get_every=get_every)
414
415  # check that all lengths are all equal
416  length_set = set()
417  for f in fields:
419
420  # if some of the fields are missing, truncate
421  # the feature files to the shortest one
422  if len(length_set) > 1:
423  print("get_best_models: the statfile is not synchronous")
424  minlen = min(length_set)
425  for f in fields:
426  fields[f] = fields[f][0:minlen]
427
428  # append to the lists
429  score_list += fields[score_key]
430  for rmf in fields[rmf_file_key]:
431  rmf_file_list.append(str(root_directory_of_stat_file / rmf))
432
433  rmf_file_frame_list += fields[rmf_file_frame_key]
434
435  return rmf_file_list, rmf_file_frame_list, score_list
436
437
439  rmf_tuples,
440  alignment_components=None,
441  rmsd_calculation_components=None,
442  state_number=0):
443  """Read in coordinates of a set of RMF tuples.
444  Returns the coordinates split as requested (all, alignment only, rmsd only)
445  as well as RMF file names (as keys in a dictionary, with values being
446  the rank number) and just a plain list
447  @param model The IMP model
448  @param rmf_tuples [score,filename,frame number,original order number, rank]
449  @param alignment_components Tuples to specify what you're aligning on
450  @param rmsd_calculation_components Tuples to specify what components
451  are used for RMSD calc
452  """
453  all_coordinates = []
454  rmsd_coordinates = []
455  alignment_coordinates = []
456  all_rmf_file_names = []
457  rmf_file_name_index_dict = {} # storing the features
458
459  for cnt, tpl in enumerate(rmf_tuples):
460  rmf_file = tpl[1]
461  frame_number = tpl[2]
462  if cnt == 0:
463  prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
464  frame_number,
465  rmf_file)
466  else:
468  rmf_file)
469
470  if not prots:
471  continue
472  if IMP.pmi.get_is_canonical(prots[0]):
473  states = IMP.atom.get_by_type(prots[0], IMP.atom.STATE_TYPE)
474  prot = states[state_number]
475  else:
476  prot = prots[state_number]
477
478  # getting the particles
480  all_particles = [pp for key in part_dict for pp in part_dict[key]]
481  all_ps_set = set(all_particles)
482  model_coordinate_dict = {}
483  template_coordinate_dict = {}
484  rmsd_coordinate_dict = {}
485
486  for pr in part_dict:
487  model_coordinate_dict[pr] = np.array(
488  [np.array(IMP.core.XYZ(i).get_coordinates())
489  for i in part_dict[pr]])
490  # for each file, get (as floats) a list of all coordinates
491  # of all requested tuples, organized as dictionaries.
492  for tuple_dict, result_dict in zip(
493  (alignment_components, rmsd_calculation_components),
494  (template_coordinate_dict, rmsd_coordinate_dict)):
495
496  if tuple_dict is None:
497  continue
498
499  # PMI2: do selection of resolution and name at the same time
500  if IMP.pmi.get_is_canonical(prot):
501  for pr in tuple_dict:
503  prot, tuple_dict[pr], resolution=1)
504  result_dict[pr] = [
505  list(map(float, IMP.core.XYZ(p).get_coordinates()))
506  for p in ps]
507  else:
508  for pr in tuple_dict:
509  if type(tuple_dict[pr]) is str:
510  name = tuple_dict[pr]
511  s = IMP.atom.Selection(prot, molecule=name)
512  elif type(tuple_dict[pr]) is tuple:
513  name = tuple_dict[pr][2]
514  rend = tuple_dict[pr][1]
515  rbegin = tuple_dict[pr][0]
516  s = IMP.atom.Selection(
517  prot, molecule=name,
518  residue_indexes=range(rbegin, rend+1))
519  ps = s.get_selected_particles()
520  filtered_particles = [p for p in ps if p in all_ps_set]
521  result_dict[pr] = \
522  [list(map(float, IMP.core.XYZ(p).get_coordinates()))
523  for p in filtered_particles]
524
525  all_coordinates.append(model_coordinate_dict)
526  alignment_coordinates.append(template_coordinate_dict)
527  rmsd_coordinates.append(rmsd_coordinate_dict)
528  frame_name = rmf_file + '|' + str(frame_number)
529  all_rmf_file_names.append(frame_name)
530  rmf_file_name_index_dict[frame_name] = tpl[4]
531  return (all_coordinates, alignment_coordinates, rmsd_coordinates,
532  rmf_file_name_index_dict, all_rmf_file_names)
533
534
536  state_number=0):
537  '''
538  @param model The IMP model
539  @param rmf_tuple score,filename,frame number,original order number, rank
540  @param rmsd_calculation_components Tuples to specify what components
541  are used for RMSD calc
542  '''
543  if rmsd_calculation_components is None:
544  return {}
545
546  rmf_file = rmf_tuple[1]
547  frame_number = rmf_tuple[2]
548  prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
549  frame_number,
550  rmf_file)
551
552  if IMP.pmi.get_is_canonical(prots[0]):
553  states = IMP.atom.get_by_type(prots[0], IMP.atom.STATE_TYPE)
554  prot = states[state_number]
555  else:
556  prot = prots[state_number]
557
559
560  # PMI2: do selection of resolution and name at the same time
561  if IMP.pmi.get_is_canonical(prot):
562  for pr in rmsd_calculation_components:
564  prot, rmsd_calculation_components[pr], resolution=1)
566  len(IMP.pmi.tools.get_residue_indexes(p)) for p in ps]
567  else:
568  # getting the particles
570  all_particles = [pp for key in part_dict for pp in part_dict[key]]
571  all_ps_set = set(all_particles)
572
573  # getting the coordinates
574  for pr in rmsd_calculation_components:
575  if type(rmsd_calculation_components[pr]) is str:
576  name = rmsd_calculation_components[pr]
577  s = IMP.atom.Selection(prot, molecule=name)
578  elif type(rmsd_calculation_components[pr]) is tuple:
579  name = rmsd_calculation_components[pr][2]
580  rend = rmsd_calculation_components[pr][1]
581  rbegin = rmsd_calculation_components[pr][0]
582  s = IMP.atom.Selection(
583  prot, molecule=name, residue_indexes=range(rbegin, rend+1))
584  ps = s.get_selected_particles()
585  filtered_particles = [p for p in ps if p in all_ps_set]
588  for p in filtered_particles]
589
591
592
593 class TotalScoreOutput(object):
594  """A helper output for model evaluation"""
595  def __init__(self, model):
596  self.model = model
597  self.rs = IMP.pmi.tools.get_restraint_set(self.model)
598
599  def get_output(self):
600  score = self.rs.evaluate(False)
601  output = {}
602  output["Total_Score"] = str(score)
603  return output
def get_restraint_set
Get a RestraintSet containing all PMI restraints added to the model.
Definition: tools.py:111
A class for reading stat files (either rmf or ascii v1 and v2)
Definition: output.py:947
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def parse_dssp
Read a DSSP file, and return secondary structure elements (SSEs).
def get_best_models
Given a list of stat files, read them all and find the best models.
A helper output for model evaluation.
Miscellaneous utilities.
Definition: tools.py:1
Collect statistics from ProcessOutput.get_fields().
Definition: output.py:936
Track creation of a system fragment by combination.
Definition: provenance.h:280
def get_trajectory_models
Given a list of stat files, read them all and find a trajectory of models.
def select_by_tuple_2
New tuple format: molname OR (start,stop,molname,copynum,statenum) Copy and state are optional...
Definition: tools.py:437
Track creation of a system fragment by filtering.
Definition: provenance.h:340
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Tools for clustering and cluster analysis.
Definition: pmi/Analysis.py:1
bool get_is_canonical(atom::Hierarchy h)
Walk up a PMI2 hierarchy/representations and check if the root is named System.
Definition: pmi/utilities.h:91
Classes for writing output files and processing them.
Definition: output.py:1
def save_best_models
Given a list of stat files, read them all and find the best models.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Track creation of a system fragment from clustering.
Definition: provenance.h:426
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
Read in coordinates of a set of RMF tuples.
Python classes to represent, score, sample and analyze models.