IMP logo
IMP Reference Guide  develop.27926d84dc,2024/04/22
The Integrative Modeling Platform
pmi1/io/__init__.py
1 """@namespace IMP.pmi1.io
2  Utility classes and functions for reading and storing PMI files
3 """
4 
5 from __future__ import print_function
6 import IMP
7 import IMP.algebra
8 import IMP.atom
9 import IMP.pmi1
10 import IMP.rmf
11 import IMP.pmi1.analysis
12 import IMP.pmi1.output
13 import IMP.pmi1.tools
14 import RMF
15 import sys,os
16 import numpy as np
17 import re
18 from collections import defaultdict
19 import itertools
20 
21 def parse_dssp(dssp_fn, limit_to_chains='',name_map=None):
22  """Read a DSSP file, and return secondary structure elements (SSEs).
23  Values are all PDB residue numbering.
24  @param dssp_fn The file to read
25  @param limit_to_chains Only read/return these chain IDs
26  @param name_map If passed, return tuples organized by molecule name
27  (name_map should be a dictionary with chain IDs as keys and
28  molecule names as values).
29 
30  @return a dictionary with keys 'helix', 'beta', 'loop'
31  Each contains a list of SSEs.
32  Each SSE is a list of elements (e.g. strands in a sheet)
33  Each element is a tuple (residue start, residue end, chain)
34 
35  Example for a structure with helix A:5-7 and Beta strands A:1-3,A:9-11:
36 
37  @code{.py}
38  ret = { 'helix' : [ [ (5,7,'A') ],... ]
39  'beta' : [ [ (1,3,'A'),
40  (9,11,'A'),...],...]
41  'loop' : same format as helix
42  }
43  @endcode
44  """
45 
46  def convert_chain(ch):
47  if name_map is None:
48  return ch
49  else:
50  try:
51  return name_map[ch]
52  except:
53  return ch
54 
55  # setup
56  helix_classes = 'GHI'
57  strand_classes = 'EB'
58  loop_classes = [' ', '', 'T', 'S']
59  sse_dict = {}
60  for h in helix_classes:
61  sse_dict[h] = 'helix'
62  for s in strand_classes:
63  sse_dict[s] = 'beta'
64  for l in loop_classes:
65  sse_dict[l] = 'loop'
66  sses = {'helix':[],
67  'beta':[],
68  'loop':[]}
69 
70  # read file and parse
71  start = False
72 
73  # temporary beta dictionary indexed by DSSP's ID
74  beta_dict = IMP.pmi1.tools.OrderedDefaultDict(list)
75  prev_sstype = None
76  prev_beta_id = None
77 
78  with open(dssp_fn, 'r') as fh:
79  for line in fh:
80  fields = line.split()
81  chain_break = False
82  if len(fields) < 2:
83  continue
84  if fields[1] == "RESIDUE":
85  # Start parsing from here
86  start = True
87  continue
88  if not start:
89  continue
90  if line[9] == " ":
91  chain_break = True
92  elif limit_to_chains != '' and line[11] not in limit_to_chains:
93  continue
94 
95  # gather line info
96  if not chain_break:
97  pdb_res_num = int(line[5:10])
98  chain = line[11]
99  sstype = sse_dict[line[16]]
100  beta_id = line[33]
101 
102  # decide whether to extend or store the SSE
103  if prev_sstype is None:
104  cur_sse = [pdb_res_num,pdb_res_num,convert_chain(chain)]
105  elif sstype != prev_sstype or chain_break:
106  # add cur_sse to the right place
107  if prev_sstype in ['helix', 'loop']:
108  sses[prev_sstype].append([cur_sse])
109  elif prev_sstype == 'beta':
110  beta_dict[prev_beta_id].append(cur_sse)
111  cur_sse = [pdb_res_num,pdb_res_num,convert_chain(chain)]
112  else:
113  cur_sse[1] = pdb_res_num
114  if chain_break:
115  prev_sstype = None
116  prev_beta_id = None
117  else:
118  prev_sstype = sstype
119  prev_beta_id = beta_id
120 
121  # final SSE processing
122  if not prev_sstype is None:
123  if prev_sstype in ['helix', 'loop']:
124  sses[prev_sstype].append([cur_sse])
125  elif prev_sstype == 'beta':
126  beta_dict[prev_beta_id].append(cur_sse)
127  # gather betas
128  for beta_sheet in beta_dict:
129  sses['beta'].append(beta_dict[beta_sheet])
130  return sses
131 
133  out_dir,
134  stat_files,
135  number_of_best_scoring_models=10,
136  get_every=1,
137  score_key="SimplifiedModel_Total_Score_None",
138  feature_keys=None,
139  rmf_file_key="rmf_file",
140  rmf_file_frame_key="rmf_frame_index",
141  override_rmf_dir=None):
142  """Given a list of stat files, read them all and find the best models.
143  Save to a single RMF along with a stat file.
144  @param model The IMP Model
145  @param out_dir The output directory. Will save 3 files (RMF, stat, summary)
146  @param stat_files List of all stat files to collect
147  @param number_of_best_scoring_models Num best models to gather
148  @param get_every Skip frames
149  @param score_key Used for the ranking
150  @param feature_keys Keys to keep around
151  @param rmf_file_key The key that says RMF file name
152  @param rmf_file_frame_key The key that says RMF frame number
153  @param override_rmf_dir For output, change the name of the RMF directory (experiment)
154  """
155 
156  # start by splitting into jobs
157  try:
158  from mpi4py import MPI
159  comm = MPI.COMM_WORLD
160  rank = comm.Get_rank()
161  number_of_processes = comm.size
162  except ImportError:
163  rank = 0
164  number_of_processes = 1
165  my_stat_files=IMP.pmi1.tools.chunk_list_into_segments(
166  stat_files,number_of_processes)[rank]
167 
168  # filenames
169  out_stat_fn = os.path.join(out_dir,"top_"+str(number_of_best_scoring_models)+".out")
170  out_rmf_fn = os.path.join(out_dir,"top_"+str(number_of_best_scoring_models)+".rmf3")
171 
172  # extract all the models
173  all_fields=[]
174  for nsf,sf in enumerate(my_stat_files):
175 
176  # get list of keywords
177  root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
178  print("getting data from file %s" % sf)
180  all_keys = [score_key,
181  rmf_file_key,
182  rmf_file_frame_key]
183  for k in po.get_keys():
184  for fk in feature_keys:
185  if fk in k:
186  all_keys.append(k)
187  fields = po.get_fields(all_keys,
188  get_every=get_every)
189 
190  # check that all lengths are all equal
191  length_set = set([len(fields[f]) for f in fields])
192  minlen = min(length_set)
193  # if some of the fields are missing, truncate
194  # the feature files to the shortest one
195  if len(length_set) > 1:
196  print("get_best_models: the statfile is not synchronous")
197  minlen = min(length_set)
198  for f in fields:
199  fields[f] = fields[f][0:minlen]
200  if nsf==0:
201  all_fields=fields
202  else:
203  for k in fields:
204  all_fields[k]+=fields[k]
205 
206  if override_rmf_dir is not None:
207  for i in range(minlen):
208  all_fields[rmf_file_key][i]=os.path.join(
209  override_rmf_dir,os.path.basename(all_fields[rmf_file_key][i]))
210 
211  # gather info, sort, write
212  if number_of_processes!=1:
213  comm.Barrier()
214  if rank!=0:
215  comm.send(all_fields, dest=0, tag=11)
216  else:
217  for i in range(1,number_of_processes):
218  data_tmp = comm.recv(source=i, tag=11)
219  for k in all_fields:
220  all_fields[k]+=data_tmp[k]
221 
222  # sort by total score
223  order = sorted(range(len(all_fields[score_key])),
224  key=lambda i: float(all_fields[score_key][i]))
225 
226  # write the stat and RMF files
227  stat = open(out_stat_fn,'w')
228  rh0 = RMF.open_rmf_file_read_only(
229  os.path.join(root_directory_of_stat_file,all_fields[rmf_file_key][0]))
230  prots = IMP.rmf.create_hierarchies(rh0,model)
231  del rh0
232  outf = RMF.create_rmf_file(out_rmf_fn)
233  IMP.rmf.add_hierarchies(outf,prots)
234  for nm,i in enumerate(order[:number_of_best_scoring_models]):
235  dline=dict((k,all_fields[k][i]) for k in all_fields)
236  dline['orig_rmf_file']=dline[rmf_file_key]
237  dline['orig_rmf_frame_index']=dline[rmf_file_frame_key]
238  dline[rmf_file_key]=out_rmf_fn
239  dline[rmf_file_frame_key]=nm
240  rh = RMF.open_rmf_file_read_only(
241  os.path.join(root_directory_of_stat_file,all_fields[rmf_file_key][i]))
242  IMP.rmf.link_hierarchies(rh,prots)
244  RMF.FrameID(all_fields[rmf_file_frame_key][i]))
245  IMP.rmf.save_frame(outf)
246  del rh
247  stat.write(str(dline)+'\n')
248  stat.close()
249  print('wrote stats to',out_stat_fn)
250  print('wrote rmfs to',out_rmf_fn)
251 
252 class _TempProvenance(object):
253  """Placeholder to track provenance information added to the IMP model.
254  This is since we typically don't preserve the IMP::Model object
255  throughout a PMI protocol."""
256  def __init__(self, provclass, particle_name, *args, **kwargs):
257  self.provclass = provclass
258  self.particle_name = particle_name
259  self.args = args
260  self.kwargs = kwargs
261 
262  def get_decorator(self, model):
263  """Instantiate and return an IMP Provenance object in the model."""
264  pi = model.add_particle(self.particle_name)
265  return self.provclass.setup_particle(model, pi, *self.args,
266  **self.kwargs)
267 
268 
269 class ClusterProvenance(_TempProvenance):
270  def __init__(self, *args, **kwargs):
271  _TempProvenance.__init__(self, IMP.core.ClusterProvenance,
272  "clustering", *args, **kwargs)
273 
274 
275 class FilterProvenance(_TempProvenance):
276  def __init__(self, *args, **kwargs):
277  _TempProvenance.__init__(self, IMP.core.FilterProvenance,
278  "filtering", *args, **kwargs)
279 
280 
281 class CombineProvenance(_TempProvenance):
282  def __init__(self, *args, **kwargs):
283  _TempProvenance.__init__(self, IMP.core.CombineProvenance,
284  "combine runs", *args, **kwargs)
285 
286 
287 def add_provenance(prov, hiers):
288  """Add provenance information in `prov` (a list of _TempProvenance
289  objects) to each of the IMP hierarchies provided.
290  Note that we do this all at once since we typically don't preserve
291  the IMP::Model object throughout a PMI protocol."""
292  for h in hiers:
293  IMP.pmi1.tools._add_pmi_provenance(h)
294  m = h.get_model()
295  for p in prov:
296  IMP.core.add_provenance(m, h, p.get_decorator(m))
297 
298 def get_best_models(stat_files,
299  score_key="SimplifiedModel_Total_Score_None",
300  feature_keys=None,
301  rmf_file_key="rmf_file",
302  rmf_file_frame_key="rmf_frame_index",
303  prefiltervalue=None,
304  get_every=1, provenance=None):
305  """ Given a list of stat files, read them all and find the best models.
306  Returns the best rmf filenames, frame numbers, scores, and values for feature keywords
307  """
308  rmf_file_list=[] # best RMF files
309  rmf_file_frame_list=[] # best RMF frames
310  score_list=[] # best scores
311  feature_keyword_list_dict=defaultdict(list) # best values of the feature keys
312  statistics = IMP.pmi1.output.OutputStatistics()
313  for sf in stat_files:
314  root_directory_of_stat_file = os.path.dirname(os.path.abspath(sf))
315  if sf[-4:]=='rmf3':
316  root_directory_of_stat_file = os.path.dirname(os.path.abspath(root_directory_of_stat_file))
317  print("getting data from file %s" % sf)
319 
320  try:
321  file_keywords = po.get_keys()
322  except:
323  continue
324 
325  keywords = [score_key,
326  rmf_file_key,
327  rmf_file_frame_key]
328 
329  # check all requested keys are in the file
330  # this looks weird because searching for "*requested_key*"
331  if feature_keys:
332  for requested_key in feature_keys:
333  for file_k in file_keywords:
334  if requested_key in file_k:
335  if file_k not in keywords:
336  keywords.append(file_k)
337 
338  if prefiltervalue is None:
339  fields = po.get_fields(keywords,
340  get_every=get_every,
341  statistics=statistics)
342  else:
343  fields = po.get_fields(keywords,
344  filtertuple=(score_key,"<",prefiltervalue),
345  get_every=get_every,
346  statistics=statistics)
347 
348  # check that all lengths are all equal
349  length_set = set()
350  for f in fields:
351  length_set.add(len(fields[f]))
352 
353  # if some of the fields are missing, truncate
354  # the feature files to the shortest one
355  if len(length_set) > 1:
356  print("get_best_models: the statfile is not synchronous")
357  minlen = min(length_set)
358  for f in fields:
359  fields[f] = fields[f][0:minlen]
360 
361  # append to the lists
362  score_list += fields[score_key]
363  for rmf in fields[rmf_file_key]:
364  rmf=os.path.normpath(rmf)
365  if root_directory_of_stat_file not in rmf:
366  rmf_local_path=os.path.join(os.path.basename(os.path.dirname(rmf)),os.path.basename(rmf))
367  rmf=os.path.join(root_directory_of_stat_file,rmf_local_path)
368  rmf_file_list.append(rmf)
369 
370  rmf_file_frame_list += fields[rmf_file_frame_key]
371 
372  for k in keywords:
373  feature_keyword_list_dict[k] += fields[k]
374 
375  # Record combining and filtering operations in provenance, if requested
376  if provenance is not None:
377  if len(stat_files) > 1:
378  provenance.append(CombineProvenance(len(stat_files),
379  statistics.total))
380  if get_every != 1:
381  provenance.append(FilterProvenance("Keep fraction",
382  0., statistics.passed_get_every))
383  if prefiltervalue is not None:
384  provenance.append(FilterProvenance("Total score",
385  prefiltervalue,
386  statistics.passed_filtertuple))
387 
388  return rmf_file_list,rmf_file_frame_list,score_list,feature_keyword_list_dict
389 
390 def get_trajectory_models(stat_files,
391  score_key="SimplifiedModel_Total_Score_None",
392  rmf_file_key="rmf_file",
393  rmf_file_frame_key="rmf_frame_index",
394  get_every=1):
395  """ Given a list of stat files, read them all and find a trajectory of models.
396  Returns the rmf filenames, frame numbers, scores, and values for feature keywords
397  """
398  rmf_file_list=[] # best RMF files
399  rmf_file_frame_list=[] # best RMF frames
400  score_list=[] # best scores
401  for sf in stat_files:
402  root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
403  print("getting data from file %s" % sf)
405  keywords = po.get_keys()
406 
407  feature_keywords = [score_key,
408  rmf_file_key,
409  rmf_file_frame_key]
410 
411  fields = po.get_fields(feature_keywords,
412  get_every=get_every)
413 
414  # check that all lengths are all equal
415  length_set = set()
416  for f in fields:
417  length_set.add(len(fields[f]))
418 
419  # if some of the fields are missing, truncate
420  # the feature files to the shortest one
421  if len(length_set) > 1:
422  print("get_best_models: the statfile is not synchronous")
423  minlen = min(length_set)
424  for f in fields:
425  fields[f] = fields[f][0:minlen]
426 
427  # append to the lists
428  score_list += fields[score_key]
429  for rmf in fields[rmf_file_key]:
430  rmf_file_list.append(os.path.join(root_directory_of_stat_file,rmf))
431 
432  rmf_file_frame_list += fields[rmf_file_frame_key]
433 
434  return rmf_file_list,rmf_file_frame_list,score_list
435 
436 
438  rmf_tuples,
439  alignment_components=None,
440  rmsd_calculation_components=None,
441  state_number=0):
442  """ Read in coordinates of a set of RMF tuples.
443  Returns the coordinates split as requested (all, alignment only, rmsd only) as well as
444  RMF file names (as keys in a dictionary, with values being the rank number) and just a plain list
445  @param model The IMP model
446  @param rmf_tuples [score,filename,frame number,original order number, rank]
447  @param alignment_components Tuples to specify what you're aligning on
448  @param rmsd_calculation_components Tuples to specify what components are used for RMSD calc
449  """
450  all_coordinates = []
451  rmsd_coordinates = []
452  alignment_coordinates = []
453  all_rmf_file_names = []
454  rmf_file_name_index_dict = {} # storing the features
455 
456  for cnt, tpl in enumerate(rmf_tuples):
457  rmf_file = tpl[1]
458  frame_number = tpl[2]
459  if cnt==0:
460  prots = IMP.pmi1.analysis.get_hiers_from_rmf(model,
461  frame_number,
462  rmf_file)
463  else:
464  IMP.pmi1.analysis.link_hiers_to_rmf(model,prots,frame_number,rmf_file)
465 
466  if not prots:
467  continue
468  prot = prots[state_number]
469 
470  # getting the particles
472  all_particles = [pp for key in part_dict for pp in part_dict[key]]
473  all_ps_set = set(all_particles)
474  model_coordinate_dict = {}
475  template_coordinate_dict={}
476  rmsd_coordinate_dict={}
477 
478  for pr in part_dict:
479  model_coordinate_dict[pr] = np.array(
480  [np.array(IMP.core.XYZ(i).get_coordinates()) for i in part_dict[pr]])
481  # for each file, get (as floats) a list of all coordinates
482  # of all requested tuples, organized as dictionaries.
483  for tuple_dict,result_dict in zip((alignment_components,rmsd_calculation_components),
484  (template_coordinate_dict,rmsd_coordinate_dict)):
485 
486  if tuple_dict is None:
487  continue
488 
489  for pr in tuple_dict:
490  if type(tuple_dict[pr]) is str:
491  name=tuple_dict[pr]
492  s=IMP.atom.Selection(prot,molecule=name)
493  elif type(tuple_dict[pr]) is tuple:
494  name=tuple_dict[pr][2]
495  rend=tuple_dict[pr][1]
496  rbegin=tuple_dict[pr][0]
497  s=IMP.atom.Selection(prot,molecule=name,residue_indexes=range(rbegin,rend+1))
498  ps=s.get_selected_particles()
499  filtered_particles=[p for p in ps if p in all_ps_set]
500  result_dict[pr] = \
501  [list(map(float,IMP.core.XYZ(p).get_coordinates())) for p in filtered_particles]
502 
503  all_coordinates.append(model_coordinate_dict)
504  alignment_coordinates.append(template_coordinate_dict)
505  rmsd_coordinates.append(rmsd_coordinate_dict)
506  frame_name = rmf_file + '|' + str(frame_number)
507  all_rmf_file_names.append(frame_name)
508  rmf_file_name_index_dict[frame_name] = tpl[4]
509  return all_coordinates,alignment_coordinates,rmsd_coordinates,rmf_file_name_index_dict,all_rmf_file_names
510 
511 def get_bead_sizes(model,rmf_tuple,rmsd_calculation_components=None,state_number=0):
512  '''
513  @param model The IMP model
514  @param rmf_tuple score,filename,frame number,original order number, rank
515  @param rmsd_calculation_components Tuples to specify what components are used for RMSD calc
516  '''
517  if rmsd_calculation_components is None:
518  return {}
519 
520  rmf_file = rmf_tuple[1]
521  frame_number = rmf_tuple[2]
522  prots = IMP.pmi1.analysis.get_hiers_from_rmf(model,
523  frame_number,
524  rmf_file)
525 
526  prot = prots[state_number]
527 
528  rmsd_bead_size_dict = {}
529 
530  # getting the particles
532  all_particles=[pp for key in part_dict for pp in part_dict[key]]
533  all_ps_set=set(all_particles)
534 
535  # getting the coordinates
536  for pr in rmsd_calculation_components:
537  if type(rmsd_calculation_components[pr]) is str:
538  name=rmsd_calculation_components[pr]
539  s=IMP.atom.Selection(prot,molecule=name)
540  elif type(rmsd_calculation_components[pr]) is tuple:
541  name=rmsd_calculation_components[pr][2]
542  rend=rmsd_calculation_components[pr][1]
543  rbegin=rmsd_calculation_components[pr][0]
544  s=IMP.atom.Selection(prot,molecule=name,residue_indexes=range(rbegin,rend+1))
545  ps=s.get_selected_particles()
546  filtered_particles=[p for p in ps if p in all_ps_set]
547  rmsd_bead_size_dict[pr] = \
548  [len(IMP.pmi1.tools.get_residue_indexes(p)) for p in filtered_particles]
549 
550  return rmsd_bead_size_dict
551 
552 class RMSDOutput(object):
553  """A helper output based on dist to initial coordinates"""
554  def __init__(self,ps,label,init_coords=None):
555  self.model = ps[0].get_model()
556  self.ps = ps
557  if init_coords is None:
558  self.init_coords = [IMP.core.XYZ(p).get_coordinates() for p in self.ps]
559  else:
560  self.init_coords = init_coords
561  self.label = label
562 
563  @property
564  @IMP.deprecated_method("3.0", "Model should be accessed with `.model`.")
565  def mdl(self):
566  return self.model
567 
568  def get_output(self):
569  output = {}
570  coords = [IMP.core.XYZ(p).get_coordinates() for p in self.ps]
571  rmsd = IMP.algebra.get_rmsd(coords,self.init_coords)
572  output["RMSD_"+self.label] = str(rmsd)
573  return output
574 
575 class TotalScoreOutput(object):
576  """A helper output for model evaluation"""
577  def __init__(self,model):
578  self.model = model
579  self.rs = IMP.pmi1.tools.get_restraint_set(self.model)
580 
581  @property
582  @IMP.deprecated_method("3.0", "Model should be accessed with `.model`.")
583  def mdl(self):
584  return self.model
585 
586  def get_output(self):
587  score = self.rs.evaluate(False)
588  output = {}
589  output["Total_Score"] = str(score)
590  return output
def add_provenance
Add provenance information in prov (a list of _TempProvenance objects) to each of the IMP hierarchies...
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
Collect statistics from ProcessOutput.get_fields().
Definition: /output.py:760
A helper output for model evaluation.
def parse_dssp
Read a DSSP file, and return secondary structure elements (SSEs).
Legacy PMI1 module to represent, score, sample and analyze models.
Store objects in order they were added, but with default type.
Definition: /tools.py:1572
def save_best_models
Given a list of stat files, read them all and find the best models.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
Track creation of a system fragment by combination.
Definition: provenance.h:280
def get_residue_indexes
Retrieve the residue indexes for the given particle.
Definition: /tools.py:1064
Tools for clustering and cluster analysis.
Definition: pmi1/Analysis.py:1
Miscellaneous utilities.
Definition: /tools.py:1
def get_trajectory_models
Given a list of stat files, read them all and find a trajectory of models.
def deprecated_method
Python decorator to mark a method as deprecated.
Definition: __init__.py:9538
Track creation of a system fragment by filtering.
Definition: provenance.h:340
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
def read_coordinates_of_rmfs
Read in coordinates of a set of RMF tuples.
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
def get_restraint_set
Get a RestraintSet containing all PMI restraints added to the model.
Definition: /tools.py:72
def get_best_models
Given a list of stat files, read them all and find the best models.
A helper output based on dist to initial coordinates.
A class for reading stat files (either rmf or ascii v1 and v2)
Definition: /output.py:771
Track creation of a system fragment from clustering.
Definition: provenance.h:426
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
Classes for writing output files and processing them.
Definition: /output.py:1
Functionality for loading, creating, manipulating and scoring atomic structures.
void add_provenance(Model *m, ParticleIndex pi, Provenance p)
Add provenance to part of the model.
Select hierarchy particles identified by the biological name.
Definition: Selection.h:70
Support for the RMF file format for storing hierarchical molecular data and markup.