IMP logo
IMP Reference Guide  2.8.0
The Integrative Modeling Platform
pmi/io/__init__.py
1 """@namespace IMP.pmi.io
2  Utility classes and functions for reading and storing PMI files
3 """
4 
5 from __future__ import print_function
6 import IMP
7 import IMP.algebra
8 import IMP.atom
9 import IMP.pmi
10 import IMP.rmf
11 import IMP.pmi.analysis
12 import IMP.pmi.output
13 import IMP.pmi.tools
14 import RMF
15 import sys,os
16 import numpy as np
17 import re
18 from collections import defaultdict
19 import itertools
20 
21 def parse_dssp(dssp_fn, limit_to_chains='',name_map=None):
22  """Read a DSSP file, and return secondary structure elements (SSEs).
23  Values are all PDB residue numbering.
24  @param dssp_fn The file to read
25  @param limit_to_chains Only read/return these chain IDs
26  @param name_map If passed, return tuples organized by molecule name
27  (name_map should be a dictionary with chain IDs as keys and
28  molecule names as values).
29 
30  @return a dictionary with keys 'helix', 'beta', 'loop'
31  Each contains a list of SSEs.
32  Each SSE is a list of elements (e.g. strands in a sheet)
33  Each element is a tuple (residue start, residue end, chain)
34 
35  Example for a structure with helix A:5-7 and Beta strands A:1-3,A:9-11:
36 
37  @code{.py}
38  ret = { 'helix' : [ [ (5,7,'A') ],... ]
39  'beta' : [ [ (1,3,'A'),
40  (9,11,'A'),...],...]
41  'loop' : same format as helix
42  }
43  @endcode
44  """
45 
46  def convert_chain(ch):
47  if name_map is None:
48  return ch
49  else:
50  try:
51  return name_map[ch]
52  except:
53  return ch
54 
55  # setup
56  helix_classes = 'GHI'
57  strand_classes = 'EB'
58  loop_classes = [' ', '', 'T', 'S']
59  sse_dict = {}
60  for h in helix_classes:
61  sse_dict[h] = 'helix'
62  for s in strand_classes:
63  sse_dict[s] = 'beta'
64  for l in loop_classes:
65  sse_dict[l] = 'loop'
66  sses = {'helix':[],
67  'beta':[],
68  'loop':[]}
69 
70  # read file and parse
71  start = False
72 
73  # temporary beta dictionary indexed by DSSP's ID
74  beta_dict = IMP.pmi.tools.OrderedDefaultDict(list)
75  prev_sstype = None
76  prev_beta_id = None
77 
78  with open(dssp_fn, 'r') as fh:
79  for line in fh:
80  fields = line.split()
81  chain_break = False
82  if len(fields) < 2:
83  continue
84  if fields[1] == "RESIDUE":
85  # Start parsing from here
86  start = True
87  continue
88  if not start:
89  continue
90  if line[9] == " ":
91  chain_break = True
92  elif limit_to_chains != '' and line[11] not in limit_to_chains:
93  continue
94 
95  # gather line info
96  if not chain_break:
97  pdb_res_num = int(line[5:10])
98  chain = line[11]
99  sstype = sse_dict[line[16]]
100  beta_id = line[33]
101 
102  # decide whether to extend or store the SSE
103  if prev_sstype is None:
104  cur_sse = [pdb_res_num,pdb_res_num,convert_chain(chain)]
105  elif sstype != prev_sstype or chain_break:
106  # add cur_sse to the right place
107  if prev_sstype in ['helix', 'loop']:
108  sses[prev_sstype].append([cur_sse])
109  elif prev_sstype == 'beta':
110  beta_dict[prev_beta_id].append(cur_sse)
111  cur_sse = [pdb_res_num,pdb_res_num,convert_chain(chain)]
112  else:
113  cur_sse[1] = pdb_res_num
114  if chain_break:
115  prev_sstype = None
116  prev_beta_id = None
117  else:
118  prev_sstype = sstype
119  prev_beta_id = beta_id
120 
121  # final SSE processing
122  if not prev_sstype is None:
123  if prev_sstype in ['helix', 'loop']:
124  sses[prev_sstype].append([cur_sse])
125  elif prev_sstype == 'beta':
126  beta_dict[prev_beta_id].append(cur_sse)
127  # gather betas
128  for beta_sheet in beta_dict:
129  sses['beta'].append(beta_dict[beta_sheet])
130  return sses
131 
133  out_dir,
134  stat_files,
135  number_of_best_scoring_models=10,
136  get_every=1,
137  score_key="SimplifiedModel_Total_Score_None",
138  feature_keys=None,
139  rmf_file_key="rmf_file",
140  rmf_file_frame_key="rmf_frame_index",
141  override_rmf_dir=None):
142  """Given a list of stat files, read them all and find the best models.
143  Save to a single RMF along with a stat file.
144  @param mdl The IMP Model
145  @param out_dir The output directory. Will save 3 files (RMF, stat, summary)
146  @param stat_files List of all stat files to collect
147  @param number_of_best_scoring_models Num best models to gather
148  @param get_every Skip frames
149  @param score_key Used for the ranking
150  @param feature_keys Keys to keep around
151  @param rmf_file_key The key that says RMF file name
152  @param rmf_file_frame_key The key that says RMF frame number
153  @param override_rmf_dir For output, change the name of the RMF directory (experiment)
154  """
155 
156  # start by splitting into jobs
157  try:
158  from mpi4py import MPI
159  comm = MPI.COMM_WORLD
160  rank = comm.Get_rank()
161  number_of_processes = comm.size
162  except ImportError:
163  rank = 0
164  number_of_processes = 1
165  my_stat_files=IMP.pmi.tools.chunk_list_into_segments(
166  stat_files,number_of_processes)[rank]
167 
168  # filenames
169  out_stat_fn = os.path.join(out_dir,"top_"+str(number_of_best_scoring_models)+".out")
170  out_rmf_fn = os.path.join(out_dir,"top_"+str(number_of_best_scoring_models)+".rmf3")
171 
172  # extract all the models
173  all_fields=[]
174  for nsf,sf in enumerate(my_stat_files):
175 
176  # get list of keywords
177  root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
178  print("getting data from file %s" % sf)
180  all_keys = [score_key,
181  rmf_file_key,
182  rmf_file_frame_key]
183  for k in po.get_keys():
184  for fk in feature_keys:
185  if fk in k:
186  all_keys.append(k)
187  fields = po.get_fields(all_keys,
188  get_every=get_every)
189 
190  # check that all lengths are all equal
191  length_set = set([len(fields[f]) for f in fields])
192  minlen = min(length_set)
193  # if some of the fields are missing, truncate
194  # the feature files to the shortest one
195  if len(length_set) > 1:
196  print("get_best_models: the statfile is not synchronous")
197  minlen = min(length_set)
198  for f in fields:
199  fields[f] = fields[f][0:minlen]
200  if nsf==0:
201  all_fields=fields
202  else:
203  for k in fields:
204  all_fields[k]+=fields[k]
205 
206  if override_rmf_dir is not None:
207  for i in range(minlen):
208  all_fields[rmf_file_key][i]=os.path.join(
209  override_rmf_dir,os.path.basename(all_fields[rmf_file_key][i]))
210 
211  # gather info, sort, write
212  if number_of_processes!=1:
213  comm.Barrier()
214  if rank!=0:
215  comm.send(all_fields, dest=0, tag=11)
216  else:
217  for i in range(1,number_of_processes):
218  data_tmp = comm.recv(source=i, tag=11)
219  for k in all_fields:
220  all_fields[k]+=data_tmp[k]
221 
222  # sort by total score
223  order = sorted(range(len(all_fields[score_key])),
224  key=lambda i: float(all_fields[score_key][i]))
225 
226  # write the stat and RMF files
227  stat = open(out_stat_fn,'w')
228  rh0 = RMF.open_rmf_file_read_only(
229  os.path.join(root_directory_of_stat_file,all_fields[rmf_file_key][0]))
230  prots = IMP.rmf.create_hierarchies(rh0,mdl)
231  del rh0
232  outf = RMF.create_rmf_file(out_rmf_fn)
233  IMP.rmf.add_hierarchies(outf,prots)
234  for nm,i in enumerate(order[:number_of_best_scoring_models]):
235  dline=dict((k,all_fields[k][i]) for k in all_fields)
236  dline['orig_rmf_file']=dline[rmf_file_key]
237  dline['orig_rmf_frame_index']=dline[rmf_file_frame_key]
238  dline[rmf_file_key]=out_rmf_fn
239  dline[rmf_file_frame_key]=nm
240  rh = RMF.open_rmf_file_read_only(
241  os.path.join(root_directory_of_stat_file,all_fields[rmf_file_key][i]))
242  IMP.rmf.link_hierarchies(rh,prots)
244  RMF.FrameID(all_fields[rmf_file_frame_key][i]))
245  IMP.rmf.save_frame(outf)
246  del rh
247  stat.write(str(dline)+'\n')
248  stat.close()
249  print('wrote stats to',out_stat_fn)
250  print('wrote rmfs to',out_rmf_fn)
251 
252 
253 
254 
255 def get_best_models(stat_files,
256  score_key="SimplifiedModel_Total_Score_None",
257  feature_keys=None,
258  rmf_file_key="rmf_file",
259  rmf_file_frame_key="rmf_frame_index",
260  prefiltervalue=None,
261  get_every=1):
262  """ Given a list of stat files, read them all and find the best models.
263  Returns the best rmf filenames, frame numbers, scores, and values for feature keywords
264  """
265  rmf_file_list=[] # best RMF files
266  rmf_file_frame_list=[] # best RMF frames
267  score_list=[] # best scores
268  feature_keyword_list_dict=defaultdict(list) # best values of the feature keys
269  for sf in stat_files:
270  root_directory_of_stat_file = os.path.dirname(os.path.dirname(os.path.abspath(sf)))
271  print("getting data from file %s" % sf)
273 
274  try:
275  file_keywords = po.get_keys()
276  except:
277  continue
278 
279  keywords = [score_key,
280  rmf_file_key,
281  rmf_file_frame_key]
282 
283  # check all requested keys are in the file
284  # this looks weird because searching for "*requested_key*"
285  if feature_keys:
286  for requested_key in feature_keys:
287  for file_k in file_keywords:
288  if requested_key in file_k:
289  keywords.append(file_k)
290 
291  if prefiltervalue is None:
292  fields = po.get_fields(keywords,
293  get_every=get_every)
294  else:
295  fields = po.get_fields(keywords,
296  filtertuple=(score_key,"<",prefiltervalue),
297  get_every=get_every)
298 
299  # check that all lengths are all equal
300  length_set = set()
301  for f in fields:
302  length_set.add(len(fields[f]))
303 
304  # if some of the fields are missing, truncate
305  # the feature files to the shortest one
306  if len(length_set) > 1:
307  print("get_best_models: the statfile is not synchronous")
308  minlen = min(length_set)
309  for f in fields:
310  fields[f] = fields[f][0:minlen]
311 
312  # append to the lists
313  score_list += fields[score_key]
314  for rmf in fields[rmf_file_key]:
315  rmf=os.path.normpath(rmf)
316  if root_directory_of_stat_file not in rmf:
317  rmf=os.path.join(root_directory_of_stat_file,rmf)
318  rmf_file_list.append(rmf)
319 
320  rmf_file_frame_list += fields[rmf_file_frame_key]
321 
322  for k in keywords:
323  feature_keyword_list_dict[k] += fields[k]
324 
325  return rmf_file_list,rmf_file_frame_list,score_list,feature_keyword_list_dict
326 
327 def get_trajectory_models(stat_files,
328  score_key="SimplifiedModel_Total_Score_None",
329  rmf_file_key="rmf_file",
330  rmf_file_frame_key="rmf_frame_index",
331  get_every=1):
332  """ Given a list of stat files, read them all and find a trajectory of models.
333  Returns the rmf filenames, frame numbers, scores, and values for feature keywords
334  """
335  rmf_file_list=[] # best RMF files
336  rmf_file_frame_list=[] # best RMF frames
337  score_list=[] # best scores
338  for sf in stat_files:
339  root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
340  print("getting data from file %s" % sf)
342  keywords = po.get_keys()
343 
344  feature_keywords = [score_key,
345  rmf_file_key,
346  rmf_file_frame_key]
347 
348  fields = po.get_fields(feature_keywords,
349  get_every=get_every)
350 
351  # check that all lengths are all equal
352  length_set = set()
353  for f in fields:
354  length_set.add(len(fields[f]))
355 
356  # if some of the fields are missing, truncate
357  # the feature files to the shortest one
358  if len(length_set) > 1:
359  print("get_best_models: the statfile is not synchronous")
360  minlen = min(length_set)
361  for f in fields:
362  fields[f] = fields[f][0:minlen]
363 
364  # append to the lists
365  score_list += fields[score_key]
366  for rmf in fields[rmf_file_key]:
367  rmf_file_list.append(os.path.join(root_directory_of_stat_file,rmf))
368 
369  rmf_file_frame_list += fields[rmf_file_frame_key]
370 
371  return rmf_file_list,rmf_file_frame_list,score_list
372 
373 
375  rmf_tuples,
376  alignment_components=None,
377  rmsd_calculation_components=None,
378  state_number=0):
379  """ Read in coordinates of a set of RMF tuples.
380  Returns the coordinates split as requested (all, alignment only, rmsd only) as well as
381  RMF file names (as keys in a dictionary, with values being the rank number) and just a plain list
382  @param model The IMP model
383  @param rmf_tuples [score,filename,frame number,original order number, rank]
384  @param alignment_components Tuples to specify what you're aligning on
385  @param rmsd_calculation_components Tuples to specify what components are used for RMSD calc
386  """
387  all_coordinates = []
388  rmsd_coordinates = []
389  alignment_coordinates = []
390  all_rmf_file_names = []
391  rmf_file_name_index_dict = {} # storing the features
392 
393  for cnt, tpl in enumerate(rmf_tuples):
394  rmf_file = tpl[1]
395  frame_number = tpl[2]
396  if cnt==0:
397  prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
398  frame_number,
399  rmf_file)
400  else:
401  IMP.pmi.analysis.link_hiers_to_rmf(model,prots,frame_number,rmf_file)
402 
403  if not prots:
404  continue
405  if IMP.pmi.get_is_canonical(prots[0]):
406  states = IMP.atom.get_by_type(prots[0],IMP.atom.STATE_TYPE)
407  prot = states[state_number]
408  else:
409  prot = prots[state_number]
410 
411  # getting the particles
413  all_particles = [pp for key in part_dict for pp in part_dict[key]]
414  all_ps_set = set(all_particles)
415  model_coordinate_dict = {}
416  template_coordinate_dict={}
417  rmsd_coordinate_dict={}
418 
419  for pr in part_dict:
420  model_coordinate_dict[pr] = np.array(
421  [np.array(IMP.core.XYZ(i).get_coordinates()) for i in part_dict[pr]])
422  # for each file, get (as floats) a list of all coordinates
423  # of all requested tuples, organized as dictionaries.
424  for tuple_dict,result_dict in zip((alignment_components,rmsd_calculation_components),
425  (template_coordinate_dict,rmsd_coordinate_dict)):
426 
427  if tuple_dict is None:
428  continue
429 
430  # PMI2: do selection of resolution and name at the same time
431  if IMP.pmi.get_is_canonical(prot):
432  for pr in tuple_dict:
433  ps = IMP.pmi.tools.select_by_tuple_2(prot,tuple_dict[pr],resolution=1)
434  result_dict[pr] = [list(map(float,IMP.core.XYZ(p).get_coordinates()))
435  for p in ps]
436  else:
437  for pr in tuple_dict:
438  if type(tuple_dict[pr]) is str:
439  name=tuple_dict[pr]
440  s=IMP.atom.Selection(prot,molecule=name)
441  elif type(tuple_dict[pr]) is tuple:
442  name=tuple_dict[pr][2]
443  rend=tuple_dict[pr][1]
444  rbegin=tuple_dict[pr][0]
445  s=IMP.atom.Selection(prot,molecule=name,residue_indexes=range(rbegin,rend+1))
446  ps=s.get_selected_particles()
447  filtered_particles=[p for p in ps if p in all_ps_set]
448  result_dict[pr] = \
449  [list(map(float,IMP.core.XYZ(p).get_coordinates())) for p in filtered_particles]
450 
451  all_coordinates.append(model_coordinate_dict)
452  alignment_coordinates.append(template_coordinate_dict)
453  rmsd_coordinates.append(rmsd_coordinate_dict)
454  frame_name = rmf_file + '|' + str(frame_number)
455  all_rmf_file_names.append(frame_name)
456  rmf_file_name_index_dict[frame_name] = tpl[4]
457  return all_coordinates,alignment_coordinates,rmsd_coordinates,rmf_file_name_index_dict,all_rmf_file_names
458 
459 def get_bead_sizes(model,rmf_tuple,rmsd_calculation_components=None,state_number=0):
460  '''
461  @param model The IMP model
462  @param rmf_tuple score,filename,frame number,original order number, rank
463  @param rmsd_calculation_components Tuples to specify what components are used for RMSD calc
464  '''
465  if rmsd_calculation_components is None:
466  return {}
467 
468  rmf_file = rmf_tuple[1]
469  frame_number = rmf_tuple[2]
470  prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
471  frame_number,
472  rmf_file)
473 
474  if IMP.pmi.get_is_canonical(prots[0]):
475  states = IMP.atom.get_by_type(prots[0],IMP.atom.STATE_TYPE)
476  prot = states[state_number]
477  else:
478  prot = prots[state_number]
479 
480  rmsd_bead_size_dict = {}
481 
482  # PMI2: do selection of resolution and name at the same time
483  if IMP.pmi.get_is_canonical(prot):
484  for pr in rmsd_calculation_components:
485  ps = IMP.pmi.tools.select_by_tuple_2(prot,rmsd_calculation_components[pr],resolution=1)
486  rmsd_bead_size_dict[pr] = [len(IMP.pmi.tools.get_residue_indexes(p))
487  for p in ps]
488  else:
489  # getting the particles
491  all_particles=[pp for key in part_dict for pp in part_dict[key]]
492  all_ps_set=set(all_particles)
493 
494  # getting the coordinates
495  for pr in rmsd_calculation_components:
496  if type(rmsd_calculation_components[pr]) is str:
497  name=rmsd_calculation_components[pr]
498  s=IMP.atom.Selection(prot,molecule=name)
499  elif type(rmsd_calculation_components[pr]) is tuple:
500  name=rmsd_calculation_components[pr][2]
501  rend=rmsd_calculation_components[pr][1]
502  rbegin=rmsd_calculation_components[pr][0]
503  s=IMP.atom.Selection(prot,molecule=name,residue_indexes=range(rbegin,rend+1))
504  ps=s.get_selected_particles()
505  filtered_particles=[p for p in ps if p in all_ps_set]
506  rmsd_bead_size_dict[pr] = \
507  [len(IMP.pmi.tools.get_residue_indexes(p)) for p in filtered_particles]
508 
509  return rmsd_bead_size_dict
510 
511 class RMSDOutput(object):
512  """A helper output based on dist to initial coordinates"""
513  def __init__(self,ps,label,init_coords=None):
514  self.mdl = ps[0].get_model()
515  self.ps = ps
516  if init_coords is None:
517  self.init_coords = [IMP.core.XYZ(p).get_coordinates() for p in self.ps]
518  else:
519  self.init_coords = init_coords
520  self.label = label
521  def get_output(self):
522  self.mdl.update()
523  output = {}
524  coords = [IMP.core.XYZ(p).get_coordinates() for p in self.ps]
525  rmsd = IMP.algebra.get_rmsd(coords,self.init_coords)
526  output["RMSD_"+self.label] = str(rmsd)
527  return output
528 
529 class TotalScoreOutput(object):
530  """A helper output for model evaluation"""
531  def __init__(self,mdl):
532  self.mdl = mdl
533  self.rs = IMP.pmi.tools.get_restraint_set(self.mdl)
534  def get_output(self):
535  self.mdl.update()
536  score = self.rs.evaluate(False)
537  output = {}
538  output["Total_Score"] = str(score)
539  return output
def get_restraint_set
Get a RestraintSet containing all PMI restraints added to the model.
Definition: tools.py:50
A class for reading stat files.
Definition: output.py:696
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def parse_dssp
Read a DSSP file, and return secondary structure elements (SSEs).
def get_best_models
Given a list of stat files, read them all and find the best models.
A helper output for model evaluation.
Miscellaneous utilities.
Definition: tools.py:1
A helper output based on dist to initial coordinates.
def get_trajectory_models
Given a list of stat files, read them all and find a trajectory of models.
def select_by_tuple_2
New tuple format: molname OR (start,stop,molname,copynum,statenum) Copy and state are optional...
Definition: tools.py:819
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Tools for clustering and cluster analysis.
Definition: pmi/Analysis.py:1
bool get_is_canonical(atom::Hierarchy h)
Walk up a PMI2 hierarchy/representations and check if the root is named System.
Definition: utilities.h:91
Classes for writing output files and processing them.
Definition: output.py:1
def save_best_models
Given a list of stat files, read them all and find the best models.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
def read_coordinates_of_rmfs
Read in coordinates of a set of RMF tuples.
Python classes to represent, score, sample and analyze models.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Definition: Selection.h:66
Support for the RMF file format for storing hierarchical molecular data and markup.
def get_residue_indexes
Retrieve the residue indexes for the given particle.
Definition: tools.py:1037
Store objects in order they were added, but with default type.
Definition: tools.py:1530