IMP logo
IMP Reference Guide  2.7.0
The Integrative Modeling Platform
pmi/io/__init__.py
1 """@namespace IMP.pmi.io
2  Utility classes and functions for reading and storing PMI files
3 """
4 
5 from __future__ import print_function
6 import IMP
7 import IMP.algebra
8 import IMP.atom
9 import IMP.pmi
10 import IMP.rmf
11 import IMP.pmi.analysis
12 import IMP.pmi.output
13 import IMP.pmi.tools
14 import RMF
15 import sys,os
16 import numpy as np
17 import re
18 from collections import defaultdict
19 import itertools
20 
21 def parse_dssp(dssp_fn, limit_to_chains='',name_map=None):
22  """Read a DSSP file, and return secondary structure elements (SSEs).
23  Values are all PDB residue numbering.
24  @param dssp_fn The file to read
25  @param limit_to_chains Only read/return these chain IDs
26  @param name_map If passed, return tuples organized by molecule name
27  (name_map should be a dictionary with chain IDs as keys and
28  molecule names as values).
29 
30  @return a dictionary with keys 'helix', 'beta', 'loop'
31  Each contains a list of SSEs.
32  Each SSE is a list of elements (e.g. strands in a sheet)
33  Each element is a tuple (residue start, residue end, chain)
34 
35  Example for a structure with helix A:5-7 and Beta strands A:1-3,A:9-11:
36 
37  @code{.py}
38  ret = { 'helix' : [ [ (5,7,'A') ],... ]
39  'beta' : [ [ (1,3,'A'),
40  (9,11,'A'),...],...]
41  'loop' : same format as helix
42  }
43  @endcode
44  """
45 
46  def convert_chain(ch):
47  if name_map is None:
48  return ch
49  else:
50  try:
51  return name_map[ch]
52  except:
53  return ch
54 
55  # setup
56  helix_classes = 'GHI'
57  strand_classes = 'EB'
58  loop_classes = [' ', '', 'T', 'S']
59  sse_dict = {}
60  for h in helix_classes:
61  sse_dict[h] = 'helix'
62  for s in strand_classes:
63  sse_dict[s] = 'beta'
64  for l in loop_classes:
65  sse_dict[l] = 'loop'
66  sses = {'helix':[],
67  'beta':[],
68  'loop':[]}
69 
70  # read file and parse
71  start = False
72 
73  # temporary beta dictionary indexed by DSSP's ID
74  beta_dict = IMP.pmi.tools.OrderedDefaultDict(list)
75  prev_sstype = None
76  prev_beta_id = None
77 
78  with open(dssp_fn, 'r') as fh:
79  for line in fh:
80  fields = line.split()
81  chain_break = False
82  if len(fields) < 2:
83  continue
84  if fields[1] == "RESIDUE":
85  # Start parsing from here
86  start = True
87  continue
88  if not start:
89  continue
90  if line[9] == " ":
91  chain_break = True
92  elif limit_to_chains != '' and line[11] not in limit_to_chains:
93  continue
94 
95  # gather line info
96  if not chain_break:
97  pdb_res_num = int(line[5:10])
98  chain = line[11]
99  sstype = sse_dict[line[16]]
100  beta_id = line[33]
101 
102  # decide whether to extend or store the SSE
103  if prev_sstype is None:
104  cur_sse = [pdb_res_num,pdb_res_num,convert_chain(chain)]
105  elif sstype != prev_sstype or chain_break:
106  # add cur_sse to the right place
107  if prev_sstype in ['helix', 'loop']:
108  sses[prev_sstype].append([cur_sse])
109  elif prev_sstype == 'beta':
110  beta_dict[prev_beta_id].append(cur_sse)
111  cur_sse = [pdb_res_num,pdb_res_num,convert_chain(chain)]
112  else:
113  cur_sse[1] = pdb_res_num
114  if chain_break:
115  prev_sstype = None
116  prev_beta_id = None
117  else:
118  prev_sstype = sstype
119  prev_beta_id = beta_id
120 
121  # final SSE processing
122  if not prev_sstype is None:
123  if prev_sstype in ['helix', 'loop']:
124  sses[prev_sstype].append([cur_sse])
125  elif prev_sstype == 'beta':
126  beta_dict[prev_beta_id].append(cur_sse)
127  # gather betas
128  for beta_sheet in beta_dict:
129  sses['beta'].append(beta_dict[beta_sheet])
130  return sses
131 
133  out_dir,
134  stat_files,
135  number_of_best_scoring_models=10,
136  get_every=1,
137  score_key="SimplifiedModel_Total_Score_None",
138  feature_keys=None,
139  rmf_file_key="rmf_file",
140  rmf_file_frame_key="rmf_frame_index",
141  override_rmf_dir=None):
142  """Given a list of stat files, read them all and find the best models.
143  Save to a single RMF along with a stat file.
144  @param mdl The IMP Model
145  @param out_dir The output directory. Will save 3 files (RMF, stat, summary)
146  @param stat_files List of all stat files to collect
147  @param number_of_best_scoring_models Num best models to gather
148  @param get_every Skip frames
149  @param score_key Used for the ranking
150  @param feature_keys Keys to keep around
151  @param rmf_file_key The key that says RMF file name
152  @param rmf_file_frame_key The key that says RMF frame number
153  @param override_rmf_dir For output, change the name of the RMF directory (experiment)
154  """
155 
156  # start by splitting into jobs
157  try:
158  from mpi4py import MPI
159  comm = MPI.COMM_WORLD
160  rank = comm.Get_rank()
161  number_of_processes = comm.size
162  except ImportError:
163  rank = 0
164  number_of_processes = 1
165  my_stat_files=IMP.pmi.tools.chunk_list_into_segments(
166  stat_files,number_of_processes)[rank]
167 
168  # filenames
169  out_stat_fn = os.path.join(out_dir,"top_"+str(number_of_best_scoring_models)+".out")
170  out_rmf_fn = os.path.join(out_dir,"top_"+str(number_of_best_scoring_models)+".rmf3")
171 
172  # extract all the models
173  all_fields=[]
174  for nsf,sf in enumerate(my_stat_files):
175 
176  # get list of keywords
177  root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
178  print("getting data from file %s" % sf)
180  all_keys = [score_key,
181  rmf_file_key,
182  rmf_file_frame_key]
183  for k in po.get_keys():
184  for fk in feature_keys:
185  if fk in k:
186  all_keys.append(k)
187  fields = po.get_fields(all_keys,
188  get_every=get_every)
189 
190  # check that all lengths are all equal
191  length_set = set([len(fields[f]) for f in fields])
192  minlen = min(length_set)
193  # if some of the fields are missing, truncate
194  # the feature files to the shortest one
195  if len(length_set) > 1:
196  print("get_best_models: the statfile is not synchronous")
197  minlen = min(length_set)
198  for f in fields:
199  fields[f] = fields[f][0:minlen]
200  if nsf==0:
201  all_fields=fields
202  else:
203  for k in fields:
204  all_fields[k]+=fields[k]
205 
206  if override_rmf_dir is not None:
207  for i in range(minlen):
208  all_fields[rmf_file_key][i]=os.path.join(
209  override_rmf_dir,os.path.basename(all_fields[rmf_file_key][i]))
210 
211  # gather info, sort, write
212  if number_of_processes!=1:
213  comm.Barrier()
214  if rank!=0:
215  comm.send(all_fields, dest=0, tag=11)
216  else:
217  for i in range(1,number_of_processes):
218  data_tmp = comm.recv(source=i, tag=11)
219  for k in all_fields:
220  all_fields[k]+=data_tmp[k]
221 
222  # sort by total score
223  order = sorted(range(len(all_fields[score_key])),
224  key=lambda i: float(all_fields[score_key][i]))
225 
226  # write the stat and RMF files
227  stat = open(out_stat_fn,'w')
228  rh0 = RMF.open_rmf_file_read_only(
229  os.path.join(root_directory_of_stat_file,all_fields[rmf_file_key][0]))
230  prots = IMP.rmf.create_hierarchies(rh0,mdl)
231  del rh0
232  outf = RMF.create_rmf_file(out_rmf_fn)
233  IMP.rmf.add_hierarchies(outf,prots)
234  for nm,i in enumerate(order[:number_of_best_scoring_models]):
235  dline=dict((k,all_fields[k][i]) for k in all_fields)
236  dline['orig_rmf_file']=dline[rmf_file_key]
237  dline['orig_rmf_frame_index']=dline[rmf_file_frame_key]
238  dline[rmf_file_key]=out_rmf_fn
239  dline[rmf_file_frame_key]=nm
240  rh = RMF.open_rmf_file_read_only(
241  os.path.join(root_directory_of_stat_file,all_fields[rmf_file_key][i]))
242  IMP.rmf.link_hierarchies(rh,prots)
244  RMF.FrameID(all_fields[rmf_file_frame_key][i]))
245  IMP.rmf.save_frame(outf)
246  del rh
247  stat.write(str(dline)+'\n')
248  stat.close()
249  print('wrote stats to',out_stat_fn)
250  print('wrote rmfs to',out_rmf_fn)
251 
252 
253 
254 
255 def get_best_models(stat_files,
256  score_key="SimplifiedModel_Total_Score_None",
257  feature_keys=None,
258  rmf_file_key="rmf_file",
259  rmf_file_frame_key="rmf_frame_index",
260  prefiltervalue=None,
261  get_every=1):
262  """ Given a list of stat files, read them all and find the best models.
263  Returns the best rmf filenames, frame numbers, scores, and values for feature keywords
264  """
265  rmf_file_list=[] # best RMF files
266  rmf_file_frame_list=[] # best RMF frames
267  score_list=[] # best scores
268  feature_keyword_list_dict=defaultdict(list) # best values of the feature keys
269  for sf in stat_files:
270  root_directory_of_stat_file = os.path.dirname(os.path.dirname(os.path.abspath(sf)))
271  print("getting data from file %s" % sf)
273 
274  try:
275  file_keywords = po.get_keys()
276  except:
277  continue
278 
279  keywords = [score_key,
280  rmf_file_key,
281  rmf_file_frame_key]
282 
283  # check all requested keys are in the file
284  # this looks weird because searching for "*requested_key*"
285  if feature_keys:
286  for requested_key in feature_keys:
287  for file_k in file_keywords:
288  if requested_key in file_k:
289  keywords.append(file_k)
290 
291  if prefiltervalue is None:
292  fields = po.get_fields(keywords,
293  get_every=get_every)
294  else:
295  fields = po.get_fields(keywords,
296  filtertuple=(score_key,"<",prefiltervalue),
297  get_every=get_every)
298 
299  # check that all lengths are all equal
300  length_set = set()
301  for f in fields:
302  length_set.add(len(fields[f]))
303 
304  # if some of the fields are missing, truncate
305  # the feature files to the shortest one
306  if len(length_set) > 1:
307  print("get_best_models: the statfile is not synchronous")
308  minlen = min(length_set)
309  for f in fields:
310  fields[f] = fields[f][0:minlen]
311 
312  # append to the lists
313  score_list += fields[score_key]
314  for rmf in fields[rmf_file_key]:
315  rmf_file_list.append(os.path.join(root_directory_of_stat_file,rmf))
316 
317  rmf_file_frame_list += fields[rmf_file_frame_key]
318 
319  for k in keywords:
320  feature_keyword_list_dict[k] += fields[k]
321 
322  return rmf_file_list,rmf_file_frame_list,score_list,feature_keyword_list_dict
323 
324 def get_trajectory_models(stat_files,
325  score_key="SimplifiedModel_Total_Score_None",
326  rmf_file_key="rmf_file",
327  rmf_file_frame_key="rmf_frame_index",
328  get_every=1):
329  """ Given a list of stat files, read them all and find a trajectory of models.
330  Returns the rmf filenames, frame numbers, scores, and values for feature keywords
331  """
332  rmf_file_list=[] # best RMF files
333  rmf_file_frame_list=[] # best RMF frames
334  score_list=[] # best scores
335  for sf in stat_files:
336  root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
337  print("getting data from file %s" % sf)
339  keywords = po.get_keys()
340 
341  feature_keywords = [score_key,
342  rmf_file_key,
343  rmf_file_frame_key]
344 
345  fields = po.get_fields(feature_keywords,
346  get_every=get_every)
347 
348  # check that all lengths are all equal
349  length_set = set()
350  for f in fields:
351  length_set.add(len(fields[f]))
352 
353  # if some of the fields are missing, truncate
354  # the feature files to the shortest one
355  if len(length_set) > 1:
356  print("get_best_models: the statfile is not synchronous")
357  minlen = min(length_set)
358  for f in fields:
359  fields[f] = fields[f][0:minlen]
360 
361  # append to the lists
362  score_list += fields[score_key]
363  for rmf in fields[rmf_file_key]:
364  rmf_file_list.append(os.path.join(root_directory_of_stat_file,rmf))
365 
366  rmf_file_frame_list += fields[rmf_file_frame_key]
367 
368  return rmf_file_list,rmf_file_frame_list,score_list
369 
370 
372  rmf_tuples,
373  alignment_components=None,
374  rmsd_calculation_components=None,
375  state_number=0):
376  """ Read in coordinates of a set of RMF tuples.
377  Returns the coordinates split as requested (all, alignment only, rmsd only) as well as
378  RMF file names (as keys in a dictionary, with values being the rank number) and just a plain list
379  @param model The IMP model
380  @param rmf_tuples [score,filename,frame number,original order number, rank]
381  @param alignment_components Tuples to specify what you're aligning on
382  @param rmsd_calculation_components Tuples to specify what components are used for RMSD calc
383  """
384  all_coordinates = []
385  rmsd_coordinates = []
386  alignment_coordinates = []
387  all_rmf_file_names = []
388  rmf_file_name_index_dict = {} # storing the features
389 
390  for cnt, tpl in enumerate(rmf_tuples):
391  rmf_file = tpl[1]
392  frame_number = tpl[2]
393  if cnt==0:
394  prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
395  frame_number,
396  rmf_file)
397  else:
398  IMP.pmi.analysis.link_hiers_to_rmf(model,prots,frame_number,rmf_file)
399 
400  if not prots:
401  continue
402  if IMP.pmi.get_is_canonical(prots[0]):
403  states = IMP.atom.get_by_type(prots[0],IMP.atom.STATE_TYPE)
404  prot = states[state_number]
405  else:
406  prot = prots[state_number]
407 
408  # getting the particles
410  all_particles = [pp for key in part_dict for pp in part_dict[key]]
411  all_ps_set = set(all_particles)
412  model_coordinate_dict = {}
413  template_coordinate_dict={}
414  rmsd_coordinate_dict={}
415 
416  for pr in part_dict:
417  model_coordinate_dict[pr] = np.array(
418  [np.array(IMP.core.XYZ(i).get_coordinates()) for i in part_dict[pr]])
419  # for each file, get (as floats) a list of all coordinates
420  # of all requested tuples, organized as dictionaries.
421  for tuple_dict,result_dict in zip((alignment_components,rmsd_calculation_components),
422  (template_coordinate_dict,rmsd_coordinate_dict)):
423 
424  if tuple_dict is None:
425  continue
426 
427  # PMI2: do selection of resolution and name at the same time
428  if IMP.pmi.get_is_canonical(prot):
429  for pr in tuple_dict:
430  ps = IMP.pmi.tools.select_by_tuple_2(prot,tuple_dict[pr],resolution=1)
431  result_dict[pr] = [list(map(float,IMP.core.XYZ(p).get_coordinates()))
432  for p in ps]
433  else:
434  for pr in tuple_dict:
435  if type(tuple_dict[pr]) is str:
436  name=tuple_dict[pr]
437  s=IMP.atom.Selection(prot,molecule=name)
438  elif type(tuple_dict[pr]) is tuple:
439  name=tuple_dict[pr][2]
440  rend=tuple_dict[pr][1]
441  rbegin=tuple_dict[pr][0]
442  s=IMP.atom.Selection(prot,molecule=name,residue_indexes=range(rbegin,rend+1))
443  ps=s.get_selected_particles()
444  filtered_particles=[p for p in ps if p in all_ps_set]
445  result_dict[pr] = \
446  [list(map(float,IMP.core.XYZ(p).get_coordinates())) for p in filtered_particles]
447 
448  all_coordinates.append(model_coordinate_dict)
449  alignment_coordinates.append(template_coordinate_dict)
450  rmsd_coordinates.append(rmsd_coordinate_dict)
451  frame_name = rmf_file + '|' + str(frame_number)
452  all_rmf_file_names.append(frame_name)
453  rmf_file_name_index_dict[frame_name] = tpl[4]
454  return all_coordinates,alignment_coordinates,rmsd_coordinates,rmf_file_name_index_dict,all_rmf_file_names
455 
456 def get_bead_sizes(model,rmf_tuple,rmsd_calculation_components=None,state_number=0):
457  '''
458  @param model The IMP model
459  @param rmf_tuple score,filename,frame number,original order number, rank
460  @param rmsd_calculation_components Tuples to specify what components are used for RMSD calc
461  '''
462  if rmsd_calculation_components is None:
463  return {}
464 
465  rmf_file = rmf_tuple[1]
466  frame_number = rmf_tuple[2]
467  prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
468  frame_number,
469  rmf_file)
470 
471  if IMP.pmi.get_is_canonical(prots[0]):
472  states = IMP.atom.get_by_type(prots[0],IMP.atom.STATE_TYPE)
473  prot = states[state_number]
474  else:
475  prot = prots[state_number]
476 
477  rmsd_bead_size_dict = {}
478 
479  # PMI2: do selection of resolution and name at the same time
480  if IMP.pmi.get_is_canonical(prot):
481  for pr in rmsd_calculation_components:
482  ps = IMP.pmi.tools.select_by_tuple_2(prot,rmsd_calculation_components[pr],resolution=1)
483  rmsd_bead_size_dict[pr] = [len(IMP.pmi.tools.get_residue_indexes(p))
484  for p in ps]
485  else:
486  # getting the particles
488  all_particles=[pp for key in part_dict for pp in part_dict[key]]
489  all_ps_set=set(all_particles)
490 
491  # getting the coordinates
492  for pr in rmsd_calculation_components:
493  if type(rmsd_calculation_components[pr]) is str:
494  name=rmsd_calculation_components[pr]
495  s=IMP.atom.Selection(prot,molecule=name)
496  elif type(rmsd_calculation_components[pr]) is tuple:
497  name=rmsd_calculation_components[pr][2]
498  rend=rmsd_calculation_components[pr][1]
499  rbegin=rmsd_calculation_components[pr][0]
500  s=IMP.atom.Selection(prot,molecule=name,residue_indexes=range(rbegin,rend+1))
501  ps=s.get_selected_particles()
502  filtered_particles=[p for p in ps if p in all_ps_set]
503  rmsd_bead_size_dict[pr] = \
504  [len(IMP.pmi.tools.get_residue_indexes(p)) for p in filtered_particles]
505 
506  return rmsd_bead_size_dict
507 
508 class RMSDOutput(object):
509  """A helper output based on dist to initial coordinates"""
510  def __init__(self,ps,label,init_coords=None):
511  self.mdl = ps[0].get_model()
512  self.ps = ps
513  if init_coords is None:
514  self.init_coords = [IMP.core.XYZ(p).get_coordinates() for p in self.ps]
515  else:
516  self.init_coords = init_coords
517  self.label = label
518  def get_output(self):
519  self.mdl.update()
520  output = {}
521  coords = [IMP.core.XYZ(p).get_coordinates() for p in self.ps]
522  rmsd = IMP.algebra.get_rmsd(coords,self.init_coords)
523  output["RMSD_"+self.label] = str(rmsd)
524  return output
525 
526 class TotalScoreOutput(object):
527  """A helper output for model evaluation"""
528  def __init__(self,mdl):
529  self.mdl = mdl
530  self.rs = IMP.pmi.tools.get_restraint_set(self.mdl)
531  def get_output(self):
532  self.mdl.update()
533  score = self.rs.evaluate(False)
534  output = {}
535  output["Total_Score"] = str(score)
536  return output
def get_restraint_set
Get a RestraintSet containing all PMI restraints added to the model.
Definition: tools.py:49
A class for reading stat files.
Definition: output.py:696
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def parse_dssp
Read a DSSP file, and return secondary structure elements (SSEs).
def get_best_models
Given a list of stat files, read them all and find the best models.
A helper output for model evaluation.
Miscellaneous utilities.
Definition: tools.py:1
A helper output based on dist to initial coordinates.
def get_trajectory_models
Given a list of stat files, read them all and find a trajectory of models.
def select_by_tuple_2
New tuple format: molname OR (start,stop,molname,copynum,statenum) Copy and state are optional...
Definition: tools.py:850
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Tools for clustering and cluster analysis.
Definition: pmi/Analysis.py:1
bool get_is_canonical(atom::Hierarchy h)
Walk up a PMI2 hierarchy/representations and check if the root is named System.
Definition: utilities.h:91
Classes for writing output files and processing them.
Definition: output.py:1
def save_best_models
Given a list of stat files, read them all and find the best models.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
def read_coordinates_of_rmfs
Read in coordinates of a set of RMF tuples.
Python classes to represent, score, sample and analyze models.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Definition: Selection.h:66
Support for the RMF file format for storing hierarchical molecular data and markup.
def get_residue_indexes
Retrieve the residue indexes for the given particle.
Definition: tools.py:1068
Store objects in order they were added, but with default type.
Definition: tools.py:1561