IMP logo
IMP Reference Guide  2.6.0
The Integrative Modeling Platform
pmi/io/__init__.py
1 """@namespace IMP.pmi.io
2  Utility classes and functions for reading and storing PMI files
3 """
4 
5 from __future__ import print_function
6 import IMP
7 import IMP.algebra
8 import IMP.atom
9 import IMP.pmi
10 import IMP.rmf
11 import IMP.pmi.analysis
12 import IMP.pmi.output
13 import IMP.pmi.tools
14 import RMF
15 import sys,os
16 import numpy as np
17 import re
18 from collections import defaultdict
19 import itertools
20 
21 def parse_dssp(dssp_fn, limit_to_chains=''):
22  """read dssp file, get SSEs. values are all PDB residue numbering.
23  Returns a dictionary with keys helix, beta, loop
24  Each contains a list of SSEs.
25  Each SSE is a list of elements (e.g. strands in a sheet)
26  Each element is a pair (chain,(residue_indexes))
27 
28  Example for a structure with helix A:5-7 and Beta strands A:1-3,A:9-11:
29  ret = { 'helix' : [ [ ('A',5,7 ) ],... ]
30  'beta' : [ [ ('A',1,3),
31  ('A',9,11),...],...]
32  'loop' : same format as helix
33  }
34  """
35  # setup
36  helix_classes = 'GHI'
37  strand_classes = 'EB'
38  loop_classes = [' ', '', 'T', 'S']
39  sse_dict = {}
40  for h in helix_classes:
41  sse_dict[h] = 'helix'
42  for s in strand_classes:
43  sse_dict[s] = 'beta'
44  for l in loop_classes:
45  sse_dict[l] = 'loop'
46  sses = {'helix':[],
47  'beta':[],
48  'loop':[]}
49 
50  # read file and parse
51  start = False
52 
53  # temporary beta dictionary indexed by DSSP's ID
54  beta_dict = defaultdict(list)
55  prev_sstype = None
56  cur_sse = ['',[]]
57  prev_beta_id = None
58  for line in open(dssp_fn, 'r'):
59  fields = line.split()
60  chain_break = False
61  if len(fields) < 2:
62  continue
63  if fields[1] == "RESIDUE":
64  # Start parsing from here
65  start = True
66  continue
67  if not start:
68  continue
69  if line[9] == " ":
70  chain_break = True
71  elif limit_to_chains != '' and line[11] not in limit_to_chains:
72  continue
73 
74  # gather line info
75  if not chain_break:
76  pdb_res_num = int(line[5:10])
77  chain = line[11]
78  sstype = sse_dict[line[16]]
79  beta_id = line[33]
80 
81  # decide whether to extend or store the SSE
82  if prev_sstype is None:
83  cur_sse = [chain,pdb_res_num,pdb_res_num]
84  elif sstype != prev_sstype or chain_break:
85  # add cur_sse to the right place
86  if prev_sstype in ['helix', 'loop']:
87  sses[prev_sstype].append([cur_sse])
88  elif prev_sstype == 'beta':
89  beta_dict[prev_beta_id].append(cur_sse)
90  cur_sse = [chain,pdb_res_num,pdb_res_num]
91  else:
92  cur_sse[2] = pdb_res_num
93  if chain_break:
94  prev_sstype = None
95  prev_beta_id = None
96  else:
97  prev_sstype = sstype
98  prev_beta_id = beta_id
99 
100  # final SSE processing
101  if not prev_sstype is None:
102  if prev_sstype in ['helix', 'loop']:
103  sses[prev_sstype].append([cur_sse])
104  elif prev_sstype == 'beta':
105  beta_dict[prev_beta_id].append(cur_sse)
106  # gather betas
107  for beta_sheet in beta_dict:
108  sses['beta'].append(beta_dict[beta_sheet])
109  return sses
110 
112  out_dir,
113  stat_files,
114  number_of_best_scoring_models=10,
115  get_every=1,
116  score_key="SimplifiedModel_Total_Score_None",
117  feature_keys=None,
118  rmf_file_key="rmf_file",
119  rmf_file_frame_key="rmf_frame_index",
120  override_rmf_dir=None):
121  """Given a list of stat files, read them all and find the best models.
122  Save to a single RMF along with a stat file.
123  @param mdl The IMP Model
124  @param out_dir The output directory. Will save 3 files (RMF, stat, summary)
125  @param stat_files List of all stat files to collect
126  @param number_of_best_scoring_models Num best models to gather
127  @param get_every Skip frames
128  @param score_key Used for the ranking
129  @param feature_keys Keys to keep around
130  @param rmf_file_key The key that says RMF file name
131  @param rmf_file_frame_key The key that says RMF frame number
132  @param override_rmf_dir For output, change the name of the RMF directory (experiment)
133  """
134 
135  # start by splitting into jobs
136  try:
137  from mpi4py import MPI
138  comm = MPI.COMM_WORLD
139  rank = comm.Get_rank()
140  number_of_processes = comm.size
141  except ImportError:
142  rank = 0
143  number_of_processes = 1
144  my_stat_files=IMP.pmi.tools.chunk_list_into_segments(
145  stat_files,number_of_processes)[rank]
146 
147  # filenames
148  out_stat_fn = os.path.join(out_dir,"top_"+str(number_of_best_scoring_models)+".out")
149  out_rmf_fn = os.path.join(out_dir,"top_"+str(number_of_best_scoring_models)+".rmf3")
150 
151  # extract all the models
152  all_fields=[]
153  for nsf,sf in enumerate(my_stat_files):
154 
155  # get list of keywords
156  root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
157  print("getting data from file %s" % sf)
159  all_keys = [score_key,
160  rmf_file_key,
161  rmf_file_frame_key]
162  for k in po.get_keys():
163  for fk in feature_keys:
164  if fk in k:
165  all_keys.append(k)
166  fields = po.get_fields(all_keys,
167  get_every=get_every)
168 
169  # check that all lengths are all equal
170  length_set = set([len(fields[f]) for f in fields])
171  minlen = min(length_set)
172  # if some of the fields are missing, truncate
173  # the feature files to the shortest one
174  if len(length_set) > 1:
175  print("get_best_models: the statfile is not synchronous")
176  minlen = min(length_set)
177  for f in fields:
178  fields[f] = fields[f][0:minlen]
179  if nsf==0:
180  all_fields=fields
181  else:
182  for k in fields:
183  all_fields[k]+=fields[k]
184 
185  if override_rmf_dir is not None:
186  for i in range(minlen):
187  all_fields[rmf_file_key][i]=os.path.join(
188  override_rmf_dir,os.path.basename(all_fields[rmf_file_key][i]))
189 
190  # gather info, sort, write
191  if number_of_processes!=1:
192  comm.Barrier()
193  if rank!=0:
194  comm.send(all_fields, dest=0, tag=11)
195  else:
196  for i in range(1,number_of_processes):
197  data_tmp = comm.recv(source=i, tag=11)
198  for k in all_fields:
199  all_fields[k]+=data_tmp[k]
200 
201  # sort by total score
202  order = sorted(range(len(all_fields[score_key])),
203  key=lambda i: float(all_fields[score_key][i]))
204 
205  # write the stat and RMF files
206  stat = open(out_stat_fn,'w')
207  rh0 = RMF.open_rmf_file_read_only(
208  os.path.join(root_directory_of_stat_file,all_fields[rmf_file_key][0]))
209  prots = IMP.rmf.create_hierarchies(rh0,mdl)
210  del rh0
211  outf = RMF.create_rmf_file(out_rmf_fn)
212  IMP.rmf.add_hierarchies(outf,prots)
213  for nm,i in enumerate(order[:number_of_best_scoring_models]):
214  dline=dict((k,all_fields[k][i]) for k in all_fields)
215  dline['orig_rmf_file']=dline[rmf_file_key]
216  dline['orig_rmf_frame_index']=dline[rmf_file_frame_key]
217  dline[rmf_file_key]=out_rmf_fn
218  dline[rmf_file_frame_key]=nm
219  rh = RMF.open_rmf_file_read_only(
220  os.path.join(root_directory_of_stat_file,all_fields[rmf_file_key][i]))
221  IMP.rmf.link_hierarchies(rh,prots)
223  RMF.FrameID(all_fields[rmf_file_frame_key][i]))
224  IMP.rmf.save_frame(outf)
225  del rh
226  stat.write(str(dline)+'\n')
227  stat.close()
228  print('wrote stats to',out_stat_fn)
229  print('wrote rmfs to',out_rmf_fn)
230 
231 
232 
233 
234 def get_best_models(stat_files,
235  score_key="SimplifiedModel_Total_Score_None",
236  feature_keys=None,
237  rmf_file_key="rmf_file",
238  rmf_file_frame_key="rmf_frame_index",
239  prefiltervalue=None,
240  get_every=1):
241  """ Given a list of stat files, read them all and find the best models.
242  Returns the best rmf filenames, frame numbers, scores, and values for feature keywords
243  """
244  rmf_file_list=[] # best RMF files
245  rmf_file_frame_list=[] # best RMF frames
246  score_list=[] # best scores
247  feature_keyword_list_dict=defaultdict(list) # best values of the feature keys
248  for sf in stat_files:
249  root_directory_of_stat_file = os.path.dirname(os.path.dirname(os.path.abspath(sf)))
250  print("getting data from file %s" % sf)
252 
253  try:
254  keywords = po.get_keys()
255  except:
256  continue
257 
258  feature_keywords = [score_key,
259  rmf_file_key,
260  rmf_file_frame_key]
261 
262  for k in keywords:
263  for fk in feature_keys:
264  if fk in k:
265  feature_keywords.append(k)
266 
267  if prefiltervalue is None:
268  fields = po.get_fields(feature_keywords,
269  get_every=get_every)
270  else:
271  fields = po.get_fields(feature_keywords,
272  filtertuple=(score_key,"<",prefiltervalue),
273  get_every=get_every)
274 
275  # check that all lengths are all equal
276  length_set = set()
277  for f in fields:
278  length_set.add(len(fields[f]))
279 
280  # if some of the fields are missing, truncate
281  # the feature files to the shortest one
282  if len(length_set) > 1:
283  print("get_best_models: the statfile is not synchronous")
284  minlen = min(length_set)
285  for f in fields:
286  fields[f] = fields[f][0:minlen]
287 
288  # append to the lists
289  score_list += fields[score_key]
290  for rmf in fields[rmf_file_key]:
291  rmf_file_list.append(os.path.join(root_directory_of_stat_file,rmf))
292 
293  rmf_file_frame_list += fields[rmf_file_frame_key]
294 
295  if feature_keywords is not None:
296  for k in feature_keywords:
297  feature_keyword_list_dict[k] += fields[k]
298 
299  return rmf_file_list,rmf_file_frame_list,score_list,feature_keyword_list_dict
300 
301 def get_trajectory_models(stat_files,
302  score_key="SimplifiedModel_Total_Score_None",
303  rmf_file_key="rmf_file",
304  rmf_file_frame_key="rmf_frame_index",
305  get_every=1):
306  """ Given a list of stat files, read them all and find a trajectory of models.
307  Returns the rmf filenames, frame numbers, scores, and values for feature keywords
308  """
309  rmf_file_list=[] # best RMF files
310  rmf_file_frame_list=[] # best RMF frames
311  score_list=[] # best scores
312  for sf in stat_files:
313  root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
314  print("getting data from file %s" % sf)
316  keywords = po.get_keys()
317 
318  feature_keywords = [score_key,
319  rmf_file_key,
320  rmf_file_frame_key]
321 
322  fields = po.get_fields(feature_keywords,
323  get_every=get_every)
324 
325  # check that all lengths are all equal
326  length_set = set()
327  for f in fields:
328  length_set.add(len(fields[f]))
329 
330  # if some of the fields are missing, truncate
331  # the feature files to the shortest one
332  if len(length_set) > 1:
333  print("get_best_models: the statfile is not synchronous")
334  minlen = min(length_set)
335  for f in fields:
336  fields[f] = fields[f][0:minlen]
337 
338  # append to the lists
339  score_list += fields[score_key]
340  for rmf in fields[rmf_file_key]:
341  rmf_file_list.append(os.path.join(root_directory_of_stat_file,rmf))
342 
343  rmf_file_frame_list += fields[rmf_file_frame_key]
344 
345  return rmf_file_list,rmf_file_frame_list,score_list
346 
347 
349  rmf_tuples,
350  alignment_components=None,
351  rmsd_calculation_components=None,
352  state_number=0):
353  """ Read in coordinates of a set of RMF tuples.
354  Returns the coordinates split as requested (all, alignment only, rmsd only) as well as
355  RMF file names (as keys in a dictionary, with values being the rank number) and just a plain list
356  @param model The IMP model
357  @param rmf_tuples [score,filename,frame number,original order number, rank]
358  @param alignment_components Tuples to specify what you're aligning on
359  @param rmsd_calculation_components Tuples to specify what components are used for RMSD calc
360  """
361  all_coordinates = []
362  rmsd_coordinates = []
363  alignment_coordinates = []
364  all_rmf_file_names = []
365  rmf_file_name_index_dict = {} # storing the features
366 
367  for cnt, tpl in enumerate(rmf_tuples):
368  rmf_file = tpl[1]
369  frame_number = tpl[2]
370  if cnt==0:
371  prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
372  frame_number,
373  rmf_file)
374  else:
375  IMP.pmi.analysis.link_hiers_to_rmf(model,prots,frame_number,rmf_file)
376 
377  if not prots:
378  continue
379  if IMP.pmi.get_is_canonical(prots[0]):
380  states = IMP.atom.get_by_type(prots[0],IMP.atom.STATE_TYPE)
381  prot = states[state_number]
382  else:
383  prot = prots[state_number]
384 
385  # getting the particles
387  all_particles = [pp for key in part_dict for pp in part_dict[key]]
388  all_ps_set = set(all_particles)
389  model_coordinate_dict = {}
390  template_coordinate_dict={}
391  rmsd_coordinate_dict={}
392 
393  for pr in part_dict:
394  model_coordinate_dict[pr] = np.array(
395  [np.array(IMP.core.XYZ(i).get_coordinates()) for i in part_dict[pr]])
396  for tuple_dict,result_dict in zip((alignment_components,rmsd_calculation_components),
397  (template_coordinate_dict,rmsd_coordinate_dict)):
398 
399  if tuple_dict is None:
400  continue
401 
402  # PMI2: do selection of resolution and name at the same time
403  if IMP.pmi.get_is_canonical(prot):
404  for pr in tuple_dict:
405  if type(tuple_dict[pr]) is str:
406  name = tuple_dict[pr]
407  s = IMP.atom.Selection(prot,molecule=name,resolution=1)
408  elif type(tuple_dict[pr]) is tuple:
409  rend = tuple_dict[pr][1]
410  rbegin = tuple_dict[pr][0]
411  s = IMP.atom.Selection(prot,molecule=name,resolution=1,
412  residue_indexes=range(rbegin,rend+1))
413  result_dict[pr] = [list(map(float,IMP.core.XYZ(p).get_coordinates()))
414  for p in s.get_selected_particles()]
415  else:
416  for pr in tuple_dict:
417  if type(tuple_dict[pr]) is str:
418  name=tuple_dict[pr]
419  s=IMP.atom.Selection(prot,molecule=name)
420  elif type(tuple_dict[pr]) is tuple:
421  name=tuple_dict[pr][2]
422  rend=tuple_dict[pr][1]
423  rbegin=tuple_dict[pr][0]
424  s=IMP.atom.Selection(prot,molecule=name,residue_indexes=range(rbegin,rend+1))
425  ps=s.get_selected_particles()
426  filtered_particles=[p for p in ps if p in all_ps_set]
427  result_dict[pr] = \
428  [list(map(float,IMP.core.XYZ(p).get_coordinates())) for p in filtered_particles]
429 
430  all_coordinates.append(model_coordinate_dict)
431  alignment_coordinates.append(template_coordinate_dict)
432  rmsd_coordinates.append(rmsd_coordinate_dict)
433  frame_name = rmf_file + '|' + str(frame_number)
434  all_rmf_file_names.append(frame_name)
435  rmf_file_name_index_dict[frame_name] = tpl[4]
436  return all_coordinates,alignment_coordinates,rmsd_coordinates,rmf_file_name_index_dict,all_rmf_file_names
437 
438 def get_bead_sizes(model,rmf_tuple,rmsd_calculation_components=None,state_number=0):
439  '''
440  @param model The IMP model
441  @param rmf_tuple score,filename,frame number,original order number, rank
442  @param rmsd_calculation_components Tuples to specify what components are used for RMSD calc
443  '''
444  if rmsd_calculation_components is None:
445  return {}
446 
447  rmf_file = rmf_tuple[1]
448  frame_number = rmf_tuple[2]
449  prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
450  frame_number,
451  rmf_file)
452 
453  if IMP.pmi.get_is_canonical(prots[0]):
454  states = IMP.atom.get_by_type(prots[0],IMP.atom.STATE_TYPE)
455  prot = states[state_number]
456  else:
457  prot = prots[state_number]
458 
459  rmsd_bead_size_dict = {}
460 
461  # PMI2: do selection of resolution and name at the same time
462  if IMP.pmi.get_is_canonical(prot):
463  for pr in rmsd_calculation_components:
464  if type(rmsd_calculation_components[pr]) is str:
465  name = rmsd_calculation_components[pr]
466  s = IMP.atom.Selection(prot,molecule=name,resolution=1)
467  elif type(rmsd_calculation_components[pr]) is tuple:
468  rend = rmsd_calculation_components[pr][1]
469  rbegin = rmsd_calculation_components[pr][0]
470  s = IMP.atom.Selection(prot,molecule=name,resolution=1,
471  residue_indexes=range(rbegin,rend+1))
472  rmsd_bead_size_dict[pr] = [len(IMP.pmi.tools.get_residue_indexes(p))
473  for p in s.get_selected_particles()]
474  else:
475  # getting the particles
477  all_particles=[pp for key in part_dict for pp in part_dict[key]]
478  all_ps_set=set(all_particles)
479 
480  # getting the coordinates
481  for pr in rmsd_calculation_components:
482  if type(rmsd_calculation_components[pr]) is str:
483  name=rmsd_calculation_components[pr]
484  s=IMP.atom.Selection(prot,molecule=name)
485  elif type(rmsd_calculation_components[pr]) is tuple:
486  name=rmsd_calculation_components[pr][2]
487  rend=rmsd_calculation_components[pr][1]
488  rbegin=rmsd_calculation_components[pr][0]
489  s=IMP.atom.Selection(prot,molecule=name,residue_indexes=range(rbegin,rend+1))
490  ps=s.get_selected_particles()
491  filtered_particles=[p for p in ps if p in all_ps_set]
492  rmsd_bead_size_dict[pr] = \
493  [len(IMP.pmi.tools.get_residue_indexes(p)) for p in filtered_particles]
494 
495  return rmsd_bead_size_dict
496 
497 class RMSDOutput(object):
498  """A helper output based on dist to initial coordinates"""
499  def __init__(self,ps,label,init_coords=None):
500  self.mdl = ps[0].get_model()
501  self.ps = ps
502  if init_coords is None:
503  self.init_coords = [IMP.core.XYZ(p).get_coordinates() for p in self.ps]
504  else:
505  self.init_coords = init_coords
506  self.label = label
507  def get_output(self):
508  self.mdl.update()
509  output = {}
510  coords = [IMP.core.XYZ(p).get_coordinates() for p in self.ps]
511  rmsd = IMP.algebra.get_rmsd(coords,self.init_coords)
512  output["RMSD_"+self.label] = str(rmsd)
513  return output
514 
515 class TotalScoreOutput(object):
516  """A helper output for model evaluation"""
517  def __init__(self,mdl):
518  self.mdl = mdl
519  self.rs = IMP.pmi.tools.get_restraint_set(self.mdl)
520  def get_output(self):
521  self.mdl.update()
522  score = self.rs.evaluate(False)
523  output = {}
524  output["Total_Score"] = str(score)
525  return output
def get_restraint_set
Get a RestraintSet containing all PMI restraints added to the model.
Definition: tools.py:49
A class for reading stat files.
Definition: output.py:674
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def parse_dssp
read dssp file, get SSEs.
def get_best_models
Given a list of stat files, read them all and find the best models.
A helper output for model evaluation.
Miscellaneous utilities.
Definition: tools.py:1
A helper output based on dist to initial coordinates.
def get_trajectory_models
Given a list of stat files, read them all and find a trajectory of models.
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Tools for clustering and cluster analysis.
Definition: pmi/Analysis.py:1
bool get_is_canonical(atom::Hierarchy h)
Walk up a PMI2 hierarchy/representations and check if the root is named System.
Definition: utilities.h:98
Classes for writing output files and processing them.
Definition: output.py:1
def save_best_models
Given a list of stat files, read them all and find the best models.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
def read_coordinates_of_rmfs
Read in coordinates of a set of RMF tuples.
Python classes to represent, score, sample and analyze models.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Definition: Selection.h:66
Support for the RMF file format for storing hierarchical molecular data and markup.
def get_residue_indexes
Retrieve the residue indexes for the given particle.
Definition: tools.py:1000