IMP  2.0.1
The Integrative Modeling Platform
TBLReader.py
1 #!/usr/bin/env python
2 
3 
4 import sys, os
5 import IMP.isd
6 from IMP.isd.utils import Load, read_sequence_file
7 #from Isd.io.nomenclature import IUPAC_CONVENTION
8 IUPAC_CONVENTION='iupac'
9 TYPE_AMINO_ACID = 'AMINO_ACID'
10 pseudoatoms_dict = IMP.isd.get_data_path('CHARMM_pseudoatoms.dict')
11 
12 def del_comment(x):
13 
14  n = x.find('!')
15 
16  if n >= 0:
17  x = x[:n]
18 
19  return x
20 
21 def check_assigns(x):
22  return 'resid' in x and 'name' in x
23 
24 class TBLReader:
25 
26  atom_dict = {'segid': '',
27  'resid': -1,
28  'name': ''}
29 
30  pseudoatom_char = '*', '%', '#'
31 
32  def __init__(self, sequence, ignore_warnings=False, sequence_match=(1,1)):
33 
34  self.sequence = sequence
35  #sequence_match = (a,b) a: NOE numbering, b: sequence numbering
36  self.offset = sequence_match[1]-sequence_match[0]
37  self.ignore = ignore_warnings
38  self.pseudo_dict = Load(pseudoatoms_dict)
39 
40  def extract_contributions(self, contribs):
41 
42  new_contribs = []
43 
44  for c in contribs:
45 
46  if not c:
47  continue
48 
49  c = c[c.find('('):]
50  c = c[:c.rfind(')')+1]
51 
52  new_contribs.append(c)
53 
54  return new_contribs
55 
56  def split_contribution(self, contrib):
57 
58  words = contrib.split('(')
59  atoms = [word.split(')')[0] for word in words if word]
60 
61  return atoms
62 
63  def resolve_pseudoatom(self, residue_type, atom_name):
64 
65  if '*' in atom_name:
66  char = '*'
67  elif '#' in atom_name:
68  char = '#'
69 
70  atom_name = atom_name.replace(char, '%')
71 
72  ## TODO: Assumes that pseudo-atom names are compatible with
73  ## IUPAC name, since pseudo_dict can handle IUPAC
74  ## names only.
75 
76  try:
77  group = self.pseudo_dict[residue_type][atom_name]
78 
79  except:
80 
81  key = atom_name, residue_type
82 
83  if not key in self.missing_atoms:
84 
85  msg = 'Could not resolve pseudoatom %s.%s.' % (residue_type, atom_name)
86 
87  if self.ignore:
88  print msg
89  else:
90  raise KeyError, msg
91 
92  self.missing_atoms.append(key)
93 
94  return atom_name
95 
96  return group
97 
98  def to_iupac(self, residue_type, atom_name):
99 
100  raise NotImplementedError
101 
102  iupac_name = self.thesaurus.convert_atom(residue_type,
103  atom_name,
104  self.naming_system,
105  IUPAC_CONVENTION,
106  TYPE_AMINO_ACID)
107  try:
108  iupac_name = self.thesaurus.convert_atom(residue_type,
109  atom_name,
110  self.naming_system,
111  IUPAC_CONVENTION,
112  TYPE_AMINO_ACID)
113 
114  except:
115 
116  key = atom_name, residue_type
117 
118  if not key in self.missing_atoms:
119 
120  if '*' in atom_name or '#' in atom_name:
121 
122  msg = 'Pseudoatoms not upported: %s' % atom_name
123 
124  if self.ignore:
125  print msg
126 
127  else:
128  raise KeyError, msg
129 
130  elif self.ignore:
131  msg = 'Warning: atom %s not found in residue %s.' % key
132  print msg
133  else:
134  raise KeyError, msg % key
135 
136  self.missing_atoms.append(key)
137 
138  return atom_name
139 
140  return iupac_name
141 
142  def resolve_dihedral_name(self, atoms):
143 
144  raise NotImplementedError
145 
146  names = [a['name'] for a in atoms]
147 
148  try:
149  res_type = self.sequence[atoms[1]['resid']]
150 
151  except IndexError:
152  print 'Residue number overflow in atoms', atoms
153  return ''
154 
155  for dihedral in self.connectivity[res_type].dihedrals.values():
156 
157  keys = [k for k in dihedral.keys() if 'atom' in k]
158  keys.sort()
159 
160  atom_names = []
161 
162  for k in keys:
163  name = dihedral[k]
164  if name[-1] in ('-', '+'):
165  name = name[:-1]
166 
167  atom_names.append(name)
168 
169  if atom_names == names:
170  return dihedral['name']
171 
172  msg = 'Could not determine name of dihedral angles defined by atoms %s.' % str(names)
173 
174  if self.ignore:
175  print msg
176  return ''
177 
178  raise KeyError, msg
179 
180  def extract_atom(self, a):
181 
182  atom = dict(self.atom_dict)
183 
184  words = a.split()
185 
186  skip_next = False
187 
188  ## correct for segid statements
189 
190  words = [x for x in words if x <> '"']
191 
192  for i in range(len(words)):
193 
194  if skip_next:
195  skip_next = False
196  continue
197 
198  word = words[i]
199 
200  if word == 'and':
201  continue
202 
203  for key in atom.keys():
204 
205  if key in word:
206 
207  if key == 'segid':
208  atom[key] = words[i+1][:-1]
209  else:
210  atom[key] = words[i+1]
211 
212  skip_next = True
213  break
214 
215  else:
216  raise KeyError, 'Value or keyword "%s" unknown. Source: "%s", decomposed into "%s"' % \
217  (word, str(a), str(words))
218 
219  atom['resid'] = int(atom['resid']) + self.offset
220  atom['name'] = atom['name'].upper()
221 
222  return atom
223 
224  def build_contributions(self, atoms):
225 
226  groups = []
227 
228  for a in atoms:
229 
230  try:
231  res_type = self.sequence[a['resid']]
232 
233  except IndexError:
234  print 'Residue number overflow in atoms', atoms
235  return []
236 
237  atom_name = a['name']
238 
239  if atom_name[-1] in self.pseudoatom_char:
240  group = self.resolve_pseudoatom(res_type, atom_name)
241 
242  else:
243  #group = [self.to_iupac(res_type, atom_name)]
244  group = [atom_name]
245 
246  groups.append(group)
247 
248  group1, group2 = groups
249 
250  contribs = []
251 
252  res_1 = atoms[0]['resid']
253  res_2 = atoms[1]['resid']
254 
255  for i in range(len(group1)):
256 
257  name_1 = group1[i]
258 
259  for j in range(len(group2)):
260 
261  if (res_1, name_1) <> (res_2, group2[j]):
262  contribs.append(((res_1, name_1), (res_2, group2[j])))
263 
264  return contribs
265 
266  def extract_target_values(self, line):
267 
268  end = line.rfind(')')
269 
270  values = line[end+1:].split()
271 
272  try:
273  distances = [float(x) for x in values[:3]]
274  except:
275  distances = None
276 
277  ## read volume from ARIA 1.x restraint files
278 
279  val = line.split('volume=')
280 
281  if len(val) > 1:
282  volume = float(val[1].split()[0].split(',')[0])
283  else:
284 
285  volume = None
286 
287  return distances, volume
288 
289  def read_contents(self, filename):
290 
291  keywords = 'class',
292 
293  filename = os.path.expanduser(filename)
294 
295  f = open(filename)
296  lines = f.readlines()
297  f.close()
298 
299  all = ''
300 
301  for x in lines:
302 
303  x = x.strip()
304 
305  if not x or x[0] == '!':
306  continue
307 
308  not_valid = [kw for kw in keywords if kw in x]
309 
310  if not_valid:
311  continue
312 
313  all += x.lower() + ' '
314 
315  return [x.strip() for x in all.split('assi')]
316 
317  def find_contributions(self, line):
318 
319  contribs = [del_comment(x).strip() for x in line.split('or')]
320 
321  ## use alternative parser for implicitly listed atom pairs
322 
323  if 1 in [x.count('resid') for x in contribs]:
324 
325  atoms = []
326 
327  while line:
328 
329  start = line.find('(')
330 
331  if start < 0:
332  atoms[-1][-1] += line
333  break
334 
335  stop = line.find(')')
336 
337  selection = [x.strip() for x in line[start:stop+1].split('or')]
338 
339  for i in range(len(selection)):
340 
341  val = selection[i]
342 
343  if not '(' in val:
344  val = '(' + val
345 
346  if not ')' in val:
347  val += ')'
348 
349  selection[i] = val
350 
351  atoms.append(selection)
352 
353  line = line[stop+1:]
354 
355  if len(atoms) <> 2:
356  raise
357 
358  ## find and isolate target distances
359 
360  l = []
361 
362  for i in range(len(atoms)):
363 
364  g = []
365 
366  for atom in atoms[i]:
367 
368  n = atom.rfind(')')
369 
370  if n >= 0 and len(atom[n+1:].strip()) > 3:
371  distances = atom[n+1:].strip()
372  atom = atom[:n+1]
373 
374  g.append(atom)
375 
376  l.append(g)
377 
378  a, b = l
379 
380  if len(a) > len(b):
381  a, b = b, a
382 
383  contribs = []
384 
385  for i in a:
386  for j in b:
387  contribs.append('%s %s' % (i,j))
388 
389  contribs[0] += ' ' + distances
390 
391  return contribs
392 
393  def create_distance_restraint(self, distances, volume, contributions):
394  if distances is None and volume is None:
395  raise ValueError, "could not find either volume or "\
396  "distance: %s %s %s" % (distances,volume,contributions)
397  if distances is None:
398  distances = [volume**(-1./6),0,0]
399  dist = distances[0]
400  if volume is None:
401  volume = dist ** (-6)
402  lower = dist - distances[1]
403  upper = dist + distances[2]
404  return (tuple(contributions), dist, lower, upper, volume)
405 
406  def read_distances(self, filename, key, naming_system=IUPAC_CONVENTION,
407  decompose=False):
408  """reads a tbl file and parses distance restraints.
409  """
410 
411  self.naming_system = naming_system
412 
413  assigns = self.read_contents(filename)
414 
415  restraints = []
416  self.missing_atoms = []
417  seq_number = 0
418 
419  for line in assigns:
420 
421  contribs = self.find_contributions(line)
422 
423  if False in [check_assigns(x) for x in contribs]:
424  continue
425 
426  distances, volume = self.extract_target_values(contribs[0])
427 
428  if (distances is None and volume is None):
429  distances, volume = self.extract_target_values(contribs[-1])
430 
431  new_contribs = self.extract_contributions(contribs)
432 
433  contributions = []
434 
435  for contrib in new_contribs:
436 
437  atoms = self.split_contribution(contrib)
438  atoms = [self.extract_atom(x) for x in atoms]
439 
440  contributions += self.build_contributions(atoms)
441 
442  if contributions:
443  r = self.create_distance_restraint(distances, volume,
444  contributions)
445 
446  restraints.append(r)
447  seq_number += 1
448 
449  if restraints:
450 
451  if decompose:
452 
453  d = decompose_restraints(restraints)
454 
455  for _type in d.keys():
456  if not d[_type]:
457  del d[_type]
458 
459  if len(d) > 1:
460  for _type, val in d.items():
461 
462  if val:
463  new_key = key + '_%s' % _type
464  d[new_key] = val
465 
466  del d[_type]
467 
468  else:
469  d = {key: d.values()[0]}
470  else:
471  d = {key: restraints}
472 
473  return d
474 
475  def read_dihedrals(self, filename, key, naming_system=IUPAC_CONVENTION):
476 
477  self.naming_system = naming_system
478 
479  assigns = self.read_contents(filename)
480 
481  restraints = []
482  self.missing_atoms = []
483  seq_number = 0
484 
485  for line in assigns:
486 
487  contribs = [del_comment(x).strip() for x in line.split('or')]
488 
489  values, volume = self.extract_target_values(contribs[0])
490  new_contribs = self.extract_contributions(contribs)
491 
492  if not new_contribs:
493  continue
494 
495  if len(new_contribs) > 1:
496  raise ValueError, 'Inconsistency in data file, multiple contributions detected.'
497 
498  atoms = self.split_contribution(new_contribs[0])
499  atoms = [self.extract_atom(x) for x in atoms]
500 
501  name = self.resolve_dihedral_name(atoms)
502 
503  r = create_dihedral_restraint(seq_number, name, values, atoms)
504 
505  restraints.append(r)
506  seq_number += 1
507 
508  if restraints:
509  return restraints
510 
511  def read_rdcs(self, filename, key, naming_system=IUPAC_CONVENTION):
512 
513  self.naming_system = naming_system
514 
515  assigns = self.read_contents(filename)
516 
517  restraints = []
518  self.missing_atoms = []
519  seq_number = 0
520 
521  fake_atom_names = ('OO', 'X', 'Y', 'Z')
522 
523  for line in assigns:
524 
525  contribs = [del_comment(x).strip() for x in line.split('or')]
526  distances, volume = self.extract_target_values(contribs[0])
527  new_contribs = self.extract_contributions(contribs)
528 
529  contributions = []
530 
531  for contrib in new_contribs:
532 
533  atoms = self.split_contribution(contrib)
534  atoms = [self.extract_atom(x) for x in atoms]
535 
536  atoms = [a for a in atoms if not a['name'] in fake_atom_names]
537 
538  contributions += self.build_contributions(atoms)
539 
540  if contributions:
541  r = create_rdc_restraint(seq_number, distances[0], contributions)
542 
543  restraints.append(r)
544  seq_number += 1
545 
546  if restraints:
547  return restraints
548 
549 if __name__ == '__main__':
550 
551  noe = 'noe.tbl'
552  sequence = read_sequence_file('seq.dat', first_residue_number=1)
553  reader = TBLReader(sequence, ignore_warnings=True)
554  reader.read_distances(noe, key='test')