IMP logo
IMP Reference Guide  2.7.0
The Integrative Modeling Platform
mmcif.py
1 """@namespace IMP.pmi.mmcif
2  @brief Support for the mmCIF file format.
3 
4  IMP has basic support for writing out files in mmCIF format, for
5  deposition in [PDB-dev](https://pdb-dev.rcsb.rutgers.edu/).
6  mmCIF files are currently generated by creating an
7  IMP.pmi.mmcif.ProtocolOutput class, and attaching it to an
8  IMP.pmi.representation.Representation object, after which any
9  generated models and metadata are collected and output as mmCIF.
10 """
11 
12 from __future__ import print_function
13 import copy
14 import IMP.core
15 import IMP.algebra
16 import IMP.atom
17 import IMP.em
18 import IMP.isd
20 import IMP.pmi.tools
21 from IMP.pmi.tools import OrderedDict
22 import IMP.pmi.output
23 import IMP.pmi.metadata
24 import re
25 import sys
26 import os
27 import textwrap
28 import weakref
29 
30 def _assign_id(obj, seen_objs, obj_by_id):
31  """Assign a unique ID to obj, and track all ids in obj_by_id."""
32  if obj not in seen_objs:
33  if not hasattr(obj, 'id'):
34  obj_by_id.append(obj)
35  obj.id = len(obj_by_id)
36  seen_objs[obj] = obj.id
37  else:
38  obj.id = seen_objs[obj]
39 
40 class _LineWriter(object):
41  def __init__(self, writer, line_len=80):
42  self.writer = writer
43  self.line_len = line_len
44  self.column = 0
45  def write(self, val):
46  if isinstance(val, str) and '\n' in val:
47  self.writer.fh.write("\n;")
48  self.writer.fh.write(val)
49  if not val.endswith('\n'):
50  self.writer.fh.write("\n")
51  self.writer.fh.write(";\n")
52  self.column = 0
53  return
54  val = self.writer._repr(val)
55  if self.column > 0:
56  if self.column + len(val) + 1 > self.line_len:
57  self.writer.fh.write("\n")
58  self.column = 0
59  else:
60  self.writer.fh.write(" ")
61  self.column += 1
62  self.writer.fh.write(val)
63  self.column += len(val)
64 
65 
66 class _CifCategoryWriter(object):
67  def __init__(self, writer, category):
68  self.writer = writer
69  self.category = category
70  def write(self, **kwargs):
71  self.writer._write(self.category, kwargs)
72  def __enter__(self):
73  return self
74  def __exit__(self, exc_type, exc_value, traceback):
75  pass
76 
77 
78 class _CifLoopWriter(object):
79  def __init__(self, writer, category, keys):
80  self.writer = writer
81  self.category = category
82  self.keys = keys
83  # Remove characters that we can't use in Python identifiers
84  self.python_keys = [k.replace('[', '').replace(']', '') for k in keys]
85  self._empty_loop = True
86  def write(self, **kwargs):
87  if self._empty_loop:
88  f = self.writer.fh
89  f.write("#\nloop_\n")
90  for k in self.keys:
91  f.write("%s.%s\n" % (self.category, k))
92  self._empty_loop = False
93  l = _LineWriter(self.writer)
94  for k in self.python_keys:
95  l.write(kwargs.get(k, self.writer.omitted))
96  self.writer.fh.write("\n")
97  def __enter__(self):
98  return self
99  def __exit__(self, exc_type, exc_value, traceback):
100  if not self._empty_loop:
101  self.writer.fh.write("#\n")
102 
103 
104 class _CifWriter(object):
105  omitted = '.'
106  unknown = '?'
107  _boolmap = {False: 'NO', True: 'YES'}
108 
109  def __init__(self, fh):
110  self.fh = fh
111  def category(self, category):
112  return _CifCategoryWriter(self, category)
113  def loop(self, category, keys):
114  return _CifLoopWriter(self, category, keys)
115  def write_comment(self, comment):
116  for line in textwrap.wrap(comment, 78):
117  self.fh.write('# ' + line + '\n')
118  def _write(self, category, kwargs):
119  for key in kwargs:
120  self.fh.write("%s.%s %s\n" % (category, key,
121  self._repr(kwargs[key])))
122  def _repr(self, obj):
123  if isinstance(obj, str) and '"' not in obj \
124  and "'" not in obj and " " not in obj:
125  return obj
126  elif isinstance(obj, float):
127  return "%.3f" % obj
128  elif isinstance(obj, bool):
129  return self._boolmap[obj]
130  else:
131  return repr(obj)
132 
133 def _get_by_residue(p):
134  """Determine whether the given particle represents a specific residue
135  or a more coarse-grained object."""
137 
138 class _AsymIDMapper(object):
139  """Map a Particle to an asym_id (chain ID)"""
140  def __init__(self, prot):
141  self.o = IMP.pmi.output.Output()
142  self.prot = prot
143  self.name = 'cif-output'
144  self.o.dictionary_pdbs[self.name] = self.prot
145  self.o._init_dictchain(self.name, self.prot)
146 
147  def __getitem__(self, p):
148  protname, is_a_bead = self.o.get_prot_name_from_particle(self.name, p)
149  return self.o.dictchain[self.name][protname]
150 
151 class _ComponentMapper(object):
152  """Map a Particle to a component name"""
153  def __init__(self, prot):
154  self.o = IMP.pmi.output.Output()
155  self.prot = prot
156  self.name = 'cif-output'
157  self.o.dictionary_pdbs[self.name] = self.prot
158  self.o._init_dictchain(self.name, self.prot)
159 
160  def __getitem__(self, p):
161  protname, is_a_bead = self.o.get_prot_name_from_particle(self.name, p)
162  return protname
163 
164 class _Dumper(object):
165  """Base class for helpers to dump output to mmCIF"""
166  def __init__(self, simo):
167  self.simo = weakref.proxy(simo)
168 
169  def finalize(self):
170  pass
171 
172  def finalize_metadata(self):
173  pass
174 
175 
176 class _EntryDumper(_Dumper):
177  def dump(self, writer):
178  entry_id = 'imp_model'
179  # Write CIF header (so this dumper should always be first)
180  writer.fh.write("data_%s\n" % entry_id)
181  with writer.category("_entry") as l:
182  l.write(id=entry_id)
183 
184 
185 class _SoftwareDumper(_Dumper):
186  software = [
188  name="Integrative Modeling Platform (IMP)",
189  version=IMP.__version__,
190  classification="integrative model building",
191  description="integrative model building",
192  url='https://integrativemodeling.org'),
194  name="IMP PMI module",
195  version=IMP.pmi.__version__,
196  classification="integrative model building",
197  description="integrative model building",
198  url='https://integrativemodeling.org')
199  ]
200 
201  def __init__(self, simo):
202  super(_SoftwareDumper, self).__init__(simo)
203  self.modeller_used = False
204 
205  def set_modeller_used(self, version, date):
206  if self.modeller_used:
207  return
208  self.modeller_used = True
209  self.software.append(IMP.pmi.metadata.Software(
210  name='MODELLER', classification='comparative modeling',
211  description='Comparative modeling by satisfaction '
212  'of spatial restraints, build ' + date,
213  url='https://salilab.org/modeller/',
214  version=version))
215 
216  def dump(self, writer):
217  ordinal = 1
218  with writer.loop("_software",
219  ["pdbx_ordinal", "name", "classification", "version",
220  "type", "location"]) as l:
221  for m in self.software + self.simo._metadata:
222  if isinstance(m, IMP.pmi.metadata.Software):
223  l.write(pdbx_ordinal=ordinal, name=m.name,
224  classification=m.classification, version=m.version,
225  type=m.type, location=m.url)
226  ordinal += 1
227 
228 class _AuditAuthorDumper(_Dumper):
229  """Populate the mmCIF audit_author category (a list of the people that
230  authored this mmCIF file; here we assume that's just the authors of
231  any associated publications)"""
232  def dump(self, writer):
233  citations = [m for m in self.simo._metadata
234  if isinstance(m, IMP.pmi.metadata.Citation)]
235  seen_authors = {}
236  with writer.loop("_audit_author",
237  ["name", "pdbx_ordinal"]) as l:
238  ordinal = 1
239  for n, c in enumerate(citations):
240  for a in c.authors:
241  if a not in seen_authors:
242  seen_authors[a] = None
243  l.write(name=a, pdbx_ordinal=ordinal)
244  ordinal += 1
245 
246 
247 class _CitationDumper(_Dumper):
248  def dump(self, writer):
249  citations = [m for m in self.simo._metadata
250  if isinstance(m, IMP.pmi.metadata.Citation)]
251  with writer.loop("_citation",
252  ["id", "title", "journal_abbrev", "journal_volume",
253  "page_first", "page_last", "year",
254  "pdbx_database_id_PubMed",
255  "pdbx_database_id_DOI"]) as l:
256  for n, c in enumerate(citations):
257  if isinstance(c.page_range, (tuple, list)):
258  page_first, page_last = c.page_range
259  else:
260  page_first = c.page_range
261  page_last = _CifWriter.omitted
262  l.write(id=n+1, title=c.title, journal_abbrev=c.journal,
263  journal_volume=c.volume, page_first=page_first,
264  page_last=page_last, year=c.year,
265  pdbx_database_id_PubMed=c.pmid,
266  pdbx_database_id_DOI=c.doi)
267 
268  with writer.loop("_citation_author",
269  ["citation_id", "name", "ordinal"]) as l:
270  ordinal = 1
271  for n, c in enumerate(citations):
272  for a in c.authors:
273  l.write(citation_id=n+1, name=a, ordinal=ordinal)
274  ordinal += 1
275 
276 class _EntityDumper(_Dumper):
277  # todo: we currently only support amino acid sequences here (and
278  # then only standard amino acids; need to add support for MSE etc.)
279  def dump(self, writer):
280  with writer.loop("_entity",
281  ["id", "type", "src_method", "pdbx_description",
282  "formula_weight", "pdbx_number_of_molecules",
283  "details"]) as l:
284  for entity in self.simo.entities.get_all():
285  l.write(id=entity.id, type='polymer', src_method='man',
286  pdbx_description=entity.first_component,
287  formula_weight=writer.unknown,
288  pdbx_number_of_molecules=1, details=writer.unknown)
289 
290 
291 class _EntityPolyDumper(_Dumper):
292  # todo: we currently only support amino acid sequences here
293  def __init__(self, simo):
294  super(_EntityPolyDumper, self).__init__(simo)
295  self.output = IMP.pmi.output.Output()
296 
297  def dump(self, writer):
298  with writer.loop("_entity_poly",
299  ["entity_id", "type", "nstd_linkage",
300  "nstd_monomer", "pdbx_strand_id",
301  "pdbx_seq_one_letter_code",
302  "pdbx_seq_one_letter_code_can"]) as l:
303  for entity in self.simo.entities.get_all():
304  seq = entity.sequence
305  # Split into lines to get tidier CIF output
306  seq = "\n".join(seq[i:i+70] for i in range(0, len(seq), 70))
307  name = entity.first_component
308  chain_id = self.simo._get_chain_for_component(name, self.output)
309  l.write(entity_id=entity.id, type='polypeptide(L)',
310  nstd_linkage='no', nstd_monomer='no',
311  pdbx_strand_id=chain_id,
312  pdbx_seq_one_letter_code=seq,
313  pdbx_seq_one_letter_code_can=seq)
314 
315 
316 class _ChemCompDumper(_Dumper):
317  def dump(self, writer):
318  seen = {}
319  std = dict.fromkeys(('ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS',
320  'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN', 'ARG', 'SER',
321  'THR', 'VAL', 'TRP', 'TYR'))
322  with writer.loop("_chem_comp", ["id", "type"]) as l:
323  for entity in self.simo.entities.get_all():
324  seq = entity.sequence
325  for num, one_letter_code in enumerate(seq):
326  restyp = IMP.atom.get_residue_type(one_letter_code)
327  resid = restyp.get_string()
328  if resid not in seen:
329  seen[resid] = None
330  l.write(id=resid,
331  type='L-peptide linking' if resid in std \
332  else 'other')
333 
334 class _EntityPolySeqDumper(_Dumper):
335  def dump(self, writer):
336  with writer.loop("_entity_poly_seq",
337  ["entity_id", "num", "mon_id", "hetero"]) as l:
338  for entity in self.simo.entities.get_all():
339  seq = entity.sequence
340  for num, one_letter_code in enumerate(seq):
341  restyp = IMP.atom.get_residue_type(one_letter_code)
342  l.write(entity_id=entity.id, num=num + 1,
343  mon_id=restyp.get_string(),
344  hetero=_CifWriter.omitted)
345 
346 class _StructAsymDumper(_Dumper):
347  def __init__(self, simo):
348  super(_StructAsymDumper, self).__init__(simo)
349  self.output = IMP.pmi.output.Output()
350 
351  def dump(self, writer):
352  with writer.loop("_struct_asym",
353  ["id", "entity_id", "details"]) as l:
354  for comp in self.simo.all_modeled_components:
355  entity = self.simo.entities[comp]
356  chain_id = self.simo._get_chain_for_component(comp, self.output)
357  l.write(id=chain_id,
358  entity_id=entity.id,
359  details=comp)
360 
361 class _PDBFragment(object):
362  """Record details about part of a PDB file used as input
363  for a component."""
364  primitive = 'sphere'
365  granularity = 'by-residue'
366  num = _CifWriter.omitted
367  def __init__(self, m, component, start, end, offset, pdbname, chain, hier):
368  self.component, self.start, self.end, self.offset, self.pdbname \
369  = component, start, end, offset, pdbname
370  self.chain, self.hier = chain, hier
373  self.starting_hier = IMP.atom.read_pdb(pdbname, m, sel)
374 
375  def combine(self, other):
376  pass
377 
378 class _BeadsFragment(object):
379  """Record details about beads used to represent part of a component."""
380  primitive = 'sphere'
381  granularity = 'by-feature'
382  chain = None
383  def __init__(self, m, component, start, end, num, hier):
384  self.component, self.start, self.end, self.num, self.hier \
385  = component, start, end, num, hier
386 
387  def combine(self, other):
388  # todo: don't combine if one fragment is rigid and the other flexible
389  if type(other) == type(self) and other.start == self.end + 1:
390  self.end = other.end
391  self.num += other.num
392  return True
393 
394 class _StartingModel(object):
395  """Record details about an input model (e.g. comparative modeling
396  template) used for a component."""
397 
398  source = _CifWriter.unknown
399  db_name = _CifWriter.unknown
400  db_code = _CifWriter.unknown
401  sequence_identity = _CifWriter.unknown
402 
403  def __init__(self, fragment):
404  self.fragments = [fragment]
405 
406 class _ModelRepresentationDumper(_Dumper):
407  def __init__(self, simo):
408  super(_ModelRepresentationDumper, self).__init__(simo)
409  # dict of fragments, ordered by component name
410  self.fragments = OrderedDict()
411  self.output = IMP.pmi.output.Output()
412 
413  def add_fragment(self, fragment):
414  """Add a model fragment."""
415  comp = fragment.component
416  if comp not in self.fragments:
417  self.fragments[comp] = []
418  fragments = self.fragments[comp]
419  if len(fragments) == 0 or not fragments[-1].combine(fragment):
420  fragments.append(fragment)
421 
422  def get_model_mode(self, fragment):
423  """Determine the model_mode for a given fragment ('rigid' or
424  'flexible')"""
425  leaves = IMP.atom.get_leaves(fragment.hier)
426  # Assume all leaves are set up as rigid/flexible in the same way
427  if IMP.core.RigidMember.get_is_setup(leaves[0]):
428  return 'rigid'
429  else:
430  return 'flexible'
431 
432  def dump(self, writer):
433  ordinal_id = 1
434  segment_id = 1
435  with writer.loop("_ihm_model_representation",
436  ["ordinal_id", "representation_id",
437  "segment_id", "entity_id", "entity_description",
438  "entity_asym_id",
439  "seq_id_begin", "seq_id_end",
440  "model_object_primitive", "starting_model_id",
441  "model_mode", "model_granularity",
442  "model_object_count"]) as l:
443  for comp, fragments in self.fragments.items():
444  chain_id = self.simo._get_chain_for_component(comp, self.output)
445  for f in fragments:
446  entity = self.simo.entities[f.component]
447  starting_model_id = _CifWriter.omitted
448  if hasattr(f, 'pdbname'):
449  starting_model_id \
450  = self.starting_model[comp, f.pdbname].name
451  # todo: handle multiple representations
452  l.write(ordinal_id=ordinal_id,
453  representation_id=1,
454  segment_id=segment_id,
455  entity_id=entity.id,
456  entity_description=entity.description,
457  entity_asym_id=chain_id,
458  seq_id_begin=f.start,
459  seq_id_end=f.end,
460  model_object_primitive=f.primitive,
461  starting_model_id=starting_model_id,
462  model_mode=self.get_model_mode(f),
463  model_granularity=f.granularity,
464  model_object_count=f.num)
465  ordinal_id += 1
466  segment_id += 1
467 
468 class _PDBSource(object):
469  """An experimental PDB file used as part of a starting model"""
470  source = 'experimental model'
471  db_name = 'PDB'
472  sequence_identity = 100.0
473 
474  def __init__(self, model, db_code, chain_id, metadata):
475  self.db_code = db_code
476  self.chain_id = chain_id
477  self.metadata = metadata
478 
479  def get_seq_id_range(self, model):
480  # Assume the structure covers the entire sequence
481  return (model.seq_id_begin, model.seq_id_end)
482 
483 class _TemplateSource(object):
484  """A PDB file used as a template for a comparative starting model"""
485  source = 'comparative model'
486  db_name = db_code = _CifWriter.omitted
487  tm_dataset = None
488  # Right now assume all alignments are Modeller alignments, which uses
489  # the length of the shorter sequence as the denominator for sequence
490  # identity
491  sequence_identity_denominator = 1
492 
493  def __init__(self, tm_code, tm_seq_id_begin, tm_seq_id_end, seq_id_begin,
494  chain_id, seq_id_end, seq_id, model):
495  # Assume a code of 1abcX refers to a real PDB structure
496  if len(tm_code) == 5:
497  self.tm_db_code = tm_code[:4].upper()
498  self.tm_chain_id = tm_code[4]
499  else:
500  self.tm_db_code = self.tm_chain_id = _CifWriter.unknown
501  self.sequence_identity = seq_id
502  self.tm_seq_id_begin = tm_seq_id_begin
503  self.tm_seq_id_end = tm_seq_id_end
504  self.chain_id = chain_id
505  self._seq_id_begin, self._seq_id_end = seq_id_begin, seq_id_end
506 
507  def get_seq_id_range(self, model):
508  # The template may cover more than the current starting model
509  seq_id_begin = max(model.seq_id_begin, self._seq_id_begin)
510  seq_id_end = min(model.seq_id_end, self._seq_id_end)
511  return (seq_id_begin, seq_id_end)
512 
513 class _UnknownSource(object):
514  """Part of a starting model from an unknown source"""
515  db_code = _CifWriter.unknown
516  db_name = _CifWriter.unknown
517  chain_id = _CifWriter.unknown
518  sequence_identity = _CifWriter.unknown
519  # Map dataset types to starting model sources
520  _source_map = {'Comparative model': 'comparative model',
521  'Experimental model': 'experimental model'}
522 
523  def __init__(self, model, chain):
524  self.source = self._source_map[model.dataset._data_type]
525  self.chain_id = chain
526 
527  def get_seq_id_range(self, model):
528  return (model.seq_id_begin, model.seq_id_end)
529 
530 class _DatasetGroup(object):
531  """A group of datasets"""
532  def __init__(self, datasets, include_primaries):
533  self._datasets = list(datasets)
534  self.include_primaries = include_primaries
535 
536  def finalize(self):
537  """Get final datasets for each restraint and remove duplicates"""
538  final_datasets = OrderedDict()
539  def add_parents(d):
540  for p in d._parents.keys():
541  final_datasets[p] = None
542  add_parents(p)
543  for ds in self._datasets:
544  if isinstance(ds, _RestraintDataset):
545  d = ds.dataset
546  else:
547  d = ds
548  final_datasets[d] = None
549  if self.include_primaries:
550  add_parents(d)
551  self._datasets = final_datasets.keys()
552 
553 
554 class _ExternalReference(object):
555  """A single externally-referenced file"""
556  def __init__(self, location, content_type):
557  self.location, self.content_type = location, content_type
558 
559  # Pass id through to location
560  def __set_id(self, i):
561  self.location.id = i
562  id = property(lambda x: x.location.id, __set_id)
563 
564  def __eq__(self, other):
565  return self.location == other.location
566  def __hash__(self):
567  return hash(self.location)
568 
569 
570 class _ExternalReferenceDumper(_Dumper):
571  """Output information on externally referenced files
572  (i.e. anything that refers to a Location that isn't
573  a DatabaseLocation)."""
574 
575  INPUT_DATA = "Input data or restraints"
576  MODELING_OUTPUT = "Modeling or post-processing output"
577  WORKFLOW = "Modeling workflow or script"
578 
579  class _LocalFiles(object):
580  reference_provider = _CifWriter.omitted
581  reference_type = 'Supplementary Files'
582  reference = _CifWriter.omitted
583  refers_to = 'Other'
584  associated_url = _CifWriter.omitted
585 
586  def __init__(self, top_directory):
587  self.top_directory = top_directory
588 
589  def _get_full_path(self, path):
590  return os.path.relpath(path, start=self.top_directory)
591 
592  class _Repository(object):
593  reference_provider = _CifWriter.omitted
594  reference_type = 'DOI'
595  refers_to = 'Other'
596  associated_url = _CifWriter.omitted
597 
598  def __init__(self, repo):
599  self.id = repo.id
600  self.reference = repo.doi
601  if 'zenodo' in self.reference:
602  self.reference_provider = 'Zenodo'
603  if repo.url:
604  self.associated_url = repo.url
605  if repo.url.endswith(".zip"):
606  self.refers_to = 'Archive'
607  else:
608  self.refers_to = 'File'
609 
610  def __init__(self, simo):
611  super(_ExternalReferenceDumper, self).__init__(simo)
612  self._refs = []
613 
614  def add(self, location, content_type):
615  """Add a new location.
616  Note that ids are assigned later."""
617  self._refs.append(_ExternalReference(location, content_type))
618  return location
619 
620  def finalize_metadata(self):
621  """Register locations for any metadata and add the main script"""
622  loc = IMP.pmi.metadata.FileLocation(path=self.simo._main_script,
623  details="The main integrative modeling script")
624  main_script = IMP.pmi.metadata.PythonScript(loc)
625  self._workflow = [main_script] \
626  + [m for m in self.simo._metadata
627  if isinstance(m, IMP.pmi.metadata.PythonScript)]
628  for w in self._workflow:
629  self.add(w.location, self.WORKFLOW)
630 
631  def finalize_after_datasets(self):
632  """Note that this must happen *after* DatasetDumper.finalize()"""
633  # Keep only locations that don't point into databases (these are
634  # handled elsewhere)
635  self._refs = [x for x in self._refs
636  if not isinstance(x.location,
638  # Assign IDs to all locations and repos (including the None repo, which
639  # is for local files)
640  seen_refs = {}
641  seen_repos = {}
642  self._ref_by_id = []
643  self._repo_by_id = []
644  # Special dummy repo for repo=None (local files)
645  self._local_files = self._LocalFiles(self.simo._working_directory)
646  for r in self._refs:
647  # Update location to point to parent repository, if any
648  self.simo._update_location(r.location)
649  # Assign a unique ID to the reference
650  _assign_id(r, seen_refs, self._ref_by_id)
651  # Assign a unique ID to the repository
652  _assign_id(r.location.repo or self._local_files, seen_repos,
653  self._repo_by_id)
654 
655  def dump(self, writer):
656  self.dump_repos(writer)
657  self.dump_refs(writer)
658 
659  def dump_repos(self, writer):
660  def map_repo(r):
661  return r if isinstance(r, self._LocalFiles) else self._Repository(r)
662  with writer.loop("_ihm_external_reference_info",
663  ["reference_id", "reference_provider",
664  "reference_type", "reference", "refers_to",
665  "associated_url"]) as l:
666  for repo in [map_repo(r) for r in self._repo_by_id]:
667  l.write(reference_id=repo.id,
668  reference_provider=repo.reference_provider,
669  reference_type=repo.reference_type,
670  reference=repo.reference, refers_to=repo.refers_to,
671  associated_url=repo.associated_url)
672 
673  def dump_refs(self, writer):
674  with writer.loop("_ihm_external_files",
675  ["id", "reference_id", "file_path", "content_type",
676  "details"]) as l:
677  for r in self._ref_by_id:
678  loc = r.location
679  repo = loc.repo or self._local_files
680  file_path=self._posix_path(repo._get_full_path(loc.path))
681  l.write(id=loc.id, reference_id=repo.id,
682  file_path=file_path,
683  content_type=r.content_type,
684  details=loc.details or _CifWriter.omitted)
685 
686  # On Windows systems, convert native paths to POSIX-like (/-separated) paths
687  if os.sep == '/':
688  def _posix_path(self, path):
689  return path
690  else:
691  def _posix_path(self, path):
692  return path.replace(os.sep, '/')
693 
694 
695 class _DatasetDumper(_Dumper):
696  def __init__(self, simo):
697  super(_DatasetDumper, self).__init__(simo)
698  self._datasets = []
699  self._dataset_groups = []
700 
701  def get_all_group(self, include_primaries=False):
702  """Get a _DatasetGroup encompassing all datasets so far"""
703  g = _DatasetGroup(self._datasets, include_primaries)
704  self._dataset_groups.append(g)
705  return g
706 
707  def add(self, dataset):
708  """Add a new dataset.
709  Note that ids are assigned later."""
710  self._datasets.append(dataset)
711  return dataset
712 
713  def finalize(self):
714  seen_datasets = {}
715  # Assign IDs to all datasets
716  self._dataset_by_id = []
717  for d in self._flatten_dataset(self._datasets):
718  # Register location (need to do that now rather than in add() in
719  # order to properly handle _RestraintDataset)
720  self.simo.extref_dump.add(d.location,
721  _ExternalReferenceDumper.INPUT_DATA)
722  _assign_id(d, seen_datasets, self._dataset_by_id)
723 
724  # Make sure that all datasets are listed, even if they weren't used
725  all_group = self.get_all_group(True)
726  # Assign IDs to all groups and remove duplicates
727  seen_group_ids = {}
728  self._dataset_group_by_id = []
729  for g in self._dataset_groups:
730  g.finalize()
731  ids = tuple(sorted(d.id for d in g._datasets))
732  if ids not in seen_group_ids:
733  self._dataset_group_by_id.append(g)
734  g.id = len(self._dataset_group_by_id)
735  seen_group_ids[ids] = g
736  else:
737  g.id = seen_group_ids[ids].id
738  # Finalize external references (must happen after dataset finalize)
739  self.simo.extref_dump.finalize_after_datasets()
740 
741  def _flatten_dataset(self, d):
742  if isinstance(d, list):
743  for p in d:
744  for x in self._flatten_dataset(p):
745  yield x
746  elif isinstance(d, _RestraintDataset):
747  for x in self._flatten_dataset(d.dataset):
748  yield x
749  else:
750  for p in d._parents.keys():
751  for x in self._flatten_dataset(p):
752  yield x
753  yield d
754 
755  def dump(self, writer):
756  ordinal = 1
757  with writer.loop("_ihm_dataset_list",
758  ["ordinal_id", "id", "group_id", "data_type",
759  "database_hosted"]) as l:
760  for g in self._dataset_group_by_id:
761  for d in g._datasets:
762  l.write(ordinal_id=ordinal, id=d.id, group_id=g.id,
763  data_type=d._data_type,
764  database_hosted=isinstance(d.location,
766  ordinal += 1
767  self.dump_other((d for d in self._dataset_by_id
768  if not isinstance(d.location,
770  writer)
771  self.dump_rel_dbs((d for d in self._dataset_by_id
772  if isinstance(d.location,
774  writer)
775  self.dump_related(writer)
776 
777  def dump_related(self, writer):
778  ordinal = 1
779  with writer.loop("_ihm_related_datasets",
780  ["ordinal_id", "dataset_list_id_derived",
781  "dataset_list_id_primary"]) as l:
782  for derived in self._dataset_by_id:
783  for parent in sorted(derived._parents.keys(),
784  key=lambda d: d.id):
785  l.write(ordinal_id=ordinal,
786  dataset_list_id_derived=derived.id,
787  dataset_list_id_primary=parent.id)
788  ordinal += 1
789 
790  def dump_rel_dbs(self, datasets, writer):
791  ordinal = 1
792  with writer.loop("_ihm_dataset_related_db_reference",
793  ["id", "dataset_list_id", "db_name",
794  "accession_code", "version", "details"]) as l:
795  for d in datasets:
796  l.write(id=ordinal, dataset_list_id=d.id,
797  db_name=d.location.db_name,
798  accession_code=d.location.access_code,
799  version=d.location.version if d.location.version
800  else _CifWriter.omitted,
801  details=d.location.details if d.location.details
802  else _CifWriter.omitted)
803  ordinal += 1
804 
805  def dump_other(self, datasets, writer):
806  ordinal = 1
807  with writer.loop("_ihm_dataset_external_reference",
808  ["id", "dataset_list_id", "file_id"]) as l:
809  for d in datasets:
810  l.write(id=ordinal, dataset_list_id=d.id, file_id=d.location.id)
811  ordinal += 1
812 
813 
814 class _CrossLinkGroup(object):
815  """Group common information for a set of cross links"""
816  def __init__(self, pmi_restraint, rdataset):
817  self.pmi_restraint, self.rdataset = pmi_restraint, rdataset
818  self.label = self.pmi_restraint.label
819 
820 
821 class _ExperimentalCrossLink(object):
822  def __init__(self, r1, c1, r2, c2, length, group):
823  self.r1, self.c1, self.r2, self.c2 = r1, c1, r2, c2
824  self.length, self.group = length, group
825 
826 class _CrossLink(object):
827  def __init__(self, ex_xl, p1, p2, sigma1, sigma2, psi):
828  self.ex_xl, self.sigma1, self.sigma2 = ex_xl, sigma1, sigma2
829  self.p1, self.p2 = p1, p2
830  self.psi = psi
831 
832 class _CrossLinkDumper(_Dumper):
833  def __init__(self, simo):
834  super(_CrossLinkDumper, self).__init__(simo)
835  self.cross_links = []
836  self.exp_cross_links = []
837 
838  def add_experimental(self, cross_link):
839  self.exp_cross_links.append(cross_link)
840  cross_link.id = len(self.exp_cross_links)
841 
842  def add(self, cross_link):
843  self.cross_links.append(cross_link)
844  cross_link.id = len(self.cross_links)
845 
846  def dump(self, writer):
847  self.dump_list(writer)
848  self.dump_restraint(writer)
849  self.dump_results(writer)
850 
851  def dump_list(self, writer):
852  with writer.loop("_ihm_cross_link_list",
853  ["id", "group_id", "entity_description_1",
854  "entity_id_1", "seq_id_1", "comp_id_1",
855  "entity_description_2",
856  "entity_id_2", "seq_id_2", "comp_id_2", "type",
857  "dataset_list_id"]) as l:
858  for xl in self.exp_cross_links:
859  entity1 = self.simo.entities[xl.c1]
860  entity2 = self.simo.entities[xl.c2]
861  seq1 = entity1.sequence
862  seq2 = entity2.sequence
863  rt1 = IMP.atom.get_residue_type(seq1[xl.r1-1])
864  rt2 = IMP.atom.get_residue_type(seq2[xl.r2-1])
865  # todo: handle experimental ambiguity (group_id) properly
866  l.write(id=xl.id, group_id=xl.id,
867  entity_description_1=entity1.description,
868  entity_id_1=entity1.id,
869  seq_id_1=xl.r1,
870  comp_id_1=rt1.get_string(),
871  entity_description_2=entity2.description,
872  entity_id_2=entity2.id,
873  seq_id_2=xl.r2,
874  comp_id_2=rt2.get_string(),
875  type=xl.group.label,
876  dataset_list_id=xl.group.rdataset.dataset.id)
877 
878  def _granularity(self, xl):
879  """Determine the granularity of a cross link"""
880  if _get_by_residue(xl.p1) and _get_by_residue(xl.p2):
881  return 'by-residue'
882  else:
883  return 'by-feature'
884 
885  def dump_restraint(self, writer):
886  asym = _AsymIDMapper(self.simo.prot)
887  with writer.loop("_ihm_cross_link_restraint",
888  ["id", "group_id", "entity_id_1", "asym_id_1",
889  "seq_id_1", "comp_id_1",
890  "entity_id_2", "asym_id_2", "seq_id_2", "comp_id_2",
891  "type", "conditional_crosslink_flag",
892  "model_granularity", "distance_threshold",
893  "psi", "sigma_1", "sigma_2"]) as l:
894  for xl in self.cross_links:
895  entity1 = self.simo.entities[xl.ex_xl.c1]
896  entity2 = self.simo.entities[xl.ex_xl.c2]
897  seq1 = entity1.sequence
898  seq2 = entity2.sequence
899  rt1 = IMP.atom.get_residue_type(seq1[xl.ex_xl.r1-1])
900  rt2 = IMP.atom.get_residue_type(seq2[xl.ex_xl.r2-1])
901  l.write(id=xl.id,
902  group_id=xl.ex_xl.id,
903  entity_id_1=entity1.id,
904  asym_id_1=asym[xl.p1],
905  seq_id_1=xl.ex_xl.r1,
906  comp_id_1=rt1.get_string(),
907  entity_id_2=entity2.id,
908  asym_id_2=asym[xl.p2],
909  seq_id_2=xl.ex_xl.r2,
910  comp_id_2=rt2.get_string(),
911  type=xl.ex_xl.group.label,
912  # todo: any circumstances where this could be ANY?
913  conditional_crosslink_flag="ALL",
914  model_granularity=self._granularity(xl),
915  distance_threshold=xl.ex_xl.length,
916  psi=xl.psi.get_scale(),
917  sigma_1=xl.sigma1.get_scale(),
918  sigma_2=xl.sigma2.get_scale())
919 
920  def _set_psi_sigma(self, model, g):
921  for resolution in g.pmi_restraint.sigma_dictionary:
922  statname = 'ISDCrossLinkMS_Sigma_%s_%s' % (resolution, g.label)
923  if model.stats and statname in model.stats:
924  sigma = float(model.stats[statname])
925  p = g.pmi_restraint.sigma_dictionary[resolution][0]
926  p.set_scale(sigma)
927  for psiindex in g.pmi_restraint.psi_dictionary:
928  statname = 'ISDCrossLinkMS_Psi_%s_%s' % (psiindex, g.label)
929  if model.stats and statname in model.stats:
930  psi = float(model.stats[statname])
931  p = g.pmi_restraint.psi_dictionary[psiindex][0]
932  p.set_scale(psi)
933 
934  def dump_results(self, writer):
935  all_groups = {}
936  for xl in self.cross_links:
937  all_groups[xl.ex_xl.group] = None
938  ordinal = 1
939  with writer.loop("_ihm_cross_link_result_parameters",
940  ["ordinal_id", "restraint_id", "model_id",
941  "psi", "sigma_1", "sigma_2"]) as l:
942  for model in self.models:
943  for g in all_groups.keys():
944  self._set_psi_sigma(model, g)
945  for xl in self.cross_links:
946  if model.stats:
947  l.write(ordinal_id=ordinal, restraint_id=xl.id,
948  model_id=model.id, psi=xl.psi.get_scale(),
949  sigma_1=xl.sigma1.get_scale(),
950  sigma_2=xl.sigma2.get_scale())
951  ordinal += 1
952 
953 class _EM2DRestraint(object):
954  def __init__(self, rdataset, resolution, pixel_size,
955  image_resolution, projection_number):
956  self.rdataset, self.resolution = rdataset, resolution
957  self.pixel_size, self.image_resolution = pixel_size, image_resolution
958  self.projection_number = projection_number
959 
960  def get_num_raw_micrographs(self):
961  """Return the number of raw micrographs used, if known.
962  This is extracted from the EMMicrographsDataset if any."""
963  for d in self.rdataset.dataset._parents.keys():
964  if isinstance(d, IMP.pmi.metadata.EMMicrographsDataset):
965  return d.number
966 
967 class _EM2DDumper(_Dumper):
968  def __init__(self, simo):
969  super(_EM2DDumper, self).__init__(simo)
970  self.restraints = []
971 
972  def add(self, rsr):
973  self.restraints.append(rsr)
974  rsr.id = len(self.restraints)
975 
976  def dump(self, writer):
977  with writer.loop("_ihm_2dem_class_average_restraint",
978  ["id", "dataset_list_id", "number_raw_micrographs",
979  "pixel_size_width", "pixel_size_height",
980  "image_resolution", "image_segment_flag",
981  "number_of_projections", "struct_assembly_id",
982  "details"]) as l:
983  for r in self.restraints:
984  unk = _CifWriter.unknown
985  num_raw = r.get_num_raw_micrographs()
986  l.write(id=r.id, dataset_list_id=r.rdataset.dataset.id,
987  number_raw_micrographs=num_raw if num_raw else unk,
988  pixel_size_width=r.pixel_size,
989  pixel_size_height=r.pixel_size,
990  image_resolution=r.image_resolution,
991  number_of_projections=r.projection_number,
992  struct_assembly_id=self.simo.modeled_assembly.id,
993  image_segment_flag=False)
994 
995 class _EM3DRestraint(object):
996  fitting_method = 'Gaussian mixture models'
997 
998  def __init__(self, simo, rdataset, target_ps, densities):
999  self.rdataset = rdataset
1000  self.number_of_gaussians = len(target_ps)
1001  self.assembly = self.get_assembly(densities, simo)
1002 
1003  def get_assembly(self, densities, simo):
1004  """Get the Assembly that this restraint acts on"""
1005  cm = _ComponentMapper(simo.prot)
1006  components = {}
1007  for d in densities:
1008  components[cm[d]] = None
1009  return simo.assembly_dump.get_subassembly(components)
1010 
1011 
1012 class _EM3DDumper(_Dumper):
1013  def __init__(self, simo):
1014  super(_EM3DDumper, self).__init__(simo)
1015  self.restraints = []
1016 
1017  def add(self, rsr):
1018  self.restraints.append(rsr)
1019 
1020  def dump(self, writer):
1021  # todo: support other fields
1022  ordinal = 1
1023  with writer.loop("_ihm_3dem_restraint",
1024  ["ordinal_id", "dataset_list_id", "fitting_method",
1025  "struct_assembly_id",
1026  "number_of_gaussians", "model_id",
1027  "cross_correlation_coefficient"]) as l:
1028  for model in self.models:
1029  for r in self.restraints:
1030  # todo: fill in CCC
1031  l.write(ordinal_id=ordinal,
1032  dataset_list_id=r.rdataset.dataset.id,
1033  fitting_method=r.fitting_method,
1034  struct_assembly_id=r.assembly.id,
1035  number_of_gaussians=r.number_of_gaussians,
1036  model_id=model.id)
1037  ordinal += 1
1038 
1039 class _Assembly(list):
1040  """A collection of components. Currently simply implemented as a list of
1041  the component names. These must be in creation order."""
1042  pass
1043 
1044 class _AssemblyDumper(_Dumper):
1045  def __init__(self, simo):
1046  super(_AssemblyDumper, self).__init__(simo)
1047  self.assemblies = []
1048  self.output = IMP.pmi.output.Output()
1049 
1050  def add(self, a):
1051  """Add a new assembly. The first such assembly is assumed to contain
1052  all components."""
1053  self.assemblies.append(a)
1054  a.id = len(self.assemblies)
1055  return a
1056 
1057  def get_subassembly(self, compdict):
1058  """Get an _Assembly consisting of the given components. Should only
1059  be called after all components are created."""
1060  # Put components in creation order
1061  newa = _Assembly(c for c in self.assemblies[0] if c in compdict)
1062  for a in self.assemblies:
1063  if newa == a: # Note that .id is ignored by ==
1064  return a
1065  else:
1066  return self.add(newa)
1067 
1068  def dump(self, writer):
1069  ordinal = 1
1070  with writer.loop("_ihm_struct_assembly",
1071  ["ordinal_id", "assembly_id", "entity_description",
1072  "entity_id", "asym_id", "seq_id_begin",
1073  "seq_id_end"]) as l:
1074  for a in self.assemblies:
1075  for comp in a:
1076  entity = self.simo.entities[comp]
1077  seq = self.simo.sequence_dict[comp]
1078  chain_id = self.simo._get_chain_for_component(comp,
1079  self.output)
1080  l.write(ordinal_id=ordinal, assembly_id=a.id,
1081  entity_description=entity.description,
1082  entity_id=entity.id,
1083  asym_id=chain_id,
1084  seq_id_begin=1,
1085  seq_id_end=len(seq))
1086  ordinal += 1
1087 
1088 
1089 class _Protocol(object):
1090  pass
1091 
1092 class _ReplicaExchangeProtocol(_Protocol):
1093  def __init__(self, rex):
1094  self.step_name = 'Sampling'
1095  if rex.monte_carlo_sample_objects is not None:
1096  self.step_method = 'Replica exchange monte carlo'
1097  else:
1098  self.step_method = 'Replica exchange molecular dynamics'
1099  self.num_models_end = rex.vars["number_of_frames"]
1100 
1101 class _ModelGroup(object):
1102  """Group sets of models"""
1103  def __init__(self, name):
1104  self.name = name
1105 
1106 class _Model(object):
1107  def __init__(self, prot, simo, protocol, assembly, group):
1108  self.group = group
1109  # The _Protocol which produced this model
1110  self.protocol = protocol
1111  self.assembly = assembly
1112  self.stats = None
1113  o = IMP.pmi.output.Output()
1114  name = 'cif-output'
1115  o.dictionary_pdbs[name] = prot
1116  o._init_dictchain(name, prot)
1117  (particle_infos_for_pdb,
1118  self.geometric_center) = o.get_particle_infos_for_pdb_writing(name)
1119  self.entity_for_chain = {}
1120  self.comp_for_chain = {}
1121  for protname, chain_id in o.dictchain[name].items():
1122  self.entity_for_chain[chain_id] = simo.entities[protname]
1123  self.comp_for_chain[chain_id] = protname
1124  self.spheres = [t for t in particle_infos_for_pdb if t[1] is None]
1125  self.atoms = [t for t in particle_infos_for_pdb if t[1] is not None]
1126  self.rmsf = {}
1127  self.name = _CifWriter.omitted
1128 
1129  def parse_rmsf_file(self, fname, component):
1130  self.rmsf[component] = rmsf = {}
1131  with open(fname) as fh:
1132  for line in fh:
1133  resnum, blocknum, val = line.split()
1134  rmsf[int(resnum)] = (int(blocknum), float(val))
1135 
1136  def get_rmsf(self, component, indexes):
1137  """Get the RMSF value for the given residue indexes."""
1138  if not self.rmsf:
1139  return _CifWriter.omitted
1140  rmsf = self.rmsf[component]
1141  blocknums = dict.fromkeys(rmsf[ind][0] for ind in indexes)
1142  if len(blocknums) != 1:
1143  raise ValueError("Residue indexes %s aren't all in the same block"
1144  % str(indexes))
1145  return rmsf[indexes[0]][1]
1146 
1147 class _ModelDumper(_Dumper):
1148  def __init__(self, simo):
1149  super(_ModelDumper, self).__init__(simo)
1150  self.models = []
1151 
1152  def add(self, prot, protocol, assembly, group):
1153  m = _Model(prot, self.simo, protocol, assembly, group)
1154  self.models.append(m)
1155  m.id = len(self.models)
1156  return m
1157 
1158  def dump(self, writer):
1159  self.dump_model_list(writer)
1160  num_atoms = sum(len(m.atoms) for m in self.models)
1161  num_spheres = sum(len(m.spheres) for m in self.models)
1162  self.dump_atoms(writer)
1163  self.dump_spheres(writer)
1164 
1165  def dump_model_list(self, writer):
1166  ordinal = 1
1167  with writer.loop("_ihm_model_list",
1168  ["ordinal_id", "model_id", "model_group_id",
1169  "model_name", "model_group_name", "assembly_id",
1170  "protocol_id"]) as l:
1171  for model in self.models:
1172  l.write(ordinal_id=ordinal, model_id=model.id,
1173  model_group_id=model.group.id,
1174  model_name=model.name,
1175  model_group_name=model.group.name,
1176  assembly_id=model.assembly.id,
1177  protocol_id=model.protocol.id)
1178  ordinal += 1
1179 
1180  def dump_atoms(self, writer):
1181  ordinal = 1
1182  with writer.loop("_atom_site",
1183  ["id", "label_atom_id", "label_comp_id",
1184  "label_seq_id",
1185  "label_asym_id", "Cartn_x",
1186  "Cartn_y", "Cartn_z", "label_entity_id",
1187  "model_id"]) as l:
1188  for model in self.models:
1189  for atom in model.atoms:
1190  (xyz, atom_type, residue_type, chain_id, residue_index,
1191  all_indexes, radius) = atom
1192  l.write(id=ordinal, label_atom_id=atom_type.get_string(),
1193  label_comp_id=residue_type.get_string(),
1194  label_asym_id=chain_id,
1195  label_entity_id=model.entity_for_chain[chain_id].id,
1196  label_seq_id=residue_index,
1197  Cartn_x=xyz[0] - model.geometric_center[0],
1198  Cartn_y=xyz[1] - model.geometric_center[1],
1199  Cartn_z=xyz[2] - model.geometric_center[2],
1200  model_id=model.id)
1201  ordinal += 1
1202 
1203  def dump_spheres(self, writer):
1204  ordinal = 1
1205  with writer.loop("_ihm_sphere_obj_site",
1206  ["ordinal_id", "entity_id", "seq_id_begin",
1207  "seq_id_end", "asym_id", "Cartn_x",
1208  "Cartn_y", "Cartn_z", "object_radius", "rmsf",
1209  "model_id"]) as l:
1210  for model in self.models:
1211  for sphere in model.spheres:
1212  (xyz, atom_type, residue_type, chain_id, residue_index,
1213  all_indexes, radius) = sphere
1214  if all_indexes is None:
1215  all_indexes = (residue_index,)
1216  l.write(ordinal_id=ordinal,
1217  entity_id=model.entity_for_chain[chain_id].id,
1218  seq_id_begin = all_indexes[0],
1219  seq_id_end = all_indexes[-1],
1220  asym_id=chain_id,
1221  Cartn_x=xyz[0] - model.geometric_center[0],
1222  Cartn_y=xyz[1] - model.geometric_center[1],
1223  Cartn_z=xyz[2] - model.geometric_center[2],
1224  object_radius=radius,
1225  rmsf=model.get_rmsf(model.comp_for_chain[chain_id],
1226  all_indexes),
1227  model_id=model.id)
1228  ordinal += 1
1229 
1230 
1231 class _ModelProtocolDumper(_Dumper):
1232  def __init__(self, simo):
1233  super(_ModelProtocolDumper, self).__init__(simo)
1234  self.protocols = []
1235 
1236  def add(self, protocol):
1237  self.protocols.append(protocol)
1238  # todo: support multiple protocols with multiple steps
1239  protocol.id = 1
1240  protocol.step_id = len(self.protocols)
1241  # Assume that protocol uses all currently-defined datasets
1242  protocol.dataset_group = self.simo.dataset_dump.get_all_group()
1243 
1244  def get_last_protocol(self):
1245  """Return the most recently-added _Protocol"""
1246  return self.protocols[-1]
1247 
1248  def dump(self, writer):
1249  ordinal = 1
1250  with writer.loop("_ihm_modeling_protocol",
1251  ["ordinal_id", "protocol_id", "step_id",
1252  "struct_assembly_id", "dataset_group_id",
1253  "struct_assembly_description", "protocol_name",
1254  "step_name", "step_method", "num_models_begin",
1255  "num_models_end", "multi_scale_flag",
1256  "multi_state_flag", "time_ordered_flag"]) as l:
1257  num_models_begin = 0
1258  for p in self.protocols:
1259  l.write(ordinal_id=ordinal, protocol_id=p.id,
1260  step_id=p.step_id, step_method=p.step_method,
1261  step_name=p.step_name,
1262  struct_assembly_id=self.simo.modeled_assembly.id,
1263  dataset_group_id=p.dataset_group.id,
1264  num_models_begin=num_models_begin,
1265  num_models_end=p.num_models_end,
1266  # todo: support multiple states, time ordered
1267  multi_state_flag=False, time_ordered_flag=False,
1268  # all PMI models are multi scale
1269  multi_scale_flag=True)
1270  num_models_begin = p.num_models_end
1271  ordinal += 1
1272 
1273 
1274 class _PDBHelix(object):
1275  """Represent a HELIX record from a PDB file."""
1276  def __init__(self, line):
1277  self.helix_id = line[11:14].strip()
1278  self.start_resnam = line[14:18].strip()
1279  self.start_asym = line[19]
1280  self.start_resnum = int(line[21:25])
1281  self.end_resnam = line[27:30].strip()
1282  self.end_asym = line[31]
1283  self.end_resnum = int(line[33:37])
1284  self.helix_class = int(line[38:40])
1285  self.length = int(line[71:76])
1286 
1287 
1288 class _MSESeqDif(object):
1289  """Track an MSE -> MET mutation in the starting model sequence"""
1290  comp_id = 'MET'
1291  db_comp_id = 'MSE'
1292  details = 'Conversion of modified residue MSE to MET'
1293  def __init__(self, res, component, source, offset):
1294  self.res, self.component, self.source = res, component, source
1295  self.offset = offset
1296 
1297 
1298 class _StartingModelDumper(_Dumper):
1299  def __init__(self, simo):
1300  super(_StartingModelDumper, self).__init__(simo)
1301  # dict of PDB fragments, ordered by component name
1302  self.fragments = OrderedDict()
1303  # dict of starting models (entire PDB files), collected from fragments
1304  self.models = OrderedDict()
1305  # mapping from component+pdbname to starting model
1306  self.starting_model = {}
1307  self.output = IMP.pmi.output.Output()
1308 
1309  def add_pdb_fragment(self, fragment):
1310  """Add a starting model PDB fragment."""
1311  comp = fragment.component
1312  if comp not in self.fragments:
1313  self.fragments[comp] = []
1314  self.models[comp] = []
1315  self.fragments[comp].append(fragment)
1316  models = self.models[comp]
1317  if len(models) == 0 \
1318  or models[-1].fragments[0].pdbname != fragment.pdbname:
1319  model = _StartingModel(fragment)
1320  models.append(model)
1321  model.sources = self.get_sources(model, fragment.pdbname,
1322  fragment.chain)
1323  else:
1324  models[-1].fragments.append(fragment)
1325 
1326  def get_templates(self, pdbname, model):
1327  templates = []
1328  tmpre = re.compile('REMARK 6 TEMPLATE: '
1329  '(\S+) (\S+):\S+ \- (\S+):\S+ '
1330  'MODELS (\S+):(\S+) \- (\S+):\S+ AT (\S+)%')
1331 
1332  with open(pdbname) as fh:
1333  for line in fh:
1334  if line.startswith('ATOM'): # Read only the header
1335  break
1336  m = tmpre.match(line)
1337  if m:
1338  templates.append(_TemplateSource(m.group(1),
1339  int(m.group(2)),
1340  int(m.group(3)),
1341  int(m.group(4)),
1342  m.group(5),
1343  int(m.group(6)),
1344  m.group(7), model))
1345  # Add datasets for templates
1346  for t in templates:
1347  # todo: handle templates that aren't in PDB
1348  if t.tm_db_code:
1349  l = IMP.pmi.metadata.PDBLocation(t.tm_db_code)
1351  d = self.simo.dataset_dump.add(d)
1352  t.tm_dataset = d
1353  model.dataset.add_primary(d)
1354 
1355  # Sort by starting residue, then ending residue
1356  return sorted(templates, key=lambda x: (x._seq_id_begin, x._seq_id_end))
1357 
1358  def _parse_pdb(self, fh, first_line):
1359  """Extract information from an official PDB"""
1360  metadata = []
1361  details = ''
1362  for line in fh:
1363  if line.startswith('TITLE'):
1364  details += line[10:].rstrip()
1365  elif line.startswith('HELIX'):
1366  metadata.append(_PDBHelix(line))
1367  return (first_line[50:59].strip(),
1368  details if details else _CifWriter.unknown, metadata)
1369 
1370  def get_sources(self, model, pdbname, chain):
1371  # Attempt to identity PDB file vs. comparative model
1372  fh = open(pdbname)
1373  first_line = fh.readline()
1374  local_file = IMP.pmi.metadata.FileLocation(pdbname)
1375  file_dataset = self.simo.get_file_dataset(pdbname)
1376  if first_line.startswith('HEADER'):
1377  version, details, metadata = self._parse_pdb(fh, first_line)
1378  source = _PDBSource(model, first_line[62:66].strip(), chain,
1379  metadata)
1380  l = IMP.pmi.metadata.PDBLocation(source.db_code, version, details)
1382  model.dataset = self.simo.dataset_dump.add(file_dataset or d)
1383  return [source]
1384  elif first_line.startswith('EXPDTA THEORETICAL MODEL, MODELLER'):
1385  self.simo.software_dump.set_modeller_used(
1386  *first_line[38:].split(' ', 1))
1388  model.dataset = self.simo.dataset_dump.add(file_dataset or d)
1389  templates = self.get_templates(pdbname, model)
1390 
1391  if templates:
1392  return templates
1393  else:
1394  return [_UnknownSource(model, chain)]
1395  else:
1396  # todo: extract Modeller-like template info for Phyre models;
1397  # revisit assumption that all such unknown source PDBs are
1398  # comparative models
1400  model.dataset = self.simo.dataset_dump.add(file_dataset or d)
1401  return [_UnknownSource(model, chain)]
1402 
1403  def assign_model_details(self):
1404  for comp, models in self.models.items():
1405  for i, model in enumerate(models):
1406  model.name = "%s-m%d" % (comp, i+1)
1407  self.starting_model[comp, model.fragments[0].pdbname] = model
1408  model.seq_id_begin = min(x.start + x.offset
1409  for x in model.fragments)
1410  model.seq_id_end = max(x.end + x.offset
1411  for x in model.fragments)
1412 
1413  def all_models(self):
1414  for comp, models in self.models.items():
1415  for model in models:
1416  yield model
1417 
1418  def finalize(self):
1419  self.assign_model_details()
1420 
1421  def dump(self, writer):
1422  self.dump_details(writer)
1423  self.dump_comparative(writer)
1424  seq_dif = self.dump_coords(writer)
1425  self.dump_seq_dif(writer, seq_dif)
1426 
1427  def dump_seq_dif(self, writer, seq_dif):
1428  ordinal = 1
1429  with writer.loop("_ihm_starting_model_seq_dif",
1430  ["ordinal_id", "entity_id", "asym_id",
1431  "seq_id", "comp_id", "starting_model_ordinal_id",
1432  "db_asym_id", "db_seq_id", "db_comp_id",
1433  "details"]) as l:
1434  for sd in seq_dif:
1435  chain_id = self.simo._get_chain_for_component(
1436  sd.component, self.output)
1437  entity = self.simo.entities[sd.component]
1438  l.write(ordinal_id=ordinal, entity_id=entity.id,
1439  asym_id=chain_id, seq_id=sd.res.get_index(),
1440  comp_id=sd.comp_id,
1441  db_asym_id=sd.source.chain_id,
1442  db_seq_id=sd.res.get_index() - sd.offset,
1443  db_comp_id=sd.db_comp_id,
1444  starting_model_ordinal_id=sd.source.id,
1445  details=sd.details)
1446  ordinal += 1
1447 
1448  def dump_comparative(self, writer):
1449  """Dump details on comparative models. Must be called after
1450  dump_details() since it uses IDs assigned there."""
1451  with writer.loop("_ihm_starting_comparative_models",
1452  ["starting_model_ordinal_id", "starting_model_id",
1453  "template_auth_asym_id", "template_seq_begin",
1454  "template_seq_end", "template_sequence_identity",
1455  "template_sequence_identity_denominator",
1456  "template_dataset_list_id",
1457  "alignment_file_id"]) as l:
1458  ordinal = 1
1459  for model in self.all_models():
1460  for template in [s for s in model.sources
1461  if isinstance(s, _TemplateSource)]:
1462  denom = template.sequence_identity_denominator
1463  l.write(starting_model_ordinal_id=template.id,
1464  starting_model_id=model.name,
1465  template_auth_asym_id=template.tm_chain_id,
1466  template_seq_begin=template.tm_seq_id_begin,
1467  template_seq_end=template.tm_seq_id_end,
1468  template_sequence_identity=template.sequence_identity,
1469  template_sequence_identity_denominator=denom,
1470  template_dataset_list_id=template.tm_dataset.id
1471  if template.tm_dataset
1472  else _CifWriter.unknown)
1473 
1474  def dump_details(self, writer):
1475  writer.write_comment("""IMP will attempt to identify which input models
1476 are crystal structures and which are comparative models, but does not always
1477 have sufficient information to deduce all of the templates used for comparative
1478 modeling. These may need to be added manually below.""")
1479  with writer.loop("_ihm_starting_model_details",
1480  ["ordinal_id", "entity_id", "entity_description",
1481  "asym_id", "seq_id_begin",
1482  "seq_id_end", "starting_model_source",
1483  "starting_model_auth_asym_id",
1484  "starting_model_sequence_offset",
1485  "starting_model_id",
1486  "dataset_list_id"]) as l:
1487  ordinal = 1
1488  for model in self.all_models():
1489  f = model.fragments[0]
1490  entity = self.simo.entities[f.component]
1491  chain_id = self.simo._get_chain_for_component(f.component,
1492  self.output)
1493  for source in model.sources:
1494  source.id = ordinal
1495  source0 = model.sources[0]
1496  # Where there are multiple sources (to date, this can only
1497  # mean multiple templates for a comparative model) consolidate
1498  # them; template info is given in starting_comparative_models.
1499  seq_id_begin, seq_id_end = source0.get_seq_id_range(model)
1500  for source in model.sources[1:]:
1501  this_begin, this_end = source.get_seq_id_range(model)
1502  seq_id_begin = min(seq_id_begin, this_begin)
1503  seq_id_end = max(seq_id_end, this_end)
1504  l.write(ordinal_id=ordinal,
1505  entity_id=entity.id,
1506  entity_description=entity.description,
1507  asym_id=chain_id,
1508  seq_id_begin=seq_id_begin,
1509  seq_id_end=seq_id_end,
1510  starting_model_auth_asym_id=source0.chain_id,
1511  starting_model_id=model.name,
1512  starting_model_source=source0.source,
1513  starting_model_sequence_offset=f.offset,
1514  dataset_list_id=model.dataset.id)
1515  ordinal += 1
1516 
1517  def dump_coords(self, writer):
1518  seq_dif = []
1519  ordinal = 1
1520  with writer.loop("_ihm_starting_model_coord",
1521  ["starting_model_id", "group_PDB", "id", "type_symbol",
1522  "atom_id", "comp_id", "entity_id", "asym_id",
1523  "seq_id", "Cartn_x",
1524  "Cartn_y", "Cartn_z", "B_iso_or_equiv",
1525  "ordinal_id"]) as l:
1526  for model in self.all_models():
1527  for f in model.fragments:
1528  sel = IMP.atom.Selection(f.starting_hier,
1529  residue_indexes=list(range(f.start, f.end + 1)))
1530  last_res_index = None
1531  for a in sel.get_selected_particles():
1532  coord = IMP.core.XYZ(a).get_coordinates()
1533  atom = IMP.atom.Atom(a)
1534  element = atom.get_element()
1535  element = IMP.atom.get_element_table().get_name(element)
1536  atom_name = atom.get_atom_type().get_string()
1537  group_pdb = 'ATOM'
1538  if atom_name.startswith('HET:'):
1539  group_pdb = 'HETATM'
1540  atom_name = atom_name[4:]
1541  res = IMP.atom.get_residue(atom)
1542  res_name = res.get_residue_type().get_string()
1543  # MSE in the original PDB is automatically mutated
1544  # by IMP to MET, so reflect that in the output,
1545  # and pass back to populate the seq_dif category.
1546  if res_name == 'MSE':
1547  res_name = 'MET'
1548  # Only add one seq_dif record per residue
1549  ind = res.get_index()
1550  if ind != last_res_index:
1551  last_res_index = ind
1552  # This should only happen when we're using
1553  # a crystal structure as the source (a
1554  # comparative model would use MET in
1555  # the sequence)
1556  assert(len(model.sources) == 1)
1557  seq_dif.append(_MSESeqDif(res, f.component,
1558  model.sources[0],
1559  f.offset))
1560  chain_id = self.simo._get_chain_for_component(
1561  f.component, self.output)
1562  entity = self.simo.entities[f.component]
1563  l.write(starting_model_id=model.name,
1564  group_PDB=group_pdb,
1565  id=atom.get_input_index(), type_symbol=element,
1566  atom_id=atom_name, comp_id=res_name,
1567  entity_id=entity.id,
1568  asym_id=chain_id,
1569  seq_id=res.get_index(), Cartn_x=coord[0],
1570  Cartn_y=coord[1], Cartn_z=coord[2],
1571  B_iso_or_equiv=atom.get_temperature_factor(),
1572  ordinal_id=ordinal)
1573  ordinal += 1
1574  return seq_dif
1575 
1576 class _StructConfDumper(_Dumper):
1577  def all_rigid_fragments(self):
1578  """Yield all rigid model representation fragments"""
1579  asym = _AsymIDMapper(self.simo.prot)
1580  model_repr = self.simo.model_repr_dump
1581  for comp, fragments in model_repr.fragments.items():
1582  for f in fragments:
1583  if hasattr(f, 'pdbname') \
1584  and model_repr.get_model_mode(f) == 'rigid':
1585  yield (f, model_repr.starting_model[comp, f.pdbname],
1586  asym[f.hier])
1587 
1588  def all_helices(self):
1589  """Yield all helices that overlap with rigid model fragments"""
1590  for f, starting_model, asym_id in self.all_rigid_fragments():
1591  if len(starting_model.sources) == 1 \
1592  and isinstance(starting_model.sources[0], _PDBSource):
1593  pdb = starting_model.sources[0]
1594  for m in pdb.metadata:
1595  if isinstance(m, _PDBHelix) \
1596  and m.start_asym == pdb.chain_id \
1597  and m.end_asym == pdb.chain_id \
1598  and m.start_resnum >= f.start and m.end_resnum <= f.end:
1599  yield (m, max(f.start, m.start_resnum),
1600  min(f.end, m.end_resnum), asym_id)
1601 
1602  def dump(self, writer):
1603  with writer.category("_struct_conf_type") as l:
1604  l.write(id='HELX_P', criteria=_CifWriter.unknown,
1605  reference=_CifWriter.unknown)
1606  # Dump helix information for the model. For any model fragment that
1607  # is rigid, atomic, and uses an experimental PDB structure as the
1608  # starting model, inherit any helix information from that PDB file.
1609  # Note that we can't use the helix id from the original PDB, since
1610  # it has to be unique and this might not be the case if we inherit
1611  # from multiple PDBs.
1612  ordinal = 0
1613  with writer.loop("_struct_conf",
1614  ["id", "conf_type_id", "beg_label_comp_id",
1615  "beg_label_asym_id", "beg_label_seq_id",
1616  "end_label_comp_id", "end_label_asym_id",
1617  "end_label_seq_id",]) as l:
1618  for h, begin, end, asym_id in self.all_helices():
1619  ordinal += 1
1620  l.write(id='HELX_P%d' % ordinal, conf_type_id='HELX_P',
1621  beg_label_comp_id=h.start_resnam,
1622  beg_label_asym_id=asym_id,
1623  beg_label_seq_id=begin,
1624  end_label_comp_id=h.end_resnam,
1625  end_label_asym_id=asym_id,
1626  end_label_seq_id=end)
1627 
1628 
1629 class _PostProcess(object):
1630  """Base class for any post processing"""
1631  pass
1632 
1633 
1634 class _ReplicaExchangeAnalysisPostProcess(_PostProcess):
1635  """Post processing using AnalysisReplicaExchange0 macro"""
1636  type = 'cluster'
1637  feature = 'RMSD'
1638 
1639  def __init__(self, rex, num_models_begin):
1640  self.rex = rex
1641  self.num_models_begin = num_models_begin
1642  self.num_models_end = 0
1643  for fname in self.get_all_stat_files():
1644  with open(fname) as fh:
1645  self.num_models_end += len(fh.readlines())
1646 
1647  def get_stat_file(self, cluster_num):
1648  return os.path.join(self.rex._outputdir, "cluster.%d" % cluster_num,
1649  'stat.out')
1650 
1651  def get_all_stat_files(self):
1652  for i in range(self.rex._number_of_clusters):
1653  yield self.get_stat_file(i)
1654 
1655 class _SimplePostProcess(_PostProcess):
1656  """Simple ad hoc clustering"""
1657  type = 'cluster'
1658  feature = 'RMSD'
1659 
1660  def __init__(self, num_models_begin, num_models_end):
1661  self.num_models_begin = num_models_begin
1662  self.num_models_end = num_models_end
1663 
1664 class _PostProcessDumper(_Dumper):
1665  def __init__(self, simo):
1666  super(_PostProcessDumper, self).__init__(simo)
1667  self.postprocs = []
1668 
1669  def add(self, postproc):
1670  self.postprocs.append(postproc)
1671  postproc.id = len(self.postprocs)
1672 
1673  def dump(self, writer):
1674  with writer.loop("_ihm_modeling_post_process",
1675  ["id", "protocol_id", "analysis_id", "step_id",
1676  "type", "feature", "num_models_begin",
1677  "num_models_end"]) as l:
1678  # todo: handle multiple protocols (e.g. sampling then refinement)
1679  # and multiple analyses
1680  for p in self.postprocs:
1681  l.write(id=p.id, protocol_id=1, analysis_id=1, step_id=p.id,
1682  type=p.type, feature=p.feature,
1683  num_models_begin=p.num_models_begin,
1684  num_models_end=p.num_models_end)
1685 
1686 class _Ensemble(object):
1687  """Base class for any ensemble"""
1688  pass
1689 
1690 
1691 class _ReplicaExchangeAnalysisEnsemble(_Ensemble):
1692  """Ensemble generated using AnalysisReplicaExchange0 macro"""
1693 
1694  def __init__(self, pp, cluster_num, model_group, num_deposit):
1695  self.file = None
1696  self.model_group = model_group
1697  self.cluster_num = cluster_num
1698  self.postproc = pp
1699  self.num_deposit = num_deposit
1700  self.localization_density = {}
1701  with open(pp.get_stat_file(cluster_num)) as fh:
1702  self.num_models = len(fh.readlines())
1703 
1704  def get_rmsf_file(self, component):
1705  return os.path.join(self.postproc.rex._outputdir,
1706  'cluster.%d' % self.cluster_num,
1707  'rmsf.%s.dat' % component)
1708 
1709  def load_rmsf(self, model, component):
1710  fname = self.get_rmsf_file(component)
1711  if os.path.exists(fname):
1712  model.parse_rmsf_file(fname, component)
1713 
1714  def get_localization_density_file(self, component):
1715  # todo: this assumes that the localization density file name matches
1716  # the component name and is of the complete residue range (usually
1717  # this is the case, but it doesn't have to be)
1718  return os.path.join(self.postproc.rex._outputdir,
1719  'cluster.%d' % self.cluster_num,
1720  '%s.mrc' % component)
1721 
1722  def load_localization_density(self, mdl, component, extref_dump):
1723  fname = self.get_localization_density_file(component)
1724  if os.path.exists(fname):
1725  local_file = IMP.pmi.metadata.FileLocation(fname)
1726  self.localization_density[component] = local_file
1727  extref_dump.add(local_file,
1728  _ExternalReferenceDumper.MODELING_OUTPUT)
1729 
1730  def load_all_models(self, simo):
1731  stat_fname = self.postproc.get_stat_file(self.cluster_num)
1732  model_num = 0
1733  with open(stat_fname) as fh:
1734  stats = eval(fh.readline())
1735  # Correct path
1736  rmf_file = os.path.join(os.path.dirname(stat_fname),
1737  "%d.rmf3" % model_num)
1738  for c in simo.all_modeled_components:
1739  # todo: this only works with PMI 1
1740  simo._representation.set_coordinates_from_rmf(c, rmf_file, 0,
1741  force_rigid_update=True)
1742  # todo: fill in other data from stat file, e.g. crosslink phi/psi
1743  yield stats
1744  model_num += 1
1745  if model_num >= self.num_deposit:
1746  return
1747 
1748  # todo: also support dRMS precision
1749  def _get_precision(self):
1750  precfile = os.path.join(self.postproc.rex._outputdir,
1751  "precision.%d.%d.out" % (self.cluster_num,
1752  self.cluster_num))
1753  if not os.path.exists(precfile):
1754  return _CifWriter.unknown
1755  # Fail if the precision.x.x.out file doesn't match the cluster
1756  r = re.compile('All .*/cluster.%d/ average centroid distance ([\d\.]+)'
1757  % self.cluster_num)
1758  with open(precfile) as fh:
1759  for line in fh:
1760  m = r.match(line)
1761  if m:
1762  return float(m.group(1))
1763 
1764  feature = property(lambda self: self.postproc.feature)
1765  name = property(lambda self: "Cluster %d" % (self.cluster_num + 1))
1766  precision = property(lambda self: self._get_precision())
1767 
1768 class _SimpleEnsemble(_Ensemble):
1769  """Simple manually-created ensemble"""
1770 
1771  feature = 'dRMSD'
1772 
1773  def __init__(self, pp, model_group, num_models, drmsd,
1774  num_models_deposited, ensemble_file):
1775  self.file = ensemble_file
1776  self.postproc = pp
1777  self.model_group = model_group
1778  self.num_deposit = num_models_deposited
1779  self.localization_density = {}
1780  self.num_models = num_models
1781  self.precision = drmsd
1782 
1783  def load_localization_density(self, mdl, component, local_file,
1784  extref_dump):
1785  self.localization_density[component] = local_file
1786  extref_dump.add(local_file,
1787  _ExternalReferenceDumper.MODELING_OUTPUT)
1788 
1789  name = property(lambda self: self.model_group.name)
1790 
1791 
1792 class _EnsembleDumper(_Dumper):
1793  def __init__(self, simo):
1794  super(_EnsembleDumper, self).__init__(simo)
1795  self.ensembles = []
1796 
1797  def add(self, ensemble):
1798  self.ensembles.append(ensemble)
1799  ensemble.id = len(self.ensembles)
1800 
1801  def dump(self, writer):
1802  with writer.loop("_ihm_ensemble_info",
1803  ["ensemble_id", "ensemble_name", "post_process_id",
1804  "model_group_id", "ensemble_clustering_method",
1805  "ensemble_clustering_feature",
1806  "num_ensemble_models",
1807  "num_ensemble_models_deposited",
1808  "ensemble_precision_value",
1809  "ensemble_file_id"]) as l:
1810  for e in self.ensembles:
1811  l.write(ensemble_id=e.id, ensemble_name=e.name,
1812  post_process_id=e.postproc.id,
1813  model_group_id=e.model_group.id,
1814  ensemble_clustering_feature=e.feature,
1815  num_ensemble_models=e.num_models,
1816  num_ensemble_models_deposited=e.num_deposit,
1817  ensemble_precision_value=e.precision,
1818  ensemble_file_id=e.file.id if e.file
1819  else _CifWriter.omitted)
1820 
1821 class _DensityDumper(_Dumper):
1822  """Output localization densities for ensembles"""
1823 
1824  def __init__(self, simo):
1825  super(_DensityDumper, self).__init__(simo)
1826  self.ensembles = []
1827  self.output = IMP.pmi.output.Output()
1828 
1829  def add(self, ensemble):
1830  self.ensembles.append(ensemble)
1831 
1832  def get_density(self, ensemble, component):
1833  return ensemble.localization_density.get(component, None)
1834 
1835  def dump(self, writer):
1836  with writer.loop("_ihm_localization_density_files",
1837  ["id", "file_id", "ensemble_id", "entity_id",
1838  "asym_id", "seq_id_begin", "seq_id_end"]) as l:
1839  ordinal = 1
1840  for ensemble in self.ensembles:
1841  for comp in self.simo.all_modeled_components:
1842  density = self.get_density(ensemble, comp)
1843  if not density:
1844  continue
1845  entity = self.simo.entities[comp]
1846  lenseq = len(entity.sequence)
1847  chain_id = self.simo._get_chain_for_component(comp,
1848  self.output)
1849  l.write(id=ordinal, ensemble_id=ensemble.id,
1850  entity_id=entity.id,
1851  file_id=density.id,
1852  seq_id_begin=1, seq_id_end=lenseq,
1853  asym_id=chain_id)
1854  ordinal += 1
1855 
1856 class _Entity(object):
1857  """Represent a CIF entity (a chain with a unique sequence)"""
1858  def __init__(self, seq, first_component):
1859  self.sequence = seq
1860  self.first_component = first_component
1861  # Use the name of the first component as the description
1862  self.description = first_component
1863 
1864 class _EntityMapper(dict):
1865  """Handle mapping from IMP components to CIF entities.
1866  Multiple components may map to the same entity if they share sequence."""
1867  def __init__(self):
1868  super(_EntityMapper, self).__init__()
1869  self._sequence_dict = {}
1870  self._entities = []
1871 
1872  def add(self, component_name, sequence):
1873  if sequence not in self._sequence_dict:
1874  entity = _Entity(sequence, component_name)
1875  self._entities.append(entity)
1876  entity.id = len(self._entities)
1877  self._sequence_dict[sequence] = entity
1878  self[component_name] = self._sequence_dict[sequence]
1879 
1880  def get_all(self):
1881  """Yield all entities"""
1882  return self._entities
1883 
1884 
1885 class _RestraintDataset(object):
1886  """Wrapper around a dataset associated with a restraint.
1887  This is needed because we need to delay access to the dataset
1888  in case the writer of the PMI script overrides or changes it
1889  after creating the restraint."""
1890  def __init__(self, restraint, num, allow_duplicates):
1891  self.restraint = restraint
1892  self.num, self.allow_duplicates = num, allow_duplicates
1893  self.__dataset = None
1894  def __get_dataset(self):
1895  if self.__dataset:
1896  return self.__dataset
1897  if self.num is not None:
1898  d = copy.deepcopy(self.restraint.datasets[self.num])
1899  else:
1900  d = copy.deepcopy(self.restraint.dataset)
1901  if self.allow_duplicates:
1902  d.location._allow_duplicates = True
1903  # Don't copy again next time we access self.dataset
1904  self.__dataset = d
1905  return d
1906  dataset = property(__get_dataset)
1907 
1908 
1910  """Class to encode a modeling protocol as mmCIF.
1911 
1912  IMP has basic support for writing out files in mmCIF format, for
1913  deposition in [PDB-dev](https://pdb-dev.rcsb.rutgers.edu/).
1914  After creating an instance of this class, attach it to an
1915  IMP.pmi.representation.Representation object. After this, any
1916  generated models and metadata are automatically collected and
1917  output as mmCIF.
1918  """
1919  def __init__(self, fh):
1920  self._main_script = os.path.abspath(sys.argv[0])
1921  self._working_directory = os.getcwd()
1922  self._cif_writer = _CifWriter(fh)
1923  self.entities = _EntityMapper()
1924  self.chains = {}
1925  self._all_components = {}
1926  self.model_groups = []
1927  self.default_model_group = None
1928  self.sequence_dict = {}
1929  self.all_modeled_components = []
1930  self.model_repr_dump = _ModelRepresentationDumper(self)
1931  self.cross_link_dump = _CrossLinkDumper(self)
1932  self.em2d_dump = _EM2DDumper(self)
1933  self.em3d_dump = _EM3DDumper(self)
1934  self.model_prot_dump = _ModelProtocolDumper(self)
1935  self.extref_dump = _ExternalReferenceDumper(self)
1936  self.dataset_dump = _DatasetDumper(self)
1937  self.starting_model_dump = _StartingModelDumper(self)
1938  self.assembly_dump = _AssemblyDumper(self)
1939 
1940  # The assembly of all known components.
1941  self.complete_assembly = _Assembly()
1942  self.assembly_dump.add(self.complete_assembly)
1943 
1944  # The assembly of all components modeled by IMP
1945  # This may be smaller than the complete assembly.
1946  self.modeled_assembly = self.complete_assembly
1947 
1948  self.model_dump = _ModelDumper(self)
1949  self.model_repr_dump.starting_model \
1950  = self.starting_model_dump.starting_model
1951  self.software_dump = _SoftwareDumper(self)
1952  self.post_process_dump = _PostProcessDumper(self)
1953  self.ensemble_dump = _EnsembleDumper(self)
1954  self.density_dump = _DensityDumper(self)
1955 
1956  # Some dumpers add per-model information; give them a pointer to
1957  # the model list
1958  self.cross_link_dump.models = self.model_dump.models
1959  self.em3d_dump.models = self.model_dump.models
1960 
1961  self._dumpers = [_EntryDumper(self), # should always be first
1962  _AuditAuthorDumper(self),
1963  self.software_dump, _CitationDumper(self),
1964  _ChemCompDumper(self),
1965  _EntityDumper(self),
1966  _EntityPolyDumper(self), _EntityPolySeqDumper(self),
1967  _StructAsymDumper(self),
1968  self.assembly_dump,
1969  self.model_repr_dump, self.extref_dump,
1970  self.dataset_dump,
1971  self.cross_link_dump,
1972  self.em2d_dump, self.em3d_dump,
1973  self.starting_model_dump,
1974  # todo: detect atomic models and emit struct_conf
1975  #_StructConfDumper(self),
1976  self.model_prot_dump, self.post_process_dump,
1977  self.ensemble_dump, self.density_dump, self.model_dump]
1978 
1979  def get_file_dataset(self, fname):
1980  return self._file_dataset.get(os.path.abspath(fname), None)
1981 
1982  def _get_chain_for_component(self, name, output):
1983  """Get the chain ID for a component, if any."""
1984  # todo: handle multiple copies
1985  if name in self.chains:
1986  chain = self.chains[name]
1987  return output.chainids[chain]
1988  else:
1989  # A non-modeled component doesn't have a chain ID
1990  return _CifWriter.omitted
1991 
1992  def create_component(self, name, modeled):
1993  self._all_components[name] = None
1994  if modeled:
1995  self.all_modeled_components.append(name)
1996  self.chains[name] = len(self.chains)
1997  if self.modeled_assembly is not self.complete_assembly:
1998  self.modeled_assembly.append(name)
1999  elif self.complete_assembly is self.modeled_assembly:
2000  # If this component is not modeled, we need to start tracking
2001  # the complete and modeled assemblies separately
2002  self.modeled_assembly = _Assembly(self.complete_assembly)
2003  self.assembly_dump.add(self.modeled_assembly)
2004  self.complete_assembly.append(name)
2005 
2006  def add_component_sequence(self, name, seq):
2007  self.sequence_dict[name] = seq
2008  self.entities.add(name, seq)
2009 
2010  def flush(self):
2011  for dumper in self._dumpers:
2012  dumper.finalize_metadata()
2013  for dumper in self._dumpers:
2014  dumper.finalize()
2015  for dumper in self._dumpers:
2016  dumper.dump(self._cif_writer)
2017 
2018  def add_pdb_element(self, name, start, end, offset, pdbname, chain, hier):
2019  p = _PDBFragment(self.m, name, start, end, offset, pdbname, chain, hier)
2020  self.model_repr_dump.add_fragment(p)
2021  self.starting_model_dump.add_pdb_fragment(p)
2022 
2023  def add_bead_element(self, name, start, end, num, hier):
2024  b = _BeadsFragment(self.m, name, start, end, num, hier)
2025  self.model_repr_dump.add_fragment(b)
2026 
2027  def _get_restraint_dataset(self, r, num=None, allow_duplicates=False):
2028  """Get a wrapper object for the dataset used by this restraint.
2029  This is needed because the restraint's dataset may be changed
2030  in the PMI script before we get to use it."""
2031  rs = _RestraintDataset(r, num, allow_duplicates)
2032  self.dataset_dump.add(rs)
2033  return rs
2034 
2035  def get_cross_link_group(self, r):
2036  return _CrossLinkGroup(r, self._get_restraint_dataset(r))
2037 
2038  def add_experimental_cross_link(self, r1, c1, r2, c2, length, group):
2039  if c1 not in self._all_components or c2 not in self._all_components:
2040  # Crosslink refers to a component we didn't model
2041  # As a quick hack, just ignore it.
2042  # todo: need to add an entity for this anyway (so will need the
2043  # sequence, and add to struct_assembly)
2044  return None
2045  xl = _ExperimentalCrossLink(r1, c1, r2, c2, length, group)
2046  self.cross_link_dump.add_experimental(xl)
2047  return xl
2048 
2049  def add_cross_link(self, ex_xl, p1, p2, sigma1, sigma2, psi):
2050  self.cross_link_dump.add(_CrossLink(ex_xl, p1, p2, sigma1, sigma2, psi))
2051 
2052  def add_replica_exchange(self, rex):
2053  # todo: allow for metadata to say how many replicas were used in the
2054  # actual experiment, and how many independent runs were carried out
2055  # (use these as multipliers to get the correct total number of
2056  # output models)
2057  self.model_prot_dump.add(_ReplicaExchangeProtocol(rex))
2058 
2059  def add_model_group(self, group):
2060  self.model_groups.append(group)
2061  group.id = len(self.model_groups)
2062  return group
2063 
2064  def _add_simple_postprocessing(self, num_models_begin, num_models_end):
2065  pp = _SimplePostProcess(num_models_begin, num_models_end)
2066  self.post_process_dump.add(pp)
2067  return pp
2068 
2069  def _add_simple_ensemble(self, pp, name, num_models, drmsd,
2070  num_models_deposited, localization_densities,
2071  ensemble_file):
2072  """Add an ensemble generated by ad hoc methods (not using PMI).
2073  This is currently only used by the Nup84 system."""
2074  group = self.add_model_group(_ModelGroup(name))
2075  self.extref_dump.add(ensemble_file,
2076  _ExternalReferenceDumper.MODELING_OUTPUT)
2077  e = _SimpleEnsemble(pp, group, num_models, drmsd, num_models_deposited,
2078  ensemble_file)
2079  self.ensemble_dump.add(e)
2080  self.density_dump.add(e)
2081  for c in self.all_modeled_components:
2082  den = localization_densities.get(c, None)
2083  if den:
2084  e.load_localization_density(self.m, c, den, self.extref_dump)
2085  return e
2086 
2087  def add_replica_exchange_analysis(self, rex):
2088  # todo: add prefilter as an additional postprocess step (complication:
2089  # we don't know how many models it filtered)
2090  # todo: postpone rmsf/density localization extraction until after
2091  # relevant methods are called (currently it is done from the
2092  # constructor)
2093  num_models = self.model_prot_dump.get_last_protocol().num_models_end
2094  pp = _ReplicaExchangeAnalysisPostProcess(rex, num_models)
2095  self.post_process_dump.add(pp)
2096  for i in range(rex._number_of_clusters):
2097  group = self.add_model_group(_ModelGroup('Cluster %d' % (i + 1)))
2098  # todo: make # of models to deposit configurable somewhere
2099  e = _ReplicaExchangeAnalysisEnsemble(pp, i, group, 1)
2100  self.ensemble_dump.add(e)
2101  self.density_dump.add(e)
2102  # Add localization density info if available
2103  for c in self.all_modeled_components:
2104  e.load_localization_density(self.m, c, self.extref_dump)
2105  for stats in e.load_all_models(self):
2106  m = self.add_model(group)
2107  # Since we currently only deposit 1 model, it is the
2108  # best scoring one
2109  m.name = 'Best scoring model'
2110  m.stats = stats
2111  # Don't alter original RMF coordinates
2112  m.geometric_center = [0,0,0]
2113  # Add RMSF info if available
2114  for c in self.all_modeled_components:
2115  e.load_rmsf(m, c)
2116 
2117  def add_em2d_restraint(self, r, i, resolution, pixel_size,
2118  image_resolution, projection_number):
2119  d = self._get_restraint_dataset(r, i)
2120  self.em2d_dump.add(_EM2DRestraint(d, resolution, pixel_size,
2121  image_resolution, projection_number))
2122 
2123  def add_em3d_restraint(self, target_ps, densities, r):
2124  # A 3DEM restraint's dataset ID uniquely defines the restraint, so
2125  # we need to allow duplicates
2126  d = self._get_restraint_dataset(r, allow_duplicates=True)
2127  self.em3d_dump.add(_EM3DRestraint(self, d, target_ps, densities))
2128 
2129  def add_model(self, group=None):
2130  if group is None:
2131  if self.default_model_group is None:
2132  self.default_model_group \
2133  = self.add_model_group(_ModelGroup("All models"))
2134  group = self.default_model_group
2135  return self.model_dump.add(self.prot,
2136  self.model_prot_dump.get_last_protocol(),
2137  self.modeled_assembly, group)
2138 
2139  def _update_location(self, fileloc):
2140  """Update FileLocation to point to a parent repository, if any"""
2141  all_repos = [m for m in self._metadata
2142  if isinstance(m, IMP.pmi.metadata.Repository)]
Select non water and non hydrogen atoms.
Definition: pdb.h:243
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Residue.h:155
Class to encode a modeling protocol as mmCIF.
Definition: mmcif.py:1909
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: atom/Atom.h:241
Classes for attaching metadata to PMI objects.
Definition: metadata.py:1
Miscellaneous utilities.
Definition: tools.py:1
ElementTable & get_element_table()
A Python script used as part of the modeling.
Definition: metadata.py:39
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:619
void read_pdb(TextInput input, int model, Hierarchy h)
Representation of the system.
A 3D structure determined by comparative modeling.
Definition: metadata.py:102
A repository containing modeling files.
Definition: metadata.py:187
An experimentally-determined 3D structure as a set of a coordinates, usually in a PDB file...
Definition: metadata.py:97
Raw 2D electron micrographs.
Definition: metadata.py:107
Software (other than IMP) used as part of the modeling protocol.
Definition: metadata.py:18
A decorator for a particle representing an atom.
Definition: atom/Atom.h:234
Base class for capturing a modeling protocol.
Definition: output.py:22
A publication that describes the modeling.
Definition: metadata.py:30
A dataset stored in an official database (PDB, EMDB, PRIDE, etc.)
Definition: metadata.py:138
Basic utilities for handling cryo-electron microscopy 3D density maps.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
Class for easy writing of PDBs, RMFs, and stat files.
Definition: output.py:47
Classes for writing output files and processing them.
Definition: output.py:1
An individual file or directory.
Definition: metadata.py:166
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Residue get_residue(Atom d, bool nothrow=false)
Return the Residue containing this atom.
Something stored in the PDB database.
Definition: metadata.py:154
Functionality for loading, creating, manipulating and scoring atomic structures.
def update_in_repos
If the given FileLocation maps to somewhere within one of the passed repositories, update it to reflect that.
Definition: metadata.py:226
Hierarchies get_leaves(const Selection &h)
Select hierarchy particles identified by the biological name.
Definition: Selection.h:66
Select all ATOM and HETATM records with the given chain ids.
Definition: pdb.h:189
ResidueType get_residue_type(char c)
Get the residue type from the 1-letter amino acid code.
Inferential scoring building on methods developed as part of the Inferential Structure Determination ...