IMP logo
IMP Reference Guide  2.8.0
The Integrative Modeling Platform
mmcif.py
1 """@namespace IMP.pmi.mmcif
2  @brief Support for the mmCIF file format.
3 
4  IMP has basic support for writing out files in mmCIF format, for
5  deposition in [PDB-dev](https://pdb-dev.rcsb.rutgers.edu/).
6  mmCIF files are currently generated by creating an
7  IMP.pmi.mmcif.ProtocolOutput class, and attaching it to an
8  IMP.pmi.representation.Representation object, after which any
9  generated models and metadata are collected and output as mmCIF.
10 """
11 
12 from __future__ import print_function
13 import copy
14 import IMP.core
15 import IMP.algebra
16 import IMP.atom
17 import IMP.em
18 import IMP.isd
20 import IMP.pmi.tools
21 from IMP.pmi.tools import OrderedDict
22 import IMP.pmi.output
23 import IMP.pmi.metadata
24 import re
25 import ast
26 import sys
27 import os
28 import textwrap
29 import weakref
30 import operator
31 import itertools
32 
33 # Python 3 has no 'long' type, so use 'int' instead
34 if sys.version_info[0] >= 3:
35  _long_type = int
36 else:
37  _long_type = long
38 
39 def _assign_id(obj, seen_objs, obj_by_id):
40  """Assign a unique ID to obj, and track all ids in obj_by_id."""
41  if obj not in seen_objs:
42  if not hasattr(obj, 'id'):
43  obj_by_id.append(obj)
44  obj.id = len(obj_by_id)
45  seen_objs[obj] = obj.id
46  else:
47  obj.id = seen_objs[obj]
48 
49 class _LineWriter(object):
50  def __init__(self, writer, line_len=80):
51  self.writer = writer
52  self.line_len = line_len
53  self.column = 0
54  def write(self, val):
55  if isinstance(val, str) and '\n' in val:
56  self.writer.fh.write("\n;")
57  self.writer.fh.write(val)
58  if not val.endswith('\n'):
59  self.writer.fh.write("\n")
60  self.writer.fh.write(";\n")
61  self.column = 0
62  return
63  val = self.writer._repr(val)
64  if self.column > 0:
65  if self.column + len(val) + 1 > self.line_len:
66  self.writer.fh.write("\n")
67  self.column = 0
68  else:
69  self.writer.fh.write(" ")
70  self.column += 1
71  self.writer.fh.write(val)
72  self.column += len(val)
73 
74 
75 class _CifCategoryWriter(object):
76  def __init__(self, writer, category):
77  self.writer = writer
78  self.category = category
79  def write(self, **kwargs):
80  self.writer._write(self.category, kwargs)
81  def __enter__(self):
82  return self
83  def __exit__(self, exc_type, exc_value, traceback):
84  pass
85 
86 
87 class _CifLoopWriter(object):
88  def __init__(self, writer, category, keys):
89  self.writer = writer
90  self.category = category
91  self.keys = keys
92  # Remove characters that we can't use in Python identifiers
93  self.python_keys = [k.replace('[', '').replace(']', '') for k in keys]
94  self._empty_loop = True
95  def write(self, **kwargs):
96  if self._empty_loop:
97  f = self.writer.fh
98  f.write("#\nloop_\n")
99  for k in self.keys:
100  f.write("%s.%s\n" % (self.category, k))
101  self._empty_loop = False
102  l = _LineWriter(self.writer)
103  for k in self.python_keys:
104  l.write(kwargs.get(k, self.writer.omitted))
105  self.writer.fh.write("\n")
106  def __enter__(self):
107  return self
108  def __exit__(self, exc_type, exc_value, traceback):
109  if not self._empty_loop:
110  self.writer.fh.write("#\n")
111 
112 
113 class _CifWriter(object):
114  omitted = '.'
115  unknown = '?'
116  _boolmap = {False: 'NO', True: 'YES'}
117 
118  def __init__(self, fh):
119  self.fh = fh
120  def category(self, category):
121  return _CifCategoryWriter(self, category)
122  def loop(self, category, keys):
123  return _CifLoopWriter(self, category, keys)
124  def write_comment(self, comment):
125  for line in textwrap.wrap(comment, 78):
126  self.fh.write('# ' + line + '\n')
127  def _write(self, category, kwargs):
128  for key in kwargs:
129  self.fh.write("%s.%s %s\n" % (category, key,
130  self._repr(kwargs[key])))
131  def _repr(self, obj):
132  if isinstance(obj, str) and '"' not in obj \
133  and "'" not in obj and " " not in obj:
134  return obj
135  elif isinstance(obj, float):
136  return "%.3f" % obj
137  elif isinstance(obj, bool):
138  return self._boolmap[obj]
139  # Don't use repr(x) if type(x) == long since that adds an 'L' suffix,
140  # which isn't valid mmCIF syntax. _long_type = long only on Python 2.
141  elif isinstance(obj, _long_type):
142  return "%d" % obj
143  else:
144  return repr(obj)
145 
146 def _get_by_residue(p):
147  """Determine whether the given particle represents a specific residue
148  or a more coarse-grained object."""
150 
151 
152 class _ComponentMapper(object):
153  """Map a Particle to a component name"""
154  def __init__(self, prot):
155  self.o = IMP.pmi.output.Output()
156  self.prot = prot
157  self.name = 'cif-output'
158  self.o.dictionary_pdbs[self.name] = self.prot
159  self.o._init_dictchain(self.name, self.prot)
160 
161  def __getitem__(self, p):
162  protname, is_a_bead = self.o.get_prot_name_from_particle(self.name, p)
163  return protname
164 
165 
166 class _AsymIDMapper(object):
167  """Map a Particle to an asym_id (chain ID)"""
168  def __init__(self, simo, prot):
169  self.simo = simo
170  self._cm = _ComponentMapper(prot)
171 
172  def __getitem__(self, p):
173  protname = self._cm[p]
174  return self.simo._get_chain_for_component(protname, self._cm.o)
175 
176 class _Dumper(object):
177  """Base class for helpers to dump output to mmCIF"""
178  def __init__(self, simo):
179  self.simo = weakref.proxy(simo)
180 
181  def finalize(self):
182  pass
183 
184  def finalize_metadata(self):
185  pass
186 
187 
188 class _EntryDumper(_Dumper):
189  def dump(self, writer):
190  entry_id = 'imp_model'
191  # Write CIF header (so this dumper should always be first)
192  writer.fh.write("data_%s\n" % entry_id)
193  with writer.category("_entry") as l:
194  l.write(id=entry_id)
195 
196 
197 class _SoftwareDumper(_Dumper):
198  def __init__(self, simo):
199  super(_SoftwareDumper, self).__init__(simo)
200  self.modeller_used = self.phyre2_used = False
201  self.software = [
203  name="Integrative Modeling Platform (IMP)",
204  version=IMP.__version__,
205  classification="integrative model building",
206  description="integrative model building",
207  url='https://integrativemodeling.org'),
209  name="IMP PMI module",
210  version=IMP.pmi.__version__,
211  classification="integrative model building",
212  description="integrative model building",
213  url='https://integrativemodeling.org') ]
214 
215  def set_modeller_used(self, version, date):
216  if self.modeller_used:
217  return
218  self.modeller_used = True
219  self.software.append(IMP.pmi.metadata.Software(
220  name='MODELLER', classification='comparative modeling',
221  description='Comparative modeling by satisfaction '
222  'of spatial restraints, build ' + date,
223  url='https://salilab.org/modeller/',
224  version=version))
225 
226  def set_phyre2_used(self):
227  if self.phyre2_used:
228  return
229  self.phyre2_used = True
230  self.software.append(IMP.pmi.metadata.Software(
231  name='Phyre2', classification='protein homology modeling',
232  description='Protein Homology/analogY Recognition '
233  'Engine V 2.0',
234  version='2.0', url='http://www.sbg.bio.ic.ac.uk/~phyre2/'))
235 
236  def dump(self, writer):
237  ordinal = 1
238  with writer.loop("_software",
239  ["pdbx_ordinal", "name", "classification", "version",
240  "type", "location"]) as l:
241  for m in itertools.chain(self.software, self.simo._metadata):
242  if isinstance(m, IMP.pmi.metadata.Software):
243  l.write(pdbx_ordinal=ordinal, name=m.name,
244  classification=m.classification, version=m.version,
245  type=m.type, location=m.url)
246  ordinal += 1
247 
248 class _AuditAuthorDumper(_Dumper):
249  """Populate the mmCIF audit_author category (a list of the people that
250  authored this mmCIF file; here we assume that's just the authors of
251  any associated publications)"""
252  def dump(self, writer):
253  citations = [m for m in self.simo._metadata
254  if isinstance(m, IMP.pmi.metadata.Citation)]
255  seen_authors = {}
256  with writer.loop("_audit_author",
257  ["name", "pdbx_ordinal"]) as l:
258  ordinal = 1
259  for n, c in enumerate(citations):
260  for a in c.authors:
261  if a not in seen_authors:
262  seen_authors[a] = None
263  l.write(name=a, pdbx_ordinal=ordinal)
264  ordinal += 1
265 
266 
267 class _CitationDumper(_Dumper):
268  def dump(self, writer):
269  citations = [m for m in self.simo._metadata
270  if isinstance(m, IMP.pmi.metadata.Citation)]
271  with writer.loop("_citation",
272  ["id", "title", "journal_abbrev", "journal_volume",
273  "page_first", "page_last", "year",
274  "pdbx_database_id_PubMed",
275  "pdbx_database_id_DOI"]) as l:
276  for n, c in enumerate(citations):
277  if isinstance(c.page_range, (tuple, list)):
278  page_first, page_last = c.page_range
279  else:
280  page_first = c.page_range
281  page_last = _CifWriter.omitted
282  l.write(id=n+1, title=c.title, journal_abbrev=c.journal,
283  journal_volume=c.volume, page_first=page_first,
284  page_last=page_last, year=c.year,
285  pdbx_database_id_PubMed=c.pmid,
286  pdbx_database_id_DOI=c.doi)
287 
288  with writer.loop("_citation_author",
289  ["citation_id", "name", "ordinal"]) as l:
290  ordinal = 1
291  for n, c in enumerate(citations):
292  for a in c.authors:
293  l.write(citation_id=n+1, name=a, ordinal=ordinal)
294  ordinal += 1
295 
296 class _EntityDumper(_Dumper):
297  # todo: we currently only support amino acid sequences here (and
298  # then only standard amino acids; need to add support for MSE etc.)
299  def dump(self, writer):
300  with writer.loop("_entity",
301  ["id", "type", "src_method", "pdbx_description",
302  "formula_weight", "pdbx_number_of_molecules",
303  "details"]) as l:
304  for entity in self.simo.entities.get_all():
305  l.write(id=entity.id, type='polymer', src_method='man',
306  pdbx_description=entity.first_component,
307  formula_weight=writer.unknown,
308  pdbx_number_of_molecules=1, details=writer.unknown)
309 
310 
311 class _EntityPolyDumper(_Dumper):
312  # todo: we currently only support amino acid sequences here
313  def __init__(self, simo):
314  super(_EntityPolyDumper, self).__init__(simo)
315  self.output = IMP.pmi.output.Output()
316 
317  def dump(self, writer):
318  with writer.loop("_entity_poly",
319  ["entity_id", "type", "nstd_linkage",
320  "nstd_monomer", "pdbx_strand_id",
321  "pdbx_seq_one_letter_code",
322  "pdbx_seq_one_letter_code_can"]) as l:
323  for entity in self.simo.entities.get_all():
324  seq = entity.sequence
325  # Split into lines to get tidier CIF output
326  seq = "\n".join(seq[i:i+70] for i in range(0, len(seq), 70))
327  name = entity.first_component
328  chain_id = self.simo._get_chain_for_component(name, self.output)
329  l.write(entity_id=entity.id, type='polypeptide(L)',
330  nstd_linkage='no', nstd_monomer='no',
331  pdbx_strand_id=chain_id,
332  pdbx_seq_one_letter_code=seq,
333  pdbx_seq_one_letter_code_can=seq)
334 
335 
336 class _ChemCompDumper(_Dumper):
337  def dump(self, writer):
338  seen = {}
339  std = dict.fromkeys(('ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS',
340  'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN', 'ARG', 'SER',
341  'THR', 'VAL', 'TRP', 'TYR'))
342  with writer.loop("_chem_comp", ["id", "type"]) as l:
343  for entity in self.simo.entities.get_all():
344  seq = entity.sequence
345  for num, one_letter_code in enumerate(seq):
346  restyp = IMP.atom.get_residue_type(one_letter_code)
347  resid = restyp.get_string()
348  if resid not in seen:
349  seen[resid] = None
350  l.write(id=resid,
351  type='L-peptide linking' if resid in std \
352  else 'other')
353 
354 class _EntityPolySeqDumper(_Dumper):
355  def dump(self, writer):
356  with writer.loop("_entity_poly_seq",
357  ["entity_id", "num", "mon_id", "hetero"]) as l:
358  for entity in self.simo.entities.get_all():
359  seq = entity.sequence
360  for num, one_letter_code in enumerate(seq):
361  restyp = IMP.atom.get_residue_type(one_letter_code)
362  l.write(entity_id=entity.id, num=num + 1,
363  mon_id=restyp.get_string(),
364  hetero=_CifWriter.omitted)
365 
366 class _StructAsymDumper(_Dumper):
367  def __init__(self, simo):
368  super(_StructAsymDumper, self).__init__(simo)
369  self.output = IMP.pmi.output.Output()
370 
371  def dump(self, writer):
372  with writer.loop("_struct_asym",
373  ["id", "entity_id", "details"]) as l:
374  for comp in self.simo.all_modeled_components:
375  entity = self.simo.entities[comp]
376  chain_id = self.simo._get_chain_for_component(comp, self.output)
377  l.write(id=chain_id,
378  entity_id=entity.id,
379  details=comp)
380 
381 class _PDBFragment(object):
382  """Record details about part of a PDB file used as input
383  for a component."""
384  primitive = 'sphere'
385  granularity = 'by-residue'
386  num = _CifWriter.omitted
387  def __init__(self, state, component, start, end, offset, pdbname,
388  chain, hier):
389  self.component, self.start, self.end, self.offset, self.pdbname \
390  = component, start, end, offset, pdbname
391  self.state, self.chain, self.hier = state, chain, hier
394  self.starting_hier = IMP.atom.read_pdb(pdbname, state.m, sel)
395 
396  def combine(self, other):
397  pass
398 
399 class _BeadsFragment(object):
400  """Record details about beads used to represent part of a component."""
401  primitive = 'sphere'
402  granularity = 'by-feature'
403  chain = None
404  def __init__(self, state, component, start, end, num, hier):
405  self.state, self.component, self.start, self.end, self.num, self.hier \
406  = state, component, start, end, num, hier
407 
408  def combine(self, other):
409  # todo: don't combine if one fragment is rigid and the other flexible
410  if type(other) == type(self) and other.start == self.end + 1:
411  self.end = other.end
412  self.num += other.num
413  return True
414 
415 class _StartingModel(object):
416  """Record details about an input model (e.g. comparative modeling
417  template) used for a component."""
418 
419  source = _CifWriter.unknown
420  db_name = _CifWriter.unknown
421  db_code = _CifWriter.unknown
422  sequence_identity = _CifWriter.unknown
423 
424  def __init__(self, fragment):
425  self.fragments = [fragment]
426 
427 class _ModelRepresentationDumper(_Dumper):
428  def __init__(self, simo):
429  super(_ModelRepresentationDumper, self).__init__(simo)
430  # dict of fragments, ordered by component name and then state
431  self.fragments = OrderedDict()
432  self.output = IMP.pmi.output.Output()
433 
434  def add_fragment(self, state, fragment):
435  """Add a model fragment."""
436  comp = fragment.component
437  if comp not in self.fragments:
438  self.fragments[comp] = OrderedDict()
439  if state not in self.fragments[comp]:
440  self.fragments[comp][state] = []
441  fragments = self.fragments[comp][state]
442  if len(fragments) == 0 or not fragments[-1].combine(fragment):
443  fragments.append(fragment)
444 
445  def get_model_mode(self, fragment):
446  """Determine the model_mode for a given fragment ('rigid' or
447  'flexible')"""
448  leaves = IMP.atom.get_leaves(fragment.hier)
449  # Assume all leaves are set up as rigid/flexible in the same way
450  if IMP.core.RigidMember.get_is_setup(leaves[0]):
451  return 'rigid'
452  else:
453  return 'flexible'
454 
455  def dump(self, writer):
456  ordinal_id = 1
457  segment_id = 1
458  with writer.loop("_ihm_model_representation",
459  ["ordinal_id", "representation_id",
460  "segment_id", "entity_id", "entity_description",
461  "entity_asym_id",
462  "seq_id_begin", "seq_id_end",
463  "model_object_primitive", "starting_model_id",
464  "model_mode", "model_granularity",
465  "model_object_count"]) as l:
466  for comp, statefrag in self.fragments.items():
467  # For now, assume that representation of the same-named
468  # component is the same in all states, so just take the first
469  state = list(statefrag.keys())[0]
470  chain_id = self.simo._get_chain_for_component(comp, self.output)
471  for f in statefrag[state]:
472  entity = self.simo.entities[f.component]
473  starting_model_id = _CifWriter.omitted
474  if hasattr(f, 'pdbname'):
475  starting_model_id \
476  = self.starting_model[state, comp, f.pdbname].name
477  # todo: handle multiple representations
478  l.write(ordinal_id=ordinal_id,
479  representation_id=1,
480  segment_id=segment_id,
481  entity_id=entity.id,
482  entity_description=entity.description,
483  entity_asym_id=chain_id,
484  seq_id_begin=f.start,
485  seq_id_end=f.end,
486  model_object_primitive=f.primitive,
487  starting_model_id=starting_model_id,
488  model_mode=self.get_model_mode(f),
489  model_granularity=f.granularity,
490  model_object_count=f.num)
491  ordinal_id += 1
492  segment_id += 1
493 
494 class _PDBSource(object):
495  """An experimental PDB file used as part of a starting model"""
496  source = 'experimental model'
497  db_name = 'PDB'
498  sequence_identity = 100.0
499 
500  def __init__(self, model, db_code, chain_id, metadata):
501  self.db_code = db_code
502  self.chain_id = chain_id
503  self.metadata = metadata
504 
505  def get_seq_id_range(self, model):
506  # Assume the structure covers the entire sequence
507  return (model.seq_id_begin, model.seq_id_end)
508 
509 class _TemplateSource(object):
510  """A PDB file used as a template for a comparative starting model"""
511  source = 'comparative model'
512  db_name = db_code = _CifWriter.omitted
513  tm_dataset = None
514  # Right now assume all alignments are Modeller alignments, which uses
515  # the length of the shorter sequence as the denominator for sequence
516  # identity
517  sequence_identity_denominator = 1
518 
519  def __init__(self, tm_code, tm_seq_id_begin, tm_seq_id_end, seq_id_begin,
520  chain_id, seq_id_end, seq_id, model):
521  # Assume a code of 1abcX refers to a real PDB structure
522  if len(tm_code) == 5:
523  self._orig_tm_code = None
524  self.tm_db_code = tm_code[:4].upper()
525  self.tm_chain_id = tm_code[4]
526  else:
527  # Otherwise, will need to look up in TEMPLATE PATH remarks
528  self._orig_tm_code = tm_code
529  self.tm_db_code = _CifWriter.omitted
530  self.tm_chain_id = tm_code[-1]
531  self.sequence_identity = seq_id
532  self.tm_seq_id_begin = tm_seq_id_begin
533  self.tm_seq_id_end = tm_seq_id_end
534  self.chain_id = chain_id
535  self._seq_id_begin, self._seq_id_end = seq_id_begin, seq_id_end
536 
537  def get_seq_id_range(self, model):
538  # The template may cover more than the current starting model
539  seq_id_begin = max(model.seq_id_begin, self._seq_id_begin)
540  seq_id_end = min(model.seq_id_end, self._seq_id_end)
541  return (seq_id_begin, seq_id_end)
542 
543 class _UnknownSource(object):
544  """Part of a starting model from an unknown source"""
545  db_code = _CifWriter.unknown
546  db_name = _CifWriter.unknown
547  chain_id = _CifWriter.unknown
548  sequence_identity = _CifWriter.unknown
549  # Map dataset types to starting model sources
550  _source_map = {'Comparative model': 'comparative model',
551  'Experimental model': 'experimental model'}
552 
553  def __init__(self, model, chain):
554  self.source = self._source_map[model.dataset._data_type]
555  self.chain_id = chain
556 
557  def get_seq_id_range(self, model):
558  return (model.seq_id_begin, model.seq_id_end)
559 
560 class _DatasetGroup(object):
561  """A group of datasets"""
562  def __init__(self, datasets):
563  self._datasets = list(datasets)
564 
565  def finalize(self):
566  """Get final datasets for each restraint and remove duplicates"""
567  final_datasets = OrderedDict()
568  for ds in self._datasets:
569  if isinstance(ds, _RestraintDataset):
570  d = ds.dataset
571  else:
572  d = ds
573  final_datasets[d] = None
574  self._datasets = final_datasets.keys()
575 
576 
577 class _ExternalReference(object):
578  """A single externally-referenced file"""
579  def __init__(self, location, content_type):
580  self.location, self.content_type = location, content_type
581 
582  # Pass id through to location
583  def __set_id(self, i):
584  self.location.id = i
585  id = property(lambda x: x.location.id, __set_id)
586  file_size = property(lambda x: x.location.file_size)
587 
588  def __eq__(self, other):
589  return self.location == other.location
590  def __hash__(self):
591  return hash(self.location)
592 
593 
594 class _ExternalReferenceDumper(_Dumper):
595  """Output information on externally referenced files
596  (i.e. anything that refers to a Location that isn't
597  a DatabaseLocation)."""
598 
599  INPUT_DATA = "Input data or restraints"
600  MODELING_OUTPUT = "Modeling or post-processing output"
601  WORKFLOW = "Modeling workflow or script"
602 
603  class _LocalFiles(object):
604  reference_provider = _CifWriter.omitted
605  reference_type = 'Supplementary Files'
606  reference = _CifWriter.omitted
607  refers_to = 'Other'
608  associated_url = _CifWriter.omitted
609 
610  def __init__(self, top_directory):
611  self.top_directory = top_directory
612 
613  def _get_full_path(self, path):
614  return os.path.relpath(path, start=self.top_directory)
615 
616  class _Repository(object):
617  reference_provider = _CifWriter.omitted
618  reference_type = 'DOI'
619  refers_to = 'Other'
620  associated_url = _CifWriter.omitted
621 
622  def __init__(self, repo):
623  self.id = repo.id
624  self.reference = repo.doi
625  if 'zenodo' in self.reference:
626  self.reference_provider = 'Zenodo'
627  if repo.url:
628  self.associated_url = repo.url
629  if repo.url.endswith(".zip"):
630  self.refers_to = 'Archive'
631  else:
632  self.refers_to = 'File'
633 
634  def __init__(self, simo):
635  super(_ExternalReferenceDumper, self).__init__(simo)
636  self._refs = []
637 
638  def add(self, location, content_type):
639  """Add a new location.
640  Note that ids are assigned later."""
641  self._refs.append(_ExternalReference(location, content_type))
642  return location
643 
644  def finalize_metadata(self):
645  """Register locations for any metadata and add the main script"""
646  loc = IMP.pmi.metadata.FileLocation(path=self.simo._main_script,
647  details="The main integrative modeling script")
648  main_script = IMP.pmi.metadata.PythonScript(loc)
649  self._workflow = [main_script] \
650  + [m for m in self.simo._metadata
651  if isinstance(m, IMP.pmi.metadata.PythonScript)]
652  for w in self._workflow:
653  self.add(w.location, self.WORKFLOW)
654 
655  def finalize_after_datasets(self):
656  """Note that this must happen *after* DatasetDumper.finalize()"""
657  # Keep only locations that don't point into databases (these are
658  # handled elsewhere)
659  self._refs = [x for x in self._refs
660  if not isinstance(x.location,
662  # Assign IDs to all locations and repos (including the None repo, which
663  # is for local files)
664  seen_refs = {}
665  seen_repos = {}
666  self._ref_by_id = []
667  self._repo_by_id = []
668  # Special dummy repo for repo=None (local files)
669  self._local_files = self._LocalFiles(self.simo._working_directory)
670  for r in self._refs:
671  # Update location to point to parent repository, if any
672  self.simo._update_location(r.location)
673  # Assign a unique ID to the reference
674  _assign_id(r, seen_refs, self._ref_by_id)
675  # Assign a unique ID to the repository
676  _assign_id(r.location.repo or self._local_files, seen_repos,
677  self._repo_by_id)
678 
679  def dump(self, writer):
680  self.dump_repos(writer)
681  self.dump_refs(writer)
682 
683  def dump_repos(self, writer):
684  def map_repo(r):
685  return r if isinstance(r, self._LocalFiles) else self._Repository(r)
686  with writer.loop("_ihm_external_reference_info",
687  ["reference_id", "reference_provider",
688  "reference_type", "reference", "refers_to",
689  "associated_url"]) as l:
690  for repo in [map_repo(r) for r in self._repo_by_id]:
691  l.write(reference_id=repo.id,
692  reference_provider=repo.reference_provider,
693  reference_type=repo.reference_type,
694  reference=repo.reference, refers_to=repo.refers_to,
695  associated_url=repo.associated_url)
696 
697  def dump_refs(self, writer):
698  with writer.loop("_ihm_external_files",
699  ["id", "reference_id", "file_path", "content_type",
700  "file_size_bytes", "details"]) as l:
701  for r in self._ref_by_id:
702  loc = r.location
703  repo = loc.repo or self._local_files
704  file_path=self._posix_path(repo._get_full_path(loc.path))
705  if r.file_size is None:
706  file_size = _CifWriter.omitted
707  else:
708  file_size = r.file_size
709  l.write(id=loc.id, reference_id=repo.id,
710  file_path=file_path,
711  content_type=r.content_type,
712  file_size_bytes=file_size,
713  details=loc.details or _CifWriter.omitted)
714 
715  # On Windows systems, convert native paths to POSIX-like (/-separated) paths
716  if os.sep == '/':
717  def _posix_path(self, path):
718  return path
719  else:
720  def _posix_path(self, path):
721  return path.replace(os.sep, '/')
722 
723 
724 class _DatasetDumper(_Dumper):
725  def __init__(self, simo):
726  super(_DatasetDumper, self).__init__(simo)
727  self._datasets = []
728  self._datasets_by_state = {}
729  self._dataset_groups = []
730 
731  def get_all_group(self, state):
732  """Get a _DatasetGroup encompassing all datasets so far in this state"""
733  g = _DatasetGroup(self._datasets_by_state.get(state, []))
734  self._dataset_groups.append(g)
735  return g
736 
737  def add(self, state, dataset):
738  """Add a new dataset.
739  Note that ids are assigned later."""
740  self._datasets.append(dataset)
741  if state not in self._datasets_by_state:
742  self._datasets_by_state[state] = []
743  self._datasets_by_state[state].append(dataset)
744  return dataset
745 
746  def finalize(self):
747  seen_datasets = {}
748  # Assign IDs to all datasets
749  self._dataset_by_id = []
750  for d in self._flatten_dataset(self._datasets):
751  # Register location (need to do that now rather than in add() in
752  # order to properly handle _RestraintDataset)
753  self.simo.extref_dump.add(d.location,
754  _ExternalReferenceDumper.INPUT_DATA)
755  _assign_id(d, seen_datasets, self._dataset_by_id)
756 
757  # Assign IDs to all groups and remove duplicates
758  seen_group_ids = {}
759  self._dataset_group_by_id = []
760  for g in self._dataset_groups:
761  g.finalize()
762  ids = tuple(sorted(d.id for d in g._datasets))
763  if ids not in seen_group_ids:
764  self._dataset_group_by_id.append(g)
765  g.id = len(self._dataset_group_by_id)
766  seen_group_ids[ids] = g
767  else:
768  g.id = seen_group_ids[ids].id
769  # Finalize external references (must happen after dataset finalize)
770  self.simo.extref_dump.finalize_after_datasets()
771 
772  def _flatten_dataset(self, d):
773  if isinstance(d, list):
774  for p in d:
775  for x in self._flatten_dataset(p):
776  yield x
777  elif isinstance(d, _RestraintDataset):
778  for x in self._flatten_dataset(d.dataset):
779  yield x
780  else:
781  for p in d._parents.keys():
782  for x in self._flatten_dataset(p):
783  yield x
784  yield d
785 
786  def dump(self, writer):
787  with writer.loop("_ihm_dataset_list",
788  ["id", "data_type", "database_hosted"]) as l:
789  for d in self._dataset_by_id:
790  l.write(id=d.id, data_type=d._data_type,
791  database_hosted=isinstance(d.location,
793  self.dump_groups(writer)
794  self.dump_other((d for d in self._dataset_by_id
795  if not isinstance(d.location,
797  writer)
798  self.dump_rel_dbs((d for d in self._dataset_by_id
799  if isinstance(d.location,
801  writer)
802  self.dump_related(writer)
803 
804  def dump_groups(self, writer):
805  ordinal = 1
806  with writer.loop("_ihm_dataset_group",
807  ["ordinal_id", "group_id", "dataset_list_id"]) as l:
808  for g in self._dataset_group_by_id:
809  for d in g._datasets:
810  l.write(ordinal_id=ordinal, group_id=g.id,
811  dataset_list_id=d.id)
812  ordinal += 1
813 
814  def dump_related(self, writer):
815  ordinal = 1
816  with writer.loop("_ihm_related_datasets",
817  ["ordinal_id", "dataset_list_id_derived",
818  "dataset_list_id_primary"]) as l:
819  for derived in self._dataset_by_id:
820  for parent in sorted(derived._parents.keys(),
821  key=lambda d: d.id):
822  l.write(ordinal_id=ordinal,
823  dataset_list_id_derived=derived.id,
824  dataset_list_id_primary=parent.id)
825  ordinal += 1
826 
827  def dump_rel_dbs(self, datasets, writer):
828  ordinal = 1
829  with writer.loop("_ihm_dataset_related_db_reference",
830  ["id", "dataset_list_id", "db_name",
831  "accession_code", "version", "details"]) as l:
832  for d in datasets:
833  l.write(id=ordinal, dataset_list_id=d.id,
834  db_name=d.location.db_name,
835  accession_code=d.location.access_code,
836  version=d.location.version if d.location.version
837  else _CifWriter.omitted,
838  details=d.location.details if d.location.details
839  else _CifWriter.omitted)
840  ordinal += 1
841 
842  def dump_other(self, datasets, writer):
843  ordinal = 1
844  with writer.loop("_ihm_dataset_external_reference",
845  ["id", "dataset_list_id", "file_id"]) as l:
846  for d in datasets:
847  l.write(id=ordinal, dataset_list_id=d.id, file_id=d.location.id)
848  ordinal += 1
849 
850 
851 class _CrossLinkGroup(object):
852  """Group common information for a set of cross links"""
853  def __init__(self, pmi_restraint, rdataset):
854  self.pmi_restraint, self.rdataset = pmi_restraint, rdataset
855  self.label = self.pmi_restraint.label
856 
857 
858 class _ExperimentalCrossLink(object):
859  def __init__(self, r1, c1, r2, c2, length, group):
860  self.r1, self.c1, self.r2, self.c2 = r1, c1, r2, c2
861  self.length, self.group = length, group
862 
863 class _CrossLink(object):
864  def __init__(self, state, ex_xl, p1, p2, sigma1, sigma2, psi):
865  self.state = state
866  self.ex_xl, self.sigma1, self.sigma2 = ex_xl, sigma1, sigma2
867  self.p1, self.p2 = p1, p2
868  self.psi = psi
869 
870 def get_asym_mapper_for_state(simo, state, asym_map):
871  asym = asym_map.get(state, None)
872  if asym is None:
873  asym = _AsymIDMapper(simo, state.prot)
874  asym_map[state] = asym
875  return asym
876 
877 class _CrossLinkDumper(_Dumper):
878  def __init__(self, simo):
879  super(_CrossLinkDumper, self).__init__(simo)
880  self.cross_links = []
881  self.exp_cross_links = []
882 
883  def add_experimental(self, cross_link):
884  self.exp_cross_links.append(cross_link)
885 
886  def add(self, cross_link):
887  self.cross_links.append(cross_link)
888 
889  def dump(self, writer):
890  self.dump_list(writer)
891  self.dump_restraint(writer)
892  self.dump_results(writer)
893 
894  def dump_list(self, writer):
895  seen_cross_links = {}
896  with writer.loop("_ihm_cross_link_list",
897  ["id", "group_id", "entity_description_1",
898  "entity_id_1", "seq_id_1", "comp_id_1",
899  "entity_description_2",
900  "entity_id_2", "seq_id_2", "comp_id_2", "type",
901  "dataset_list_id"]) as l:
902  xl_id = 0
903  for xl in self.exp_cross_links:
904  # Skip identical cross links
905  sig = (xl.c1, xl.c2, xl.r1, xl.r2, xl.group.label)
906  if sig in seen_cross_links:
907  xl.id = seen_cross_links[sig]
908  continue
909  entity1 = self.simo.entities[xl.c1]
910  entity2 = self.simo.entities[xl.c2]
911  seq1 = entity1.sequence
912  seq2 = entity2.sequence
913  rt1 = IMP.atom.get_residue_type(seq1[xl.r1-1])
914  rt2 = IMP.atom.get_residue_type(seq2[xl.r2-1])
915  # todo: handle experimental ambiguity (group_id) properly
916  xl_id += 1
917  seen_cross_links[sig] = xl_id
918  xl.id = xl_id
919  l.write(id=xl.id, group_id=xl.id,
920  entity_description_1=entity1.description,
921  entity_id_1=entity1.id,
922  seq_id_1=xl.r1,
923  comp_id_1=rt1.get_string(),
924  entity_description_2=entity2.description,
925  entity_id_2=entity2.id,
926  seq_id_2=xl.r2,
927  comp_id_2=rt2.get_string(),
928  type=xl.group.label,
929  dataset_list_id=xl.group.rdataset.dataset.id)
930 
931  def _granularity(self, xl):
932  """Determine the granularity of a cross link"""
933  if _get_by_residue(xl.p1) and _get_by_residue(xl.p2):
934  return 'by-residue'
935  else:
936  return 'by-feature'
937 
938  def dump_restraint(self, writer):
939  seen_cross_links = {}
940  asym_states = {} # AsymIDMapper for each state
941  with writer.loop("_ihm_cross_link_restraint",
942  ["id", "group_id", "entity_id_1", "asym_id_1",
943  "seq_id_1", "comp_id_1",
944  "entity_id_2", "asym_id_2", "seq_id_2", "comp_id_2",
945  "type", "conditional_crosslink_flag",
946  "model_granularity", "distance_threshold",
947  "psi", "sigma_1", "sigma_2"]) as l:
948  xl_id = 0
949  for xl in self.cross_links:
950  asym = get_asym_mapper_for_state(self.simo, xl.state,
951  asym_states)
952  entity1 = self.simo.entities[xl.ex_xl.c1]
953  entity2 = self.simo.entities[xl.ex_xl.c2]
954  seq1 = entity1.sequence
955  seq2 = entity2.sequence
956  rt1 = IMP.atom.get_residue_type(seq1[xl.ex_xl.r1-1])
957  rt2 = IMP.atom.get_residue_type(seq2[xl.ex_xl.r2-1])
958  asym1 = asym[xl.p1]
959  asym2 = asym[xl.p2]
960  # Skip identical cross links
961  sig = (asym1, xl.ex_xl.r1, asym2, xl.ex_xl.r2,
962  xl.ex_xl.group.label)
963  if sig in seen_cross_links:
964  xl.id = seen_cross_links[sig]
965  continue
966  xl_id += 1
967  seen_cross_links[sig] = xl_id
968  xl.id = xl_id
969  l.write(id=xl.id,
970  group_id=xl.ex_xl.id,
971  entity_id_1=entity1.id,
972  asym_id_1=asym1,
973  seq_id_1=xl.ex_xl.r1,
974  comp_id_1=rt1.get_string(),
975  entity_id_2=entity2.id,
976  asym_id_2=asym2,
977  seq_id_2=xl.ex_xl.r2,
978  comp_id_2=rt2.get_string(),
979  type=xl.ex_xl.group.label,
980  # todo: any circumstances where this could be ANY?
981  conditional_crosslink_flag="ALL",
982  model_granularity=self._granularity(xl),
983  distance_threshold=xl.ex_xl.length,
984  psi=xl.psi.get_scale(),
985  sigma_1=xl.sigma1.get_scale(),
986  sigma_2=xl.sigma2.get_scale())
987 
988  def _set_psi_sigma(self, model, g):
989  for resolution in g.pmi_restraint.sigma_dictionary:
990  statname = 'ISDCrossLinkMS_Sigma_%s_%s' % (resolution, g.label)
991  if model.stats and statname in model.stats:
992  sigma = float(model.stats[statname])
993  p = g.pmi_restraint.sigma_dictionary[resolution][0]
994  p.set_scale(sigma)
995  for psiindex in g.pmi_restraint.psi_dictionary:
996  statname = 'ISDCrossLinkMS_Psi_%s_%s' % (psiindex, g.label)
997  if model.stats and statname in model.stats:
998  psi = float(model.stats[statname])
999  p = g.pmi_restraint.psi_dictionary[psiindex][0]
1000  p.set_scale(psi)
1001 
1002  def dump_results(self, writer):
1003  all_groups = {}
1004  for xl in self.cross_links:
1005  all_groups[xl.ex_xl.group] = None
1006  ordinal = 1
1007  with writer.loop("_ihm_cross_link_result_parameters",
1008  ["ordinal_id", "restraint_id", "model_id",
1009  "psi", "sigma_1", "sigma_2"]) as l:
1010  for model in self.models:
1011  for g in all_groups.keys():
1012  self._set_psi_sigma(model, g)
1013  for xl in self.cross_links:
1014  if model.stats:
1015  l.write(ordinal_id=ordinal, restraint_id=xl.id,
1016  model_id=model.id, psi=xl.psi.get_scale(),
1017  sigma_1=xl.sigma1.get_scale(),
1018  sigma_2=xl.sigma2.get_scale())
1019  ordinal += 1
1020 
1021 class _EM2DRestraint(object):
1022  def __init__(self, state, rdataset, pmi_restraint, image_number, resolution,
1023  pixel_size, image_resolution, projection_number):
1024  self.state = state
1025  self.pmi_restraint, self.image_number = pmi_restraint, image_number
1026  self.rdataset, self.resolution = rdataset, resolution
1027  self.pixel_size, self.image_resolution = pixel_size, image_resolution
1028  self.projection_number = projection_number
1029 
1030  def get_transformation(self, model):
1031  """Get the transformation that places the model on the image"""
1032  prefix = 'ElectronMicroscopy2D_%s_Image%d' % (self.pmi_restraint.label,
1033  self.image_number + 1)
1034  r = [float(model.stats[prefix + '_Rotation%d' % i]) for i in range(4)]
1035  t = [float(model.stats[prefix + '_Translation%d' % i])
1036  for i in range(3)]
1037  # If the model coordinates are transformed, need to back transform
1038  # them first
1039  inv = model.transform.get_inverse()
1041  IMP.algebra.Vector3D(*t)) * inv
1042  def get_cross_correlation(self, model):
1043  """Get the cross correlation coefficient between the model projection
1044  and the image"""
1045  return float(model.stats['ElectronMicroscopy2D_%s_Image%d_CCC'
1046  % (self.pmi_restraint.label,
1047  self.image_number + 1)])
1048 
1049  def get_num_raw_micrographs(self):
1050  """Return the number of raw micrographs used, if known.
1051  This is extracted from the EMMicrographsDataset if any."""
1052  for d in self.rdataset.dataset._parents.keys():
1053  if isinstance(d, IMP.pmi.metadata.EMMicrographsDataset):
1054  return d.number
1055 
1056 class _EM2DDumper(_Dumper):
1057  def __init__(self, simo):
1058  super(_EM2DDumper, self).__init__(simo)
1059  self.restraints = []
1060 
1061  def add(self, rsr):
1062  self.restraints.append(rsr)
1063  rsr.id = len(self.restraints)
1064 
1065  def dump(self, writer):
1066  self.dump_restraints(writer)
1067  self.dump_fitting(writer)
1068 
1069  def dump_restraints(self, writer):
1070  with writer.loop("_ihm_2dem_class_average_restraint",
1071  ["id", "dataset_list_id", "number_raw_micrographs",
1072  "pixel_size_width", "pixel_size_height",
1073  "image_resolution", "image_segment_flag",
1074  "number_of_projections", "struct_assembly_id",
1075  "details"]) as l:
1076  for r in self.restraints:
1077  unk = _CifWriter.unknown
1078  num_raw = r.get_num_raw_micrographs()
1079  l.write(id=r.id, dataset_list_id=r.rdataset.dataset.id,
1080  number_raw_micrographs=num_raw if num_raw else unk,
1081  pixel_size_width=r.pixel_size,
1082  pixel_size_height=r.pixel_size,
1083  image_resolution=r.image_resolution,
1084  number_of_projections=r.projection_number,
1085  # todo: check that the assembly is the same for each
1086  # state
1087  struct_assembly_id=r.state.modeled_assembly.id,
1088  image_segment_flag=False)
1089 
1090  def dump_fitting(self, writer):
1091  ordinal = 1
1092  with writer.loop("_ihm_2dem_class_average_fitting",
1093  ["ordinal_id", "restraint_id", "model_id",
1094  "cross_correlation_coefficient", "rot_matrix[1][1]",
1095  "rot_matrix[2][1]", "rot_matrix[3][1]", "rot_matrix[1][2]",
1096  "rot_matrix[2][2]", "rot_matrix[3][2]", "rot_matrix[1][3]",
1097  "rot_matrix[2][3]", "rot_matrix[3][3]", "tr_vector[1]",
1098  "tr_vector[2]", "tr_vector[3]"]) as l:
1099  for m in self.models:
1100  for r in self.restraints:
1101  trans = r.get_transformation(m)
1102  rot = trans.get_rotation()
1103  # mmCIF writer usually outputs floats to 3 decimal places,
1104  # but we need more precision for rotation matrices
1105  rm = [["%.6f" % e for e in rot.get_rotation_matrix_row(i)]
1106  for i in range(3)]
1107  t = trans.get_translation()
1108  ccc = r.get_cross_correlation(m)
1109  l.write(ordinal_id=ordinal, restraint_id=r.id,
1110  model_id=m.id, cross_correlation_coefficient=ccc,
1111  rot_matrix11=rm[0][0], rot_matrix21=rm[1][0],
1112  rot_matrix31=rm[2][0], rot_matrix12=rm[0][1],
1113  rot_matrix22=rm[1][1], rot_matrix32=rm[2][1],
1114  rot_matrix13=rm[0][2], rot_matrix23=rm[1][2],
1115  rot_matrix33=rm[2][2], tr_vector1=t[0],
1116  tr_vector2=t[1], tr_vector3=t[2])
1117  ordinal += 1
1118 
1119 class _EM3DRestraint(object):
1120  fitting_method = 'Gaussian mixture models'
1121 
1122  def __init__(self, simo, state, rdataset, pmi_restraint, target_ps,
1123  densities):
1124  self.pmi_restraint = pmi_restraint
1125  self.rdataset = rdataset
1126  self.number_of_gaussians = len(target_ps)
1127  self.assembly = self.get_assembly(densities, simo, state)
1128 
1129  def get_assembly(self, densities, simo, state):
1130  """Get the Assembly that this restraint acts on"""
1131  cm = _ComponentMapper(state.prot)
1132  components = {}
1133  for d in densities:
1134  components[cm[d]] = None
1135  return simo.assembly_dump.get_subassembly(components)
1136 
1137  def get_cross_correlation(self, model):
1138  """Get the cross correlation coefficient between the model
1139  and the map"""
1140  return float(model.stats['GaussianEMRestraint_%s_CCC'
1141  % self.pmi_restraint.label])
1142 
1143 
1144 class _EM3DDumper(_Dumper):
1145  def __init__(self, simo):
1146  super(_EM3DDumper, self).__init__(simo)
1147  self.restraints = []
1148 
1149  def add(self, rsr):
1150  self.restraints.append(rsr)
1151 
1152  def dump(self, writer):
1153  # todo: support other fields
1154  ordinal = 1
1155  with writer.loop("_ihm_3dem_restraint",
1156  ["ordinal_id", "dataset_list_id", "fitting_method",
1157  "struct_assembly_id",
1158  "number_of_gaussians", "model_id",
1159  "cross_correlation_coefficient"]) as l:
1160  for model in self.models:
1161  for r in self.restraints:
1162  ccc = r.get_cross_correlation(model)
1163  l.write(ordinal_id=ordinal,
1164  dataset_list_id=r.rdataset.dataset.id,
1165  fitting_method=r.fitting_method,
1166  struct_assembly_id=r.assembly.id,
1167  number_of_gaussians=r.number_of_gaussians,
1168  model_id=model.id,
1169  cross_correlation_coefficient=ccc)
1170  ordinal += 1
1171 
1172 class _Assembly(list):
1173  """A collection of components. Currently simply implemented as a list of
1174  the component names. These must be in creation order."""
1175  def __hash__(self):
1176  # allow putting assemblies in a dict. 'list' isn't hashable
1177  # but 'tuple' is
1178  return hash(tuple(self))
1179 
1180 class _AssemblyDumper(_Dumper):
1181  def __init__(self, simo):
1182  super(_AssemblyDumper, self).__init__(simo)
1183  self.assemblies = []
1184  self.output = IMP.pmi.output.Output()
1185 
1186  def add(self, a):
1187  """Add a new assembly. The first such assembly is assumed to contain
1188  all components. Duplicate assemblies will be pruned at the end."""
1189  self.assemblies.append(a)
1190  return a
1191 
1192  def get_subassembly(self, compdict):
1193  """Get an _Assembly consisting of the given components."""
1194  # Put components in creation order
1195  newa = _Assembly(c for c in self.assemblies[0] if c in compdict)
1196  return self.add(newa)
1197 
1198  def finalize(self):
1199  seen_assemblies = {}
1200  # Assign IDs to all assemblies
1201  self._assembly_by_id = []
1202  for a in self.assemblies:
1203  _assign_id(a, seen_assemblies, self._assembly_by_id)
1204 
1205  def dump(self, writer):
1206  ordinal = 1
1207  with writer.loop("_ihm_struct_assembly",
1208  ["ordinal_id", "assembly_id", "entity_description",
1209  "entity_id", "asym_id", "seq_id_begin",
1210  "seq_id_end"]) as l:
1211  for a in self._assembly_by_id:
1212  for comp in a:
1213  entity = self.simo.entities[comp]
1214  seq = self.simo.sequence_dict[comp]
1215  chain_id = self.simo._get_chain_for_component(comp,
1216  self.output)
1217  l.write(ordinal_id=ordinal, assembly_id=a.id,
1218  entity_description=entity.description,
1219  entity_id=entity.id,
1220  asym_id=chain_id,
1221  seq_id_begin=1,
1222  seq_id_end=len(seq))
1223  ordinal += 1
1224 
1225 
1226 class _Protocol(list):
1227  """A modeling protocol. This can consist of multiple _ProtocolSteps."""
1228  pass
1229 
1230 class _ProtocolStep(object):
1231  """A single step in a _Protocol."""
1232  pass
1233 
1234 class _ReplicaExchangeProtocolStep(_ProtocolStep):
1235  def __init__(self, state, rex):
1236  self.state = state
1237  self.modeled_assembly = state.modeled_assembly
1238  self.name = 'Sampling'
1239  if rex.monte_carlo_sample_objects is not None:
1240  self.method = 'Replica exchange monte carlo'
1241  else:
1242  self.method = 'Replica exchange molecular dynamics'
1243  self.num_models_end = rex.vars["number_of_frames"]
1244 
1245 class _ModelGroup(object):
1246  """Group sets of models"""
1247  def __init__(self, state, name):
1248  self.state = state
1249  self.name = name
1250 
1251 class _Model(object):
1252  def __init__(self, prot, simo, protocol, assembly, group):
1253  # Transformation from IMP coordinates into mmCIF coordinate system.
1254  # Normally we pass through coordinates unchanged, but in some cases
1255  # we may want to translate them (e.g. Nup84, where the deposited PDB
1256  # files are all centered; we want the mmCIF files to match)
1258  self.group = group
1259  # The _Protocol which produced this model
1260  self.protocol = protocol
1261  self.assembly = assembly
1262  self.stats = None
1263  o = IMP.pmi.output.Output(atomistic=True)
1264  name = 'cif-output'
1265  o.dictionary_pdbs[name] = prot
1266  o._init_dictchain(name, prot)
1267  (particle_infos_for_pdb,
1268  self.geometric_center) = o.get_particle_infos_for_pdb_writing(name)
1269  self.geometric_center = IMP.algebra.Vector3D(*self.geometric_center)
1270  self.entity_for_chain = {}
1271  self.comp_for_chain = {}
1272  self.correct_chain_id = {}
1273  for protname, chain_id in o.dictchain[name].items():
1274  self.entity_for_chain[chain_id] = simo.entities[protname]
1275  self.comp_for_chain[chain_id] = protname
1276  # When doing multi-state modeling, the chain ID returned here
1277  # (assigned sequentially) might not be correct (states may have
1278  # gaps in the chain IDs). Map it to the correct ID.
1279  self.correct_chain_id[chain_id] = \
1280  simo._get_chain_for_component(protname, o)
1281  self.spheres = [t for t in particle_infos_for_pdb if t[1] is None]
1282  self.atoms = [t for t in particle_infos_for_pdb if t[1] is not None]
1283  self.rmsf = {}
1284  self.name = _CifWriter.omitted
1285 
1286  def parse_rmsf_file(self, fname, component):
1287  self.rmsf[component] = rmsf = {}
1288  with open(fname) as fh:
1289  for line in fh:
1290  resnum, blocknum, val = line.split()
1291  rmsf[int(resnum)] = (int(blocknum), float(val))
1292 
1293  def get_rmsf(self, component, indexes):
1294  """Get the RMSF value for the given residue indexes."""
1295  if not self.rmsf:
1296  return _CifWriter.omitted
1297  rmsf = self.rmsf[component]
1298  blocknums = dict.fromkeys(rmsf[ind][0] for ind in indexes)
1299  if len(blocknums) != 1:
1300  raise ValueError("Residue indexes %s aren't all in the same block"
1301  % str(indexes))
1302  return rmsf[indexes[0]][1]
1303 
1304 class _ModelDumper(_Dumper):
1305  def __init__(self, simo):
1306  super(_ModelDumper, self).__init__(simo)
1307  self.models = []
1308 
1309  def add(self, prot, protocol, assembly, group):
1310  m = _Model(prot, self.simo, protocol, assembly, group)
1311  self.models.append(m)
1312  m.id = len(self.models)
1313  return m
1314 
1315  def dump(self, writer):
1316  self.dump_model_list(writer)
1317  num_atoms = sum(len(m.atoms) for m in self.models)
1318  num_spheres = sum(len(m.spheres) for m in self.models)
1319  self.dump_atoms(writer)
1320  self.dump_spheres(writer)
1321 
1322  def dump_model_list(self, writer):
1323  ordinal = 1
1324  with writer.loop("_ihm_model_list",
1325  ["ordinal_id", "model_id", "model_group_id",
1326  "model_name", "model_group_name", "assembly_id",
1327  "protocol_id"]) as l:
1328  for model in self.models:
1329  state = model.group.state
1330  group_name = state.get_prefixed_name(model.group.name)
1331  l.write(ordinal_id=ordinal, model_id=model.id,
1332  model_group_id=model.group.id,
1333  model_name=model.name,
1334  model_group_name=group_name,
1335  assembly_id=model.assembly.id,
1336  protocol_id=model.protocol.id)
1337  ordinal += 1
1338 
1339  def dump_atoms(self, writer):
1340  ordinal = 1
1341  with writer.loop("_atom_site",
1342  ["id", "label_atom_id", "label_comp_id",
1343  "label_seq_id",
1344  "label_asym_id", "Cartn_x",
1345  "Cartn_y", "Cartn_z", "label_entity_id",
1346  "model_id"]) as l:
1347  for model in self.models:
1348  for atom in model.atoms:
1349  (xyz, atom_type, residue_type, chain_id, residue_index,
1350  all_indexes, radius) = atom
1351  pt = model.transform * xyz
1352  l.write(id=ordinal, label_atom_id=atom_type.get_string(),
1353  label_comp_id=residue_type.get_string(),
1354  label_asym_id=model.correct_chain_id[chain_id],
1355  label_entity_id=model.entity_for_chain[chain_id].id,
1356  label_seq_id=residue_index,
1357  Cartn_x=pt[0], Cartn_y=pt[1], Cartn_z=pt[2],
1358  model_id=model.id)
1359  ordinal += 1
1360 
1361  def dump_spheres(self, writer):
1362  ordinal = 1
1363  with writer.loop("_ihm_sphere_obj_site",
1364  ["ordinal_id", "entity_id", "seq_id_begin",
1365  "seq_id_end", "asym_id", "Cartn_x",
1366  "Cartn_y", "Cartn_z", "object_radius", "rmsf",
1367  "model_id"]) as l:
1368  for model in self.models:
1369  for sphere in model.spheres:
1370  (xyz, atom_type, residue_type, chain_id, residue_index,
1371  all_indexes, radius) = sphere
1372  if all_indexes is None:
1373  all_indexes = (residue_index,)
1374  pt = model.transform * xyz
1375  l.write(ordinal_id=ordinal,
1376  entity_id=model.entity_for_chain[chain_id].id,
1377  seq_id_begin = all_indexes[0],
1378  seq_id_end = all_indexes[-1],
1379  asym_id=model.correct_chain_id[chain_id],
1380  Cartn_x=pt[0], Cartn_y=pt[1], Cartn_z=pt[2],
1381  object_radius=radius,
1382  rmsf=model.get_rmsf(model.comp_for_chain[chain_id],
1383  all_indexes),
1384  model_id=model.id)
1385  ordinal += 1
1386 
1387 
1388 class _ModelProtocolDumper(_Dumper):
1389  def __init__(self, simo):
1390  super(_ModelProtocolDumper, self).__init__(simo)
1391  # Protocols by state
1392  self.protocols = OrderedDict()
1393 
1394  def add(self, step):
1395  state = step.state
1396  if state not in self.protocols:
1397  # Currently support only a single protocol per state
1398  self.protocols[state] = _Protocol()
1399  self.protocols[state].id = len(self.protocols)
1400  step.num_models_begin = 0
1401  else:
1402  step.num_models_begin = self.protocols[state][-1].num_models_end
1403  self.protocols[state].append(step)
1404  # todo: support multiple protocols
1405  step.id = len(self.protocols[state])
1406  # Assume that protocol uses all currently-defined datasets
1407  step.dataset_group = self.simo.dataset_dump.get_all_group(state)
1408 
1409  def get_last_protocol(self, state):
1410  """Return the most recently-added _Protocol"""
1411  # For now, we only support a single protocol per state
1412  return self.protocols[state]
1413 
1414  def dump(self, writer):
1415  ordinal = 1
1416  with writer.loop("_ihm_modeling_protocol",
1417  ["ordinal_id", "protocol_id", "step_id",
1418  "struct_assembly_id", "dataset_group_id",
1419  "struct_assembly_description", "protocol_name",
1420  "step_name", "step_method", "num_models_begin",
1421  "num_models_end", "multi_scale_flag",
1422  "multi_state_flag", "time_ordered_flag"]) as l:
1423  for p in self.protocols.values():
1424  for step in p:
1425  l.write(ordinal_id=ordinal, protocol_id=p.id,
1426  step_id=step.id, step_method=step.method,
1427  step_name=step.name,
1428  struct_assembly_id=step.modeled_assembly.id,
1429  dataset_group_id=step.dataset_group.id,
1430  num_models_begin=step.num_models_begin,
1431  num_models_end=step.num_models_end,
1432  # todo: support multiple states, time ordered
1433  multi_state_flag=False, time_ordered_flag=False,
1434  # all PMI models are multi scale
1435  multi_scale_flag=True)
1436  ordinal += 1
1437 
1438 
1439 class _PDBHelix(object):
1440  """Represent a HELIX record from a PDB file."""
1441  def __init__(self, line):
1442  self.helix_id = line[11:14].strip()
1443  self.start_resnam = line[14:18].strip()
1444  self.start_asym = line[19]
1445  self.start_resnum = int(line[21:25])
1446  self.end_resnam = line[27:30].strip()
1447  self.end_asym = line[31]
1448  self.end_resnum = int(line[33:37])
1449  self.helix_class = int(line[38:40])
1450  self.length = int(line[71:76])
1451 
1452 
1453 class _MSESeqDif(object):
1454  """Track an MSE -> MET mutation in the starting model sequence"""
1455  comp_id = 'MET'
1456  db_comp_id = 'MSE'
1457  details = 'Conversion of modified residue MSE to MET'
1458  def __init__(self, res, component, source, model, offset):
1459  self.res, self.component, self.source = res, component, source
1460  self.model = model
1461  self.offset = offset
1462 
1463 
1464 class _StartingModelDumper(_Dumper):
1465  def __init__(self, simo):
1466  super(_StartingModelDumper, self).__init__(simo)
1467  # dict of starting models (entire PDB files), collected from fragments,
1468  # ordered by component name and state
1469  self.models = OrderedDict()
1470  # mapping from state+component+pdbname to starting model
1471  self.starting_model = {}
1472  self.output = IMP.pmi.output.Output()
1473 
1474  def add_pdb_fragment(self, fragment):
1475  """Add a starting model PDB fragment."""
1476  comp = fragment.component
1477  state = fragment.state
1478  if comp not in self.models:
1479  self.models[comp] = OrderedDict()
1480  if state not in self.models[comp]:
1481  self.models[comp][state] = []
1482  models = self.models[comp][state]
1483  if len(models) == 0 \
1484  or models[-1].fragments[0].pdbname != fragment.pdbname:
1485  model = _StartingModel(fragment)
1486  models.append(model)
1487  model.sources = self.get_sources(model, fragment.pdbname,
1488  fragment.chain)
1489  else:
1490  models[-1].fragments.append(fragment)
1491 
1492  def get_templates(self, pdbname, model):
1493  template_path_map = {}
1494  templates = []
1495  alnfile = None
1496  alnfilere = re.compile('REMARK 6 ALIGNMENT: (\S+)')
1497  tmppathre = re.compile('REMARK 6 TEMPLATE PATH (\S+) (\S+)')
1498  tmpre = re.compile('REMARK 6 TEMPLATE: '
1499  '(\S+) (\S+):\S+ \- (\S+):\S+ '
1500  'MODELS (\S+):(\S+) \- (\S+):\S+ AT (\S+)%')
1501 
1502  with open(pdbname) as fh:
1503  for line in fh:
1504  if line.startswith('ATOM'): # Read only the header
1505  break
1506  m = tmppathre.match(line)
1507  if m:
1508  template_path_map[m.group(1)] = \
1509  IMP.get_relative_path(pdbname, m.group(2))
1510  m = alnfilere.match(line)
1511  if m:
1512  # Path to alignment is relative to that of the PDB file
1513  alnfile = IMP.get_relative_path(pdbname, m.group(1))
1514  m = tmpre.match(line)
1515  if m:
1516  templates.append(_TemplateSource(m.group(1),
1517  int(m.group(2)),
1518  int(m.group(3)),
1519  int(m.group(4)),
1520  m.group(5),
1521  int(m.group(6)),
1522  m.group(7), model))
1523  # Add datasets for templates
1524  for t in templates:
1525  if t._orig_tm_code:
1526  fname = template_path_map[t._orig_tm_code]
1528  details="Template for comparative modeling")
1529  else:
1530  l = IMP.pmi.metadata.PDBLocation(t.tm_db_code)
1532  d = self.simo._add_dataset(d)
1533  t.tm_dataset = d
1534  model.dataset.add_parent(d)
1535 
1536  # Sort by starting residue, then ending residue
1537  return(sorted(templates,
1538  key=lambda x: (x._seq_id_begin, x._seq_id_end)),
1539  alnfile)
1540 
1541  def _parse_pdb(self, fh, first_line):
1542  """Extract information from an official PDB"""
1543  metadata = []
1544  details = ''
1545  for line in fh:
1546  if line.startswith('TITLE'):
1547  details += line[10:].rstrip()
1548  elif line.startswith('HELIX'):
1549  metadata.append(_PDBHelix(line))
1550  return (first_line[50:59].strip(),
1551  details if details else _CifWriter.unknown, metadata)
1552 
1553  def _parse_details(self, fh):
1554  """Extract TITLE records from a PDB file"""
1555  details = ''
1556  for line in fh:
1557  if line.startswith('TITLE'):
1558  details += line[10:].rstrip()
1559  elif line.startswith('ATOM'):
1560  break
1561  return details
1562 
1563  def get_sources(self, model, pdbname, chain):
1564  # Attempt to identity PDB file vs. comparative model
1565  fh = open(pdbname)
1566  first_line = fh.readline()
1567  local_file = IMP.pmi.metadata.FileLocation(pdbname,
1568  details="Starting model structure")
1569  file_dataset = self.simo.get_file_dataset(pdbname)
1570  if first_line.startswith('HEADER'):
1571  version, details, metadata = self._parse_pdb(fh, first_line)
1572  source = _PDBSource(model, first_line[62:66].strip(), chain,
1573  metadata)
1574  l = IMP.pmi.metadata.PDBLocation(source.db_code, version, details)
1576  model.dataset = self.simo._add_dataset(file_dataset or d)
1577  return [source]
1578  elif first_line.startswith('EXPDTA DERIVED FROM PDB:'):
1579  # Model derived from a PDB structure; treat as a local experimental
1580  # model with the official PDB as a parent
1581  local_file.details = self._parse_details(fh)
1582  db_code = first_line[27:].strip()
1583  d = IMP.pmi.metadata.PDBDataset(local_file)
1584  pdb_loc = IMP.pmi.metadata.PDBLocation(db_code)
1585  parent = IMP.pmi.metadata.PDBDataset(pdb_loc)
1586  d.add_parent(parent)
1587  model.dataset = self.simo._add_dataset(file_dataset or d)
1588  return [_UnknownSource(model, chain)]
1589  elif first_line.startswith('EXPDTA DERIVED FROM COMPARATIVE '
1590  'MODEL, DOI:'):
1591  # Model derived from a comparative model; link back to the original
1592  # model as a parent
1593  local_file.details = self._parse_details(fh)
1595  repo = IMP.pmi.metadata.Repository(doi=first_line[46:].strip())
1596  # todo: better specify an unknown path
1597  orig_loc = IMP.pmi.metadata.FileLocation(repo=repo, path='.',
1598  details="Starting comparative model structure")
1599  parent = IMP.pmi.metadata.ComparativeModelDataset(orig_loc)
1600  d.add_parent(parent)
1601  model.dataset = self.simo._add_dataset(file_dataset or d)
1602  return [_UnknownSource(model, chain)]
1603  elif first_line.startswith('EXPDTA THEORETICAL MODEL, MODELLER'):
1604  self.simo.software_dump.set_modeller_used(
1605  *first_line[38:].split(' ', 1))
1606  return self.handle_comparative_model(local_file, file_dataset,
1607  pdbname, model, chain)
1608  elif first_line.startswith('REMARK 99 Chain ID :'):
1609  # Model generated by Phyre2
1610  self.simo.software_dump.set_phyre2_used()
1611  return self.handle_comparative_model(local_file, file_dataset,
1612  pdbname, model, chain)
1613  else:
1614  # todo: extract Modeller-like template info for Phyre models;
1615  # revisit assumption that all such unknown source PDBs are
1616  # comparative models
1617  return self.handle_comparative_model(local_file, file_dataset,
1618  pdbname, model, chain)
1619 
1620  def handle_comparative_model(self, local_file, file_dataset, pdbname,
1621  model, chain):
1623  model.dataset = self.simo._add_dataset(file_dataset or d)
1624  templates, alnfile = self.get_templates(pdbname, model)
1625  if alnfile:
1626  model.alignment_file = IMP.pmi.metadata.FileLocation(alnfile,
1627  details="Alignment for starting "
1628  "comparative model")
1629  self.simo.extref_dump.add(model.alignment_file,
1630  _ExternalReferenceDumper.INPUT_DATA)
1631 
1632  if templates:
1633  return templates
1634  else:
1635  return [_UnknownSource(model, chain)]
1636 
1637  def assign_model_details(self):
1638  for comp, states in self.models.items():
1639  model_id = 0
1640  for state in states:
1641  for model in states[state]:
1642  model_id += 1
1643  model.name = "%s-m%d" % (comp, model_id)
1644  self.starting_model[state, comp,
1645  model.fragments[0].pdbname] = model
1646  model.seq_id_begin = min(x.start + x.offset
1647  for x in model.fragments)
1648  model.seq_id_end = max(x.end + x.offset
1649  for x in model.fragments)
1650 
1651  def all_models(self):
1652  for comp, states in self.models.items():
1653  # For now, assume that starting model of the same-named
1654  # component is the same in all states, so just take the first
1655  first_state = list(states.keys())[0]
1656  for model in states[first_state]:
1657  yield model
1658 
1659  def finalize(self):
1660  self.assign_model_details()
1661 
1662  def dump(self, writer):
1663  self.dump_details(writer)
1664  self.dump_comparative(writer)
1665  seq_dif = self.dump_coords(writer)
1666  self.dump_seq_dif(writer, seq_dif)
1667 
1668  def dump_seq_dif(self, writer, seq_dif):
1669  ordinal = 1
1670  with writer.loop("_ihm_starting_model_seq_dif",
1671  ["ordinal_id", "entity_id", "asym_id",
1672  "seq_id", "comp_id", "starting_model_id",
1673  "db_asym_id", "db_seq_id", "db_comp_id",
1674  "details"]) as l:
1675  for sd in seq_dif:
1676  chain_id = self.simo._get_chain_for_component(
1677  sd.component, self.output)
1678  entity = self.simo.entities[sd.component]
1679  l.write(ordinal_id=ordinal, entity_id=entity.id,
1680  asym_id=chain_id, seq_id=sd.res.get_index(),
1681  comp_id=sd.comp_id,
1682  db_asym_id=sd.source.chain_id,
1683  db_seq_id=sd.res.get_index() - sd.offset,
1684  db_comp_id=sd.db_comp_id,
1685  starting_model_id=sd.model.name,
1686  details=sd.details)
1687  ordinal += 1
1688 
1689  def dump_comparative(self, writer):
1690  """Dump details on comparative models. Must be called after
1691  dump_details() since it uses IDs assigned there."""
1692  with writer.loop("_ihm_starting_comparative_models",
1693  ["ordinal_id", "starting_model_id",
1694  "starting_model_auth_asym_id",
1695  "starting_model_seq_id_begin",
1696  "starting_model_seq_id_end",
1697  "template_auth_asym_id", "template_seq_id_begin",
1698  "template_seq_id_end", "template_sequence_identity",
1699  "template_sequence_identity_denominator",
1700  "template_dataset_list_id",
1701  "alignment_file_id"]) as l:
1702  ordinal = 1
1703  for model in self.all_models():
1704  for template in [s for s in model.sources
1705  if isinstance(s, _TemplateSource)]:
1706  seq_id_begin, seq_id_end = template.get_seq_id_range(model)
1707  denom = template.sequence_identity_denominator
1708  l.write(ordinal_id=ordinal,
1709  starting_model_id=model.name,
1710  starting_model_auth_asym_id=template.chain_id,
1711  starting_model_seq_id_begin=seq_id_begin,
1712  starting_model_seq_id_end=seq_id_end,
1713  template_auth_asym_id=template.tm_chain_id,
1714  template_seq_id_begin=template.tm_seq_id_begin,
1715  template_seq_id_end=template.tm_seq_id_end,
1716  template_sequence_identity=template.sequence_identity,
1717  template_sequence_identity_denominator=denom,
1718  template_dataset_list_id=template.tm_dataset.id
1719  if template.tm_dataset
1720  else _CifWriter.unknown,
1721  alignment_file_id=model.alignment_file.id
1722  if hasattr(model, 'alignment_file')
1723  else _CifWriter.unknown)
1724  ordinal += 1
1725 
1726  def dump_details(self, writer):
1727  writer.write_comment("""IMP will attempt to identify which input models
1728 are crystal structures and which are comparative models, but does not always
1729 have sufficient information to deduce all of the templates used for comparative
1730 modeling. These may need to be added manually below.""")
1731  with writer.loop("_ihm_starting_model_details",
1732  ["starting_model_id", "entity_id", "entity_description",
1733  "asym_id", "seq_id_begin",
1734  "seq_id_end", "starting_model_source",
1735  "starting_model_auth_asym_id",
1736  "starting_model_sequence_offset",
1737  "dataset_list_id"]) as l:
1738  for model in self.all_models():
1739  f = model.fragments[0]
1740  entity = self.simo.entities[f.component]
1741  chain_id = self.simo._get_chain_for_component(f.component,
1742  self.output)
1743  source0 = model.sources[0]
1744  # Where there are multiple sources (to date, this can only
1745  # mean multiple templates for a comparative model) consolidate
1746  # them; template info is given in starting_comparative_models.
1747  seq_id_begin, seq_id_end = source0.get_seq_id_range(model)
1748  for source in model.sources[1:]:
1749  this_begin, this_end = source.get_seq_id_range(model)
1750  seq_id_begin = min(seq_id_begin, this_begin)
1751  seq_id_end = max(seq_id_end, this_end)
1752  l.write(entity_id=entity.id,
1753  entity_description=entity.description,
1754  asym_id=chain_id,
1755  seq_id_begin=seq_id_begin,
1756  seq_id_end=seq_id_end,
1757  starting_model_auth_asym_id=source0.chain_id,
1758  starting_model_id=model.name,
1759  starting_model_source=source0.source,
1760  starting_model_sequence_offset=f.offset,
1761  dataset_list_id=model.dataset.id)
1762 
1763  def dump_coords(self, writer):
1764  seq_dif = []
1765  ordinal = 1
1766  with writer.loop("_ihm_starting_model_coord",
1767  ["starting_model_id", "group_PDB", "id", "type_symbol",
1768  "atom_id", "comp_id", "entity_id", "asym_id",
1769  "seq_id", "Cartn_x",
1770  "Cartn_y", "Cartn_z", "B_iso_or_equiv",
1771  "ordinal_id"]) as l:
1772  for model in self.all_models():
1773  for f in model.fragments:
1774  sel = IMP.atom.Selection(f.starting_hier,
1775  residue_indexes=list(range(f.start, f.end + 1)))
1776  last_res_index = None
1777  for a in sel.get_selected_particles():
1778  coord = IMP.core.XYZ(a).get_coordinates()
1779  atom = IMP.atom.Atom(a)
1780  element = atom.get_element()
1781  element = IMP.atom.get_element_table().get_name(element)
1782  atom_name = atom.get_atom_type().get_string()
1783  group_pdb = 'ATOM'
1784  if atom_name.startswith('HET:'):
1785  group_pdb = 'HETATM'
1786  atom_name = atom_name[4:]
1787  res = IMP.atom.get_residue(atom)
1788  res_name = res.get_residue_type().get_string()
1789  # MSE in the original PDB is automatically mutated
1790  # by IMP to MET, so reflect that in the output,
1791  # and pass back to populate the seq_dif category.
1792  if res_name == 'MSE':
1793  res_name = 'MET'
1794  # Only add one seq_dif record per residue
1795  ind = res.get_index()
1796  if ind != last_res_index:
1797  last_res_index = ind
1798  # This should only happen when we're using
1799  # a crystal structure as the source (a
1800  # comparative model would use MET in
1801  # the sequence)
1802  assert(len(model.sources) == 1)
1803  seq_dif.append(_MSESeqDif(res, f.component,
1804  model.sources[0],
1805  model, f.offset))
1806  chain_id = self.simo._get_chain_for_component(
1807  f.component, self.output)
1808  entity = self.simo.entities[f.component]
1809  l.write(starting_model_id=model.name,
1810  group_PDB=group_pdb,
1811  id=atom.get_input_index(), type_symbol=element,
1812  atom_id=atom_name, comp_id=res_name,
1813  entity_id=entity.id,
1814  asym_id=chain_id,
1815  seq_id=res.get_index(), Cartn_x=coord[0],
1816  Cartn_y=coord[1], Cartn_z=coord[2],
1817  B_iso_or_equiv=atom.get_temperature_factor(),
1818  ordinal_id=ordinal)
1819  ordinal += 1
1820  return seq_dif
1821 
1822 class _StructConfDumper(_Dumper):
1823  def all_rigid_fragments(self):
1824  """Yield all rigid model representation fragments"""
1825  asym_states = {}
1826  model_repr = self.simo.model_repr_dump
1827  for comp, statefrag in model_repr.fragments.items():
1828  # For now, assume that representation of the same-named
1829  # component is the same in all states, so just take the first
1830  state = list(statefrag.keys())[0]
1831  for f in statefrag[state]:
1832  if hasattr(f, 'pdbname') \
1833  and model_repr.get_model_mode(f) == 'rigid':
1834  asym = get_asym_mapper_for_state(self.simo, f.state,
1835  asym_states)
1836  yield (f, model_repr.starting_model[state, comp, f.pdbname],
1837  asym[f.hier])
1838 
1839  def all_helices(self):
1840  """Yield all helices that overlap with rigid model fragments"""
1841  for f, starting_model, asym_id in self.all_rigid_fragments():
1842  if len(starting_model.sources) == 1 \
1843  and isinstance(starting_model.sources[0], _PDBSource):
1844  pdb = starting_model.sources[0]
1845  for m in pdb.metadata:
1846  if isinstance(m, _PDBHelix) \
1847  and m.start_asym == pdb.chain_id \
1848  and m.end_asym == pdb.chain_id \
1849  and m.start_resnum >= f.start and m.end_resnum <= f.end:
1850  yield (m, max(f.start, m.start_resnum),
1851  min(f.end, m.end_resnum), asym_id)
1852 
1853  def dump(self, writer):
1854  with writer.category("_struct_conf_type") as l:
1855  l.write(id='HELX_P', criteria=_CifWriter.unknown,
1856  reference=_CifWriter.unknown)
1857  # Dump helix information for the model. For any model fragment that
1858  # is rigid, atomic, and uses an experimental PDB structure as the
1859  # starting model, inherit any helix information from that PDB file.
1860  # Note that we can't use the helix id from the original PDB, since
1861  # it has to be unique and this might not be the case if we inherit
1862  # from multiple PDBs.
1863  ordinal = 0
1864  with writer.loop("_struct_conf",
1865  ["id", "conf_type_id", "beg_label_comp_id",
1866  "beg_label_asym_id", "beg_label_seq_id",
1867  "end_label_comp_id", "end_label_asym_id",
1868  "end_label_seq_id",]) as l:
1869  for h, begin, end, asym_id in self.all_helices():
1870  ordinal += 1
1871  l.write(id='HELX_P%d' % ordinal, conf_type_id='HELX_P',
1872  beg_label_comp_id=h.start_resnam,
1873  beg_label_asym_id=asym_id,
1874  beg_label_seq_id=begin,
1875  end_label_comp_id=h.end_resnam,
1876  end_label_asym_id=asym_id,
1877  end_label_seq_id=end)
1878 
1879 
1880 class _PostProcess(object):
1881  """Base class for any post processing"""
1882  pass
1883 
1884 class _ReplicaExchangeAnalysisPostProcess(_PostProcess):
1885  """Post processing using AnalysisReplicaExchange0 macro"""
1886  type = 'cluster'
1887  feature = 'RMSD'
1888 
1889  def __init__(self, protocol, rex, num_models_begin):
1890  self.protocol = protocol
1891  self.rex = rex
1892  self.num_models_begin = num_models_begin
1893  self.num_models_end = 0
1894  for fname in self.get_all_stat_files():
1895  with open(fname) as fh:
1896  self.num_models_end += len(fh.readlines())
1897 
1898  def get_stat_file(self, cluster_num):
1899  return os.path.join(self.rex._outputdir, "cluster.%d" % cluster_num,
1900  'stat.out')
1901 
1902  def get_all_stat_files(self):
1903  for i in range(self.rex._number_of_clusters):
1904  yield self.get_stat_file(i)
1905 
1906 class _SimplePostProcess(_PostProcess):
1907  """Simple ad hoc clustering"""
1908  type = 'cluster'
1909  feature = 'RMSD'
1910 
1911  def __init__(self, protocol, num_models_begin, num_models_end):
1912  self.protocol = protocol
1913  self.num_models_begin = num_models_begin
1914  self.num_models_end = num_models_end
1915 
1916 class _PostProcessDumper(_Dumper):
1917  def __init__(self, simo):
1918  super(_PostProcessDumper, self).__init__(simo)
1919  self.postprocs = []
1920 
1921  def add(self, postproc):
1922  protocol = postproc.protocol
1923  self.postprocs.append(postproc)
1924  postproc.id = len(self.postprocs)
1925 
1926  def finalize(self):
1927  # Assign step IDs per protocol
1928  pp_by_protocol = {}
1929  for p in self.postprocs:
1930  protocol = p.protocol
1931  if id(protocol) not in pp_by_protocol:
1932  pp_by_protocol[id(protocol)] = []
1933  by_prot = pp_by_protocol[id(protocol)]
1934  by_prot.append(p)
1935  p.step_id = len(by_prot)
1936 
1937  def dump(self, writer):
1938  with writer.loop("_ihm_modeling_post_process",
1939  ["id", "protocol_id", "analysis_id", "step_id",
1940  "type", "feature", "num_models_begin",
1941  "num_models_end"]) as l:
1942  # todo: handle multiple analyses
1943  for p in self.postprocs:
1944  l.write(id=p.id, protocol_id=p.protocol.id, analysis_id=1,
1945  step_id=p.step_id, type=p.type, feature=p.feature,
1946  num_models_begin=p.num_models_begin,
1947  num_models_end=p.num_models_end)
1948 
1949 class _Ensemble(object):
1950  """Base class for any ensemble"""
1951  pass
1952 
1953 
1954 class _ReplicaExchangeAnalysisEnsemble(_Ensemble):
1955  """Ensemble generated using AnalysisReplicaExchange0 macro"""
1956 
1957  def __init__(self, pp, cluster_num, model_group, num_deposit):
1958  self.file = None
1959  self.model_group = model_group
1960  self.cluster_num = cluster_num
1961  self.postproc = pp
1962  self.num_deposit = num_deposit
1963  self.localization_density = {}
1964  with open(pp.get_stat_file(cluster_num)) as fh:
1965  self.num_models = len(fh.readlines())
1966 
1967  def get_rmsf_file(self, component):
1968  return os.path.join(self.postproc.rex._outputdir,
1969  'cluster.%d' % self.cluster_num,
1970  'rmsf.%s.dat' % component)
1971 
1972  def load_rmsf(self, model, component):
1973  fname = self.get_rmsf_file(component)
1974  if os.path.exists(fname):
1975  model.parse_rmsf_file(fname, component)
1976 
1977  def get_localization_density_file(self, component):
1978  # todo: this assumes that the localization density file name matches
1979  # the component name and is of the complete residue range (usually
1980  # this is the case, but it doesn't have to be)
1981  return os.path.join(self.postproc.rex._outputdir,
1982  'cluster.%d' % self.cluster_num,
1983  '%s.mrc' % component)
1984 
1985  def load_localization_density(self, state, component, extref_dump):
1986  fname = self.get_localization_density_file(component)
1987  if os.path.exists(fname):
1988  details = "Localization density for %s %s" \
1989  % (component, self.model_group.name)
1990  local_file = IMP.pmi.metadata.FileLocation(fname,
1991  details=state.get_postfixed_name(details))
1992  self.localization_density[component] = local_file
1993  extref_dump.add(local_file,
1994  _ExternalReferenceDumper.MODELING_OUTPUT)
1995 
1996  def load_all_models(self, simo, state):
1997  stat_fname = self.postproc.get_stat_file(self.cluster_num)
1998  model_num = 0
1999  with open(stat_fname) as fh:
2000  stats = ast.literal_eval(fh.readline())
2001  # Correct path
2002  rmf_file = os.path.join(os.path.dirname(stat_fname),
2003  "%d.rmf3" % model_num)
2004  for c in state.all_modeled_components:
2005  # todo: this only works with PMI 1
2006  state._pmi_object.set_coordinates_from_rmf(c, rmf_file, 0,
2007  force_rigid_update=True)
2008  # todo: fill in other data from stat file, e.g. crosslink phi/psi
2009  yield stats
2010  model_num += 1
2011  if model_num >= self.num_deposit:
2012  return
2013 
2014  # todo: also support dRMS precision
2015  def _get_precision(self):
2016  precfile = os.path.join(self.postproc.rex._outputdir,
2017  "precision.%d.%d.out" % (self.cluster_num,
2018  self.cluster_num))
2019  if not os.path.exists(precfile):
2020  return _CifWriter.unknown
2021  # Fail if the precision.x.x.out file doesn't match the cluster
2022  r = re.compile('All .*/cluster.%d/ average centroid distance ([\d\.]+)'
2023  % self.cluster_num)
2024  with open(precfile) as fh:
2025  for line in fh:
2026  m = r.match(line)
2027  if m:
2028  return float(m.group(1))
2029 
2030  feature = property(lambda self: self.postproc.feature)
2031  name = property(lambda self: "cluster %d" % (self.cluster_num + 1))
2032  precision = property(lambda self: self._get_precision())
2033 
2034 class _SimpleEnsemble(_Ensemble):
2035  """Simple manually-created ensemble"""
2036 
2037  feature = 'dRMSD'
2038 
2039  def __init__(self, pp, model_group, num_models, drmsd,
2040  num_models_deposited, ensemble_file):
2041  self.file = ensemble_file
2042  self.postproc = pp
2043  self.model_group = model_group
2044  self.num_deposit = num_models_deposited
2045  self.localization_density = {}
2046  self.num_models = num_models
2047  self.precision = drmsd
2048 
2049  def load_localization_density(self, state, component, local_file,
2050  extref_dump):
2051  self.localization_density[component] = local_file
2052  extref_dump.add(local_file,
2053  _ExternalReferenceDumper.MODELING_OUTPUT)
2054 
2055  name = property(lambda self: self.model_group.name)
2056 
2057 
2058 class _EnsembleDumper(_Dumper):
2059  def __init__(self, simo):
2060  super(_EnsembleDumper, self).__init__(simo)
2061  self.ensembles = []
2062 
2063  def add(self, ensemble):
2064  self.ensembles.append(ensemble)
2065  ensemble.id = len(self.ensembles)
2066 
2067  def dump(self, writer):
2068  with writer.loop("_ihm_ensemble_info",
2069  ["ensemble_id", "ensemble_name", "post_process_id",
2070  "model_group_id", "ensemble_clustering_method",
2071  "ensemble_clustering_feature",
2072  "num_ensemble_models",
2073  "num_ensemble_models_deposited",
2074  "ensemble_precision_value",
2075  "ensemble_file_id"]) as l:
2076  for e in self.ensembles:
2077  state = e.model_group.state
2078  l.write(ensemble_id=e.id,
2079  ensemble_name=state.get_prefixed_name(e.name),
2080  post_process_id=e.postproc.id,
2081  model_group_id=e.model_group.id,
2082  ensemble_clustering_feature=e.feature,
2083  num_ensemble_models=e.num_models,
2084  num_ensemble_models_deposited=e.num_deposit,
2085  ensemble_precision_value=e.precision,
2086  ensemble_file_id=e.file.id if e.file
2087  else _CifWriter.omitted)
2088 
2089 class _DensityDumper(_Dumper):
2090  """Output localization densities for ensembles"""
2091 
2092  def __init__(self, simo):
2093  super(_DensityDumper, self).__init__(simo)
2094  self.ensembles = []
2095  self.output = IMP.pmi.output.Output()
2096 
2097  def add(self, ensemble):
2098  self.ensembles.append(ensemble)
2099 
2100  def get_density(self, ensemble, component):
2101  return ensemble.localization_density.get(component, None)
2102 
2103  def dump(self, writer):
2104  with writer.loop("_ihm_localization_density_files",
2105  ["id", "file_id", "ensemble_id", "entity_id",
2106  "asym_id", "seq_id_begin", "seq_id_end"]) as l:
2107  ordinal = 1
2108  for ensemble in self.ensembles:
2109  for comp in self.simo.all_modeled_components:
2110  density = self.get_density(ensemble, comp)
2111  if not density:
2112  continue
2113  entity = self.simo.entities[comp]
2114  lenseq = len(entity.sequence)
2115  chain_id = self.simo._get_chain_for_component(comp,
2116  self.output)
2117  l.write(id=ordinal, ensemble_id=ensemble.id,
2118  entity_id=entity.id,
2119  file_id=density.id,
2120  seq_id_begin=1, seq_id_end=lenseq,
2121  asym_id=chain_id)
2122  ordinal += 1
2123 
2124 
2125 class _MultiStateDumper(_Dumper):
2126  """Output information on multiple states"""
2127 
2128  def dump(self, writer):
2129  states = sorted(self.simo._states.keys(),
2130  key=operator.attrgetter('id'))
2131  # Nothing to do for single state modeling
2132  if len(states) <= 1:
2133  return
2134  # Sort all model groups first by state, then by their own ID
2135  groups = sorted(self.simo.model_groups,
2136  key=lambda g: (g.state.id, g.id))
2137  with writer.loop("_ihm_multi_state_modeling",
2138  ["ordinal_id", "state_id", "state_group_id",
2139  "population_fraction", "state_type", "state_name",
2140  "model_group_id", "experiment_type", "details"]) as l:
2141  for n, group in enumerate(groups):
2142  state = group.state
2143  l.write(ordinal_id=n+1, state_id=state.id,
2144  state_group_id=state.id,
2145  model_group_id=group.id,
2146  state_name=state.long_name if state.long_name
2147  else _CifWriter.omitted,
2148  # No IMP models are currently single molecule
2149  experiment_type='Fraction of bulk',
2150  details=state.get_prefixed_name(group.name))
2151 
2152 
2153 class _Entity(object):
2154  """Represent a CIF entity (a chain with a unique sequence)"""
2155  def __init__(self, seq, first_component):
2156  self.sequence = seq
2157  self.first_component = first_component
2158  # Use the name of the first component as the description
2159  self.description = first_component
2160 
2161 class _EntityMapper(dict):
2162  """Handle mapping from IMP components to CIF entities.
2163  Multiple components may map to the same entity if they share sequence."""
2164  def __init__(self):
2165  super(_EntityMapper, self).__init__()
2166  self._sequence_dict = {}
2167  self._entities = []
2168 
2169  def add(self, component_name, sequence):
2170  if sequence not in self._sequence_dict:
2171  entity = _Entity(sequence, component_name)
2172  self._entities.append(entity)
2173  entity.id = len(self._entities)
2174  self._sequence_dict[sequence] = entity
2175  self[component_name] = self._sequence_dict[sequence]
2176 
2177  def get_all(self):
2178  """Yield all entities"""
2179  return self._entities
2180 
2181 
2182 class _RestraintDataset(object):
2183  """Wrapper around a dataset associated with a restraint.
2184  This is needed because we need to delay access to the dataset
2185  in case the writer of the PMI script overrides or changes it
2186  after creating the restraint."""
2187  def __init__(self, restraint, num, allow_duplicates):
2188  self.restraint = restraint
2189  self.num, self.allow_duplicates = num, allow_duplicates
2190  self.__dataset = None
2191  def __get_dataset(self):
2192  if self.__dataset:
2193  return self.__dataset
2194  if self.num is not None:
2195  d = copy.deepcopy(self.restraint.datasets[self.num])
2196  else:
2197  d = copy.deepcopy(self.restraint.dataset)
2198  if self.allow_duplicates:
2199  d.location._allow_duplicates = True
2200  # Don't copy again next time we access self.dataset
2201  self.__dataset = d
2202  return d
2203  dataset = property(__get_dataset)
2204 
2205 
2206 class _State(object):
2207  """Representation of a single state in the system."""
2208 
2209  def __init__(self, pmi_object, po):
2210  # Point to the PMI object for this state. Use a weak reference
2211  # since the state object typically points to us too, so we need
2212  # to break the reference cycle. In PMI1 this will be a
2213  # Representation object.
2214  self._pmi_object = weakref.proxy(pmi_object)
2215  self._pmi_state = pmi_object.state
2216 
2217  # The assembly of all components modeled by IMP in this state.
2218  # This may be smaller than the complete assembly.
2219  self.modeled_assembly = _Assembly()
2220  po.assembly_dump.add(self.modeled_assembly)
2221 
2222  self.all_modeled_components = []
2223 
2224  def get_prefixed_name(self, name):
2225  """Prefix the given name with the state name, if available."""
2226  if self.short_name:
2227  return self.short_name + ' ' + name
2228  else:
2229  return name.capitalize()
2230 
2231  def get_postfixed_name(self, name):
2232  """Postfix the given name with the state name, if available."""
2233  if self.short_name:
2234  return "%s in state %s" % (name, self.short_name)
2235  else:
2236  return name
2237 
2238  short_name = property(lambda self: self._pmi_state.short_name)
2239  long_name = property(lambda self: self._pmi_state.long_name)
2240 
2241 
2243  """Class to encode a modeling protocol as mmCIF.
2244 
2245  IMP has basic support for writing out files in mmCIF format, for
2246  deposition in [PDB-dev](https://pdb-dev.rcsb.rutgers.edu/).
2247  After creating an instance of this class, attach it to an
2248  IMP.pmi.representation.Representation object. After this, any
2249  generated models and metadata are automatically collected and
2250  output as mmCIF.
2251  """
2252  def __init__(self, fh):
2253  self._state_ensemble_offset = 0
2254  self._each_metadata = [] # list of metadata for each representation
2255  self._file_datasets = []
2256  self._main_script = os.path.abspath(sys.argv[0])
2257  self._states = {}
2258  self._working_directory = os.getcwd()
2259  self._cif_writer = _CifWriter(fh)
2260  self.entities = _EntityMapper()
2261  self.chains = {}
2262  self._all_components = {}
2263  self.all_modeled_components = []
2264  self.model_groups = []
2265  self.default_model_group = None
2266  self.sequence_dict = {}
2267  self.model_repr_dump = _ModelRepresentationDumper(self)
2268  self.cross_link_dump = _CrossLinkDumper(self)
2269  self.em2d_dump = _EM2DDumper(self)
2270  self.em3d_dump = _EM3DDumper(self)
2271  self.model_prot_dump = _ModelProtocolDumper(self)
2272  self.extref_dump = _ExternalReferenceDumper(self)
2273  self.dataset_dump = _DatasetDumper(self)
2274  self.starting_model_dump = _StartingModelDumper(self)
2275  self.assembly_dump = _AssemblyDumper(self)
2276 
2277  # The assembly of all known components.
2278  self.complete_assembly = _Assembly()
2279  self.assembly_dump.add(self.complete_assembly)
2280 
2281  self.model_dump = _ModelDumper(self)
2282  self.model_repr_dump.starting_model \
2283  = self.starting_model_dump.starting_model
2284  self.software_dump = _SoftwareDumper(self)
2285  self.post_process_dump = _PostProcessDumper(self)
2286  self.ensemble_dump = _EnsembleDumper(self)
2287  self.density_dump = _DensityDumper(self)
2288 
2289  # Some dumpers add per-model information; give them a pointer to
2290  # the model list
2291  self.cross_link_dump.models = self.model_dump.models
2292  self.em3d_dump.models = self.model_dump.models
2293  self.em2d_dump.models = self.model_dump.models
2294 
2295  self._dumpers = [_EntryDumper(self), # should always be first
2296  _AuditAuthorDumper(self),
2297  self.software_dump, _CitationDumper(self),
2298  _ChemCompDumper(self),
2299  _EntityDumper(self),
2300  _EntityPolyDumper(self), _EntityPolySeqDumper(self),
2301  _StructAsymDumper(self),
2302  self.assembly_dump,
2303  self.model_repr_dump, self.extref_dump,
2304  self.dataset_dump,
2305  self.cross_link_dump,
2306  self.em2d_dump, self.em3d_dump,
2307  self.starting_model_dump,
2308  # todo: detect atomic models and emit struct_conf
2309  #_StructConfDumper(self),
2310  self.model_prot_dump, self.post_process_dump,
2311  self.ensemble_dump, self.density_dump, self.model_dump,
2312  _MultiStateDumper(self)]
2313 
2314  def _add_state(self, state):
2315  """Create a new state and return a pointer to it."""
2316  self._state_ensemble_offset = len(self.ensemble_dump.ensembles)
2317  s = _State(state, self)
2318  if not self._states:
2319  self._first_state = s
2320  self._states[s] = None
2321  s.id = len(self._states)
2322  self._last_state = s
2323  return s
2324 
2325  def get_file_dataset(self, fname):
2326  for d in self._file_datasets:
2327  fd = d.get(os.path.abspath(fname), None)
2328  if fd:
2329  return fd
2330 
2331  def _get_chain_for_component(self, name, output):
2332  """Get the chain ID for a component, if any."""
2333  # todo: handle multiple copies
2334  if name in self.chains:
2335  chain = self.chains[name]
2336  return output.chainids[chain]
2337  else:
2338  # A non-modeled component doesn't have a chain ID
2339  return _CifWriter.omitted
2340 
2341  def create_component(self, state, name, modeled):
2342  new_comp = name not in self._all_components
2343  self._all_components[name] = None
2344  if modeled:
2345  state.all_modeled_components.append(name)
2346  if new_comp:
2347  self.chains[name] = len(self.chains)
2348  self.all_modeled_components.append(name)
2349  state.modeled_assembly.append(name)
2350  if new_comp:
2351  self.complete_assembly.append(name)
2352 
2353  def add_component_sequence(self, name, seq):
2354  if name in self.sequence_dict:
2355  if self.sequence_dict[name] != seq:
2356  raise ValueError("Sequence mismatch for component %s" % name)
2357  else:
2358  self.sequence_dict[name] = seq
2359  self.entities.add(name, seq)
2360 
2361  def flush(self):
2362  for dumper in self._dumpers:
2363  dumper.finalize_metadata()
2364  for dumper in self._dumpers:
2365  dumper.finalize()
2366  for dumper in self._dumpers:
2367  dumper.dump(self._cif_writer)
2368 
2369  def add_pdb_element(self, state, name, start, end, offset, pdbname,
2370  chain, hier):
2371  p = _PDBFragment(state, name, start, end, offset, pdbname, chain,
2372  hier)
2373  self.model_repr_dump.add_fragment(state, p)
2374  self.starting_model_dump.add_pdb_fragment(p)
2375 
2376  def add_bead_element(self, state, name, start, end, num, hier):
2377  b = _BeadsFragment(state, name, start, end, num, hier)
2378  self.model_repr_dump.add_fragment(state, b)
2379 
2380  def _get_restraint_dataset(self, r, num=None, allow_duplicates=False):
2381  """Get a wrapper object for the dataset used by this restraint.
2382  This is needed because the restraint's dataset may be changed
2383  in the PMI script before we get to use it."""
2384  rs = _RestraintDataset(r, num, allow_duplicates)
2385  self._add_dataset(rs)
2386  return rs
2387 
2388  def get_cross_link_group(self, r):
2389  return _CrossLinkGroup(r, self._get_restraint_dataset(r))
2390 
2391  def add_experimental_cross_link(self, r1, c1, r2, c2, length, group):
2392  if c1 not in self._all_components or c2 not in self._all_components:
2393  # Crosslink refers to a component we didn't model
2394  # As a quick hack, just ignore it.
2395  # todo: need to add an entity for this anyway (so will need the
2396  # sequence, and add to struct_assembly)
2397  return None
2398  xl = _ExperimentalCrossLink(r1, c1, r2, c2, length, group)
2399  self.cross_link_dump.add_experimental(xl)
2400  return xl
2401 
2402  def add_cross_link(self, state, ex_xl, p1, p2, sigma1, sigma2, psi):
2403  self.cross_link_dump.add(_CrossLink(state, ex_xl, p1, p2, sigma1,
2404  sigma2, psi))
2405 
2406  def add_replica_exchange(self, state, rex):
2407  # todo: allow for metadata to say how many replicas were used in the
2408  # actual experiment, and how many independent runs were carried out
2409  # (use these as multipliers to get the correct total number of
2410  # output models)
2411  self.model_prot_dump.add(_ReplicaExchangeProtocolStep(state, rex))
2412 
2413  def _add_dataset(self, dataset):
2414  return self.dataset_dump.add(self._last_state, dataset)
2415 
2416  def add_model_group(self, group):
2417  self.model_groups.append(group)
2418  group.id = len(self.model_groups)
2419  return group
2420 
2421  def _add_simple_postprocessing(self, num_models_begin, num_models_end):
2422  # Always assumed that we're dealing with the last state
2423  state = self._last_state
2424  protocol = self.model_prot_dump.get_last_protocol(state)
2425  pp = _SimplePostProcess(protocol, num_models_begin, num_models_end)
2426  self.post_process_dump.add(pp)
2427  return pp
2428 
2429  def _add_simple_ensemble(self, pp, name, num_models, drmsd,
2430  num_models_deposited, localization_densities,
2431  ensemble_file):
2432  """Add an ensemble generated by ad hoc methods (not using PMI).
2433  This is currently only used by the Nup84 system."""
2434  # Always assumed that we're dealing with the last state
2435  state = self._last_state
2436  group = self.add_model_group(_ModelGroup(state, name))
2437  self.extref_dump.add(ensemble_file,
2438  _ExternalReferenceDumper.MODELING_OUTPUT)
2439  e = _SimpleEnsemble(pp, group, num_models, drmsd, num_models_deposited,
2440  ensemble_file)
2441  self.ensemble_dump.add(e)
2442  self.density_dump.add(e)
2443  for c in state.all_modeled_components:
2444  den = localization_densities.get(c, None)
2445  if den:
2446  e.load_localization_density(state, c, den, self.extref_dump)
2447  return e
2448 
2449  def set_ensemble_file(self, i, location):
2450  """Point a previously-created ensemble to an 'all-models' file.
2451  This could be a trajectory such as DCD, an RMF, or a multimodel
2452  PDB file."""
2453  self.extref_dump.add(location,
2454  _ExternalReferenceDumper.MODELING_OUTPUT)
2455  # Ensure that we point to an ensemble related to the current state
2456  ind = i + self._state_ensemble_offset
2457  self.ensemble_dump.ensembles[ind].file = location
2458 
2459  def add_replica_exchange_analysis(self, state, rex):
2460  # todo: add prefilter as an additional postprocess step (complication:
2461  # we don't know how many models it filtered)
2462  # todo: postpone rmsf/density localization extraction until after
2463  # relevant methods are called (currently it is done from the
2464  # constructor)
2465  protocol = self.model_prot_dump.get_last_protocol(state)
2466  num_models = protocol[-1].num_models_end
2467  pp = _ReplicaExchangeAnalysisPostProcess(protocol, rex, num_models)
2468  self.post_process_dump.add(pp)
2469  for i in range(rex._number_of_clusters):
2470  group = self.add_model_group(_ModelGroup(state,
2471  'cluster %d' % (i + 1)))
2472  # todo: make # of models to deposit configurable somewhere
2473  e = _ReplicaExchangeAnalysisEnsemble(pp, i, group, 1)
2474  self.ensemble_dump.add(e)
2475  self.density_dump.add(e)
2476  # Add localization density info if available
2477  for c in state.all_modeled_components:
2478  e.load_localization_density(state, c, self.extref_dump)
2479  for stats in e.load_all_models(self, state):
2480  m = self.add_model(group)
2481  # Since we currently only deposit 1 model, it is the
2482  # best scoring one
2483  m.name = 'Best scoring model'
2484  m.stats = stats
2485  # Add RMSF info if available
2486  for c in state.all_modeled_components:
2487  e.load_rmsf(m, c)
2488 
2489  def add_em2d_restraint(self, state, r, i, resolution, pixel_size,
2490  image_resolution, projection_number):
2491  d = self._get_restraint_dataset(r, i)
2492  self.em2d_dump.add(_EM2DRestraint(state, d, r, i, resolution,
2493  pixel_size, image_resolution,
2494  projection_number))
2495 
2496  def add_em3d_restraint(self, state, target_ps, densities, r):
2497  # A 3DEM restraint's dataset ID uniquely defines the restraint, so
2498  # we need to allow duplicates
2499  d = self._get_restraint_dataset(r, allow_duplicates=True)
2500  self.em3d_dump.add(_EM3DRestraint(self, state, d, r, target_ps,
2501  densities))
2502 
2503  def add_model(self, group):
2504  state = group.state
2505  return self.model_dump.add(state.prot,
2506  self.model_prot_dump.get_last_protocol(state),
2507  state.modeled_assembly, group)
2508 
2509  def _update_location(self, fileloc):
2510  """Update FileLocation to point to a parent repository, if any"""
2511  all_repos = [m for m in self._metadata
2512  if isinstance(m, IMP.pmi.metadata.Repository)]
2514 
2515  _metadata = property(lambda self:
2516  itertools.chain.from_iterable(self._each_metadata))
Select non water and non hydrogen atoms.
Definition: pdb.h:243
Simple 3D transformation class.
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Residue.h:155
Class to encode a modeling protocol as mmCIF.
Definition: mmcif.py:2242
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: atom/Atom.h:241
Classes for attaching metadata to PMI objects.
Definition: metadata.py:1
Miscellaneous utilities.
Definition: tools.py:1
ElementTable & get_element_table()
A Python script used as part of the modeling.
Definition: metadata.py:40
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:619
void read_pdb(TextInput input, int model, Hierarchy h)
Representation of the system.
def set_ensemble_file
Point a previously-created ensemble to an 'all-models' file.
Definition: mmcif.py:2449
A 3D structure determined by comparative modeling.
Definition: metadata.py:103
A repository containing modeling files.
Definition: metadata.py:191
An experimentally-determined 3D structure as a set of a coordinates, usually in a PDB file...
Definition: metadata.py:98
Raw 2D electron micrographs.
Definition: metadata.py:108
Software (other than IMP) used as part of the modeling protocol.
Definition: metadata.py:19
A decorator for a particle representing an atom.
Definition: atom/Atom.h:234
Base class for capturing a modeling protocol.
Definition: output.py:22
A publication that describes the modeling.
Definition: metadata.py:31
A dataset stored in an official database (PDB, EMDB, PRIDE, etc.)
Definition: metadata.py:139
std::string get_relative_path(std::string base, std::string relative)
Return a path to a file relative to another file.
Basic utilities for handling cryo-electron microscopy 3D density maps.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
Class for easy writing of PDBs, RMFs, and stat files.
Definition: output.py:47
3D rotation class.
Definition: Rotation3D.h:46
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Classes for writing output files and processing them.
Definition: output.py:1
An individual file or directory.
Definition: metadata.py:167
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
VectorD< 3 > Vector3D
Definition: VectorD.h:395
Residue get_residue(Atom d, bool nothrow=false)
Return the Residue containing this atom.
Something stored in the PDB database.
Definition: metadata.py:155
Functionality for loading, creating, manipulating and scoring atomic structures.
def update_in_repos
If the given FileLocation maps to somewhere within one of the passed repositories, update it to reflect that.
Definition: metadata.py:230
Hierarchies get_leaves(const Selection &h)
Select hierarchy particles identified by the biological name.
Definition: Selection.h:66
Select all ATOM and HETATM records with the given chain ids.
Definition: pdb.h:189
ResidueType get_residue_type(char c)
Get the residue type from the 1-letter amino acid code.
Inferential scoring building on methods developed as part of the Inferential Structure Determination ...