doc/ref/mmcif_8py_source.html

 """@namespace IMP.pmi.mmcif

    @brief Support for the mmCIF file format.


    IMP has basic support for writing out files in mmCIF format, for

    deposition in [PDB-dev](https://pdb-dev.rcsb.rutgers.edu/).

    mmCIF files are currently generated by creating an

    IMP.pmi.mmcif.ProtocolOutput class, and attaching it to an

    IMP.pmi.representation.Representation object, after which any

    generated models and metadata are collected and output as mmCIF.

 """


 from __future__ import print_function

 import copy

 import IMP.core

 import IMP.algebra

 import IMP.atom

 import IMP.em

 import IMP.isd

 import IMP.pmi.representation

 import IMP.pmi.tools

 from IMP.pmi.tools import OrderedDict

 import IMP.pmi.output

 import IMP.pmi.metadata

 import re

 import ast

 import sys

 import os

 import textwrap

 import weakref

 import operator

 import itertools


 # Python 3 has no 'long' type, so use 'int' instead

 if sys.version_info[0] >= 3:

     _long_type = int

 else:

     _long_type = long


 def _assign_id(obj, seen_objs, obj_by_id):

     """Assign a unique ID to obj, and track all ids in obj_by_id."""

     if obj not in seen_objs:

         if not hasattr(obj, 'id'):

             obj_by_id.append(obj)

             obj.id = len(obj_by_id)

         seen_objs[obj] = obj.id

     else:

         obj.id = seen_objs[obj]


 class _LineWriter(object):

     def __init__(self, writer, line_len=80):

         self.writer = writer

         self.line_len = line_len

         self.column = 0

     def write(self, val):

         if isinstance(val, str) and '\n' in val:

             self.writer.fh.write("\n;")

             self.writer.fh.write(val)

             if not val.endswith('\n'):

                 self.writer.fh.write("\n")

             self.writer.fh.write(";\n")

             self.column = 0

             return

         val = self.writer._repr(val)

         if self.column > 0:

             if self.column + len(val) + 1 > self.line_len:

                 self.writer.fh.write("\n")

                 self.column = 0

             else:

                 self.writer.fh.write(" ")

                 self.column += 1

         self.writer.fh.write(val)

         self.column += len(val)


 class _CifCategoryWriter(object):

     def __init__(self, writer, category):

         self.writer = writer

         self.category = category

     def write(self, **kwargs):

         self.writer._write(self.category, kwargs)

     def __enter__(self):

         return self

     def __exit__(self, exc_type, exc_value, traceback):

         pass


 class _CifLoopWriter(object):

     def __init__(self, writer, category, keys):

         self.writer = writer

         self.category = category

         self.keys = keys

         # Remove characters that we can't use in Python identifiers

         self.python_keys = [k.replace('[', '').replace(']', '') for k in keys]

         self._empty_loop = True

     def write(self, **kwargs):

         if self._empty_loop:

             f = self.writer.fh

             f.write("#\nloop_\n")

             for k in self.keys:

                 f.write("%s.%s\n" % (self.category, k))

             self._empty_loop = False

         l = _LineWriter(self.writer)

         for k in self.python_keys:

             l.write(kwargs.get(k, self.writer.omitted))

         self.writer.fh.write("\n")

     def __enter__(self):

         return self

     def __exit__(self, exc_type, exc_value, traceback):

         if not self._empty_loop:

             self.writer.fh.write("#\n")


 class _CifWriter(object):

     omitted = '.'

     unknown = '?'

     _boolmap = {False: 'NO', True: 'YES'}


     def __init__(self, fh):

         self.fh = fh

     def category(self, category):

         return _CifCategoryWriter(self, category)

     def loop(self, category, keys):

         return _CifLoopWriter(self, category, keys)

     def write_comment(self, comment):

         for line in textwrap.wrap(comment, 78):

             self.fh.write('# ' + line + '\n')

     def _write(self, category, kwargs):

         for key in kwargs:

             self.fh.write("%s.%s %s\n" % (category, key,

                                           self._repr(kwargs[key])))

     def _repr(self, obj):

         if isinstance(obj, str) and '"' not in obj \

            and "'" not in obj and " " not in obj:

             return obj

         elif isinstance(obj, float):

             return "%.3f" % obj

         elif isinstance(obj, bool):

             return self._boolmap[obj]

         # Don't use repr(x) if type(x) == long since that adds an 'L' suffix,

         # which isn't valid mmCIF syntax. _long_type = long only on Python 2.

         elif isinstance(obj, _long_type):

             return "%d" % obj

         else:

             return repr(obj)


 def _get_by_residue(p):

     """Determine whether the given particle represents a specific residue

        or a more coarse-grained object."""

     return IMP.atom.Residue.get_is_setup(p) or IMP.atom.Atom.get_is_setup(p)


 class _ComponentMapper(object):

     """Map a Particle to a component name"""

     def __init__(self, prot):

         self.o = IMP.pmi.output.Output()

         self.prot = prot

         self.name = 'cif-output'

         self.o.dictionary_pdbs[self.name] = self.prot

         self.o._init_dictchain(self.name, self.prot)


     def __getitem__(self, p):

         protname, is_a_bead = self.o.get_prot_name_from_particle(self.name, p)

         return protname


 class _AsymIDMapper(object):

     """Map a Particle to an asym_id (chain ID)"""

     def __init__(self, simo, prot):

         self.simo = simo

         self._cm = _ComponentMapper(prot)


     def __getitem__(self, p):

         protname = self._cm[p]

         return self.simo._get_chain_for_component(protname, self._cm.o)


 class _Dumper(object):

     """Base class for helpers to dump output to mmCIF"""

     def __init__(self, simo):

         self.simo = weakref.proxy(simo)


     def finalize(self):

         pass


     def finalize_metadata(self):

         pass


 class _EntryDumper(_Dumper):

     def dump(self, writer):

         entry_id = 'imp_model'

         # Write CIF header (so this dumper should always be first)

         writer.fh.write("data_%s\n" % entry_id)

         with writer.category("_entry") as l:

             l.write(id=entry_id)


 class _SoftwareDumper(_Dumper):

     def __init__(self, simo):

         super(_SoftwareDumper, self).__init__(simo)

         self.modeller_used = self.phyre2_used = False

         self.software = [

            IMP.pmi.metadata.Software(

                  name="Integrative Modeling Platform (IMP)",

                  version=IMP.__version__,

                  classification="integrative model building",

                  description="integrative model building",

                  url='https://integrativemodeling.org'),

            IMP.pmi.metadata.Software(

                 name="IMP PMI module",

                 version=IMP.pmi.__version__,

                 classification="integrative model building",

                 description="integrative model building",

                 url='https://integrativemodeling.org') ]


     def set_modeller_used(self, version, date):

         if self.modeller_used:

             return

         self.modeller_used = True

         self.software.append(IMP.pmi.metadata.Software(

                     name='MODELLER', classification='comparative modeling',

                     description='Comparative modeling by satisfaction '

                                 'of spatial restraints, build ' + date,

                     url='https://salilab.org/modeller/',

                     version=version))


     def set_phyre2_used(self):

         if self.phyre2_used:

             return

         self.phyre2_used = True

         self.software.append(IMP.pmi.metadata.Software(

                    name='Phyre2', classification='protein homology modeling',

                    description='Protein Homology/analogY Recognition '

                                'Engine V 2.0',

                    version='2.0', url='http://www.sbg.bio.ic.ac.uk/~phyre2/'))


     def dump(self, writer):

         ordinal = 1

         with writer.loop("_software",

                          ["pdbx_ordinal", "name", "classification", "version",

                           "type", "location"]) as l:

             for m in itertools.chain(self.software, self.simo._metadata):

                 if isinstance(m, IMP.pmi.metadata.Software):

                     l.write(pdbx_ordinal=ordinal, name=m.name,

                             classification=m.classification, version=m.version,

                             type=m.type, location=m.url)

                     ordinal += 1


 class _AuditAuthorDumper(_Dumper):

     """Populate the mmCIF audit_author category (a list of the people that

        authored this mmCIF file; here we assume that's just the authors of

        any associated publications)"""

     def dump(self, writer):

         citations = [m for m in self.simo._metadata

                      if isinstance(m, IMP.pmi.metadata.Citation)]

         seen_authors = {}

         with writer.loop("_audit_author",

                          ["name", "pdbx_ordinal"]) as l:

             ordinal = 1

             for n, c in enumerate(citations):

                 for a in c.authors:

                     if a not in seen_authors:

                         seen_authors[a] = None

                         l.write(name=a, pdbx_ordinal=ordinal)

                         ordinal += 1


 class _CitationDumper(_Dumper):

     def dump(self, writer):

         citations = [m for m in self.simo._metadata

                      if isinstance(m, IMP.pmi.metadata.Citation)]

         with writer.loop("_citation",

                          ["id", "title", "journal_abbrev", "journal_volume",

                           "page_first", "page_last", "year",

                           "pdbx_database_id_PubMed",

                           "pdbx_database_id_DOI"]) as l:

             for n, c in enumerate(citations):

                 if isinstance(c.page_range, (tuple, list)):

                     page_first, page_last = c.page_range

                 else:

                     page_first = c.page_range

                     page_last = _CifWriter.omitted

                 l.write(id=n+1, title=c.title, journal_abbrev=c.journal,

                         journal_volume=c.volume, page_first=page_first,

                         page_last=page_last, year=c.year,

                         pdbx_database_id_PubMed=c.pmid,

                         pdbx_database_id_DOI=c.doi)


         with writer.loop("_citation_author",

                          ["citation_id", "name", "ordinal"]) as l:

             ordinal = 1

             for n, c in enumerate(citations):

                 for a in c.authors:

                     l.write(citation_id=n+1, name=a, ordinal=ordinal)

                     ordinal += 1


 class _EntityDumper(_Dumper):

     # todo: we currently only support amino acid sequences here (and

     # then only standard amino acids; need to add support for MSE etc.)

     def dump(self, writer):

         with writer.loop("_entity",

                          ["id", "type", "src_method", "pdbx_description",

                           "formula_weight", "pdbx_number_of_molecules",

                           "details"]) as l:

             for entity in self.simo.entities.get_all():

                 l.write(id=entity.id, type='polymer', src_method='man',

                         pdbx_description=entity.first_component,

                         formula_weight=writer.unknown,

                         pdbx_number_of_molecules=1, details=writer.unknown)


 class _EntityPolyDumper(_Dumper):

     # todo: we currently only support amino acid sequences here

     def __init__(self, simo):

         super(_EntityPolyDumper, self).__init__(simo)

         self.output = IMP.pmi.output.Output()


     def dump(self, writer):

         with writer.loop("_entity_poly",

                          ["entity_id", "type", "nstd_linkage",

                           "nstd_monomer", "pdbx_strand_id",

                           "pdbx_seq_one_letter_code",

                           "pdbx_seq_one_letter_code_can"]) as l:

             for entity in self.simo.entities.get_all():

                 seq = entity.sequence

                 # Split into lines to get tidier CIF output

                 seq = "\n".join(seq[i:i+70] for i in range(0, len(seq), 70))

                 name = entity.first_component

                 chain_id = self.simo._get_chain_for_component(name, self.output)

                 l.write(entity_id=entity.id, type='polypeptide(L)',

                         nstd_linkage='no', nstd_monomer='no',

                         pdbx_strand_id=chain_id,

                         pdbx_seq_one_letter_code=seq,

                         pdbx_seq_one_letter_code_can=seq)


 class _ChemCompDumper(_Dumper):

     def dump(self, writer):

         seen = {}

         std = dict.fromkeys(('ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS',

                'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN', 'ARG', 'SER',

                'THR', 'VAL', 'TRP', 'TYR'))

         with writer.loop("_chem_comp", ["id", "type"]) as l:

             for entity in self.simo.entities.get_all():

                 seq = entity.sequence

                 for num, one_letter_code in enumerate(seq):

                     restyp = IMP.atom.get_residue_type(one_letter_code)

                     resid = restyp.get_string()

                     if resid not in seen:

                         seen[resid] = None

                         l.write(id=resid,

                                 type='L-peptide linking' if resid in std \

                                                          else 'other')


 class _EntityPolySeqDumper(_Dumper):

     def dump(self, writer):

         with writer.loop("_entity_poly_seq",

                          ["entity_id", "num", "mon_id", "hetero"]) as l:

             for entity in self.simo.entities.get_all():

                 seq = entity.sequence

                 for num, one_letter_code in enumerate(seq):

                     restyp = IMP.atom.get_residue_type(one_letter_code)

                     l.write(entity_id=entity.id, num=num + 1,

                             mon_id=restyp.get_string(),

                             hetero=_CifWriter.omitted)


 class _StructAsymDumper(_Dumper):

     def __init__(self, simo):

         super(_StructAsymDumper, self).__init__(simo)

         self.output = IMP.pmi.output.Output()


     def dump(self, writer):

         with writer.loop("_struct_asym",

                          ["id", "entity_id", "details"]) as l:

             for comp in self.simo.all_modeled_components:

                 entity = self.simo.entities[comp]

                 chain_id = self.simo._get_chain_for_component(comp, self.output)

                 l.write(id=chain_id,

                         entity_id=entity.id,

                         details=comp)


 class _PDBFragment(object):

     """Record details about part of a PDB file used as input

        for a component."""

     primitive = 'sphere'

     granularity = 'by-residue'

     num = _CifWriter.omitted

     def __init__(self, state, component, start, end, offset, pdbname,

                  chain, hier):

         self.component, self.start, self.end, self.offset, self.pdbname \

               = component, start, end, offset, pdbname

         self.state, self.chain, self.hier = state, chain, hier

         sel = IMP.atom.NonWaterNonHydrogenPDBSelector() \

               & IMP.atom.ChainPDBSelector(chain)

         self.starting_hier = IMP.atom.read_pdb(pdbname, state.m, sel)


     def combine(self, other):

         pass


 class _BeadsFragment(object):

     """Record details about beads used to represent part of a component."""

     primitive = 'sphere'

     granularity = 'by-feature'

     chain = None

     def __init__(self, state, component, start, end, num, hier):

         self.state, self.component, self.start, self.end, self.num, self.hier \

               = state, component, start, end, num, hier


     def combine(self, other):

         # todo: don't combine if one fragment is rigid and the other flexible

         if type(other) == type(self) and other.start == self.end + 1:

             self.end = other.end

             self.num += other.num

             return True


 class _StartingModel(object):

     """Record details about an input model (e.g. comparative modeling

        template) used for a component."""


     source = _CifWriter.unknown

     db_name = _CifWriter.unknown

     db_code = _CifWriter.unknown

     sequence_identity = _CifWriter.unknown


     def __init__(self, fragment):

         self.fragments = [fragment]


 class _ModelRepresentationDumper(_Dumper):

     def __init__(self, simo):

         super(_ModelRepresentationDumper, self).__init__(simo)

         # dict of fragments, ordered by component name and then state

         self.fragments = OrderedDict()

         self.output = IMP.pmi.output.Output()


     def add_fragment(self, state, fragment):

         """Add a model fragment."""

         comp = fragment.component

         if comp not in self.fragments:

             self.fragments[comp] = OrderedDict()

         if state not in self.fragments[comp]:

             self.fragments[comp][state] = []

         fragments = self.fragments[comp][state]

         if len(fragments) == 0 or not fragments[-1].combine(fragment):

             fragments.append(fragment)


     def get_model_mode(self, fragment):

         """Determine the model_mode for a given fragment ('rigid' or

            'flexible')"""

         leaves = IMP.atom.get_leaves(fragment.hier)

         # Assume all leaves are set up as rigid/flexible in the same way

         if IMP.core.RigidMember.get_is_setup(leaves[0]):

             return 'rigid'

         else:

             return 'flexible'


     def dump(self, writer):

         ordinal_id = 1

         segment_id = 1

         with writer.loop("_ihm_model_representation",

                          ["ordinal_id", "representation_id",

                           "segment_id", "entity_id", "entity_description",

                           "entity_asym_id",

                           "seq_id_begin", "seq_id_end",

                           "model_object_primitive", "starting_model_id",

                           "model_mode", "model_granularity",

                           "model_object_count"]) as l:

             for comp, statefrag in self.fragments.items():

                 # For now, assume that representation of the same-named

                 # component is the same in all states, so just take the first

                 state = list(statefrag.keys())[0]

                 chain_id = self.simo._get_chain_for_component(comp, self.output)

                 for f in statefrag[state]:

                     entity = self.simo.entities[f.component]

                     starting_model_id = _CifWriter.omitted

                     if hasattr(f, 'pdbname'):

                         starting_model_id \

                              = self.starting_model[state, comp, f.pdbname].name

                     # todo: handle multiple representations

                     l.write(ordinal_id=ordinal_id,

                             representation_id=1,

                             segment_id=segment_id,

                             entity_id=entity.id,

                             entity_description=entity.description,

                             entity_asym_id=chain_id,

                             seq_id_begin=f.start,

                             seq_id_end=f.end,

                             model_object_primitive=f.primitive,

                             starting_model_id=starting_model_id,

                             model_mode=self.get_model_mode(f),

                             model_granularity=f.granularity,

                             model_object_count=f.num)

                     ordinal_id += 1

                     segment_id += 1


 class _PDBSource(object):

     """An experimental PDB file used as part of a starting model"""

     source = 'experimental model'

     db_name = 'PDB'

     sequence_identity = 100.0


     def __init__(self, model, db_code, chain_id, metadata):

         self.db_code = db_code

         self.chain_id = chain_id

         self.metadata = metadata


     def get_seq_id_range(self, model):

         # Assume the structure covers the entire sequence

         return (model.seq_id_begin, model.seq_id_end)


 class _TemplateSource(object):

     """A PDB file used as a template for a comparative starting model"""

     source = 'comparative model'

     db_name = db_code = _CifWriter.omitted

     tm_dataset = None

     # Right now assume all alignments are Modeller alignments, which uses

     # the length of the shorter sequence as the denominator for sequence

     # identity

     sequence_identity_denominator = 1


     def __init__(self, tm_code, tm_seq_id_begin, tm_seq_id_end, seq_id_begin,

                  chain_id, seq_id_end, seq_id, model):

         # Assume a code of 1abcX refers to a real PDB structure

         if len(tm_code) == 5:

             self._orig_tm_code = None

             self.tm_db_code = tm_code[:4].upper()

             self.tm_chain_id = tm_code[4]

         else:

             # Otherwise, will need to look up in TEMPLATE PATH remarks

             self._orig_tm_code = tm_code

             self.tm_db_code = _CifWriter.omitted

             self.tm_chain_id = tm_code[-1]

         self.sequence_identity = seq_id

         self.tm_seq_id_begin = tm_seq_id_begin

         self.tm_seq_id_end = tm_seq_id_end

         self.chain_id = chain_id

         self._seq_id_begin, self._seq_id_end = seq_id_begin, seq_id_end


     def get_seq_id_range(self, model):

         # The template may cover more than the current starting model

         seq_id_begin = max(model.seq_id_begin, self._seq_id_begin)

         seq_id_end = min(model.seq_id_end, self._seq_id_end)

         return (seq_id_begin, seq_id_end)


 class _UnknownSource(object):

     """Part of a starting model from an unknown source"""

     db_code = _CifWriter.unknown

     db_name = _CifWriter.unknown

     chain_id = _CifWriter.unknown

     sequence_identity = _CifWriter.unknown

     # Map dataset types to starting model sources

     _source_map = {'Comparative model': 'comparative model',

                    'Experimental model': 'experimental model'}


     def __init__(self, model, chain):

         self.source = self._source_map[model.dataset._data_type]

         self.chain_id = chain


     def get_seq_id_range(self, model):

         return (model.seq_id_begin, model.seq_id_end)


 class _DatasetGroup(object):

     """A group of datasets"""

     def __init__(self, datasets):

         self._datasets = list(datasets)


     def finalize(self):

         """Get final datasets for each restraint and remove duplicates"""

         final_datasets = OrderedDict()

         for ds in self._datasets:

             if isinstance(ds, _RestraintDataset):

                 d = ds.dataset

             else:

                 d = ds

             final_datasets[d] = None

         self._datasets = final_datasets.keys()


 class _ExternalReference(object):

     """A single externally-referenced file"""

     def __init__(self, location, content_type):

         self.location, self.content_type = location, content_type


     # Pass id through to location

     def __set_id(self, i):

         self.location.id = i

     id = property(lambda x: x.location.id, __set_id)

     file_size = property(lambda x: x.location.file_size)


     def __eq__(self, other):

         return self.location == other.location

     def __hash__(self):

         return hash(self.location)


 class _ExternalReferenceDumper(_Dumper):

     """Output information on externally referenced files

        (i.e. anything that refers to a Location that isn't

        a DatabaseLocation)."""


     INPUT_DATA = "Input data or restraints"

     MODELING_OUTPUT = "Modeling or post-processing output"

     WORKFLOW = "Modeling workflow or script"


     class _LocalFiles(object):

         reference_provider = _CifWriter.omitted

         reference_type = 'Supplementary Files'

         reference = _CifWriter.omitted

         refers_to = 'Other'

         associated_url = _CifWriter.omitted


         def __init__(self, top_directory):

             self.top_directory = top_directory


         def _get_full_path(self, path):

             return os.path.relpath(path, start=self.top_directory)


     class _Repository(object):

         reference_provider = _CifWriter.omitted

         reference_type = 'DOI'

         refers_to = 'Other'

         associated_url = _CifWriter.omitted


         def __init__(self, repo):

             self.id = repo.id

             self.reference = repo.doi

             if 'zenodo' in self.reference:

                 self.reference_provider = 'Zenodo'

             if repo.url:

                 self.associated_url = repo.url

                 if repo.url.endswith(".zip"):

                     self.refers_to = 'Archive'

                 else:

                     self.refers_to = 'File'


     def __init__(self, simo):

         super(_ExternalReferenceDumper, self).__init__(simo)

         self._refs = []


     def add(self, location, content_type):

         """Add a new location.

            Note that ids are assigned later."""

         self._refs.append(_ExternalReference(location, content_type))

         return location


     def finalize_metadata(self):

         """Register locations for any metadata and add the main script"""

         loc = IMP.pmi.metadata.FileLocation(path=self.simo._main_script,

                                details="The main integrative modeling script")

         main_script = IMP.pmi.metadata.PythonScript(loc)

         self._workflow = [main_script] \

                          + [m for m in self.simo._metadata

                             if isinstance(m, IMP.pmi.metadata.PythonScript)]

         for w in self._workflow:

             self.add(w.location, self.WORKFLOW)


     def finalize_after_datasets(self):

         """Note that this must happen *after* DatasetDumper.finalize()"""

         # Keep only locations that don't point into databases (these are

         # handled elsewhere)

         self._refs = [x for x in self._refs

                       if not isinstance(x.location,

                                         IMP.pmi.metadata.DatabaseLocation)]

         # Assign IDs to all locations and repos (including the None repo, which

         # is for local files)

         seen_refs = {}

         seen_repos = {}

         self._ref_by_id = []

         self._repo_by_id = []

         # Special dummy repo for repo=None (local files)

         self._local_files = self._LocalFiles(self.simo._working_directory)

         for r in self._refs:

             # Update location to point to parent repository, if any

             self.simo._update_location(r.location)

             # Assign a unique ID to the reference

             _assign_id(r, seen_refs, self._ref_by_id)

             # Assign a unique ID to the repository

             _assign_id(r.location.repo or self._local_files, seen_repos,

                        self._repo_by_id)


     def dump(self, writer):

         self.dump_repos(writer)

         self.dump_refs(writer)


     def dump_repos(self, writer):

         def map_repo(r):

             return r if isinstance(r, self._LocalFiles) else self._Repository(r)

         with writer.loop("_ihm_external_reference_info",

                          ["reference_id", "reference_provider",

                           "reference_type", "reference", "refers_to",

                           "associated_url"]) as l:

             for repo in [map_repo(r) for r in self._repo_by_id]:

                 l.write(reference_id=repo.id,

                         reference_provider=repo.reference_provider,

                         reference_type=repo.reference_type,

                         reference=repo.reference, refers_to=repo.refers_to,

                         associated_url=repo.associated_url)


     def dump_refs(self, writer):

         with writer.loop("_ihm_external_files",

                          ["id", "reference_id", "file_path", "content_type",

                           "file_size_bytes", "details"]) as l:

             for r in self._ref_by_id:

                 loc = r.location

                 repo = loc.repo or self._local_files

                 file_path=self._posix_path(repo._get_full_path(loc.path))

                 if r.file_size is None:

                     file_size = _CifWriter.omitted

                 else:

                     file_size = r.file_size

                 l.write(id=loc.id, reference_id=repo.id,

                         file_path=file_path,

                         content_type=r.content_type,

                         file_size_bytes=file_size,

                         details=loc.details or _CifWriter.omitted)


     # On Windows systems, convert native paths to POSIX-like (/-separated) paths

     if os.sep == '/':

         def _posix_path(self, path):

             return path

     else:

         def _posix_path(self, path):

             return path.replace(os.sep, '/')


 class _DatasetDumper(_Dumper):

     def __init__(self, simo):

         super(_DatasetDumper, self).__init__(simo)

         self._datasets = []

         self._datasets_by_state = {}

         self._dataset_groups = []


     def get_all_group(self, state):

         """Get a _DatasetGroup encompassing all datasets so far in this state"""

         g = _DatasetGroup(self._datasets_by_state.get(state, []))

         self._dataset_groups.append(g)

         return g


     def add(self, state, dataset):

         """Add a new dataset.

            Note that ids are assigned later."""

         self._datasets.append(dataset)

         if state not in self._datasets_by_state:

             self._datasets_by_state[state] = []

         self._datasets_by_state[state].append(dataset)

         return dataset


     def finalize(self):

         seen_datasets = {}

         # Assign IDs to all datasets

         self._dataset_by_id = []

         for d in self._flatten_dataset(self._datasets):

             # Register location (need to do that now rather than in add() in

             # order to properly handle _RestraintDataset)

             self.simo.extref_dump.add(d.location,

                                       _ExternalReferenceDumper.INPUT_DATA)

             _assign_id(d, seen_datasets, self._dataset_by_id)


         # Assign IDs to all groups and remove duplicates

         seen_group_ids = {}

         self._dataset_group_by_id = []

         for g in self._dataset_groups:

             g.finalize()

             ids = tuple(sorted(d.id for d in g._datasets))

             if ids not in seen_group_ids:

                 self._dataset_group_by_id.append(g)

                 g.id = len(self._dataset_group_by_id)

                 seen_group_ids[ids] = g

             else:

                 g.id = seen_group_ids[ids].id

         # Finalize external references (must happen after dataset finalize)

         self.simo.extref_dump.finalize_after_datasets()


     def _flatten_dataset(self, d):

         if isinstance(d, list):

             for p in d:

                 for x in self._flatten_dataset(p):

                     yield x

         elif isinstance(d, _RestraintDataset):

             for x in self._flatten_dataset(d.dataset):

                 yield x

         else:

             for p in d._parents.keys():

                 for x in self._flatten_dataset(p):

                     yield x

             yield d


     def dump(self, writer):

         with writer.loop("_ihm_dataset_list",

                          ["id", "data_type", "database_hosted"]) as l:

             for d in self._dataset_by_id:

                 l.write(id=d.id, data_type=d._data_type,

                         database_hosted=isinstance(d.location,

                                         IMP.pmi.metadata.DatabaseLocation))

         self.dump_groups(writer)

         self.dump_other((d for d in self._dataset_by_id

                          if not isinstance(d.location,

                                             IMP.pmi.metadata.DatabaseLocation)),

                         writer)

         self.dump_rel_dbs((d for d in self._dataset_by_id

                            if isinstance(d.location,

                                             IMP.pmi.metadata.DatabaseLocation)),

                           writer)

         self.dump_related(writer)


     def dump_groups(self, writer):

         ordinal = 1

         with writer.loop("_ihm_dataset_group",

                          ["ordinal_id", "group_id", "dataset_list_id"]) as l:

             for g in self._dataset_group_by_id:

                 for d in g._datasets:

                     l.write(ordinal_id=ordinal, group_id=g.id,

                             dataset_list_id=d.id)

                     ordinal += 1


     def dump_related(self, writer):

         ordinal = 1

         with writer.loop("_ihm_related_datasets",

                          ["ordinal_id", "dataset_list_id_derived",

                           "dataset_list_id_primary"]) as l:

             for derived in self._dataset_by_id:

                 for parent in sorted(derived._parents.keys(),

                                      key=lambda d: d.id):

                     l.write(ordinal_id=ordinal,

                             dataset_list_id_derived=derived.id,

                             dataset_list_id_primary=parent.id)

                     ordinal += 1


     def dump_rel_dbs(self, datasets, writer):

         ordinal = 1

         with writer.loop("_ihm_dataset_related_db_reference",

                          ["id", "dataset_list_id", "db_name",

                           "accession_code", "version", "details"]) as l:

             for d in datasets:

                 l.write(id=ordinal, dataset_list_id=d.id,

                         db_name=d.location.db_name,

                         accession_code=d.location.access_code,

                         version=d.location.version if d.location.version

                                 else _CifWriter.omitted,

                         details=d.location.details if d.location.details

                                 else _CifWriter.omitted)

                 ordinal += 1


     def dump_other(self, datasets, writer):

         ordinal = 1

         with writer.loop("_ihm_dataset_external_reference",

                          ["id", "dataset_list_id", "file_id"]) as l:

             for d in datasets:

                 l.write(id=ordinal, dataset_list_id=d.id, file_id=d.location.id)

                 ordinal += 1


 class _CrossLinkGroup(object):

     """Group common information for a set of cross links"""

     def __init__(self, pmi_restraint, rdataset):

         self.pmi_restraint, self.rdataset = pmi_restraint, rdataset

         self.label = self.pmi_restraint.label


 class _ExperimentalCrossLink(object):

     def __init__(self, r1, c1, r2, c2, length, group):

         self.r1, self.c1, self.r2, self.c2 = r1, c1, r2, c2

         self.length, self.group = length, group


 class _CrossLink(object):

     def __init__(self, state, ex_xl, p1, p2, sigma1, sigma2, psi):

         self.state = state

         self.ex_xl, self.sigma1, self.sigma2 = ex_xl, sigma1, sigma2

         self.p1, self.p2 = p1, p2

         self.psi = psi


 def get_asym_mapper_for_state(simo, state, asym_map):

     asym = asym_map.get(state, None)

     if asym is None:

         asym = _AsymIDMapper(simo, state.prot)

         asym_map[state] = asym

     return asym


 class _CrossLinkDumper(_Dumper):

     def __init__(self, simo):

         super(_CrossLinkDumper, self).__init__(simo)

         self.cross_links = []

         self.exp_cross_links = []


     def add_experimental(self, cross_link):

         self.exp_cross_links.append(cross_link)


     def add(self, cross_link):

         self.cross_links.append(cross_link)


     def dump(self, writer):

         self.dump_list(writer)

         self.dump_restraint(writer)

         self.dump_results(writer)


     def dump_list(self, writer):

         seen_cross_links = {}

         with writer.loop("_ihm_cross_link_list",

                          ["id", "group_id", "entity_description_1",

                           "entity_id_1", "seq_id_1", "comp_id_1",

                           "entity_description_2",

                           "entity_id_2", "seq_id_2", "comp_id_2", "type",

                           "dataset_list_id"]) as l:

             xl_id = 0

             for xl in self.exp_cross_links:

                 # Skip identical cross links

                 sig = (xl.c1, xl.c2, xl.r1, xl.r2, xl.group.label)

                 if sig in seen_cross_links:

                     xl.id = seen_cross_links[sig]

                     continue

                 entity1 = self.simo.entities[xl.c1]

                 entity2 = self.simo.entities[xl.c2]

                 seq1 = entity1.sequence

                 seq2 = entity2.sequence

                 rt1 = IMP.atom.get_residue_type(seq1[xl.r1-1])

                 rt2 = IMP.atom.get_residue_type(seq2[xl.r2-1])

                 # todo: handle experimental ambiguity (group_id) properly

                 xl_id += 1

                 seen_cross_links[sig] = xl_id

                 xl.id = xl_id

                 l.write(id=xl.id, group_id=xl.id,

                         entity_description_1=entity1.description,

                         entity_id_1=entity1.id,

                         seq_id_1=xl.r1,

                         comp_id_1=rt1.get_string(),

                         entity_description_2=entity2.description,

                         entity_id_2=entity2.id,

                         seq_id_2=xl.r2,

                         comp_id_2=rt2.get_string(),

                         type=xl.group.label,

                         dataset_list_id=xl.group.rdataset.dataset.id)


     def _granularity(self, xl):

         """Determine the granularity of a cross link"""

         if _get_by_residue(xl.p1) and _get_by_residue(xl.p2):

             return 'by-residue'

         else:

             return 'by-feature'


     def dump_restraint(self, writer):

         seen_cross_links = {}

         asym_states = {} # AsymIDMapper for each state

         with writer.loop("_ihm_cross_link_restraint",

                          ["id", "group_id", "entity_id_1", "asym_id_1",

                           "seq_id_1", "comp_id_1",

                           "entity_id_2", "asym_id_2", "seq_id_2", "comp_id_2",

                           "type", "conditional_crosslink_flag",

                           "model_granularity", "distance_threshold",

                           "psi", "sigma_1", "sigma_2"]) as l:

             xl_id = 0

             for xl in self.cross_links:

                 asym = get_asym_mapper_for_state(self.simo, xl.state,

                                                  asym_states)

                 entity1 = self.simo.entities[xl.ex_xl.c1]

                 entity2 = self.simo.entities[xl.ex_xl.c2]

                 seq1 = entity1.sequence

                 seq2 = entity2.sequence

                 rt1 = IMP.atom.get_residue_type(seq1[xl.ex_xl.r1-1])

                 rt2 = IMP.atom.get_residue_type(seq2[xl.ex_xl.r2-1])

                 asym1 = asym[xl.p1]

                 asym2 = asym[xl.p2]

                 # Skip identical cross links

                 sig = (asym1, xl.ex_xl.r1, asym2, xl.ex_xl.r2,

                        xl.ex_xl.group.label)

                 if sig in seen_cross_links:

                     xl.id = seen_cross_links[sig]

                     continue

                 xl_id += 1

                 seen_cross_links[sig] = xl_id

                 xl.id = xl_id

                 l.write(id=xl.id,

                         group_id=xl.ex_xl.id,

                         entity_id_1=entity1.id,

                         asym_id_1=asym1,

                         seq_id_1=xl.ex_xl.r1,

                         comp_id_1=rt1.get_string(),

                         entity_id_2=entity2.id,

                         asym_id_2=asym2,

                         seq_id_2=xl.ex_xl.r2,

                         comp_id_2=rt2.get_string(),

                         type=xl.ex_xl.group.label,

                         # todo: any circumstances where this could be ANY?

                         conditional_crosslink_flag="ALL",

                         model_granularity=self._granularity(xl),

                         distance_threshold=xl.ex_xl.length,

                         psi=xl.psi.get_scale(),

                         sigma_1=xl.sigma1.get_scale(),

                         sigma_2=xl.sigma2.get_scale())


     def _set_psi_sigma(self, model, g):

         for resolution in g.pmi_restraint.sigma_dictionary:

             statname = 'ISDCrossLinkMS_Sigma_%s_%s' % (resolution, g.label)

             if model.stats and statname in model.stats:

                 sigma = float(model.stats[statname])

                 p = g.pmi_restraint.sigma_dictionary[resolution][0]

                 p.set_scale(sigma)

         for psiindex in g.pmi_restraint.psi_dictionary:

             statname = 'ISDCrossLinkMS_Psi_%s_%s' % (psiindex, g.label)

             if model.stats and statname in model.stats:

                 psi = float(model.stats[statname])

                 p = g.pmi_restraint.psi_dictionary[psiindex][0]

                 p.set_scale(psi)


     def dump_results(self, writer):

         all_groups = {}

         for xl in self.cross_links:

             all_groups[xl.ex_xl.group] = None

         ordinal = 1

         with writer.loop("_ihm_cross_link_result_parameters",

                          ["ordinal_id", "restraint_id", "model_id",

                           "psi", "sigma_1", "sigma_2"]) as l:

             for model in self.models:

                 for g in all_groups.keys():

                     self._set_psi_sigma(model, g)

                 for xl in self.cross_links:

                     if model.stats:

                         l.write(ordinal_id=ordinal, restraint_id=xl.id,

                                 model_id=model.id, psi=xl.psi.get_scale(),

                                 sigma_1=xl.sigma1.get_scale(),

                                 sigma_2=xl.sigma2.get_scale())

                         ordinal += 1


 class _EM2DRestraint(object):

     def __init__(self, state, rdataset, pmi_restraint, image_number, resolution,

                  pixel_size, image_resolution, projection_number):

         self.state = state

         self.pmi_restraint, self.image_number = pmi_restraint, image_number

         self.rdataset, self.resolution = rdataset, resolution

         self.pixel_size, self.image_resolution = pixel_size, image_resolution

         self.projection_number = projection_number


     def get_transformation(self, model):

         """Get the transformation that places the model on the image"""

         prefix = 'ElectronMicroscopy2D_%s_Image%d' % (self.pmi_restraint.label,

                                                       self.image_number + 1)

         r = [float(model.stats[prefix + '_Rotation%d' % i]) for i in range(4)]

         t = [float(model.stats[prefix + '_Translation%d' % i])

              for i in range(3)]

         # If the model coordinates are transformed, need to back transform

         # them first

         inv = model.transform.get_inverse()

         return IMP.algebra.Transformation3D(IMP.algebra.Rotation3D(*r),

                                             IMP.algebra.Vector3D(*t)) * inv

     def get_cross_correlation(self, model):

         """Get the cross correlation coefficient between the model projection

            and the image"""

         return float(model.stats['ElectronMicroscopy2D_%s_Image%d_CCC'

                                  % (self.pmi_restraint.label,

                                     self.image_number + 1)])


     def get_num_raw_micrographs(self):

         """Return the number of raw micrographs used, if known.

            This is extracted from the EMMicrographsDataset if any."""

         for d in self.rdataset.dataset._parents.keys():

             if isinstance(d, IMP.pmi.metadata.EMMicrographsDataset):

                 return d.number


 class _EM2DDumper(_Dumper):

     def __init__(self, simo):

         super(_EM2DDumper, self).__init__(simo)

         self.restraints = []


     def add(self, rsr):

         self.restraints.append(rsr)

         rsr.id = len(self.restraints)


     def dump(self, writer):

         self.dump_restraints(writer)

         self.dump_fitting(writer)


     def dump_restraints(self, writer):

         with writer.loop("_ihm_2dem_class_average_restraint",

                          ["id", "dataset_list_id", "number_raw_micrographs",

                           "pixel_size_width", "pixel_size_height",

                           "image_resolution", "image_segment_flag",

                           "number_of_projections", "struct_assembly_id",

                           "details"]) as l:

             for r in self.restraints:

                 unk = _CifWriter.unknown

                 num_raw = r.get_num_raw_micrographs()

                 l.write(id=r.id, dataset_list_id=r.rdataset.dataset.id,

                         number_raw_micrographs=num_raw if num_raw else unk,

                         pixel_size_width=r.pixel_size,

                         pixel_size_height=r.pixel_size,

                         image_resolution=r.image_resolution,

                         number_of_projections=r.projection_number,

                         # todo: check that the assembly is the same for each

                         # state

                         struct_assembly_id=r.state.modeled_assembly.id,

                         image_segment_flag=False)


     def dump_fitting(self, writer):

         ordinal = 1

         with writer.loop("_ihm_2dem_class_average_fitting",

                 ["ordinal_id", "restraint_id", "model_id",

                  "cross_correlation_coefficient", "rot_matrix[1][1]",

                  "rot_matrix[2][1]", "rot_matrix[3][1]", "rot_matrix[1][2]",

                  "rot_matrix[2][2]", "rot_matrix[3][2]", "rot_matrix[1][3]",

                  "rot_matrix[2][3]", "rot_matrix[3][3]", "tr_vector[1]",

                  "tr_vector[2]", "tr_vector[3]"]) as l:

             for m in self.models:

                 for r in self.restraints:

                     trans = r.get_transformation(m)

                     rot = trans.get_rotation()

                     # mmCIF writer usually outputs floats to 3 decimal places,

                     # but we need more precision for rotation matrices

                     rm = [["%.6f" % e for e in rot.get_rotation_matrix_row(i)]

                           for i in range(3)]

                     t = trans.get_translation()

                     ccc = r.get_cross_correlation(m)

                     l.write(ordinal_id=ordinal, restraint_id=r.id,

                             model_id=m.id, cross_correlation_coefficient=ccc,

                             rot_matrix11=rm[0][0], rot_matrix21=rm[1][0],

                             rot_matrix31=rm[2][0], rot_matrix12=rm[0][1],

                             rot_matrix22=rm[1][1], rot_matrix32=rm[2][1],

                             rot_matrix13=rm[0][2], rot_matrix23=rm[1][2],

                             rot_matrix33=rm[2][2], tr_vector1=t[0],

                             tr_vector2=t[1], tr_vector3=t[2])

                     ordinal += 1


 class _EM3DRestraint(object):

     fitting_method = 'Gaussian mixture models'


     def __init__(self, simo, state, rdataset, pmi_restraint, target_ps,

                  densities):

         self.pmi_restraint = pmi_restraint

         self.rdataset = rdataset

         self.number_of_gaussians = len(target_ps)

         self.assembly = self.get_assembly(densities, simo, state)


     def get_assembly(self, densities, simo, state):

         """Get the Assembly that this restraint acts on"""

         cm = _ComponentMapper(state.prot)

         components = {}

         for d in densities:

             components[cm[d]] = None

         return simo.assembly_dump.get_subassembly(components)


     def get_cross_correlation(self, model):

         """Get the cross correlation coefficient between the model

            and the map"""

         return float(model.stats['GaussianEMRestraint_%s_CCC'

                                  % self.pmi_restraint.label])


 class _EM3DDumper(_Dumper):

     def __init__(self, simo):

         super(_EM3DDumper, self).__init__(simo)

         self.restraints = []


     def add(self, rsr):

         self.restraints.append(rsr)


     def dump(self, writer):

         # todo: support other fields

         ordinal = 1

         with writer.loop("_ihm_3dem_restraint",

                          ["ordinal_id", "dataset_list_id", "fitting_method",

                           "struct_assembly_id",

                           "number_of_gaussians", "model_id",

                           "cross_correlation_coefficient"]) as l:

             for model in self.models:

                 for r in self.restraints:

                     ccc = r.get_cross_correlation(model)

                     l.write(ordinal_id=ordinal,

                             dataset_list_id=r.rdataset.dataset.id,

                             fitting_method=r.fitting_method,

                             struct_assembly_id=r.assembly.id,

                             number_of_gaussians=r.number_of_gaussians,

                             model_id=model.id,

                             cross_correlation_coefficient=ccc)

                     ordinal += 1


 class _Assembly(list):

     """A collection of components. Currently simply implemented as a list of

        the component names. These must be in creation order."""

     def __hash__(self):

         # allow putting assemblies in a dict. 'list' isn't hashable

         # but 'tuple' is

         return hash(tuple(self))


 class _AssemblyDumper(_Dumper):

     def __init__(self, simo):

         super(_AssemblyDumper, self).__init__(simo)

         self.assemblies = []

         self.output = IMP.pmi.output.Output()


     def add(self, a):

         """Add a new assembly. The first such assembly is assumed to contain

            all components. Duplicate assemblies will be pruned at the end."""

         self.assemblies.append(a)

         return a


     def get_subassembly(self, compdict):

         """Get an _Assembly consisting of the given components."""

         # Put components in creation order

         newa = _Assembly(c for c in self.assemblies[0] if c in compdict)

         return self.add(newa)


     def finalize(self):

         seen_assemblies = {}

         # Assign IDs to all assemblies

         self._assembly_by_id = []

         for a in self.assemblies:

             _assign_id(a, seen_assemblies, self._assembly_by_id)


     def dump(self, writer):

         ordinal = 1

         with writer.loop("_ihm_struct_assembly",

                          ["ordinal_id", "assembly_id", "entity_description",

                           "entity_id", "asym_id", "seq_id_begin",

                           "seq_id_end"]) as l:

             for a in self._assembly_by_id:

                 for comp in a:

                     entity = self.simo.entities[comp]

                     seq = self.simo.sequence_dict[comp]

                     chain_id = self.simo._get_chain_for_component(comp,

                                                                   self.output)

                     l.write(ordinal_id=ordinal, assembly_id=a.id,

                             entity_description=entity.description,

                             entity_id=entity.id,

                             asym_id=chain_id,

                             seq_id_begin=1,

                             seq_id_end=len(seq))

                     ordinal += 1


 class _Protocol(list):

     """A modeling protocol. This can consist of multiple _ProtocolSteps."""

     pass


 class _ProtocolStep(object):

     """A single step in a _Protocol."""

     pass


 class _ReplicaExchangeProtocolStep(_ProtocolStep):

     def __init__(self, state, rex):

         self.state = state

         self.modeled_assembly = state.modeled_assembly

         self.name = 'Sampling'

         if rex.monte_carlo_sample_objects is not None:

             self.method = 'Replica exchange monte carlo'

         else:

             self.method = 'Replica exchange molecular dynamics'

         self.num_models_end = rex.vars["number_of_frames"]


 class _ModelGroup(object):

     """Group sets of models"""

     def __init__(self, state, name):

         self.state = state

         self.name = name


 class _Model(object):

     def __init__(self, prot, simo, protocol, assembly, group):

         # Transformation from IMP coordinates into mmCIF coordinate system.

         # Normally we pass through coordinates unchanged, but in some cases

         # we may want to translate them (e.g. Nup84, where the deposited PDB

         # files are all centered; we want the mmCIF files to match)

         self.transform = IMP.algebra.get_identity_transformation_3d()

         self.group = group

         # The _Protocol which produced this model

         self.protocol = protocol

         self.assembly = assembly

         self.stats = None

         o = IMP.pmi.output.Output(atomistic=True)

         name = 'cif-output'

         o.dictionary_pdbs[name] = prot

         o._init_dictchain(name, prot)

         (particle_infos_for_pdb,

          self.geometric_center) = o.get_particle_infos_for_pdb_writing(name)

         self.geometric_center = IMP.algebra.Vector3D(*self.geometric_center)

         self.entity_for_chain = {}

         self.comp_for_chain = {}

         self.correct_chain_id = {}

         for protname, chain_id in o.dictchain[name].items():

             self.entity_for_chain[chain_id] = simo.entities[protname]

             self.comp_for_chain[chain_id] = protname

             # When doing multi-state modeling, the chain ID returned here

             # (assigned sequentially) might not be correct (states may have

             # gaps in the chain IDs). Map it to the correct ID.

             self.correct_chain_id[chain_id] = \

                            simo._get_chain_for_component(protname, o)

         self.spheres = [t for t in particle_infos_for_pdb if t[1] is None]

         self.atoms = [t for t in particle_infos_for_pdb if t[1] is not None]

         self.rmsf = {}

         self.name = _CifWriter.omitted


     def parse_rmsf_file(self, fname, component):

         self.rmsf[component] = rmsf = {}

         with open(fname) as fh:

             for line in fh:

                 resnum, blocknum, val = line.split()

                 rmsf[int(resnum)] = (int(blocknum), float(val))


     def get_rmsf(self, component, indexes):

         """Get the RMSF value for the given residue indexes."""

         if not self.rmsf:

             return _CifWriter.omitted

         rmsf = self.rmsf[component]

         blocknums = dict.fromkeys(rmsf[ind][0] for ind in indexes)

         if len(blocknums) != 1:

             raise ValueError("Residue indexes %s aren't all in the same block"

                              % str(indexes))

         return rmsf[indexes[0]][1]


 class _ModelDumper(_Dumper):

     def __init__(self, simo):

         super(_ModelDumper, self).__init__(simo)

         self.models = []


     def add(self, prot, protocol, assembly, group):

         m = _Model(prot, self.simo, protocol, assembly, group)

         self.models.append(m)

         m.id = len(self.models)

         return m


     def dump(self, writer):

         self.dump_model_list(writer)

         num_atoms = sum(len(m.atoms) for m in self.models)

         num_spheres = sum(len(m.spheres) for m in self.models)

         self.dump_atoms(writer)

         self.dump_spheres(writer)


     def dump_model_list(self, writer):

         ordinal = 1

         with writer.loop("_ihm_model_list",

                          ["ordinal_id", "model_id", "model_group_id",

                           "model_name", "model_group_name", "assembly_id",

                           "protocol_id"]) as l:

             for model in self.models:

                 state = model.group.state

                 group_name = state.get_prefixed_name(model.group.name)

                 l.write(ordinal_id=ordinal, model_id=model.id,

                         model_group_id=model.group.id,

                         model_name=model.name,

                         model_group_name=group_name,

                         assembly_id=model.assembly.id,

                         protocol_id=model.protocol.id)

                 ordinal += 1


     def dump_atoms(self, writer):

         ordinal = 1

         with writer.loop("_atom_site",

                          ["id", "label_atom_id", "label_comp_id",

                           "label_seq_id",

                           "label_asym_id", "Cartn_x",

                           "Cartn_y", "Cartn_z", "label_entity_id",

                           "model_id"]) as l:

             for model in self.models:

                 for atom in model.atoms:

                     (xyz, atom_type, residue_type, chain_id, residue_index,

                      all_indexes, radius) = atom

                     pt = model.transform * xyz

                     l.write(id=ordinal, label_atom_id=atom_type.get_string(),

                             label_comp_id=residue_type.get_string(),

                             label_asym_id=model.correct_chain_id[chain_id],

                             label_entity_id=model.entity_for_chain[chain_id].id,

                             label_seq_id=residue_index,

                             Cartn_x=pt[0], Cartn_y=pt[1], Cartn_z=pt[2],

                             model_id=model.id)

                     ordinal += 1


     def dump_spheres(self, writer):

         ordinal = 1

         with writer.loop("_ihm_sphere_obj_site",

                          ["ordinal_id", "entity_id", "seq_id_begin",

                           "seq_id_end", "asym_id", "Cartn_x",

                           "Cartn_y", "Cartn_z", "object_radius", "rmsf",

                           "model_id"]) as l:

             for model in self.models:

                 for sphere in model.spheres:

                     (xyz, atom_type, residue_type, chain_id, residue_index,

                      all_indexes, radius) = sphere

                     if all_indexes is None:

                         all_indexes = (residue_index,)

                     pt = model.transform * xyz

                     l.write(ordinal_id=ordinal,

                             entity_id=model.entity_for_chain[chain_id].id,

                             seq_id_begin = all_indexes[0],

                             seq_id_end = all_indexes[-1],

                             asym_id=model.correct_chain_id[chain_id],

                             Cartn_x=pt[0], Cartn_y=pt[1], Cartn_z=pt[2],

                             object_radius=radius,

                             rmsf=model.get_rmsf(model.comp_for_chain[chain_id],

                                                 all_indexes),

                             model_id=model.id)

                     ordinal += 1


 class _ModelProtocolDumper(_Dumper):

     def __init__(self, simo):

         super(_ModelProtocolDumper, self).__init__(simo)

         # Protocols by state

         self.protocols = OrderedDict()


     def add(self, step):

         state = step.state

         if state not in self.protocols:

             # Currently support only a single protocol per state

             self.protocols[state] = _Protocol()

             self.protocols[state].id = len(self.protocols)

             step.num_models_begin = 0

         else:

             step.num_models_begin = self.protocols[state][-1].num_models_end

         self.protocols[state].append(step)

         # todo: support multiple protocols

         step.id = len(self.protocols[state])

         # Assume that protocol uses all currently-defined datasets

         step.dataset_group = self.simo.dataset_dump.get_all_group(state)


     def get_last_protocol(self, state):

         """Return the most recently-added _Protocol"""

         # For now, we only support a single protocol per state

         return self.protocols[state]


     def dump(self, writer):

         ordinal = 1

         with writer.loop("_ihm_modeling_protocol",

                          ["ordinal_id", "protocol_id", "step_id",

                           "struct_assembly_id", "dataset_group_id",

                           "struct_assembly_description", "protocol_name",

                           "step_name", "step_method", "num_models_begin",

                           "num_models_end", "multi_scale_flag",

                           "multi_state_flag", "time_ordered_flag"]) as l:

             for p in self.protocols.values():

                 for step in p:

                     l.write(ordinal_id=ordinal, protocol_id=p.id,

                             step_id=step.id, step_method=step.method,

                             step_name=step.name,

                             struct_assembly_id=step.modeled_assembly.id,

                             dataset_group_id=step.dataset_group.id,

                             num_models_begin=step.num_models_begin,

                             num_models_end=step.num_models_end,

                             # todo: support multiple states, time ordered

                             multi_state_flag=False, time_ordered_flag=False,

                             # all PMI models are multi scale

                             multi_scale_flag=True)

                     ordinal += 1


 class _PDBHelix(object):

     """Represent a HELIX record from a PDB file."""

     def __init__(self, line):

         self.helix_id = line[11:14].strip()

         self.start_resnam = line[14:18].strip()

         self.start_asym = line[19]

         self.start_resnum = int(line[21:25])

         self.end_resnam = line[27:30].strip()

         self.end_asym = line[31]

         self.end_resnum = int(line[33:37])

         self.helix_class = int(line[38:40])

         self.length = int(line[71:76])


 class _MSESeqDif(object):

     """Track an MSE -> MET mutation in the starting model sequence"""

     comp_id = 'MET'

     db_comp_id = 'MSE'

     details = 'Conversion of modified residue MSE to MET'

     def __init__(self, res, component, source, model, offset):

         self.res, self.component, self.source = res, component, source

         self.model = model

         self.offset = offset


 class _StartingModelDumper(_Dumper):

     def __init__(self, simo):

         super(_StartingModelDumper, self).__init__(simo)

         # dict of starting models (entire PDB files), collected from fragments,

         # ordered by component name and state

         self.models = OrderedDict()

         # mapping from state+component+pdbname to starting model

         self.starting_model = {}

         self.output = IMP.pmi.output.Output()


     def add_pdb_fragment(self, fragment):

         """Add a starting model PDB fragment."""

         comp = fragment.component

         state = fragment.state

         if comp not in self.models:

             self.models[comp] = OrderedDict()

         if state not in self.models[comp]:

             self.models[comp][state] = []

         models = self.models[comp][state]

         if len(models) == 0 \

            or models[-1].fragments[0].pdbname != fragment.pdbname:

             model = _StartingModel(fragment)

             models.append(model)

             model.sources = self.get_sources(model, fragment.pdbname,

                                              fragment.chain)

         else:

             models[-1].fragments.append(fragment)


     def get_templates(self, pdbname, model):

         template_path_map = {}

         templates = []

         alnfile = None

         alnfilere = re.compile('REMARK   6 ALIGNMENT: (\S+)')

         tmppathre = re.compile('REMARK   6 TEMPLATE PATH (\S+) (\S+)')

         tmpre = re.compile('REMARK   6 TEMPLATE: '

                            '(\S+) (\S+):\S+ \- (\S+):\S+ '

                            'MODELS (\S+):(\S+) \- (\S+):\S+ AT (\S+)%')


         with open(pdbname) as fh:

             for line in fh:

                 if line.startswith('ATOM'): # Read only the header

                     break

                 m = tmppathre.match(line)

                 if m:

                     template_path_map[m.group(1)] = \

                               IMP.get_relative_path(pdbname, m.group(2))

                 m = alnfilere.match(line)

                 if m:

                     # Path to alignment is relative to that of the PDB file

                     alnfile = IMP.get_relative_path(pdbname, m.group(1))

                 m = tmpre.match(line)

                 if m:

                     templates.append(_TemplateSource(m.group(1),

                                                      int(m.group(2)),

                                                      int(m.group(3)),

                                                      int(m.group(4)),

                                                      m.group(5),

                                                      int(m.group(6)),

                                                      m.group(7), model))

         # Add datasets for templates

         for t in templates:

             if t._orig_tm_code:

                 fname = template_path_map[t._orig_tm_code]

                 l = IMP.pmi.metadata.FileLocation(fname,

                                  details="Template for comparative modeling")

             else:

                 l = IMP.pmi.metadata.PDBLocation(t.tm_db_code)

             d = IMP.pmi.metadata.PDBDataset(l)

             d = self.simo._add_dataset(d)

             t.tm_dataset = d

             model.dataset.add_parent(d)


         # Sort by starting residue, then ending residue

         return(sorted(templates,

                       key=lambda x: (x._seq_id_begin, x._seq_id_end)),

                alnfile)


     def _parse_pdb(self, fh, first_line):

         """Extract information from an official PDB"""

         metadata = []

         details = ''

         for line in fh:

             if line.startswith('TITLE'):

                 details += line[10:].rstrip()

             elif line.startswith('HELIX'):

                 metadata.append(_PDBHelix(line))

         return (first_line[50:59].strip(),

                 details if details else _CifWriter.unknown, metadata)


     def _parse_details(self, fh):

         """Extract TITLE records from a PDB file"""

         details = ''

         for line in fh:

             if line.startswith('TITLE'):

                 details += line[10:].rstrip()

             elif line.startswith('ATOM'):

                 break

         return details


     def get_sources(self, model, pdbname, chain):

         # Attempt to identity PDB file vs. comparative model

         fh = open(pdbname)

         first_line = fh.readline()

         local_file = IMP.pmi.metadata.FileLocation(pdbname,

                                           details="Starting model structure")

         file_dataset = self.simo.get_file_dataset(pdbname)

         if first_line.startswith('HEADER'):

             version, details, metadata = self._parse_pdb(fh, first_line)

             source = _PDBSource(model, first_line[62:66].strip(), chain,

                                 metadata)

             l = IMP.pmi.metadata.PDBLocation(source.db_code, version, details)

             d = IMP.pmi.metadata.PDBDataset(l)

             model.dataset = self.simo._add_dataset(file_dataset or d)

             return [source]

         elif first_line.startswith('EXPDTA    DERIVED FROM PDB:'):

             # Model derived from a PDB structure; treat as a local experimental

             # model with the official PDB as a parent

             local_file.details = self._parse_details(fh)

             db_code = first_line[27:].strip()

             d = IMP.pmi.metadata.PDBDataset(local_file)

             pdb_loc = IMP.pmi.metadata.PDBLocation(db_code)

             parent = IMP.pmi.metadata.PDBDataset(pdb_loc)

             d.add_parent(parent)

             model.dataset = self.simo._add_dataset(file_dataset or d)

             return [_UnknownSource(model, chain)]

         elif first_line.startswith('EXPDTA    DERIVED FROM COMPARATIVE '

                                    'MODEL, DOI:'):

             # Model derived from a comparative model; link back to the original

             # model as a parent

             local_file.details = self._parse_details(fh)

             d = IMP.pmi.metadata.ComparativeModelDataset(local_file)

             repo = IMP.pmi.metadata.Repository(doi=first_line[46:].strip())

             # todo: better specify an unknown path

             orig_loc = IMP.pmi.metadata.FileLocation(repo=repo, path='.',

                               details="Starting comparative model structure")

             parent = IMP.pmi.metadata.ComparativeModelDataset(orig_loc)

             d.add_parent(parent)

             model.dataset = self.simo._add_dataset(file_dataset or d)

             return [_UnknownSource(model, chain)]

         elif first_line.startswith('EXPDTA    THEORETICAL MODEL, MODELLER'):

             self.simo.software_dump.set_modeller_used(

                                         *first_line[38:].split(' ', 1))

             return self.handle_comparative_model(local_file, file_dataset,

                                                  pdbname, model, chain)

         elif first_line.startswith('REMARK  99  Chain ID :'):

             # Model generated by Phyre2

             self.simo.software_dump.set_phyre2_used()

             return self.handle_comparative_model(local_file, file_dataset,

                                                  pdbname, model, chain)

         else:

             # todo: extract Modeller-like template info for Phyre models;

             # revisit assumption that all such unknown source PDBs are

             # comparative models

             return self.handle_comparative_model(local_file, file_dataset,

                                                  pdbname, model, chain)


     def handle_comparative_model(self, local_file, file_dataset, pdbname,

                                  model, chain):

         d = IMP.pmi.metadata.ComparativeModelDataset(local_file)

         model.dataset = self.simo._add_dataset(file_dataset or d)

         templates, alnfile = self.get_templates(pdbname, model)

         if alnfile:

             model.alignment_file = IMP.pmi.metadata.FileLocation(alnfile,

                                     details="Alignment for starting "

                                             "comparative model")

             self.simo.extref_dump.add(model.alignment_file,

                                       _ExternalReferenceDumper.INPUT_DATA)


         if templates:

             return templates

         else:

             return [_UnknownSource(model, chain)]


     def assign_model_details(self):

         for comp, states in self.models.items():

             model_id = 0

             for state in states:

                 for model in states[state]:

                     model_id += 1

                     model.name = "%s-m%d" % (comp, model_id)

                     self.starting_model[state, comp,

                                         model.fragments[0].pdbname] = model

                     model.seq_id_begin = min(x.start + x.offset

                                              for x in model.fragments)

                     model.seq_id_end = max(x.end + x.offset

                                            for x in model.fragments)


     def all_models(self):

         for comp, states in self.models.items():

             # For now, assume that starting model of the same-named

             # component is the same in all states, so just take the first

             first_state = list(states.keys())[0]

             for model in states[first_state]:

                 yield model


     def finalize(self):

         self.assign_model_details()


     def dump(self, writer):

         self.dump_details(writer)

         self.dump_comparative(writer)

         seq_dif = self.dump_coords(writer)

         self.dump_seq_dif(writer, seq_dif)


     def dump_seq_dif(self, writer, seq_dif):

         ordinal = 1

         with writer.loop("_ihm_starting_model_seq_dif",

                      ["ordinal_id", "entity_id", "asym_id",

                       "seq_id", "comp_id", "starting_model_id",

                       "db_asym_id", "db_seq_id", "db_comp_id",

                       "details"]) as l:

             for sd in seq_dif:

                 chain_id = self.simo._get_chain_for_component(

                                     sd.component, self.output)

                 entity = self.simo.entities[sd.component]

                 l.write(ordinal_id=ordinal, entity_id=entity.id,

                         asym_id=chain_id, seq_id=sd.res.get_index(),

                         comp_id=sd.comp_id,

                         db_asym_id=sd.source.chain_id,

                         db_seq_id=sd.res.get_index() - sd.offset,

                         db_comp_id=sd.db_comp_id,

                         starting_model_id=sd.model.name,

                         details=sd.details)

                 ordinal += 1


     def dump_comparative(self, writer):

         """Dump details on comparative models. Must be called after

            dump_details() since it uses IDs assigned there."""

         with writer.loop("_ihm_starting_comparative_models",

                      ["ordinal_id", "starting_model_id",

                       "starting_model_auth_asym_id",

                       "starting_model_seq_id_begin",

                       "starting_model_seq_id_end",

                       "template_auth_asym_id", "template_seq_id_begin",

                       "template_seq_id_end", "template_sequence_identity",

                       "template_sequence_identity_denominator",

                       "template_dataset_list_id",

                       "alignment_file_id"]) as l:

             ordinal = 1

             for model in self.all_models():

                 for template in [s for s in model.sources

                                  if isinstance(s, _TemplateSource)]:

                     seq_id_begin, seq_id_end = template.get_seq_id_range(model)

                     denom = template.sequence_identity_denominator

                     l.write(ordinal_id=ordinal,

                       starting_model_id=model.name,

                       starting_model_auth_asym_id=template.chain_id,

                       starting_model_seq_id_begin=seq_id_begin,

                       starting_model_seq_id_end=seq_id_end,

                       template_auth_asym_id=template.tm_chain_id,

                       template_seq_id_begin=template.tm_seq_id_begin,

                       template_seq_id_end=template.tm_seq_id_end,

                       template_sequence_identity=template.sequence_identity,

                       template_sequence_identity_denominator=denom,

                       template_dataset_list_id=template.tm_dataset.id

                                                if template.tm_dataset

                                                else _CifWriter.unknown,

                       alignment_file_id=model.alignment_file.id

                                         if hasattr(model, 'alignment_file')

                                         else _CifWriter.unknown)

                     ordinal += 1


     def dump_details(self, writer):

         writer.write_comment("""IMP will attempt to identify which input models

 are crystal structures and which are comparative models, but does not always

 have sufficient information to deduce all of the templates used for comparative

 modeling. These may need to be added manually below.""")

         with writer.loop("_ihm_starting_model_details",

                      ["starting_model_id", "entity_id", "entity_description",

                       "asym_id", "seq_id_begin",

                       "seq_id_end", "starting_model_source",

                       "starting_model_auth_asym_id",

                       "starting_model_sequence_offset",

                       "dataset_list_id"]) as l:

             for model in self.all_models():

                 f = model.fragments[0]

                 entity = self.simo.entities[f.component]

                 chain_id = self.simo._get_chain_for_component(f.component,

                                                               self.output)

                 source0 = model.sources[0]

                 # Where there are multiple sources (to date, this can only

                 # mean multiple templates for a comparative model) consolidate

                 # them; template info is given in starting_comparative_models.

                 seq_id_begin, seq_id_end = source0.get_seq_id_range(model)

                 for source in model.sources[1:]:

                     this_begin, this_end = source.get_seq_id_range(model)

                     seq_id_begin = min(seq_id_begin, this_begin)

                     seq_id_end = max(seq_id_end, this_end)

                 l.write(entity_id=entity.id,

                       entity_description=entity.description,

                       asym_id=chain_id,

                       seq_id_begin=seq_id_begin,

                       seq_id_end=seq_id_end,

                       starting_model_auth_asym_id=source0.chain_id,

                       starting_model_id=model.name,

                       starting_model_source=source0.source,

                       starting_model_sequence_offset=f.offset,

                       dataset_list_id=model.dataset.id)


     def dump_coords(self, writer):

         seq_dif = []

         ordinal = 1

         with writer.loop("_ihm_starting_model_coord",

                      ["starting_model_id", "group_PDB", "id", "type_symbol",

                       "atom_id", "comp_id", "entity_id", "asym_id",

                       "seq_id", "Cartn_x",

                       "Cartn_y", "Cartn_z", "B_iso_or_equiv",

                       "ordinal_id"]) as l:

             for model in self.all_models():

                 for f in model.fragments:

                     sel = IMP.atom.Selection(f.starting_hier,

                                residue_indexes=list(range(f.start, f.end + 1)))

                     last_res_index = None

                     for a in sel.get_selected_particles():

                         coord = IMP.core.XYZ(a).get_coordinates()

                         atom = IMP.atom.Atom(a)

                         element = atom.get_element()

                         element = IMP.atom.get_element_table().get_name(element)

                         atom_name = atom.get_atom_type().get_string()

                         group_pdb = 'ATOM'

                         if atom_name.startswith('HET:'):

                             group_pdb = 'HETATM'

                             atom_name = atom_name[4:]

                         res = IMP.atom.get_residue(atom)

                         res_name = res.get_residue_type().get_string()

                         # MSE in the original PDB is automatically mutated

                         # by IMP to MET, so reflect that in the output,

                         # and pass back to populate the seq_dif category.

                         if res_name == 'MSE':

                             res_name = 'MET'

                             # Only add one seq_dif record per residue

                             ind = res.get_index()

                             if ind != last_res_index:

                                 last_res_index = ind

                                 # This should only happen when we're using

                                 # a crystal structure as the source (a

                                 # comparative model would use MET in

                                 # the sequence)

                                 assert(len(model.sources) == 1)

                                 seq_dif.append(_MSESeqDif(res, f.component,

                                                           model.sources[0],

                                                           model, f.offset))

                         chain_id = self.simo._get_chain_for_component(

                                             f.component, self.output)

                         entity = self.simo.entities[f.component]

                         l.write(starting_model_id=model.name,

                                 group_PDB=group_pdb,

                                 id=atom.get_input_index(), type_symbol=element,

                                 atom_id=atom_name, comp_id=res_name,

                                 entity_id=entity.id,

                                 asym_id=chain_id,

                                 seq_id=res.get_index(), Cartn_x=coord[0],

                                 Cartn_y=coord[1], Cartn_z=coord[2],

                                 B_iso_or_equiv=atom.get_temperature_factor(),

                                 ordinal_id=ordinal)

                         ordinal += 1

         return seq_dif


 class _StructConfDumper(_Dumper):

     def all_rigid_fragments(self):

         """Yield all rigid model representation fragments"""

         asym_states = {}

         model_repr = self.simo.model_repr_dump

         for comp, statefrag in model_repr.fragments.items():

             # For now, assume that representation of the same-named

             # component is the same in all states, so just take the first

             state = list(statefrag.keys())[0]

             for f in statefrag[state]:

                 if hasattr(f, 'pdbname') \

                    and model_repr.get_model_mode(f) == 'rigid':

                     asym = get_asym_mapper_for_state(self.simo, f.state,

                                                      asym_states)

                     yield (f, model_repr.starting_model[state, comp, f.pdbname],

                            asym[f.hier])


     def all_helices(self):

         """Yield all helices that overlap with rigid model fragments"""

         for f, starting_model, asym_id in self.all_rigid_fragments():

             if len(starting_model.sources) == 1 \

                and isinstance(starting_model.sources[0], _PDBSource):

                 pdb = starting_model.sources[0]

                 for m in pdb.metadata:

                     if isinstance(m, _PDBHelix) \

                        and m.start_asym == pdb.chain_id \

                        and m.end_asym == pdb.chain_id \

                        and m.start_resnum >= f.start and m.end_resnum <= f.end:

                         yield (m, max(f.start, m.start_resnum),

                                min(f.end, m.end_resnum), asym_id)


     def dump(self, writer):

         with writer.category("_struct_conf_type") as l:

             l.write(id='HELX_P', criteria=_CifWriter.unknown,

                     reference=_CifWriter.unknown)

         # Dump helix information for the model. For any model fragment that

         # is rigid, atomic, and uses an experimental PDB structure as the

         # starting model, inherit any helix information from that PDB file.

         # Note that we can't use the helix id from the original PDB, since

         # it has to be unique and this might not be the case if we inherit

         # from multiple PDBs.

         ordinal = 0

         with writer.loop("_struct_conf",

                      ["id", "conf_type_id", "beg_label_comp_id",

                       "beg_label_asym_id", "beg_label_seq_id",

                       "end_label_comp_id", "end_label_asym_id",

                       "end_label_seq_id",]) as l:

             for h, begin, end, asym_id in self.all_helices():

                 ordinal += 1

                 l.write(id='HELX_P%d' % ordinal, conf_type_id='HELX_P',

                         beg_label_comp_id=h.start_resnam,

                         beg_label_asym_id=asym_id,

                         beg_label_seq_id=begin,

                         end_label_comp_id=h.end_resnam,

                         end_label_asym_id=asym_id,

                         end_label_seq_id=end)


 class _PostProcess(object):

     """Base class for any post processing"""

     pass


 class _ReplicaExchangeAnalysisPostProcess(_PostProcess):

     """Post processing using AnalysisReplicaExchange0 macro"""

     type = 'cluster'

     feature = 'RMSD'


     def __init__(self, protocol, rex, num_models_begin):

         self.protocol = protocol

         self.rex = rex

         self.num_models_begin = num_models_begin

         self.num_models_end = 0

         for fname in self.get_all_stat_files():

             with open(fname) as fh:

                 self.num_models_end += len(fh.readlines())


     def get_stat_file(self, cluster_num):

         return os.path.join(self.rex._outputdir, "cluster.%d" % cluster_num,

                             'stat.out')


     def get_all_stat_files(self):

         for i in range(self.rex._number_of_clusters):

             yield self.get_stat_file(i)


 class _SimplePostProcess(_PostProcess):

     """Simple ad hoc clustering"""

     type = 'cluster'

     feature = 'RMSD'


     def __init__(self, protocol, num_models_begin, num_models_end):

         self.protocol = protocol

         self.num_models_begin = num_models_begin

         self.num_models_end = num_models_end


 class _PostProcessDumper(_Dumper):

     def __init__(self, simo):

         super(_PostProcessDumper, self).__init__(simo)

         self.postprocs = []


     def add(self, postproc):

         protocol = postproc.protocol

         self.postprocs.append(postproc)

         postproc.id = len(self.postprocs)


     def finalize(self):

         # Assign step IDs per protocol

         pp_by_protocol = {}

         for p in self.postprocs:

             protocol = p.protocol

             if id(protocol) not in pp_by_protocol:

                 pp_by_protocol[id(protocol)] = []

             by_prot = pp_by_protocol[id(protocol)]

             by_prot.append(p)

             p.step_id = len(by_prot)


     def dump(self, writer):

         with writer.loop("_ihm_modeling_post_process",

                          ["id", "protocol_id", "analysis_id", "step_id",

                           "type", "feature", "num_models_begin",

                           "num_models_end"]) as l:

             # todo: handle multiple analyses

             for p in self.postprocs:

                 l.write(id=p.id, protocol_id=p.protocol.id, analysis_id=1,

                         step_id=p.step_id, type=p.type, feature=p.feature,

                         num_models_begin=p.num_models_begin,

                         num_models_end=p.num_models_end)


 class _Ensemble(object):

     """Base class for any ensemble"""

     pass


 class _ReplicaExchangeAnalysisEnsemble(_Ensemble):

     """Ensemble generated using AnalysisReplicaExchange0 macro"""


     def __init__(self, pp, cluster_num, model_group, num_deposit):

         self.file = None

         self.model_group = model_group

         self.cluster_num = cluster_num

         self.postproc = pp

         self.num_deposit = num_deposit

         self.localization_density = {}

         with open(pp.get_stat_file(cluster_num)) as fh:

             self.num_models = len(fh.readlines())


     def get_rmsf_file(self, component):

         return os.path.join(self.postproc.rex._outputdir,

                             'cluster.%d' % self.cluster_num,

                             'rmsf.%s.dat' % component)


     def load_rmsf(self, model, component):

         fname = self.get_rmsf_file(component)

         if os.path.exists(fname):

             model.parse_rmsf_file(fname, component)


     def get_localization_density_file(self, component):

         # todo: this assumes that the localization density file name matches

         # the component name and is of the complete residue range (usually

         # this is the case, but it doesn't have to be)

         return os.path.join(self.postproc.rex._outputdir,

                             'cluster.%d' % self.cluster_num,

                             '%s.mrc' % component)


     def load_localization_density(self, state, component, extref_dump):

         fname = self.get_localization_density_file(component)

         if os.path.exists(fname):

             details = "Localization density for %s %s" \

                       % (component, self.model_group.name)

             local_file = IMP.pmi.metadata.FileLocation(fname,

                               details=state.get_postfixed_name(details))

             self.localization_density[component] = local_file

             extref_dump.add(local_file,

                             _ExternalReferenceDumper.MODELING_OUTPUT)


     def load_all_models(self, simo, state):

         stat_fname = self.postproc.get_stat_file(self.cluster_num)

         model_num = 0

         with open(stat_fname) as fh:

             stats = ast.literal_eval(fh.readline())

             # Correct path

             rmf_file = os.path.join(os.path.dirname(stat_fname),

                                     "%d.rmf3" % model_num)

             for c in state.all_modeled_components:

                 # todo: this only works with PMI 1

                 state._pmi_object.set_coordinates_from_rmf(c, rmf_file, 0,

                                                        force_rigid_update=True)

             # todo: fill in other data from stat file, e.g. crosslink phi/psi

             yield stats

             model_num += 1

             if model_num >= self.num_deposit:

                 return


     # todo: also support dRMS precision

     def _get_precision(self):

         precfile = os.path.join(self.postproc.rex._outputdir,

                                 "precision.%d.%d.out" % (self.cluster_num,

                                                          self.cluster_num))

         if not os.path.exists(precfile):

             return _CifWriter.unknown

         # Fail if the precision.x.x.out file doesn't match the cluster

         r = re.compile('All .*/cluster.%d/ average centroid distance ([\d\.]+)'

                        % self.cluster_num)

         with open(precfile) as fh:

             for line in fh:

                 m = r.match(line)

                 if m:

                     return float(m.group(1))


     feature = property(lambda self: self.postproc.feature)

     name = property(lambda self: "cluster %d" % (self.cluster_num + 1))

     precision = property(lambda self: self._get_precision())


 class _SimpleEnsemble(_Ensemble):

     """Simple manually-created ensemble"""


     feature = 'dRMSD'


     def __init__(self, pp, model_group, num_models, drmsd,

                  num_models_deposited, ensemble_file):

         self.file = ensemble_file

         self.postproc = pp

         self.model_group = model_group

         self.num_deposit = num_models_deposited

         self.localization_density = {}

         self.num_models = num_models

         self.precision = drmsd


     def load_localization_density(self, state, component, local_file,

                                   extref_dump):

         self.localization_density[component] = local_file

         extref_dump.add(local_file,

                         _ExternalReferenceDumper.MODELING_OUTPUT)


     name = property(lambda self: self.model_group.name)


 class _EnsembleDumper(_Dumper):

     def __init__(self, simo):

         super(_EnsembleDumper, self).__init__(simo)

         self.ensembles = []


     def add(self, ensemble):

         self.ensembles.append(ensemble)

         ensemble.id = len(self.ensembles)


     def dump(self, writer):

         with writer.loop("_ihm_ensemble_info",

                          ["ensemble_id", "ensemble_name", "post_process_id",

                           "model_group_id", "ensemble_clustering_method",

                           "ensemble_clustering_feature",

                           "num_ensemble_models",

                           "num_ensemble_models_deposited",

                           "ensemble_precision_value",

                           "ensemble_file_id"]) as l:

             for e in self.ensembles:

                 state = e.model_group.state

                 l.write(ensemble_id=e.id,

                         ensemble_name=state.get_prefixed_name(e.name),

                         post_process_id=e.postproc.id,

                         model_group_id=e.model_group.id,

                         ensemble_clustering_feature=e.feature,

                         num_ensemble_models=e.num_models,

                         num_ensemble_models_deposited=e.num_deposit,

                         ensemble_precision_value=e.precision,

                         ensemble_file_id=e.file.id if e.file

                                                    else _CifWriter.omitted)


 class _DensityDumper(_Dumper):

     """Output localization densities for ensembles"""


     def __init__(self, simo):

         super(_DensityDumper, self).__init__(simo)

         self.ensembles = []

         self.output = IMP.pmi.output.Output()


     def add(self, ensemble):

         self.ensembles.append(ensemble)


     def get_density(self, ensemble, component):

         return ensemble.localization_density.get(component, None)


     def dump(self, writer):

         with writer.loop("_ihm_localization_density_files",

                          ["id", "file_id", "ensemble_id", "entity_id",

                           "asym_id", "seq_id_begin", "seq_id_end"]) as l:

             ordinal = 1

             for ensemble in self.ensembles:

                 for comp in self.simo.all_modeled_components:

                     density = self.get_density(ensemble, comp)

                     if not density:

                         continue

                     entity = self.simo.entities[comp]

                     lenseq = len(entity.sequence)

                     chain_id = self.simo._get_chain_for_component(comp,

                                                                   self.output)

                     l.write(id=ordinal, ensemble_id=ensemble.id,

                             entity_id=entity.id,

                             file_id=density.id,

                             seq_id_begin=1, seq_id_end=lenseq,

                             asym_id=chain_id)

                     ordinal += 1


 class _MultiStateDumper(_Dumper):

     """Output information on multiple states"""


     def dump(self, writer):

         states = sorted(self.simo._states.keys(),

                         key=operator.attrgetter('id'))

         # Nothing to do for single state modeling

         if len(states) <= 1:

             return

         # Sort all model groups first by state, then by their own ID

         groups = sorted(self.simo.model_groups,

                         key=lambda g: (g.state.id, g.id))

         with writer.loop("_ihm_multi_state_modeling",

                          ["ordinal_id", "state_id", "state_group_id",

                           "population_fraction", "state_type", "state_name",

                           "model_group_id", "experiment_type", "details"]) as l:

             for n, group in enumerate(groups):

                 state = group.state

                 l.write(ordinal_id=n+1, state_id=state.id,

                         state_group_id=state.id,

                         model_group_id=group.id,

                         state_name=state.long_name if state.long_name

                                    else _CifWriter.omitted,

                         # No IMP models are currently single molecule

                         experiment_type='Fraction of bulk',

                         details=state.get_prefixed_name(group.name))


 class _Entity(object):

     """Represent a CIF entity (a chain with a unique sequence)"""

     def __init__(self, seq, first_component):

         self.sequence = seq

         self.first_component = first_component

         # Use the name of the first component as the description

         self.description = first_component


 class _EntityMapper(dict):

     """Handle mapping from IMP components to CIF entities.

        Multiple components may map to the same entity if they share sequence."""

     def __init__(self):

         super(_EntityMapper, self).__init__()

         self._sequence_dict = {}

         self._entities = []


     def add(self, component_name, sequence):

         if sequence not in self._sequence_dict:

             entity = _Entity(sequence, component_name)

             self._entities.append(entity)

             entity.id = len(self._entities)

             self._sequence_dict[sequence] = entity

         self[component_name] = self._sequence_dict[sequence]


     def get_all(self):

         """Yield all entities"""

         return self._entities


 class _RestraintDataset(object):

     """Wrapper around a dataset associated with a restraint.

        This is needed because we need to delay access to the dataset

        in case the writer of the PMI script overrides or changes it

        after creating the restraint."""

     def __init__(self, restraint, num, allow_duplicates):

         self.restraint = restraint

         self.num, self.allow_duplicates = num, allow_duplicates

         self.__dataset = None

     def __get_dataset(self):

         if self.__dataset:

             return self.__dataset

         if self.num is not None:

             d = copy.deepcopy(self.restraint.datasets[self.num])

         else:

             d = copy.deepcopy(self.restraint.dataset)

         if self.allow_duplicates:

             d.location._allow_duplicates = True

         # Don't copy again next time we access self.dataset

         self.__dataset = d

         return d

     dataset = property(__get_dataset)


 class _State(object):

     """Representation of a single state in the system."""


     def __init__(self, pmi_object, po):

         # Point to the PMI object for this state. Use a weak reference

         # since the state object typically points to us too, so we need

         # to break the reference cycle. In PMI1 this will be a

         # Representation object.

         self._pmi_object = weakref.proxy(pmi_object)

         self._pmi_state = pmi_object.state


         # The assembly of all components modeled by IMP in this state.

         # This may be smaller than the complete assembly.

         self.modeled_assembly = _Assembly()

         po.assembly_dump.add(self.modeled_assembly)


         self.all_modeled_components = []


     def get_prefixed_name(self, name):

         """Prefix the given name with the state name, if available."""

         if self.short_name:

             return self.short_name + ' ' + name

         else:

             return name.capitalize()


     def get_postfixed_name(self, name):

         """Postfix the given name with the state name, if available."""

         if self.short_name:

             return "%s in state %s" % (name, self.short_name)

         else:

             return name


     short_name = property(lambda self: self._pmi_state.short_name)

     long_name = property(lambda self: self._pmi_state.long_name)


 class ProtocolOutput(IMP.pmi.output.ProtocolOutput):

     """Class to encode a modeling protocol as mmCIF.


     IMP has basic support for writing out files in mmCIF format, for

     deposition in [PDB-dev](https://pdb-dev.rcsb.rutgers.edu/).

     After creating an instance of this class, attach it to an

     IMP.pmi.representation.Representation object. After this, any

     generated models and metadata are automatically collected and

     output as mmCIF.

     """

     def __init__(self, fh):

         self._state_ensemble_offset = 0

         self._each_metadata = [] # list of metadata for each representation

         self._file_datasets = []

         self._main_script = os.path.abspath(sys.argv[0])

         self._states = {}

         self._working_directory = os.getcwd()

         self._cif_writer = _CifWriter(fh)

         self.entities = _EntityMapper()

         self.chains = {}

         self._all_components = {}

         self.all_modeled_components = []

         self.model_groups = []

         self.default_model_group = None

         self.sequence_dict = {}

         self.model_repr_dump = _ModelRepresentationDumper(self)

         self.cross_link_dump = _CrossLinkDumper(self)

         self.em2d_dump = _EM2DDumper(self)

         self.em3d_dump = _EM3DDumper(self)

         self.model_prot_dump = _ModelProtocolDumper(self)

         self.extref_dump = _ExternalReferenceDumper(self)

         self.dataset_dump = _DatasetDumper(self)

         self.starting_model_dump = _StartingModelDumper(self)

         self.assembly_dump = _AssemblyDumper(self)


         # The assembly of all known components.

         self.complete_assembly = _Assembly()

         self.assembly_dump.add(self.complete_assembly)


         self.model_dump = _ModelDumper(self)

         self.model_repr_dump.starting_model \

                     = self.starting_model_dump.starting_model

         self.software_dump = _SoftwareDumper(self)

         self.post_process_dump = _PostProcessDumper(self)

         self.ensemble_dump = _EnsembleDumper(self)

         self.density_dump = _DensityDumper(self)


         # Some dumpers add per-model information; give them a pointer to

         # the model list

         self.cross_link_dump.models = self.model_dump.models

         self.em3d_dump.models = self.model_dump.models

         self.em2d_dump.models = self.model_dump.models


         self._dumpers = [_EntryDumper(self), # should always be first

                          _AuditAuthorDumper(self),

                          self.software_dump, _CitationDumper(self),

                          _ChemCompDumper(self),

                          _EntityDumper(self),

                          _EntityPolyDumper(self), _EntityPolySeqDumper(self),

                          _StructAsymDumper(self),

                          self.assembly_dump,

                          self.model_repr_dump, self.extref_dump,

                          self.dataset_dump,

                          self.cross_link_dump,

                          self.em2d_dump, self.em3d_dump,

                          self.starting_model_dump,

                          # todo: detect atomic models and emit struct_conf

                          #_StructConfDumper(self),

                          self.model_prot_dump, self.post_process_dump,

                          self.ensemble_dump, self.density_dump, self.model_dump,

                          _MultiStateDumper(self)]


     def _add_state(self, state):

         """Create a new state and return a pointer to it."""

         self._state_ensemble_offset = len(self.ensemble_dump.ensembles)

         s = _State(state, self)

         if not self._states:

             self._first_state = s

         self._states[s] = None

         s.id = len(self._states)

         self._last_state = s

         return s


     def get_file_dataset(self, fname):

         for d in self._file_datasets:

             fd = d.get(os.path.abspath(fname), None)

             if fd:

                 return fd


     def _get_chain_for_component(self, name, output):

         """Get the chain ID for a component, if any."""

         # todo: handle multiple copies

         if name in self.chains:

             chain = self.chains[name]

             return output.chainids[chain]

         else:

             # A non-modeled component doesn't have a chain ID

             return _CifWriter.omitted


     def create_component(self, state, name, modeled):

         new_comp = name not in self._all_components

         self._all_components[name] = None

         if modeled:

             state.all_modeled_components.append(name)

             if new_comp:

                 self.chains[name] = len(self.chains)

                 self.all_modeled_components.append(name)

             state.modeled_assembly.append(name)

         if new_comp:

             self.complete_assembly.append(name)


     def add_component_sequence(self, name, seq):

         if name in self.sequence_dict:

             if self.sequence_dict[name] != seq:

                 raise ValueError("Sequence mismatch for component %s" % name)

         else:

             self.sequence_dict[name] = seq

             self.entities.add(name, seq)


     def flush(self):

         for dumper in self._dumpers:

             dumper.finalize_metadata()

         for dumper in self._dumpers:

             dumper.finalize()

         for dumper in self._dumpers:

             dumper.dump(self._cif_writer)


     def add_pdb_element(self, state, name, start, end, offset, pdbname,

                         chain, hier):

         p = _PDBFragment(state, name, start, end, offset, pdbname, chain,

                          hier)

         self.model_repr_dump.add_fragment(state, p)

         self.starting_model_dump.add_pdb_fragment(p)


     def add_bead_element(self, state, name, start, end, num, hier):

         b = _BeadsFragment(state, name, start, end, num, hier)

         self.model_repr_dump.add_fragment(state, b)


     def _get_restraint_dataset(self, r, num=None, allow_duplicates=False):

         """Get a wrapper object for the dataset used by this restraint.

            This is needed because the restraint's dataset may be changed

            in the PMI script before we get to use it."""

         rs = _RestraintDataset(r, num, allow_duplicates)

         self._add_dataset(rs)

         return rs


     def get_cross_link_group(self, r):

         return _CrossLinkGroup(r, self._get_restraint_dataset(r))


     def add_experimental_cross_link(self, r1, c1, r2, c2, length, group):

         if c1 not in self._all_components or c2 not in self._all_components:

             # Crosslink refers to a component we didn't model

             # As a quick hack, just ignore it.

             # todo: need to add an entity for this anyway (so will need the

             # sequence, and add to struct_assembly)

             return None

         xl = _ExperimentalCrossLink(r1, c1, r2, c2, length, group)

         self.cross_link_dump.add_experimental(xl)

         return xl


     def add_cross_link(self, state, ex_xl, p1, p2, sigma1, sigma2, psi):

         self.cross_link_dump.add(_CrossLink(state, ex_xl, p1, p2, sigma1,

                                             sigma2, psi))


     def add_replica_exchange(self, state, rex):

         # todo: allow for metadata to say how many replicas were used in the

         # actual experiment, and how many independent runs were carried out

         # (use these as multipliers to get the correct total number of

         # output models)

         self.model_prot_dump.add(_ReplicaExchangeProtocolStep(state, rex))


     def _add_dataset(self, dataset):

         return self.dataset_dump.add(self._last_state, dataset)


     def add_model_group(self, group):

         self.model_groups.append(group)

         group.id = len(self.model_groups)

         return group


     def _add_simple_postprocessing(self, num_models_begin, num_models_end):

         # Always assumed that we're dealing with the last state

         state = self._last_state

         protocol = self.model_prot_dump.get_last_protocol(state)

         pp = _SimplePostProcess(protocol, num_models_begin, num_models_end)

         self.post_process_dump.add(pp)

         return pp


     def _add_simple_ensemble(self, pp, name, num_models, drmsd,

                              num_models_deposited, localization_densities,

                              ensemble_file):

         """Add an ensemble generated by ad hoc methods (not using PMI).

            This is currently only used by the Nup84 system."""

         # Always assumed that we're dealing with the last state

         state = self._last_state

         group = self.add_model_group(_ModelGroup(state, name))

         self.extref_dump.add(ensemble_file,

                              _ExternalReferenceDumper.MODELING_OUTPUT)

         e = _SimpleEnsemble(pp, group, num_models, drmsd, num_models_deposited,

                             ensemble_file)

         self.ensemble_dump.add(e)

         self.density_dump.add(e)

         for c in state.all_modeled_components:

             den = localization_densities.get(c, None)

             if den:

                 e.load_localization_density(state, c, den, self.extref_dump)

         return e


     def set_ensemble_file(self, i, location):

         """Point a previously-created ensemble to an 'all-models' file.

            This could be a trajectory such as DCD, an RMF, or a multimodel

            PDB file."""

         self.extref_dump.add(location,

                              _ExternalReferenceDumper.MODELING_OUTPUT)

         # Ensure that we point to an ensemble related to the current state

         ind = i + self._state_ensemble_offset

         self.ensemble_dump.ensembles[ind].file = location


     def add_replica_exchange_analysis(self, state, rex):

         # todo: add prefilter as an additional postprocess step (complication:

         # we don't know how many models it filtered)

         # todo: postpone rmsf/density localization extraction until after

         # relevant methods are called (currently it is done from the

         # constructor)

         protocol = self.model_prot_dump.get_last_protocol(state)

         num_models = protocol[-1].num_models_end

         pp = _ReplicaExchangeAnalysisPostProcess(protocol, rex, num_models)

         self.post_process_dump.add(pp)

         for i in range(rex._number_of_clusters):

             group = self.add_model_group(_ModelGroup(state,

                                                      'cluster %d' % (i + 1)))

             # todo: make # of models to deposit configurable somewhere

             e = _ReplicaExchangeAnalysisEnsemble(pp, i, group, 1)

             self.ensemble_dump.add(e)

             self.density_dump.add(e)

             # Add localization density info if available

             for c in state.all_modeled_components:

                 e.load_localization_density(state, c, self.extref_dump)

             for stats in e.load_all_models(self, state):

                 m = self.add_model(group)

                 # Since we currently only deposit 1 model, it is the

                 # best scoring one

                 m.name = 'Best scoring model'

                 m.stats = stats

                 # Add RMSF info if available

                 for c in state.all_modeled_components:

                     e.load_rmsf(m, c)


     def add_em2d_restraint(self, state, r, i, resolution, pixel_size,

                            image_resolution, projection_number):

         d = self._get_restraint_dataset(r, i)

         self.em2d_dump.add(_EM2DRestraint(state, d, r, i, resolution,

                                           pixel_size, image_resolution,

                                           projection_number))


     def add_em3d_restraint(self, state, target_ps, densities, r):

         # A 3DEM restraint's dataset ID uniquely defines the restraint, so

         # we need to allow duplicates

         d = self._get_restraint_dataset(r, allow_duplicates=True)

         self.em3d_dump.add(_EM3DRestraint(self, state, d, r, target_ps,

                                           densities))


     def add_model(self, group):

         state = group.state

         return self.model_dump.add(state.prot,

                            self.model_prot_dump.get_last_protocol(state),

                            state.modeled_assembly, group)


     def _update_location(self, fileloc):

         """Update FileLocation to point to a parent repository, if any"""

         all_repos = [m for m in self._metadata

                      if isinstance(m, IMP.pmi.metadata.Repository)]

         IMP.pmi.metadata.Repository.update_in_repos(fileloc, all_repos)


     _metadata = property(lambda self:

                          itertools.chain.from_iterable(self._each_metadata))

IMP::atom::NonWaterNonHydrogenPDBSelector
Select non water and non hydrogen atoms.
Definition: pdb.h:243

IMP::algebra::Transformation3D
Simple 3D transformation class.
Definition: Transformation3D.h:31

IMP::atom::Residue::get_is_setup
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Residue.h:155

IMP::pmi.mmcif.ProtocolOutput
Class to encode a modeling protocol as mmCIF.
Definition: mmcif.py:2242

IMP::atom::Atom::get_is_setup
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: atom/Atom.h:241

IMP::pmi.metadata
Classes for attaching metadata to PMI objects.
Definition: metadata.py:1

IMP::pmi.tools
Miscellaneous utilities.
Definition: tools.py:1

IMP::atom::get_element_table
ElementTable & get_element_table()

IMP::pmi.metadata.PythonScript
A Python script used as part of the modeling.
Definition: metadata.py:40

IMP::core::RigidMember::get_is_setup
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:619

IMP::atom::read_pdb
void read_pdb(TextInput input, int model, Hierarchy h)

IMP::pmi.representation
Representation of the system.
Definition: pmi/representation.py:1

IMP::pmi.mmcif.ProtocolOutput.set_ensemble_file
def set_ensemble_file
Point a previously-created ensemble to an 'all-models' file.
Definition: mmcif.py:2449

IMP::pmi.metadata.ComparativeModelDataset
A 3D structure determined by comparative modeling.
Definition: metadata.py:103

IMP::pmi.metadata.Repository
A repository containing modeling files.
Definition: metadata.py:191

IMP::pmi.metadata.PDBDataset
An experimentally-determined 3D structure as a set of a coordinates, usually in a PDB file...
Definition: metadata.py:98

IMP::pmi.metadata.EMMicrographsDataset
Raw 2D electron micrographs.
Definition: metadata.py:108

IMP::pmi.metadata.Software
Software (other than IMP) used as part of the modeling protocol.
Definition: metadata.py:19

IMP::atom::Atom
A decorator for a particle representing an atom.
Definition: atom/Atom.h:234

IMP::pmi.output.ProtocolOutput
Base class for capturing a modeling protocol.
Definition: output.py:22

IMP::pmi.metadata.Citation
A publication that describes the modeling.
Definition: metadata.py:31

IMP::pmi.metadata.DatabaseLocation
A dataset stored in an official database (PDB, EMDB, PRIDE, etc.)
Definition: metadata.py:139

IMP::get_relative_path
std::string get_relative_path(std::string base, std::string relative)
Return a path to a file relative to another file.

IMP::em
Basic utilities for handling cryo-electron microscopy 3D density maps.

IMP::core::XYZ
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30

IMP::pmi.output.Output
Class for easy writing of PDBs, RMFs, and stat files.
Definition: output.py:47

IMP::algebra::Rotation3D
3D rotation class.
Definition: Rotation3D.h:46

IMP::algebra::get_identity_transformation_3d
Transformation3D get_identity_transformation_3d()
Return a transformation that does not do anything.
Definition: Transformation3D.h:102

IMP::pmi.output
Classes for writing output files and processing them.
Definition: output.py:1

IMP::pmi.metadata.FileLocation
An individual file or directory.
Definition: metadata.py:167

IMP::core
Basic functionality that is expected to be used by a wide variety of IMP users.

IMP::algebra
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...

IMP::algebra::Vector3D
VectorD< 3 > Vector3D
Definition: VectorD.h:395

IMP::atom::get_residue
Residue get_residue(Atom d, bool nothrow=false)
Return the Residue containing this atom.

IMP::pmi.metadata.PDBLocation
Something stored in the PDB database.
Definition: metadata.py:155

IMP::atom
Functionality for loading, creating, manipulating and scoring atomic structures.

IMP::pmi.metadata.Repository.update_in_repos
def update_in_repos
If the given FileLocation maps to somewhere within one of the passed repositories, update it to reflect that.
Definition: metadata.py:230

IMP::atom::get_leaves
Hierarchies get_leaves(const Selection &h)

IMP::atom::Selection
Select hierarchy particles identified by the biological name.
Definition: Selection.h:66

IMP::atom::ChainPDBSelector
Select all ATOM and HETATM records with the given chain ids.
Definition: pdb.h:189

IMP::atom::get_residue_type
ResidueType get_residue_type(char c)
Get the residue type from the 1-letter amino acid code.

IMP::isd
Inferential scoring building on methods developed as part of the Inferential Structure Determination ...