1 """@namespace IMP.pmi.mmcif
2 @brief Support for the mmCIF file format.
4 IMP has basic support for writing out files in mmCIF format, for
5 deposition in [PDB-dev](https://pdb-dev.rcsb.rutgers.edu/).
6 mmCIF files are currently generated by creating an
7 IMP.pmi.mmcif.ProtocolOutput class, and attaching it to an
8 IMP.pmi.representation.Representation object, after which any
9 generated models and metadata are collected and output as mmCIF.
12 from __future__
import print_function
30 def _assign_id(obj, seen_objs, obj_by_id):
31 """Assign a unique ID to obj, and track all ids in obj_by_id."""
32 if obj
not in seen_objs:
33 if not hasattr(obj,
'id'):
35 obj.id = len(obj_by_id)
36 seen_objs[obj] = obj.id
38 obj.id = seen_objs[obj]
40 class _LineWriter(object):
41 def __init__(self, writer, line_len=80):
43 self.line_len = line_len
46 if isinstance(val, str)
and '\n' in val:
47 self.writer.fh.write(
"\n;")
48 self.writer.fh.write(val)
49 if not val.endswith(
'\n'):
50 self.writer.fh.write(
"\n")
51 self.writer.fh.write(
";\n")
54 val = self.writer._repr(val)
56 if self.column + len(val) + 1 > self.line_len:
57 self.writer.fh.write(
"\n")
60 self.writer.fh.write(
" ")
62 self.writer.fh.write(val)
63 self.column += len(val)
66 class _CifCategoryWriter(object):
67 def __init__(self, writer, category):
69 self.category = category
70 def write(self, **kwargs):
71 self.writer._write(self.category, kwargs)
74 def __exit__(self, exc_type, exc_value, traceback):
78 class _CifLoopWriter(object):
79 def __init__(self, writer, category, keys):
81 self.category = category
84 self.python_keys = [k.replace(
'[',
'').replace(
']',
'')
for k
in keys]
85 self._empty_loop =
True
86 def write(self, **kwargs):
91 f.write(
"%s.%s\n" % (self.category, k))
92 self._empty_loop =
False
93 l = _LineWriter(self.writer)
94 for k
in self.python_keys:
95 l.write(kwargs.get(k, self.writer.omitted))
96 self.writer.fh.write(
"\n")
99 def __exit__(self, exc_type, exc_value, traceback):
100 if not self._empty_loop:
101 self.writer.fh.write(
"#\n")
104 class _CifWriter(object):
107 _boolmap = {
False:
'NO',
True:
'YES'}
109 def __init__(self, fh):
111 def category(self, category):
112 return _CifCategoryWriter(self, category)
113 def loop(self, category, keys):
114 return _CifLoopWriter(self, category, keys)
115 def write_comment(self, comment):
116 for line
in textwrap.wrap(comment, 78):
117 self.fh.write(
'# ' + line +
'\n')
118 def _write(self, category, kwargs):
120 self.fh.write(
"%s.%s %s\n" % (category, key,
121 self._repr(kwargs[key])))
122 def _repr(self, obj):
123 if isinstance(obj, str)
and '"' not in obj \
124 and "'" not in obj
and " " not in obj:
126 elif isinstance(obj, float):
128 elif isinstance(obj, bool):
129 return self._boolmap[obj]
133 def _get_by_residue(p):
134 """Determine whether the given particle represents a specific residue
135 or a more coarse-grained object."""
138 class _AsymIDMapper(object):
139 """Map a Particle to an asym_id (chain ID)"""
140 def __init__(self, prot):
143 self.name =
'cif-output'
144 self.o.dictionary_pdbs[self.name] = self.prot
145 self.o._init_dictchain(self.name, self.prot)
147 def __getitem__(self, p):
148 protname, is_a_bead = self.o.get_prot_name_from_particle(self.name, p)
149 return self.o.dictchain[self.name][protname]
151 class _ComponentMapper(object):
152 """Map a Particle to a component name"""
153 def __init__(self, prot):
156 self.name =
'cif-output'
157 self.o.dictionary_pdbs[self.name] = self.prot
158 self.o._init_dictchain(self.name, self.prot)
160 def __getitem__(self, p):
161 protname, is_a_bead = self.o.get_prot_name_from_particle(self.name, p)
164 class _Dumper(object):
165 """Base class for helpers to dump output to mmCIF"""
166 def __init__(self, simo):
167 self.simo = weakref.proxy(simo)
172 def finalize_metadata(self):
176 class _EntryDumper(_Dumper):
177 def dump(self, writer):
178 entry_id =
'imp_model'
180 writer.fh.write(
"data_%s\n" % entry_id)
181 with writer.category(
"_entry")
as l:
185 class _SoftwareDumper(_Dumper):
188 name=
"Integrative Modeling Platform (IMP)",
189 version=IMP.__version__,
190 classification=
"integrative model building",
191 description=
"integrative model building",
192 url=
'https://integrativemodeling.org'),
194 name=
"IMP PMI module",
195 version=IMP.pmi.__version__,
196 classification=
"integrative model building",
197 description=
"integrative model building",
198 url=
'https://integrativemodeling.org')
201 def __init__(self, simo):
202 super(_SoftwareDumper, self).__init__(simo)
203 self.modeller_used =
False
205 def set_modeller_used(self, version, date):
206 if self.modeller_used:
208 self.modeller_used =
True
210 name=
'MODELLER', classification=
'comparative modeling',
211 description=
'Comparative modeling by satisfaction '
212 'of spatial restraints, build ' + date,
213 url=
'https://salilab.org/modeller/',
216 def dump(self, writer):
218 with writer.loop(
"_software",
219 [
"pdbx_ordinal",
"name",
"classification",
"version",
220 "type",
"location"])
as l:
221 for m
in self.software + self.simo._metadata:
223 l.write(pdbx_ordinal=ordinal, name=m.name,
224 classification=m.classification, version=m.version,
225 type=m.type, location=m.url)
228 class _AuditAuthorDumper(_Dumper):
229 """Populate the mmCIF audit_author category (a list of the people that
230 authored this mmCIF file; here we assume that's just the authors of
231 any associated publications)"""
232 def dump(self, writer):
233 citations = [m
for m
in self.simo._metadata
236 with writer.loop(
"_audit_author",
237 [
"name",
"pdbx_ordinal"])
as l:
239 for n, c
in enumerate(citations):
241 if a
not in seen_authors:
242 seen_authors[a] =
None
243 l.write(name=a, pdbx_ordinal=ordinal)
247 class _CitationDumper(_Dumper):
248 def dump(self, writer):
249 citations = [m
for m
in self.simo._metadata
251 with writer.loop(
"_citation",
252 [
"id",
"title",
"journal_abbrev",
"journal_volume",
253 "page_first",
"page_last",
"year",
254 "pdbx_database_id_PubMed",
255 "pdbx_database_id_DOI"])
as l:
256 for n, c
in enumerate(citations):
257 if isinstance(c.page_range, (tuple, list)):
258 page_first, page_last = c.page_range
260 page_first = c.page_range
261 page_last = _CifWriter.omitted
262 l.write(id=n+1, title=c.title, journal_abbrev=c.journal,
263 journal_volume=c.volume, page_first=page_first,
264 page_last=page_last, year=c.year,
265 pdbx_database_id_PubMed=c.pmid,
266 pdbx_database_id_DOI=c.doi)
268 with writer.loop(
"_citation_author",
269 [
"citation_id",
"name",
"ordinal"])
as l:
271 for n, c
in enumerate(citations):
273 l.write(citation_id=n+1, name=a, ordinal=ordinal)
276 class _EntityDumper(_Dumper):
279 def dump(self, writer):
280 with writer.loop(
"_entity",
281 [
"id",
"type",
"src_method",
"pdbx_description",
282 "formula_weight",
"pdbx_number_of_molecules",
284 for entity
in self.simo.entities.get_all():
285 l.write(id=entity.id, type=
'polymer', src_method=
'man',
286 pdbx_description=entity.first_component,
287 formula_weight=writer.unknown,
288 pdbx_number_of_molecules=1, details=writer.unknown)
291 class _EntityPolyDumper(_Dumper):
293 def __init__(self, simo):
294 super(_EntityPolyDumper, self).__init__(simo)
297 def dump(self, writer):
298 with writer.loop(
"_entity_poly",
299 [
"entity_id",
"type",
"nstd_linkage",
300 "nstd_monomer",
"pdbx_strand_id",
301 "pdbx_seq_one_letter_code",
302 "pdbx_seq_one_letter_code_can"])
as l:
303 for entity
in self.simo.entities.get_all():
304 seq = entity.sequence
306 seq =
"\n".join(seq[i:i+70]
for i
in range(0, len(seq), 70))
307 name = entity.first_component
308 chain_id = self.simo._get_chain_for_component(name, self.output)
309 l.write(entity_id=entity.id, type=
'polypeptide(L)',
310 nstd_linkage=
'no', nstd_monomer=
'no',
311 pdbx_strand_id=chain_id,
312 pdbx_seq_one_letter_code=seq,
313 pdbx_seq_one_letter_code_can=seq)
316 class _ChemCompDumper(_Dumper):
317 def dump(self, writer):
319 std = dict.fromkeys((
'ALA',
'CYS',
'ASP',
'GLU',
'PHE',
'GLY',
'HIS',
320 'ILE',
'LYS',
'LEU',
'MET',
'ASN',
'PRO',
'GLN',
'ARG',
'SER',
321 'THR',
'VAL',
'TRP',
'TYR'))
322 with writer.loop(
"_chem_comp", [
"id",
"type"])
as l:
323 for entity
in self.simo.entities.get_all():
324 seq = entity.sequence
325 for num, one_letter_code
in enumerate(seq):
327 resid = restyp.get_string()
328 if resid
not in seen:
331 type=
'L-peptide linking' if resid
in std \
334 class _EntityPolySeqDumper(_Dumper):
335 def dump(self, writer):
336 with writer.loop(
"_entity_poly_seq",
337 [
"entity_id",
"num",
"mon_id",
"hetero"])
as l:
338 for entity
in self.simo.entities.get_all():
339 seq = entity.sequence
340 for num, one_letter_code
in enumerate(seq):
342 l.write(entity_id=entity.id, num=num + 1,
343 mon_id=restyp.get_string(),
344 hetero=_CifWriter.omitted)
346 class _StructAsymDumper(_Dumper):
347 def __init__(self, simo):
348 super(_StructAsymDumper, self).__init__(simo)
351 def dump(self, writer):
352 with writer.loop(
"_struct_asym",
353 [
"id",
"entity_id",
"details"])
as l:
354 for comp
in self.simo.all_modeled_components:
355 entity = self.simo.entities[comp]
356 chain_id = self.simo._get_chain_for_component(comp, self.output)
361 class _PDBFragment(object):
362 """Record details about part of a PDB file used as input
365 granularity =
'by-residue'
366 num = _CifWriter.omitted
367 def __init__(self, m, component, start, end, offset, pdbname, chain, hier):
368 self.component, self.start, self.end, self.offset, self.pdbname \
369 = component, start, end, offset, pdbname
370 self.chain, self.hier = chain, hier
375 def combine(self, other):
378 class _BeadsFragment(object):
379 """Record details about beads used to represent part of a component."""
381 granularity =
'by-feature'
383 def __init__(self, m, component, start, end, num, hier):
384 self.component, self.start, self.end, self.num, self.hier \
385 = component, start, end, num, hier
387 def combine(self, other):
389 if type(other) == type(self)
and other.start == self.end + 1:
391 self.num += other.num
394 class _StartingModel(object):
395 """Record details about an input model (e.g. comparative modeling
396 template) used for a component."""
398 source = _CifWriter.unknown
399 db_name = _CifWriter.unknown
400 db_code = _CifWriter.unknown
401 sequence_identity = _CifWriter.unknown
403 def __init__(self, fragment):
404 self.fragments = [fragment]
406 class _ModelRepresentationDumper(_Dumper):
407 def __init__(self, simo):
408 super(_ModelRepresentationDumper, self).__init__(simo)
410 self.fragments = OrderedDict()
413 def add_fragment(self, fragment):
414 """Add a model fragment."""
415 comp = fragment.component
416 if comp
not in self.fragments:
417 self.fragments[comp] = []
418 fragments = self.fragments[comp]
419 if len(fragments) == 0
or not fragments[-1].combine(fragment):
420 fragments.append(fragment)
422 def get_model_mode(self, fragment):
423 """Determine the model_mode for a given fragment ('rigid' or
432 def dump(self, writer):
435 with writer.loop(
"_ihm_model_representation",
436 [
"ordinal_id",
"representation_id",
437 "segment_id",
"entity_id",
"entity_description",
439 "seq_id_begin",
"seq_id_end",
440 "model_object_primitive",
"starting_model_id",
441 "model_mode",
"model_granularity",
442 "model_object_count"])
as l:
443 for comp, fragments
in self.fragments.items():
444 chain_id = self.simo._get_chain_for_component(comp, self.output)
446 entity = self.simo.entities[f.component]
447 starting_model_id = _CifWriter.omitted
448 if hasattr(f,
'pdbname'):
450 = self.starting_model[comp, f.pdbname].name
452 l.write(ordinal_id=ordinal_id,
454 segment_id=segment_id,
456 entity_description=entity.description,
457 entity_asym_id=chain_id,
458 seq_id_begin=f.start,
460 model_object_primitive=f.primitive,
461 starting_model_id=starting_model_id,
462 model_mode=self.get_model_mode(f),
463 model_granularity=f.granularity,
464 model_object_count=f.num)
468 class _PDBSource(object):
469 """An experimental PDB file used as part of a starting model"""
470 source =
'experimental model'
472 sequence_identity = 100.0
474 def __init__(self, model, db_code, chain_id, metadata):
475 self.db_code = db_code
476 self.chain_id = chain_id
477 self.metadata = metadata
479 def get_seq_id_range(self, model):
481 return (model.seq_id_begin, model.seq_id_end)
483 class _TemplateSource(object):
484 """A PDB file used as a template for a comparative starting model"""
485 source =
'comparative model'
486 db_name = db_code = _CifWriter.omitted
491 sequence_identity_denominator = 1
493 def __init__(self, tm_code, tm_seq_id_begin, tm_seq_id_end, seq_id_begin,
494 chain_id, seq_id_end, seq_id, model):
496 if len(tm_code) == 5:
497 self.tm_db_code = tm_code[:4].upper()
498 self.tm_chain_id = tm_code[4]
500 self.tm_db_code = self.tm_chain_id = _CifWriter.unknown
501 self.sequence_identity = seq_id
502 self.tm_seq_id_begin = tm_seq_id_begin
503 self.tm_seq_id_end = tm_seq_id_end
504 self.chain_id = chain_id
505 self._seq_id_begin, self._seq_id_end = seq_id_begin, seq_id_end
507 def get_seq_id_range(self, model):
509 seq_id_begin = max(model.seq_id_begin, self._seq_id_begin)
510 seq_id_end = min(model.seq_id_end, self._seq_id_end)
511 return (seq_id_begin, seq_id_end)
513 class _UnknownSource(object):
514 """Part of a starting model from an unknown source"""
515 db_code = _CifWriter.unknown
516 db_name = _CifWriter.unknown
517 chain_id = _CifWriter.unknown
518 sequence_identity = _CifWriter.unknown
520 _source_map = {
'Comparative model':
'comparative model',
521 'Experimental model':
'experimental model'}
523 def __init__(self, model, chain):
524 self.source = self._source_map[model.dataset._data_type]
525 self.chain_id = chain
527 def get_seq_id_range(self, model):
528 return (model.seq_id_begin, model.seq_id_end)
530 class _DatasetGroup(object):
531 """A group of datasets"""
532 def __init__(self, datasets, include_primaries):
533 self._datasets = list(datasets)
534 self.include_primaries = include_primaries
537 """Get final datasets for each restraint and remove duplicates"""
538 final_datasets = OrderedDict()
540 for p
in d._parents.keys():
541 final_datasets[p] =
None
543 for ds
in self._datasets:
544 if isinstance(ds, _RestraintDataset):
548 final_datasets[d] =
None
549 if self.include_primaries:
551 self._datasets = final_datasets.keys()
554 class _ExternalReference(object):
555 """A single externally-referenced file"""
556 def __init__(self, location, content_type):
557 self.location, self.content_type = location, content_type
560 def __set_id(self, i):
562 id = property(
lambda x: x.location.id, __set_id)
564 def __eq__(self, other):
565 return self.location == other.location
567 return hash(self.location)
570 class _ExternalReferenceDumper(_Dumper):
571 """Output information on externally referenced files
572 (i.e. anything that refers to a Location that isn't
573 a DatabaseLocation)."""
575 INPUT_DATA =
"Input data or restraints"
576 MODELING_OUTPUT =
"Modeling or post-processing output"
577 WORKFLOW =
"Modeling workflow or script"
579 class _LocalFiles(object):
580 reference_provider = _CifWriter.omitted
581 reference_type =
'Supplementary Files'
582 reference = _CifWriter.omitted
584 associated_url = _CifWriter.omitted
586 def __init__(self, top_directory):
587 self.top_directory = top_directory
589 def _get_full_path(self, path):
590 return os.path.relpath(path, start=self.top_directory)
592 class _Repository(object):
593 reference_provider = _CifWriter.omitted
594 reference_type =
'DOI'
596 associated_url = _CifWriter.omitted
598 def __init__(self, repo):
600 self.reference = repo.doi
601 if 'zenodo' in self.reference:
602 self.reference_provider =
'Zenodo'
604 self.associated_url = repo.url
605 if repo.url.endswith(
".zip"):
606 self.refers_to =
'Archive'
608 self.refers_to =
'File'
610 def __init__(self, simo):
611 super(_ExternalReferenceDumper, self).__init__(simo)
614 def add(self, location, content_type):
615 """Add a new location.
616 Note that ids are assigned later."""
617 self._refs.append(_ExternalReference(location, content_type))
620 def finalize_metadata(self):
621 """Register locations for any metadata and add the main script"""
623 details=
"The main integrative modeling script")
625 self._workflow = [main_script] \
626 + [m
for m
in self.simo._metadata
628 for w
in self._workflow:
629 self.add(w.location, self.WORKFLOW)
631 def finalize_after_datasets(self):
632 """Note that this must happen *after* DatasetDumper.finalize()"""
635 self._refs = [x
for x
in self._refs
636 if not isinstance(x.location,
643 self._repo_by_id = []
645 self._local_files = self._LocalFiles(self.simo._working_directory)
648 self.simo._update_location(r.location)
650 _assign_id(r, seen_refs, self._ref_by_id)
652 _assign_id(r.location.repo
or self._local_files, seen_repos,
655 def dump(self, writer):
656 self.dump_repos(writer)
657 self.dump_refs(writer)
659 def dump_repos(self, writer):
661 return r
if isinstance(r, self._LocalFiles)
else self._Repository(r)
662 with writer.loop(
"_ihm_external_reference_info",
663 [
"reference_id",
"reference_provider",
664 "reference_type",
"reference",
"refers_to",
665 "associated_url"])
as l:
666 for repo
in [map_repo(r)
for r
in self._repo_by_id]:
667 l.write(reference_id=repo.id,
668 reference_provider=repo.reference_provider,
669 reference_type=repo.reference_type,
670 reference=repo.reference, refers_to=repo.refers_to,
671 associated_url=repo.associated_url)
673 def dump_refs(self, writer):
674 with writer.loop(
"_ihm_external_files",
675 [
"id",
"reference_id",
"file_path",
"content_type",
677 for r
in self._ref_by_id:
679 repo = loc.repo
or self._local_files
680 file_path=self._posix_path(repo._get_full_path(loc.path))
681 l.write(id=loc.id, reference_id=repo.id,
683 content_type=r.content_type,
684 details=loc.details
or _CifWriter.omitted)
688 def _posix_path(self, path):
691 def _posix_path(self, path):
692 return path.replace(os.sep,
'/')
695 class _DatasetDumper(_Dumper):
696 def __init__(self, simo):
697 super(_DatasetDumper, self).__init__(simo)
699 self._dataset_groups = []
701 def get_all_group(self, include_primaries=False):
702 """Get a _DatasetGroup encompassing all datasets so far"""
703 g = _DatasetGroup(self._datasets, include_primaries)
704 self._dataset_groups.append(g)
707 def add(self, dataset):
708 """Add a new dataset.
709 Note that ids are assigned later."""
710 self._datasets.append(dataset)
716 self._dataset_by_id = []
717 for d
in self._flatten_dataset(self._datasets):
720 self.simo.extref_dump.add(d.location,
721 _ExternalReferenceDumper.INPUT_DATA)
722 _assign_id(d, seen_datasets, self._dataset_by_id)
725 all_group = self.get_all_group(
True)
728 self._dataset_group_by_id = []
729 for g
in self._dataset_groups:
731 ids = tuple(sorted(d.id
for d
in g._datasets))
732 if ids
not in seen_group_ids:
733 self._dataset_group_by_id.append(g)
734 g.id = len(self._dataset_group_by_id)
735 seen_group_ids[ids] = g
737 g.id = seen_group_ids[ids].id
739 self.simo.extref_dump.finalize_after_datasets()
741 def _flatten_dataset(self, d):
742 if isinstance(d, list):
744 for x
in self._flatten_dataset(p):
746 elif isinstance(d, _RestraintDataset):
747 for x
in self._flatten_dataset(d.dataset):
750 for p
in d._parents.keys():
751 for x
in self._flatten_dataset(p):
755 def dump(self, writer):
757 with writer.loop(
"_ihm_dataset_list",
758 [
"ordinal_id",
"id",
"group_id",
"data_type",
759 "database_hosted"])
as l:
760 for g
in self._dataset_group_by_id:
761 for d
in g._datasets:
762 l.write(ordinal_id=ordinal, id=d.id, group_id=g.id,
763 data_type=d._data_type,
764 database_hosted=isinstance(d.location,
767 self.dump_other((d
for d
in self._dataset_by_id
768 if not isinstance(d.location,
771 self.dump_rel_dbs((d
for d
in self._dataset_by_id
772 if isinstance(d.location,
775 self.dump_related(writer)
777 def dump_related(self, writer):
779 with writer.loop(
"_ihm_related_datasets",
780 [
"ordinal_id",
"dataset_list_id_derived",
781 "dataset_list_id_primary"])
as l:
782 for derived
in self._dataset_by_id:
783 for parent
in sorted(derived._parents.keys(),
785 l.write(ordinal_id=ordinal,
786 dataset_list_id_derived=derived.id,
787 dataset_list_id_primary=parent.id)
790 def dump_rel_dbs(self, datasets, writer):
792 with writer.loop(
"_ihm_dataset_related_db_reference",
793 [
"id",
"dataset_list_id",
"db_name",
794 "accession_code",
"version",
"details"])
as l:
796 l.write(id=ordinal, dataset_list_id=d.id,
797 db_name=d.location.db_name,
798 accession_code=d.location.access_code,
799 version=d.location.version
if d.location.version
800 else _CifWriter.omitted,
801 details=d.location.details
if d.location.details
802 else _CifWriter.omitted)
805 def dump_other(self, datasets, writer):
807 with writer.loop(
"_ihm_dataset_external_reference",
808 [
"id",
"dataset_list_id",
"file_id"])
as l:
810 l.write(id=ordinal, dataset_list_id=d.id, file_id=d.location.id)
814 class _CrossLinkGroup(object):
815 """Group common information for a set of cross links"""
816 def __init__(self, pmi_restraint, rdataset):
817 self.pmi_restraint, self.rdataset = pmi_restraint, rdataset
818 self.label = self.pmi_restraint.label
821 class _ExperimentalCrossLink(object):
822 def __init__(self, r1, c1, r2, c2, length, group):
823 self.r1, self.c1, self.r2, self.c2 = r1, c1, r2, c2
824 self.length, self.group = length, group
826 class _CrossLink(object):
827 def __init__(self, ex_xl, p1, p2, sigma1, sigma2, psi):
828 self.ex_xl, self.sigma1, self.sigma2 = ex_xl, sigma1, sigma2
829 self.p1, self.p2 = p1, p2
832 class _CrossLinkDumper(_Dumper):
833 def __init__(self, simo):
834 super(_CrossLinkDumper, self).__init__(simo)
835 self.cross_links = []
836 self.exp_cross_links = []
838 def add_experimental(self, cross_link):
839 self.exp_cross_links.append(cross_link)
840 cross_link.id = len(self.exp_cross_links)
842 def add(self, cross_link):
843 self.cross_links.append(cross_link)
844 cross_link.id = len(self.cross_links)
846 def dump(self, writer):
847 self.dump_list(writer)
848 self.dump_restraint(writer)
849 self.dump_results(writer)
851 def dump_list(self, writer):
852 with writer.loop(
"_ihm_cross_link_list",
853 [
"id",
"group_id",
"entity_description_1",
854 "entity_id_1",
"seq_id_1",
"comp_id_1",
855 "entity_description_2",
856 "entity_id_2",
"seq_id_2",
"comp_id_2",
"type",
857 "dataset_list_id"])
as l:
858 for xl
in self.exp_cross_links:
859 entity1 = self.simo.entities[xl.c1]
860 entity2 = self.simo.entities[xl.c2]
861 seq1 = entity1.sequence
862 seq2 = entity2.sequence
866 l.write(id=xl.id, group_id=xl.id,
867 entity_description_1=entity1.description,
868 entity_id_1=entity1.id,
870 comp_id_1=rt1.get_string(),
871 entity_description_2=entity2.description,
872 entity_id_2=entity2.id,
874 comp_id_2=rt2.get_string(),
876 dataset_list_id=xl.group.rdataset.dataset.id)
878 def _granularity(self, xl):
879 """Determine the granularity of a cross link"""
880 if _get_by_residue(xl.p1)
and _get_by_residue(xl.p2):
885 def dump_restraint(self, writer):
886 asym = _AsymIDMapper(self.simo.prot)
887 with writer.loop(
"_ihm_cross_link_restraint",
888 [
"id",
"group_id",
"entity_id_1",
"asym_id_1",
889 "seq_id_1",
"comp_id_1",
890 "entity_id_2",
"asym_id_2",
"seq_id_2",
"comp_id_2",
891 "type",
"conditional_crosslink_flag",
892 "model_granularity",
"distance_threshold",
893 "psi",
"sigma_1",
"sigma_2"])
as l:
894 for xl
in self.cross_links:
895 entity1 = self.simo.entities[xl.ex_xl.c1]
896 entity2 = self.simo.entities[xl.ex_xl.c2]
897 seq1 = entity1.sequence
898 seq2 = entity2.sequence
902 group_id=xl.ex_xl.id,
903 entity_id_1=entity1.id,
904 asym_id_1=asym[xl.p1],
905 seq_id_1=xl.ex_xl.r1,
906 comp_id_1=rt1.get_string(),
907 entity_id_2=entity2.id,
908 asym_id_2=asym[xl.p2],
909 seq_id_2=xl.ex_xl.r2,
910 comp_id_2=rt2.get_string(),
911 type=xl.ex_xl.group.label,
913 conditional_crosslink_flag=
"ALL",
914 model_granularity=self._granularity(xl),
915 distance_threshold=xl.ex_xl.length,
916 psi=xl.psi.get_scale(),
917 sigma_1=xl.sigma1.get_scale(),
918 sigma_2=xl.sigma2.get_scale())
920 def _set_psi_sigma(self, model, g):
921 for resolution
in g.pmi_restraint.sigma_dictionary:
922 statname =
'ISDCrossLinkMS_Sigma_%s_%s' % (resolution, g.label)
923 if model.stats
and statname
in model.stats:
924 sigma = float(model.stats[statname])
925 p = g.pmi_restraint.sigma_dictionary[resolution][0]
927 for psiindex
in g.pmi_restraint.psi_dictionary:
928 statname =
'ISDCrossLinkMS_Psi_%s_%s' % (psiindex, g.label)
929 if model.stats
and statname
in model.stats:
930 psi = float(model.stats[statname])
931 p = g.pmi_restraint.psi_dictionary[psiindex][0]
934 def dump_results(self, writer):
936 for xl
in self.cross_links:
937 all_groups[xl.ex_xl.group] =
None
939 with writer.loop(
"_ihm_cross_link_result_parameters",
940 [
"ordinal_id",
"restraint_id",
"model_id",
941 "psi",
"sigma_1",
"sigma_2"])
as l:
942 for model
in self.models:
943 for g
in all_groups.keys():
944 self._set_psi_sigma(model, g)
945 for xl
in self.cross_links:
947 l.write(ordinal_id=ordinal, restraint_id=xl.id,
948 model_id=model.id, psi=xl.psi.get_scale(),
949 sigma_1=xl.sigma1.get_scale(),
950 sigma_2=xl.sigma2.get_scale())
953 class _EM2DRestraint(object):
954 def __init__(self, rdataset, resolution, pixel_size,
955 image_resolution, projection_number):
956 self.rdataset, self.resolution = rdataset, resolution
957 self.pixel_size, self.image_resolution = pixel_size, image_resolution
958 self.projection_number = projection_number
960 def get_num_raw_micrographs(self):
961 """Return the number of raw micrographs used, if known.
962 This is extracted from the EMMicrographsDataset if any."""
963 for d
in self.rdataset.dataset._parents.keys():
967 class _EM2DDumper(_Dumper):
968 def __init__(self, simo):
969 super(_EM2DDumper, self).__init__(simo)
973 self.restraints.append(rsr)
974 rsr.id = len(self.restraints)
976 def dump(self, writer):
977 with writer.loop(
"_ihm_2dem_class_average_restraint",
978 [
"id",
"dataset_list_id",
"number_raw_micrographs",
979 "pixel_size_width",
"pixel_size_height",
980 "image_resolution",
"image_segment_flag",
981 "number_of_projections",
"struct_assembly_id",
983 for r
in self.restraints:
984 unk = _CifWriter.unknown
985 num_raw = r.get_num_raw_micrographs()
986 l.write(id=r.id, dataset_list_id=r.rdataset.dataset.id,
987 number_raw_micrographs=num_raw
if num_raw
else unk,
988 pixel_size_width=r.pixel_size,
989 pixel_size_height=r.pixel_size,
990 image_resolution=r.image_resolution,
991 number_of_projections=r.projection_number,
992 struct_assembly_id=self.simo.modeled_assembly.id,
993 image_segment_flag=
False)
995 class _EM3DRestraint(object):
996 fitting_method =
'Gaussian mixture models'
998 def __init__(self, simo, rdataset, target_ps, densities):
999 self.rdataset = rdataset
1000 self.number_of_gaussians = len(target_ps)
1001 self.assembly = self.get_assembly(densities, simo)
1003 def get_assembly(self, densities, simo):
1004 """Get the Assembly that this restraint acts on"""
1005 cm = _ComponentMapper(simo.prot)
1008 components[cm[d]] =
None
1009 return simo.assembly_dump.get_subassembly(components)
1012 class _EM3DDumper(_Dumper):
1013 def __init__(self, simo):
1014 super(_EM3DDumper, self).__init__(simo)
1015 self.restraints = []
1018 self.restraints.append(rsr)
1020 def dump(self, writer):
1023 with writer.loop(
"_ihm_3dem_restraint",
1024 [
"ordinal_id",
"dataset_list_id",
"fitting_method",
1025 "struct_assembly_id",
1026 "number_of_gaussians",
"model_id",
1027 "cross_correlation_coefficient"])
as l:
1028 for model
in self.models:
1029 for r
in self.restraints:
1031 l.write(ordinal_id=ordinal,
1032 dataset_list_id=r.rdataset.dataset.id,
1033 fitting_method=r.fitting_method,
1034 struct_assembly_id=r.assembly.id,
1035 number_of_gaussians=r.number_of_gaussians,
1039 class _Assembly(list):
1040 """A collection of components. Currently simply implemented as a list of
1041 the component names. These must be in creation order."""
1044 class _AssemblyDumper(_Dumper):
1045 def __init__(self, simo):
1046 super(_AssemblyDumper, self).__init__(simo)
1047 self.assemblies = []
1051 """Add a new assembly. The first such assembly is assumed to contain
1053 self.assemblies.append(a)
1054 a.id = len(self.assemblies)
1057 def get_subassembly(self, compdict):
1058 """Get an _Assembly consisting of the given components. Should only
1059 be called after all components are created."""
1061 newa = _Assembly(c
for c
in self.assemblies[0]
if c
in compdict)
1062 for a
in self.assemblies:
1066 return self.add(newa)
1068 def dump(self, writer):
1070 with writer.loop(
"_ihm_struct_assembly",
1071 [
"ordinal_id",
"assembly_id",
"entity_description",
1072 "entity_id",
"asym_id",
"seq_id_begin",
1073 "seq_id_end"])
as l:
1074 for a
in self.assemblies:
1076 entity = self.simo.entities[comp]
1077 seq = self.simo.sequence_dict[comp]
1078 chain_id = self.simo._get_chain_for_component(comp,
1080 l.write(ordinal_id=ordinal, assembly_id=a.id,
1081 entity_description=entity.description,
1082 entity_id=entity.id,
1085 seq_id_end=len(seq))
1089 class _Protocol(object):
1092 class _ReplicaExchangeProtocol(_Protocol):
1093 def __init__(self, rex):
1094 self.step_name =
'Sampling'
1095 if rex.monte_carlo_sample_objects
is not None:
1096 self.step_method =
'Replica exchange monte carlo'
1098 self.step_method =
'Replica exchange molecular dynamics'
1099 self.num_models_end = rex.vars[
"number_of_frames"]
1101 class _ModelGroup(object):
1102 """Group sets of models"""
1103 def __init__(self, name):
1106 class _Model(object):
1107 def __init__(self, prot, simo, protocol, assembly, group):
1110 self.protocol = protocol
1111 self.assembly = assembly
1115 o.dictionary_pdbs[name] = prot
1116 o._init_dictchain(name, prot)
1117 (particle_infos_for_pdb,
1118 self.geometric_center) = o.get_particle_infos_for_pdb_writing(name)
1119 self.entity_for_chain = {}
1120 self.comp_for_chain = {}
1121 for protname, chain_id
in o.dictchain[name].items():
1122 self.entity_for_chain[chain_id] = simo.entities[protname]
1123 self.comp_for_chain[chain_id] = protname
1124 self.spheres = [t
for t
in particle_infos_for_pdb
if t[1]
is None]
1125 self.atoms = [t
for t
in particle_infos_for_pdb
if t[1]
is not None]
1127 self.name = _CifWriter.omitted
1129 def parse_rmsf_file(self, fname, component):
1130 self.rmsf[component] = rmsf = {}
1131 with open(fname)
as fh:
1133 resnum, blocknum, val = line.split()
1134 rmsf[int(resnum)] = (int(blocknum), float(val))
1136 def get_rmsf(self, component, indexes):
1137 """Get the RMSF value for the given residue indexes."""
1139 return _CifWriter.omitted
1140 rmsf = self.rmsf[component]
1141 blocknums = dict.fromkeys(rmsf[ind][0]
for ind
in indexes)
1142 if len(blocknums) != 1:
1143 raise ValueError(
"Residue indexes %s aren't all in the same block"
1145 return rmsf[indexes[0]][1]
1147 class _ModelDumper(_Dumper):
1148 def __init__(self, simo):
1149 super(_ModelDumper, self).__init__(simo)
1152 def add(self, prot, protocol, assembly, group):
1153 m = _Model(prot, self.simo, protocol, assembly, group)
1154 self.models.append(m)
1155 m.id = len(self.models)
1158 def dump(self, writer):
1159 self.dump_model_list(writer)
1160 num_atoms = sum(len(m.atoms)
for m
in self.models)
1161 num_spheres = sum(len(m.spheres)
for m
in self.models)
1162 self.dump_atoms(writer)
1163 self.dump_spheres(writer)
1165 def dump_model_list(self, writer):
1167 with writer.loop(
"_ihm_model_list",
1168 [
"ordinal_id",
"model_id",
"model_group_id",
1169 "model_name",
"model_group_name",
"assembly_id",
1170 "protocol_id"])
as l:
1171 for model
in self.models:
1172 l.write(ordinal_id=ordinal, model_id=model.id,
1173 model_group_id=model.group.id,
1174 model_name=model.name,
1175 model_group_name=model.group.name,
1176 assembly_id=model.assembly.id,
1177 protocol_id=model.protocol.id)
1180 def dump_atoms(self, writer):
1182 with writer.loop(
"_atom_site",
1183 [
"id",
"label_atom_id",
"label_comp_id",
1185 "label_asym_id",
"Cartn_x",
1186 "Cartn_y",
"Cartn_z",
"label_entity_id",
1188 for model
in self.models:
1189 for atom
in model.atoms:
1190 (xyz, atom_type, residue_type, chain_id, residue_index,
1191 all_indexes, radius) = atom
1192 l.write(id=ordinal, label_atom_id=atom_type.get_string(),
1193 label_comp_id=residue_type.get_string(),
1194 label_asym_id=chain_id,
1195 label_entity_id=model.entity_for_chain[chain_id].id,
1196 label_seq_id=residue_index,
1197 Cartn_x=xyz[0] - model.geometric_center[0],
1198 Cartn_y=xyz[1] - model.geometric_center[1],
1199 Cartn_z=xyz[2] - model.geometric_center[2],
1203 def dump_spheres(self, writer):
1205 with writer.loop(
"_ihm_sphere_obj_site",
1206 [
"ordinal_id",
"entity_id",
"seq_id_begin",
1207 "seq_id_end",
"asym_id",
"Cartn_x",
1208 "Cartn_y",
"Cartn_z",
"object_radius",
"rmsf",
1210 for model
in self.models:
1211 for sphere
in model.spheres:
1212 (xyz, atom_type, residue_type, chain_id, residue_index,
1213 all_indexes, radius) = sphere
1214 if all_indexes
is None:
1215 all_indexes = (residue_index,)
1216 l.write(ordinal_id=ordinal,
1217 entity_id=model.entity_for_chain[chain_id].id,
1218 seq_id_begin = all_indexes[0],
1219 seq_id_end = all_indexes[-1],
1221 Cartn_x=xyz[0] - model.geometric_center[0],
1222 Cartn_y=xyz[1] - model.geometric_center[1],
1223 Cartn_z=xyz[2] - model.geometric_center[2],
1224 object_radius=radius,
1225 rmsf=model.get_rmsf(model.comp_for_chain[chain_id],
1231 class _ModelProtocolDumper(_Dumper):
1232 def __init__(self, simo):
1233 super(_ModelProtocolDumper, self).__init__(simo)
1236 def add(self, protocol):
1237 self.protocols.append(protocol)
1240 protocol.step_id = len(self.protocols)
1242 protocol.dataset_group = self.simo.dataset_dump.get_all_group()
1244 def get_last_protocol(self):
1245 """Return the most recently-added _Protocol"""
1246 return self.protocols[-1]
1248 def dump(self, writer):
1250 with writer.loop(
"_ihm_modeling_protocol",
1251 [
"ordinal_id",
"protocol_id",
"step_id",
1252 "struct_assembly_id",
"dataset_group_id",
1253 "struct_assembly_description",
"protocol_name",
1254 "step_name",
"step_method",
"num_models_begin",
1255 "num_models_end",
"multi_scale_flag",
1256 "multi_state_flag",
"time_ordered_flag"])
as l:
1257 num_models_begin = 0
1258 for p
in self.protocols:
1259 l.write(ordinal_id=ordinal, protocol_id=p.id,
1260 step_id=p.step_id, step_method=p.step_method,
1261 step_name=p.step_name,
1262 struct_assembly_id=self.simo.modeled_assembly.id,
1263 dataset_group_id=p.dataset_group.id,
1264 num_models_begin=num_models_begin,
1265 num_models_end=p.num_models_end,
1267 multi_state_flag=
False, time_ordered_flag=
False,
1269 multi_scale_flag=
True)
1270 num_models_begin = p.num_models_end
1274 class _PDBHelix(object):
1275 """Represent a HELIX record from a PDB file."""
1276 def __init__(self, line):
1277 self.helix_id = line[11:14].strip()
1278 self.start_resnam = line[14:18].strip()
1279 self.start_asym = line[19]
1280 self.start_resnum = int(line[21:25])
1281 self.end_resnam = line[27:30].strip()
1282 self.end_asym = line[31]
1283 self.end_resnum = int(line[33:37])
1284 self.helix_class = int(line[38:40])
1285 self.length = int(line[71:76])
1288 class _MSESeqDif(object):
1289 """Track an MSE -> MET mutation in the starting model sequence"""
1292 details =
'Conversion of modified residue MSE to MET'
1293 def __init__(self, res, component, source, offset):
1294 self.res, self.component, self.source = res, component, source
1295 self.offset = offset
1298 class _StartingModelDumper(_Dumper):
1299 def __init__(self, simo):
1300 super(_StartingModelDumper, self).__init__(simo)
1302 self.fragments = OrderedDict()
1304 self.models = OrderedDict()
1306 self.starting_model = {}
1309 def add_pdb_fragment(self, fragment):
1310 """Add a starting model PDB fragment."""
1311 comp = fragment.component
1312 if comp
not in self.fragments:
1313 self.fragments[comp] = []
1314 self.models[comp] = []
1315 self.fragments[comp].append(fragment)
1316 models = self.models[comp]
1317 if len(models) == 0 \
1318 or models[-1].fragments[0].pdbname != fragment.pdbname:
1319 model = _StartingModel(fragment)
1320 models.append(model)
1321 model.sources = self.get_sources(model, fragment.pdbname,
1324 models[-1].fragments.append(fragment)
1326 def get_templates(self, pdbname, model):
1328 tmpre = re.compile(
'REMARK 6 TEMPLATE: '
1329 '(\S+) (\S+):\S+ \- (\S+):\S+ '
1330 'MODELS (\S+):(\S+) \- (\S+):\S+ AT (\S+)%')
1332 with open(pdbname)
as fh:
1334 if line.startswith(
'ATOM'):
1336 m = tmpre.match(line)
1338 templates.append(_TemplateSource(m.group(1),
1351 d = self.simo.dataset_dump.add(d)
1353 model.dataset.add_primary(d)
1356 return sorted(templates, key=
lambda x: (x._seq_id_begin, x._seq_id_end))
1358 def _parse_pdb(self, fh, first_line):
1359 """Extract information from an official PDB"""
1363 if line.startswith(
'TITLE'):
1364 details += line[10:].rstrip()
1365 elif line.startswith(
'HELIX'):
1366 metadata.append(_PDBHelix(line))
1367 return (first_line[50:59].strip(),
1368 details
if details
else _CifWriter.unknown, metadata)
1370 def get_sources(self, model, pdbname, chain):
1373 first_line = fh.readline()
1375 file_dataset = self.simo.get_file_dataset(pdbname)
1376 if first_line.startswith(
'HEADER'):
1377 version, details, metadata = self._parse_pdb(fh, first_line)
1378 source = _PDBSource(model, first_line[62:66].strip(), chain,
1382 model.dataset = self.simo.dataset_dump.add(file_dataset
or d)
1384 elif first_line.startswith(
'EXPDTA THEORETICAL MODEL, MODELLER'):
1385 self.simo.software_dump.set_modeller_used(
1386 *first_line[38:].split(
' ', 1))
1388 model.dataset = self.simo.dataset_dump.add(file_dataset
or d)
1389 templates = self.get_templates(pdbname, model)
1394 return [_UnknownSource(model, chain)]
1400 model.dataset = self.simo.dataset_dump.add(file_dataset
or d)
1401 return [_UnknownSource(model, chain)]
1403 def assign_model_details(self):
1404 for comp, models
in self.models.items():
1405 for i, model
in enumerate(models):
1406 model.name =
"%s-m%d" % (comp, i+1)
1407 self.starting_model[comp, model.fragments[0].pdbname] = model
1408 model.seq_id_begin = min(x.start + x.offset
1409 for x
in model.fragments)
1410 model.seq_id_end = max(x.end + x.offset
1411 for x
in model.fragments)
1413 def all_models(self):
1414 for comp, models
in self.models.items():
1415 for model
in models:
1419 self.assign_model_details()
1421 def dump(self, writer):
1422 self.dump_details(writer)
1423 self.dump_comparative(writer)
1424 seq_dif = self.dump_coords(writer)
1425 self.dump_seq_dif(writer, seq_dif)
1427 def dump_seq_dif(self, writer, seq_dif):
1429 with writer.loop(
"_ihm_starting_model_seq_dif",
1430 [
"ordinal_id",
"entity_id",
"asym_id",
1431 "seq_id",
"comp_id",
"starting_model_ordinal_id",
1432 "db_asym_id",
"db_seq_id",
"db_comp_id",
1435 chain_id = self.simo._get_chain_for_component(
1436 sd.component, self.output)
1437 entity = self.simo.entities[sd.component]
1438 l.write(ordinal_id=ordinal, entity_id=entity.id,
1439 asym_id=chain_id, seq_id=sd.res.get_index(),
1441 db_asym_id=sd.source.chain_id,
1442 db_seq_id=sd.res.get_index() - sd.offset,
1443 db_comp_id=sd.db_comp_id,
1444 starting_model_ordinal_id=sd.source.id,
1448 def dump_comparative(self, writer):
1449 """Dump details on comparative models. Must be called after
1450 dump_details() since it uses IDs assigned there."""
1451 with writer.loop(
"_ihm_starting_comparative_models",
1452 [
"starting_model_ordinal_id",
"starting_model_id",
1453 "template_auth_asym_id",
"template_seq_begin",
1454 "template_seq_end",
"template_sequence_identity",
1455 "template_sequence_identity_denominator",
1456 "template_dataset_list_id",
1457 "alignment_file_id"])
as l:
1459 for model
in self.all_models():
1460 for template
in [s
for s
in model.sources
1461 if isinstance(s, _TemplateSource)]:
1462 denom = template.sequence_identity_denominator
1463 l.write(starting_model_ordinal_id=template.id,
1464 starting_model_id=model.name,
1465 template_auth_asym_id=template.tm_chain_id,
1466 template_seq_begin=template.tm_seq_id_begin,
1467 template_seq_end=template.tm_seq_id_end,
1468 template_sequence_identity=template.sequence_identity,
1469 template_sequence_identity_denominator=denom,
1470 template_dataset_list_id=template.tm_dataset.id
1471 if template.tm_dataset
1472 else _CifWriter.unknown)
1474 def dump_details(self, writer):
1475 writer.write_comment(
"""IMP will attempt to identify which input models
1476 are crystal structures and which are comparative models, but does not always
1477 have sufficient information to deduce all of the templates used for comparative
1478 modeling. These may need to be added manually below.""")
1479 with writer.loop(
"_ihm_starting_model_details",
1480 [
"ordinal_id",
"entity_id",
"entity_description",
1481 "asym_id",
"seq_id_begin",
1482 "seq_id_end",
"starting_model_source",
1483 "starting_model_auth_asym_id",
1484 "starting_model_sequence_offset",
1485 "starting_model_id",
1486 "dataset_list_id"])
as l:
1488 for model
in self.all_models():
1489 f = model.fragments[0]
1490 entity = self.simo.entities[f.component]
1491 chain_id = self.simo._get_chain_for_component(f.component,
1493 for source
in model.sources:
1495 source0 = model.sources[0]
1499 seq_id_begin, seq_id_end = source0.get_seq_id_range(model)
1500 for source
in model.sources[1:]:
1501 this_begin, this_end = source.get_seq_id_range(model)
1502 seq_id_begin = min(seq_id_begin, this_begin)
1503 seq_id_end = max(seq_id_end, this_end)
1504 l.write(ordinal_id=ordinal,
1505 entity_id=entity.id,
1506 entity_description=entity.description,
1508 seq_id_begin=seq_id_begin,
1509 seq_id_end=seq_id_end,
1510 starting_model_auth_asym_id=source0.chain_id,
1511 starting_model_id=model.name,
1512 starting_model_source=source0.source,
1513 starting_model_sequence_offset=f.offset,
1514 dataset_list_id=model.dataset.id)
1517 def dump_coords(self, writer):
1520 with writer.loop(
"_ihm_starting_model_coord",
1521 [
"starting_model_id",
"group_PDB",
"id",
"type_symbol",
1522 "atom_id",
"comp_id",
"entity_id",
"asym_id",
1523 "seq_id",
"Cartn_x",
1524 "Cartn_y",
"Cartn_z",
"B_iso_or_equiv",
1525 "ordinal_id"])
as l:
1526 for model
in self.all_models():
1527 for f
in model.fragments:
1529 residue_indexes=list(range(f.start, f.end + 1)))
1530 last_res_index =
None
1531 for a
in sel.get_selected_particles():
1534 element = atom.get_element()
1536 atom_name = atom.get_atom_type().get_string()
1538 if atom_name.startswith(
'HET:'):
1539 group_pdb =
'HETATM'
1540 atom_name = atom_name[4:]
1542 res_name = res.get_residue_type().get_string()
1546 if res_name ==
'MSE':
1549 ind = res.get_index()
1550 if ind != last_res_index:
1551 last_res_index = ind
1556 assert(len(model.sources) == 1)
1557 seq_dif.append(_MSESeqDif(res, f.component,
1560 chain_id = self.simo._get_chain_for_component(
1561 f.component, self.output)
1562 entity = self.simo.entities[f.component]
1563 l.write(starting_model_id=model.name,
1564 group_PDB=group_pdb,
1565 id=atom.get_input_index(), type_symbol=element,
1566 atom_id=atom_name, comp_id=res_name,
1567 entity_id=entity.id,
1569 seq_id=res.get_index(), Cartn_x=coord[0],
1570 Cartn_y=coord[1], Cartn_z=coord[2],
1571 B_iso_or_equiv=atom.get_temperature_factor(),
1576 class _StructConfDumper(_Dumper):
1577 def all_rigid_fragments(self):
1578 """Yield all rigid model representation fragments"""
1579 asym = _AsymIDMapper(self.simo.prot)
1580 model_repr = self.simo.model_repr_dump
1581 for comp, fragments
in model_repr.fragments.items():
1583 if hasattr(f,
'pdbname') \
1584 and model_repr.get_model_mode(f) ==
'rigid':
1585 yield (f, model_repr.starting_model[comp, f.pdbname],
1588 def all_helices(self):
1589 """Yield all helices that overlap with rigid model fragments"""
1590 for f, starting_model, asym_id
in self.all_rigid_fragments():
1591 if len(starting_model.sources) == 1 \
1592 and isinstance(starting_model.sources[0], _PDBSource):
1593 pdb = starting_model.sources[0]
1594 for m
in pdb.metadata:
1595 if isinstance(m, _PDBHelix) \
1596 and m.start_asym == pdb.chain_id \
1597 and m.end_asym == pdb.chain_id \
1598 and m.start_resnum >= f.start
and m.end_resnum <= f.end:
1599 yield (m, max(f.start, m.start_resnum),
1600 min(f.end, m.end_resnum), asym_id)
1602 def dump(self, writer):
1603 with writer.category(
"_struct_conf_type")
as l:
1604 l.write(id=
'HELX_P', criteria=_CifWriter.unknown,
1605 reference=_CifWriter.unknown)
1613 with writer.loop(
"_struct_conf",
1614 [
"id",
"conf_type_id",
"beg_label_comp_id",
1615 "beg_label_asym_id",
"beg_label_seq_id",
1616 "end_label_comp_id",
"end_label_asym_id",
1617 "end_label_seq_id",])
as l:
1618 for h, begin, end, asym_id
in self.all_helices():
1620 l.write(id=
'HELX_P%d' % ordinal, conf_type_id=
'HELX_P',
1621 beg_label_comp_id=h.start_resnam,
1622 beg_label_asym_id=asym_id,
1623 beg_label_seq_id=begin,
1624 end_label_comp_id=h.end_resnam,
1625 end_label_asym_id=asym_id,
1626 end_label_seq_id=end)
1629 class _PostProcess(object):
1630 """Base class for any post processing"""
1634 class _ReplicaExchangeAnalysisPostProcess(_PostProcess):
1635 """Post processing using AnalysisReplicaExchange0 macro"""
1639 def __init__(self, rex, num_models_begin):
1641 self.num_models_begin = num_models_begin
1642 self.num_models_end = 0
1643 for fname
in self.get_all_stat_files():
1644 with open(fname)
as fh:
1645 self.num_models_end += len(fh.readlines())
1647 def get_stat_file(self, cluster_num):
1648 return os.path.join(self.rex._outputdir,
"cluster.%d" % cluster_num,
1651 def get_all_stat_files(self):
1652 for i
in range(self.rex._number_of_clusters):
1653 yield self.get_stat_file(i)
1655 class _SimplePostProcess(_PostProcess):
1656 """Simple ad hoc clustering"""
1660 def __init__(self, num_models_begin, num_models_end):
1661 self.num_models_begin = num_models_begin
1662 self.num_models_end = num_models_end
1664 class _PostProcessDumper(_Dumper):
1665 def __init__(self, simo):
1666 super(_PostProcessDumper, self).__init__(simo)
1669 def add(self, postproc):
1670 self.postprocs.append(postproc)
1671 postproc.id = len(self.postprocs)
1673 def dump(self, writer):
1674 with writer.loop(
"_ihm_modeling_post_process",
1675 [
"id",
"protocol_id",
"analysis_id",
"step_id",
1676 "type",
"feature",
"num_models_begin",
1677 "num_models_end"])
as l:
1680 for p
in self.postprocs:
1681 l.write(id=p.id, protocol_id=1, analysis_id=1, step_id=p.id,
1682 type=p.type, feature=p.feature,
1683 num_models_begin=p.num_models_begin,
1684 num_models_end=p.num_models_end)
1686 class _Ensemble(object):
1687 """Base class for any ensemble"""
1691 class _ReplicaExchangeAnalysisEnsemble(_Ensemble):
1692 """Ensemble generated using AnalysisReplicaExchange0 macro"""
1694 def __init__(self, pp, cluster_num, model_group, num_deposit):
1696 self.model_group = model_group
1697 self.cluster_num = cluster_num
1699 self.num_deposit = num_deposit
1700 self.localization_density = {}
1701 with open(pp.get_stat_file(cluster_num))
as fh:
1702 self.num_models = len(fh.readlines())
1704 def get_rmsf_file(self, component):
1705 return os.path.join(self.postproc.rex._outputdir,
1706 'cluster.%d' % self.cluster_num,
1707 'rmsf.%s.dat' % component)
1709 def load_rmsf(self, model, component):
1710 fname = self.get_rmsf_file(component)
1711 if os.path.exists(fname):
1712 model.parse_rmsf_file(fname, component)
1714 def get_localization_density_file(self, component):
1718 return os.path.join(self.postproc.rex._outputdir,
1719 'cluster.%d' % self.cluster_num,
1720 '%s.mrc' % component)
1722 def load_localization_density(self, mdl, component, extref_dump):
1723 fname = self.get_localization_density_file(component)
1724 if os.path.exists(fname):
1726 self.localization_density[component] = local_file
1727 extref_dump.add(local_file,
1728 _ExternalReferenceDumper.MODELING_OUTPUT)
1730 def load_all_models(self, simo):
1731 stat_fname = self.postproc.get_stat_file(self.cluster_num)
1733 with open(stat_fname)
as fh:
1734 stats = eval(fh.readline())
1736 rmf_file = os.path.join(os.path.dirname(stat_fname),
1737 "%d.rmf3" % model_num)
1738 for c
in simo.all_modeled_components:
1740 simo._representation.set_coordinates_from_rmf(c, rmf_file, 0,
1741 force_rigid_update=
True)
1745 if model_num >= self.num_deposit:
1749 def _get_precision(self):
1750 precfile = os.path.join(self.postproc.rex._outputdir,
1751 "precision.%d.%d.out" % (self.cluster_num,
1753 if not os.path.exists(precfile):
1754 return _CifWriter.unknown
1756 r = re.compile(
'All .*/cluster.%d/ average centroid distance ([\d\.]+)'
1758 with open(precfile)
as fh:
1762 return float(m.group(1))
1764 feature = property(
lambda self: self.postproc.feature)
1765 name = property(
lambda self:
"Cluster %d" % (self.cluster_num + 1))
1766 precision = property(
lambda self: self._get_precision())
1768 class _SimpleEnsemble(_Ensemble):
1769 """Simple manually-created ensemble"""
1773 def __init__(self, pp, model_group, num_models, drmsd,
1774 num_models_deposited, ensemble_file):
1775 self.file = ensemble_file
1777 self.model_group = model_group
1778 self.num_deposit = num_models_deposited
1779 self.localization_density = {}
1780 self.num_models = num_models
1781 self.precision = drmsd
1783 def load_localization_density(self, mdl, component, local_file,
1785 self.localization_density[component] = local_file
1786 extref_dump.add(local_file,
1787 _ExternalReferenceDumper.MODELING_OUTPUT)
1789 name = property(
lambda self: self.model_group.name)
1792 class _EnsembleDumper(_Dumper):
1793 def __init__(self, simo):
1794 super(_EnsembleDumper, self).__init__(simo)
1797 def add(self, ensemble):
1798 self.ensembles.append(ensemble)
1799 ensemble.id = len(self.ensembles)
1801 def dump(self, writer):
1802 with writer.loop(
"_ihm_ensemble_info",
1803 [
"ensemble_id",
"ensemble_name",
"post_process_id",
1804 "model_group_id",
"ensemble_clustering_method",
1805 "ensemble_clustering_feature",
1806 "num_ensemble_models",
1807 "num_ensemble_models_deposited",
1808 "ensemble_precision_value",
1809 "ensemble_file_id"])
as l:
1810 for e
in self.ensembles:
1811 l.write(ensemble_id=e.id, ensemble_name=e.name,
1812 post_process_id=e.postproc.id,
1813 model_group_id=e.model_group.id,
1814 ensemble_clustering_feature=e.feature,
1815 num_ensemble_models=e.num_models,
1816 num_ensemble_models_deposited=e.num_deposit,
1817 ensemble_precision_value=e.precision,
1818 ensemble_file_id=e.file.id
if e.file
1819 else _CifWriter.omitted)
1821 class _DensityDumper(_Dumper):
1822 """Output localization densities for ensembles"""
1824 def __init__(self, simo):
1825 super(_DensityDumper, self).__init__(simo)
1829 def add(self, ensemble):
1830 self.ensembles.append(ensemble)
1832 def get_density(self, ensemble, component):
1833 return ensemble.localization_density.get(component,
None)
1835 def dump(self, writer):
1836 with writer.loop(
"_ihm_localization_density_files",
1837 [
"id",
"file_id",
"ensemble_id",
"entity_id",
1838 "asym_id",
"seq_id_begin",
"seq_id_end"])
as l:
1840 for ensemble
in self.ensembles:
1841 for comp
in self.simo.all_modeled_components:
1842 density = self.get_density(ensemble, comp)
1845 entity = self.simo.entities[comp]
1846 lenseq = len(entity.sequence)
1847 chain_id = self.simo._get_chain_for_component(comp,
1849 l.write(id=ordinal, ensemble_id=ensemble.id,
1850 entity_id=entity.id,
1852 seq_id_begin=1, seq_id_end=lenseq,
1856 class _Entity(object):
1857 """Represent a CIF entity (a chain with a unique sequence)"""
1858 def __init__(self, seq, first_component):
1860 self.first_component = first_component
1862 self.description = first_component
1864 class _EntityMapper(dict):
1865 """Handle mapping from IMP components to CIF entities.
1866 Multiple components may map to the same entity if they share sequence."""
1868 super(_EntityMapper, self).__init__()
1869 self._sequence_dict = {}
1872 def add(self, component_name, sequence):
1873 if sequence
not in self._sequence_dict:
1874 entity = _Entity(sequence, component_name)
1875 self._entities.append(entity)
1876 entity.id = len(self._entities)
1877 self._sequence_dict[sequence] = entity
1878 self[component_name] = self._sequence_dict[sequence]
1881 """Yield all entities"""
1882 return self._entities
1885 class _RestraintDataset(object):
1886 """Wrapper around a dataset associated with a restraint.
1887 This is needed because we need to delay access to the dataset
1888 in case the writer of the PMI script overrides or changes it
1889 after creating the restraint."""
1890 def __init__(self, restraint, num, allow_duplicates):
1891 self.restraint = restraint
1892 self.num, self.allow_duplicates = num, allow_duplicates
1893 self.__dataset =
None
1894 def __get_dataset(self):
1896 return self.__dataset
1897 if self.num
is not None:
1898 d = copy.deepcopy(self.restraint.datasets[self.num])
1900 d = copy.deepcopy(self.restraint.dataset)
1901 if self.allow_duplicates:
1902 d.location._allow_duplicates =
True
1906 dataset = property(__get_dataset)
1910 """Class to encode a modeling protocol as mmCIF.
1912 IMP has basic support for writing out files in mmCIF format, for
1913 deposition in [PDB-dev](https://pdb-dev.rcsb.rutgers.edu/).
1914 After creating an instance of this class, attach it to an
1915 IMP.pmi.representation.Representation object. After this, any
1916 generated models and metadata are automatically collected and
1919 def __init__(self, fh):
1920 self._main_script = os.path.abspath(sys.argv[0])
1921 self._working_directory = os.getcwd()
1922 self._cif_writer = _CifWriter(fh)
1923 self.entities = _EntityMapper()
1925 self._all_components = {}
1926 self.model_groups = []
1927 self.default_model_group =
None
1928 self.sequence_dict = {}
1929 self.all_modeled_components = []
1930 self.model_repr_dump = _ModelRepresentationDumper(self)
1931 self.cross_link_dump = _CrossLinkDumper(self)
1932 self.em2d_dump = _EM2DDumper(self)
1933 self.em3d_dump = _EM3DDumper(self)
1934 self.model_prot_dump = _ModelProtocolDumper(self)
1935 self.extref_dump = _ExternalReferenceDumper(self)
1936 self.dataset_dump = _DatasetDumper(self)
1937 self.starting_model_dump = _StartingModelDumper(self)
1938 self.assembly_dump = _AssemblyDumper(self)
1941 self.complete_assembly = _Assembly()
1942 self.assembly_dump.add(self.complete_assembly)
1946 self.modeled_assembly = self.complete_assembly
1948 self.model_dump = _ModelDumper(self)
1949 self.model_repr_dump.starting_model \
1950 = self.starting_model_dump.starting_model
1951 self.software_dump = _SoftwareDumper(self)
1952 self.post_process_dump = _PostProcessDumper(self)
1953 self.ensemble_dump = _EnsembleDumper(self)
1954 self.density_dump = _DensityDumper(self)
1958 self.cross_link_dump.models = self.model_dump.models
1959 self.em3d_dump.models = self.model_dump.models
1961 self._dumpers = [_EntryDumper(self),
1962 _AuditAuthorDumper(self),
1963 self.software_dump, _CitationDumper(self),
1964 _ChemCompDumper(self),
1965 _EntityDumper(self),
1966 _EntityPolyDumper(self), _EntityPolySeqDumper(self),
1967 _StructAsymDumper(self),
1969 self.model_repr_dump, self.extref_dump,
1971 self.cross_link_dump,
1972 self.em2d_dump, self.em3d_dump,
1973 self.starting_model_dump,
1976 self.model_prot_dump, self.post_process_dump,
1977 self.ensemble_dump, self.density_dump, self.model_dump]
1979 def get_file_dataset(self, fname):
1980 return self._file_dataset.get(os.path.abspath(fname),
None)
1982 def _get_chain_for_component(self, name, output):
1983 """Get the chain ID for a component, if any."""
1985 if name
in self.chains:
1986 chain = self.chains[name]
1987 return output.chainids[chain]
1990 return _CifWriter.omitted
1992 def create_component(self, name, modeled):
1993 self._all_components[name] =
None
1995 self.all_modeled_components.append(name)
1996 self.chains[name] = len(self.chains)
1997 if self.modeled_assembly
is not self.complete_assembly:
1998 self.modeled_assembly.append(name)
1999 elif self.complete_assembly
is self.modeled_assembly:
2002 self.modeled_assembly = _Assembly(self.complete_assembly)
2003 self.assembly_dump.add(self.modeled_assembly)
2004 self.complete_assembly.append(name)
2006 def add_component_sequence(self, name, seq):
2007 self.sequence_dict[name] = seq
2008 self.entities.add(name, seq)
2011 for dumper
in self._dumpers:
2012 dumper.finalize_metadata()
2013 for dumper
in self._dumpers:
2015 for dumper
in self._dumpers:
2016 dumper.dump(self._cif_writer)
2018 def add_pdb_element(self, name, start, end, offset, pdbname, chain, hier):
2019 p = _PDBFragment(self.m, name, start, end, offset, pdbname, chain, hier)
2020 self.model_repr_dump.add_fragment(p)
2021 self.starting_model_dump.add_pdb_fragment(p)
2023 def add_bead_element(self, name, start, end, num, hier):
2024 b = _BeadsFragment(self.m, name, start, end, num, hier)
2025 self.model_repr_dump.add_fragment(b)
2027 def _get_restraint_dataset(self, r, num=None, allow_duplicates=False):
2028 """Get a wrapper object for the dataset used by this restraint.
2029 This is needed because the restraint's dataset may be changed
2030 in the PMI script before we get to use it."""
2031 rs = _RestraintDataset(r, num, allow_duplicates)
2032 self.dataset_dump.add(rs)
2035 def get_cross_link_group(self, r):
2036 return _CrossLinkGroup(r, self._get_restraint_dataset(r))
2038 def add_experimental_cross_link(self, r1, c1, r2, c2, length, group):
2039 if c1
not in self._all_components
or c2
not in self._all_components:
2045 xl = _ExperimentalCrossLink(r1, c1, r2, c2, length, group)
2046 self.cross_link_dump.add_experimental(xl)
2049 def add_cross_link(self, ex_xl, p1, p2, sigma1, sigma2, psi):
2050 self.cross_link_dump.add(_CrossLink(ex_xl, p1, p2, sigma1, sigma2, psi))
2052 def add_replica_exchange(self, rex):
2057 self.model_prot_dump.add(_ReplicaExchangeProtocol(rex))
2059 def add_model_group(self, group):
2060 self.model_groups.append(group)
2061 group.id = len(self.model_groups)
2064 def _add_simple_postprocessing(self, num_models_begin, num_models_end):
2065 pp = _SimplePostProcess(num_models_begin, num_models_end)
2066 self.post_process_dump.add(pp)
2069 def _add_simple_ensemble(self, pp, name, num_models, drmsd,
2070 num_models_deposited, localization_densities,
2072 """Add an ensemble generated by ad hoc methods (not using PMI).
2073 This is currently only used by the Nup84 system."""
2074 group = self.add_model_group(_ModelGroup(name))
2075 self.extref_dump.add(ensemble_file,
2076 _ExternalReferenceDumper.MODELING_OUTPUT)
2077 e = _SimpleEnsemble(pp, group, num_models, drmsd, num_models_deposited,
2079 self.ensemble_dump.add(e)
2080 self.density_dump.add(e)
2081 for c
in self.all_modeled_components:
2082 den = localization_densities.get(c,
None)
2084 e.load_localization_density(self.m, c, den, self.extref_dump)
2087 def add_replica_exchange_analysis(self, rex):
2093 num_models = self.model_prot_dump.get_last_protocol().num_models_end
2094 pp = _ReplicaExchangeAnalysisPostProcess(rex, num_models)
2095 self.post_process_dump.add(pp)
2096 for i
in range(rex._number_of_clusters):
2097 group = self.add_model_group(_ModelGroup(
'Cluster %d' % (i + 1)))
2099 e = _ReplicaExchangeAnalysisEnsemble(pp, i, group, 1)
2100 self.ensemble_dump.add(e)
2101 self.density_dump.add(e)
2103 for c
in self.all_modeled_components:
2104 e.load_localization_density(self.m, c, self.extref_dump)
2105 for stats
in e.load_all_models(self):
2106 m = self.add_model(group)
2109 m.name =
'Best scoring model'
2112 m.geometric_center = [0,0,0]
2114 for c
in self.all_modeled_components:
2117 def add_em2d_restraint(self, r, i, resolution, pixel_size,
2118 image_resolution, projection_number):
2119 d = self._get_restraint_dataset(r, i)
2120 self.em2d_dump.add(_EM2DRestraint(d, resolution, pixel_size,
2121 image_resolution, projection_number))
2123 def add_em3d_restraint(self, target_ps, densities, r):
2126 d = self._get_restraint_dataset(r, allow_duplicates=
True)
2127 self.em3d_dump.add(_EM3DRestraint(self, d, target_ps, densities))
2129 def add_model(self, group=None):
2131 if self.default_model_group
is None:
2132 self.default_model_group \
2133 = self.add_model_group(_ModelGroup(
"All models"))
2134 group = self.default_model_group
2135 return self.model_dump.add(self.prot,
2136 self.model_prot_dump.get_last_protocol(),
2137 self.modeled_assembly, group)
2139 def _update_location(self, fileloc):
2140 """Update FileLocation to point to a parent repository, if any"""
2141 all_repos = [m
for m
in self._metadata
Select non water and non hydrogen atoms.
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Class to encode a modeling protocol as mmCIF.
static bool get_is_setup(const IMP::ParticleAdaptor &p)
ElementTable & get_element_table()
static bool get_is_setup(const IMP::ParticleAdaptor &p)
void read_pdb(TextInput input, int model, Hierarchy h)
Representation of the system.
A decorator for a particle representing an atom.
Base class for capturing a modeling protocol.
Basic utilities for handling cryo-electron microscopy 3D density maps.
A decorator for a particle with x,y,z coordinates.
Class for easy writing of PDBs, RMFs, and stat files.
Classes for writing output files and processing them.
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Residue get_residue(Atom d, bool nothrow=false)
Return the Residue containing this atom.
Functionality for loading, creating, manipulating and scoring atomic structures.
Hierarchies get_leaves(const Selection &h)
Select hierarchy particles identified by the biological name.
Select all ATOM and HETATM records with the given chain ids.
ResidueType get_residue_type(char c)
Get the residue type from the 1-letter amino acid code.
Inferential scoring building on methods developed as part of the Inferential Structure Determination ...