IMP logo
IMP Reference Guide  develop.330bebda01,2025/01/21
The Integrative Modeling Platform
data.py
1 """@namespace IMP.mmcif.data
2  @brief Classes to represent data structures used in mmCIF.
3 """
4 
5 import IMP.atom
6 import ihm.location
7 import ihm.metadata
8 import ihm.startmodel
9 import ihm.analysis
10 import ihm.protocol
11 import ihm.model
12 import ihm.citations
13 import ihm.reference
14 import operator
15 import inspect
16 import warnings
17 
18 
19 # Map from IMP ResidueType to ihm ChemComp
20 _imp_to_ihm = {}
21 
22 
23 def _fill_imp_to_ihm():
24  d = dict(x for x in inspect.getmembers(IMP.atom)
25  if isinstance(x[1], IMP.atom.ResidueType))
26  # Handle standard amino acids plus some extras like MSE, UNK
27  for comp in ihm.LPeptideAlphabet._comps.values():
28  if comp.id in d:
29  _imp_to_ihm[d[comp.id]] = comp
30  # Handle RNA and DNA
31  alpha = ihm.RNAAlphabet()
32  for code in ['ADE', 'CYT', 'GUA', 'URA']:
33  _imp_to_ihm[d[code]] = alpha[d[code].get_string()]
34  alpha = ihm.DNAAlphabet()
35  for code in ['DADE', 'DCYT', 'DGUA', 'DTHY']:
36  _imp_to_ihm[d[code]] = alpha[d[code].get_string()]
37  # Pass through missing IMP residue
38  _imp_to_ihm[None] = None
39 
40 
41 _fill_imp_to_ihm()
42 
43 
44 def get_molecule(h):
45  """Given a Hierarchy, walk up and find the parent Molecule"""
46  while h:
48  return IMP.atom.Molecule(h)
49  h = h.get_parent()
50  return None
51 
52 
53 def _check_sequential(fragment, resinds):
54  for i in range(1, len(resinds)):
55  if resinds[i - 1] + 1 != resinds[i]:
56  warnings.warn(
57  "%s: non-sequential residue indices; mmCIF bead will cover "
58  "indices [%d-%d]"
59  % (str(fragment), resinds[0], resinds[-1]))
60 
61 
62 def _get_all_state_provenance(state_h, top_h, types):
63  """Yield all provenance information for the given state.
64  If the given State Hierarchy node contains no provenance information,
65  fall back to any provenance information for the top-level node
66  (if provided)."""
67  count = 0
68  for p in IMP.core.get_all_provenance(state_h, types=types):
69  count += 1
70  yield p
71  if count == 0 and top_h is not None:
72  for p in IMP.core.get_all_provenance(top_h, types=types):
73  yield p
74 
75 
76 class _CustomDNAAlphabet(ihm.Alphabet):
77  """Custom DNA alphabet that maps A,C,G,T (rather than DA,DC,DG,DT
78  as in python-ihm)"""
79  _comps = dict([cc.code_canonical, cc]
80  for cc in ihm.DNAAlphabet._comps.values())
81 
82 
83 class _EntityMapper(dict):
84  """Handle mapping from IMP chains to CIF entities.
85  Multiple components may map to the same entity if they
86  share sequence."""
87  def __init__(self, system):
88  self.system = system
89  super().__init__()
90  self._sequence_dict = {}
91  self._entities = []
92  self._alphabet_map = {
93  IMP.atom.UnknownChainType: ihm.LPeptideAlphabet,
94  IMP.atom.Protein: ihm.LPeptideAlphabet,
95  IMP.atom.RNA: ihm.RNAAlphabet,
96  IMP.atom.DNA: _CustomDNAAlphabet}
97 
98  def _get_sequence_from_residues(self, chain, seq_from_res):
99  seq_id_begin, seq = seq_from_res
100  if not seq:
101  raise ValueError("Chain %s has no sequence and no residues"
102  % chain)
103  missing_seq = [ind + seq_id_begin
104  for (ind, res) in enumerate(seq) if res is None]
105  if missing_seq:
106  raise ValueError(
107  "Chain %s has no declared sequence; tried to determine the "
108  "sequence from Residues, but the following residue indices "
109  "have no residue type (perhaps covered only by Fragments): %s"
110  % (chain, str(missing_seq)))
111  return seq_id_begin - 1, tuple(seq)
112 
113  def add(self, chain, seq_from_res=None):
114  sequence = chain.get_sequence()
115  offset = chain.get_sequence_offset()
116  if sequence == '':
117  if seq_from_res is not None:
118  offset, sequence = self._get_sequence_from_residues(
119  chain, seq_from_res)
120  else:
121  raise ValueError("Chain %s has no sequence" % chain)
122  else:
123  # Map one-letter codes to ihm.ChemComp
124  alphabet = self._alphabet_map[chain.get_chain_type()]()
125  sequence = tuple(alphabet[x] for x in sequence)
126  if sequence not in self._sequence_dict:
127  entity = ihm.Entity(sequence)
128  self.system.entities.append(entity)
129  self._entities.append(entity)
130  self._sequence_dict[sequence] = entity
131  uniprot = chain.get_uniprot_accession()
132  if uniprot:
133  up = ihm.reference.UniProtSequence.from_accession(uniprot)
134  entity.references.append(up)
135  self[chain] = self._sequence_dict[sequence]
136  return self[chain], offset
137 
138  def get_all(self):
139  """Yield all entities"""
140  return self._entities
141 
142 
143 def _assign_id(obj, seen_objs, obj_by_id):
144  """Assign a unique ID to obj, and track all ids in obj_by_id."""
145  if obj not in seen_objs:
146  if not hasattr(obj, 'id'):
147  obj_by_id.append(obj)
148  obj.id = len(obj_by_id)
149  seen_objs[obj] = obj.id
150  else:
151  obj.id = seen_objs[obj]
152 
153 
154 class _Component:
155  """An mmCIF component. This is an instance of an _Entity. Multiple
156  _Components may map to the same _Entity but must have unique
157  asym_ids. A _Component is similar to an IMP Chain but multiple
158  Chains may map to the same _Component (the Chains represent the
159  same structure, just in different states, and potentially in
160  different IMP Models). A _Component may also represent something
161  that is described by an experiment but was not modeled by IMP, and
162  so no Chains map to it but a string name does."""
163  def __init__(self, entity, asym_id, name):
164  self.entity, self.asym_id, self.name = entity, asym_id, name
165 
166 
167 class _ComponentMapper:
168  """Handle mapping from IMP Chains to CIF AsymUnits."""
169  def __init__(self, system):
170  super().__init__()
171  self.system = system
172  self._used_entities = set()
173  self._all_components = []
174  self._map = {}
175 
176  def __getitem__(self, chain):
177  asym_id, map_key, name = self._handle_chain(chain)
178  return self._map[map_key]
179 
180  def _handle_chain(self, chain):
181  mol = get_molecule(chain)
182  asym_id = chain.get_id()
183  name = mol.get_name() if mol else None
184  # Avoid conflict between name="A" and asym_id="A"
185  if name:
186  map_key = "name", name
187  else:
188  map_key = "asym_id", asym_id
189  return asym_id, map_key, name
190 
191  def add(self, chain, entity, offset):
192  """Add a chain (an IMP Chain object)"""
193  asym_id, map_key, name = self._handle_chain(chain)
194  if map_key not in self._map:
195  component = _Component(entity, asym_id, name)
196  if entity not in self._used_entities:
197  self._used_entities.add(entity)
198  # Assign entity name from the component; strip out anything
199  # after a @ or .
200  if component.name:
201  entity.description = \
202  component.name.split("@")[0].split(".")[0]
203  self._all_components.append(component)
204  asym = ihm.AsymUnit(entity, name, id=asym_id,
205  auth_seq_id_map=offset)
206  self.system.asym_units.append(asym)
207  component.asym_unit = asym
208  self._map[map_key] = component
209  else:
210  component = self._map[map_key]
211  if component.entity != entity:
212  raise ValueError("Two chains have the same ID (%s) but "
213  "different sequences - rename one of the "
214  "chains" % component.asym_unit.id)
215  if component.asym_unit.auth_seq_id_map != offset:
216  raise ValueError(
217  "Two chains have the same ID (%s) but different offsets "
218  "(%d, %d) - this is not supported"
219  % (component.asym_unit.id,
220  component.asym_unit.auth_seq_id_map, offset))
221  return component
222 
223  def get_all(self):
224  """Get all components"""
225  return self._all_components
226 
227 
228 class _RepSegmentFactory:
229  """Make ihm.representation.Segment objects for each set of contiguous
230  particles with the same representation"""
231  def __init__(self, asym):
232  self.asym = asym
233  # Offset from IHM to IMP numbering
234  self.offset = asym.auth_seq_id_map
235  self.particles = []
236  self.imp_residue_range = () # inclusive range, using IMP numbering
237 
238  def add(self, particle, starting_model):
239  """Add a new particle to the last segment (and return None).
240  Iff the particle could not be added, return the segment and start
241  a new one."""
242  (resrange, rigid_body,
243  is_res, is_atom) = self._get_particle_info(particle)
244 
245  def start_new_segment():
246  self.particles = [particle]
247  self.imp_residue_range = resrange
248  self.rigid_body = rigid_body
249  self.is_res = is_res
250  self.is_atom = is_atom
251  self.starting_model = starting_model
252  if not self.particles:
253  # First particle in a segment
254  start_new_segment()
255  elif (type(particle) == type(self.particles[0]) # noqa: E721
256  and is_res == self.is_res
257  and is_atom == self.is_atom
258  and resrange[0] <= self.imp_residue_range[1] + 1
259  and starting_model == self.starting_model
260  and self._same_rigid_body(rigid_body)):
261  # Continue an existing segment
262  self.particles.append(particle)
263  self.imp_residue_range = (self.imp_residue_range[0], resrange[1])
264  else:
265  # Make a new segment
266  seg = self.get_last()
267  start_new_segment()
268  return seg
269 
270  def get_last(self):
271  """Return the last segment, or None"""
272  if self.particles:
273  # Convert residue_range from IMP to IHM
274  asym = self.asym(self.imp_residue_range[0] - self.offset,
275  self.imp_residue_range[1] - self.offset)
276  if self.is_atom:
277  return ihm.representation.AtomicSegment(
278  asym_unit=asym, rigid=self.rigid_body is not None,
279  starting_model=self.starting_model)
280  elif self.is_res:
281  return ihm.representation.ResidueSegment(
282  asym_unit=asym,
283  rigid=self.rigid_body is not None, primitive='sphere',
284  starting_model=self.starting_model)
285  else:
286  return ihm.representation.FeatureSegment(
287  asym_unit=asym,
288  rigid=self.rigid_body is not None, primitive='sphere',
289  count=len(self.particles),
290  starting_model=self.starting_model)
291 
292  def _same_rigid_body(self, rigid_body):
293  # Note: can't just use self.rigid_body == rigid_body as IMP may
294  # crash when comparing a RigidBody object against None
295  if self.rigid_body is None and rigid_body is None:
296  return True
297  elif self.rigid_body is None or rigid_body is None:
298  return False
299  else:
300  return self.rigid_body == rigid_body
301 
302  def _get_particle_info(self, p):
303  # Note that we consider nonrigid members to not be rigid here
305  rigid_body = IMP.core.RigidMember(p).get_rigid_body()
306  else:
307  rigid_body = None
308  if isinstance(p, IMP.atom.Residue):
309  return (p.get_index(), p.get_index()), rigid_body, True, False
310  elif isinstance(p, IMP.atom.Atom):
311  res = IMP.atom.get_residue(p)
312  return (res.get_index(), res.get_index()), rigid_body, False, True
313  elif isinstance(p, IMP.atom.Fragment):
314  resinds = p.get_residue_indexes()
315  return (resinds[0], resinds[-1]), rigid_body, False, False
316  raise TypeError("Unknown particle ", p)
317 
318 
319 def _get_all_structure_provenance(p):
320  """Yield all StructureProvenance decorators for the given particle."""
322 
323 
324 class _StartingModelAtomHandler:
325  def __init__(self, templates, asym):
326  self._seq_dif = []
327  self._last_res_index = None
328  self.templates = templates
329  self.asym = asym
330 
331  def _residue_first_atom(self, res):
332  """Return True iff we're looking at the first atom in this residue"""
333  # Only add one seq_dif record per residue
334  ind = res.get_index()
335  if ind != self._last_res_index:
336  self._last_res_index = ind
337  return True
338 
339  def handle_residue(self, res, comp_id, seq_id, offset):
340  res_name = res.get_residue_type().get_string()
341  # MSE in the original PDB is automatically mutated
342  # by IMP to MET, so reflect that in the output,
343  # and pass back to populate the seq_dif category.
344  if res_name == 'MSE' and comp_id == 'MET':
345  if self._residue_first_atom(res):
346  # This should only happen when we're using
347  # a crystal structure as the source (a
348  # comparative model would use MET in
349  # the sequence)
350  assert len(self.templates) == 0
351  self._seq_dif.append(ihm.startmodel.MSESeqDif(
352  res.get_index(), seq_id))
353  elif res_name != comp_id:
354  if self._residue_first_atom(res):
355  print("WARNING: Starting model residue %s does not match "
356  "that in the output model (%s) for chain %s residue %d. "
357  "Check offset (currently %d)."
358  % (res_name, comp_id, self.asym._id, seq_id, offset))
359  self._seq_dif.append(ihm.startmodel.SeqDif(
360  db_seq_id=res.get_index(), seq_id=seq_id,
361  db_comp_id=res_name,
362  details="Mutation of %s to %s" % (res_name, comp_id)))
363 
364  def get_ihm_atoms(self, particles, offset):
365  for a in particles:
366  coord = IMP.core.XYZ(a).get_coordinates()
367  atom = IMP.atom.Atom(a)
368  element = atom.get_element()
369  element = IMP.atom.get_element_table().get_name(element)
370  atom_name = atom.get_atom_type().get_string()
371  het = atom_name.startswith('HET:')
372  if het:
373  atom_name = atom_name[4:]
374  res = IMP.atom.get_residue(atom)
375 
376  seq_id = res.get_index() + offset
377  comp_id = self.asym.entity.sequence[seq_id-1].id
378  self.handle_residue(res, comp_id, seq_id, offset)
379  yield ihm.model.Atom(asym_unit=self.asym, seq_id=seq_id,
380  atom_id=atom_name, type_symbol=element,
381  x=coord[0], y=coord[1], z=coord[2],
382  het=het, biso=atom.get_temperature_factor())
383 
384 
385 class _StartingModel(ihm.startmodel.StartingModel):
386  _eq_keys = ['filename', 'asym_id', 'offset']
387 
388  def __init__(self, asym_unit, struc_prov):
389  self.filename = struc_prov[0].get_filename()
390  super().__init__(
391  asym_unit=asym_unit(1, 1), # will update in _add_residue()
392  # will fill in later with _set_sources_datasets()
393  dataset=None,
394  asym_id=struc_prov[0].get_chain_id(),
395  offset=struc_prov[0].get_residue_offset())
396 
397  def _add_residue(self, resind):
398  # Update seq_id_range to accommodate this residue
399  seq_id_end = resind
400  seq_id_begin = self.asym_unit.seq_id_range[0]
401  if seq_id_begin == 0:
402  seq_id_begin = seq_id_end
403  self.asym_unit = self.asym_unit.asym(seq_id_begin, seq_id_end)
404 
405  # Two starting models with same filename, chain ID, and offset
406  # compare identical
407  # note: this results in separate starting models if only the
408  # offset differs; maybe consolidate into one?
409  def _eq_vals(self):
410  return tuple([self.__class__]
411  + [getattr(self, x) for x in self._eq_keys])
412 
413  def __eq__(self, other):
414  return other is not None and self._eq_vals() == other._eq_vals()
415 
416  def __hash__(self):
417  return hash(self._eq_vals())
418 
419  def _set_sources_datasets(self, system, datasets):
420  # Attempt to identify PDB file vs. comparative model
421  if (hasattr(ihm.metadata, 'CIFParser')
422  and self.filename.endswith('.cif')):
423  p = ihm.metadata.CIFParser()
424  else:
425  p = ihm.metadata.PDBParser()
426  r = p.parse_file(self.filename)
427  system.software.extend(r.get('software', []))
428  dataset = datasets.add(r['dataset'])
429  # We only want the templates that model the starting model chain
430  templates = r.get('templates', {}).get(self.asym_id, [])
431  for t in templates:
432  if t.alignment_file:
433  system.locations.append(t.alignment_file)
434  if t.dataset:
435  datasets.add(t.dataset)
436  self.dataset = dataset
437  self.templates = templates
438  self.metadata = r.get('metadata', [])
439 
440  def _read_coords(self):
441  """Read the coordinates for this starting model"""
442  m = IMP.Model()
443  # todo: support reading other subsets of the atoms (e.g. CA/CB)
444  slt = IMP.atom.ChainPDBSelector([self.asym_id]) \
446  hier = IMP.atom.read_pdb_or_mmcif(self.filename, m, slt)
447  rng = self.asym_unit.seq_id_range
448  sel = IMP.atom.Selection(
449  hier, residue_indexes=list(range(rng[0] - self.offset,
450  rng[1] + 1 - self.offset)))
451  return m, sel
452 
453  def get_seq_dif(self):
454  return self._seq_dif # filled in by get_atoms()
455 
456  def get_atoms(self):
457  mh = _StartingModelAtomHandler(self.templates, self.asym_unit)
458  m, sel = self._read_coords()
459  for a in mh.get_ihm_atoms(sel.get_selected_particles(), self.offset):
460  yield a
461  self._seq_dif = mh._seq_dif
462 
463 
464 class _StartingModelFinder:
465  """Map IMP particles to starting model objects"""
466  def __init__(self, asym, existing_starting_models, system, datasets):
467  self._seen_particles = {}
468  self._asym = asym
469  self._seen_starting_models = {}
470  for sm in existing_starting_models:
471  self._seen_starting_models[sm] = sm
472  self._system = system
473  self._datasets = datasets
474 
475  def find(self, particle):
476  """Return a StartingModel object, or None, for this particle"""
477  def _get_starting_model(sp, resind):
478  s = _StartingModel(self._asym, sp)
479  if s not in self._seen_starting_models:
480  self._seen_starting_models[s] = s
481  s._set_sources_datasets(self._system, self._datasets)
482  self._system.orphan_starting_models.append(s)
483  s = self._seen_starting_models[s]
484  if s:
485  s._add_residue(resind)
486  return s
487  resind = None
488  if IMP.atom.Residue.get_is_setup(particle):
489  resind = IMP.atom.Residue(particle).get_index()
490  sp = list(_get_all_structure_provenance(particle))
491  if sp:
492  return _get_starting_model(sp, resind)
493  elif IMP.atom.Hierarchy.get_is_setup(particle):
494  h = IMP.atom.Hierarchy(particle).get_parent()
495  # Remember all nodes we inspect
496  seen_parents = []
497  while h:
499  resind = IMP.atom.Residue(h).get_index()
500  pi = h.get_particle_index()
501  seen_parents.append(pi)
502  # If we inspected this node before, return the cached result
503  if pi in self._seen_particles:
504  sp = self._seen_particles[pi]
505  if sp and sp[0] and resind is not None:
506  sp[0]._add_residue(resind)
507  return sp[0] if sp else None
508  else:
509  sp = list(_get_all_structure_provenance(h))
510  self._seen_particles[pi] = []
511  if sp:
512  s = _get_starting_model(sp, resind)
513  # Set cache for this node and all the children we
514  # inspected on the way up
515  for spi in seen_parents:
516  self._seen_particles[spi].append(s)
517  return s
518  h = h.get_parent()
519 
520 
521 class _Datasets:
522  """Store all datasets used."""
523  def __init__(self, system):
524  super().__init__()
525  self._datasets = {}
526  self._groups = {}
527  self.system = system
528 
529  def add(self, d):
530  """Add and return a new dataset."""
531  if d not in self._datasets:
532  self._datasets[d] = d
533  self.system.orphan_datasets.append(d)
534  return self._datasets[d]
535 
536  def add_group(self, datasets, name):
537  """Add and return a new group of datasets"""
538  seen = set()
539  # Remove duplicates
540  d = []
541  for dataset in datasets:
542  if dataset not in seen:
543  d.append(dataset)
544  seen.add(dataset)
545  d = tuple(d)
546  if d not in self._groups:
547  g = ihm.dataset.DatasetGroup(d, name=name)
548  self._groups[d] = g
549  self.system.orphan_dataset_groups.append(g)
550  return self._groups[d]
551 
552  def get_all(self):
553  """Yield all datasets"""
554  return self._datasets.keys()
555 
556 
557 class _AllSoftware:
558  """Keep track of all Software objects."""
559 
560  # IMP/RMF doesn't store citation info for software, so provide it
561  # for known software packages
562  cites = {'Integrative Modeling Platform (IMP)': ihm.citations.imp,
563  'IMP PMI module': ihm.citations.pmi}
564 
565  def __init__(self, system):
566  self.system = system
567  self._by_namever = {}
568  super().__init__()
569 
570  def add_hierarchy(self, h, top_h=None):
571  # todo: if no SoftwareProvenance available, use RMF producer field
572  for p in _get_all_state_provenance(
573  h, top_h, types=[IMP.core.SoftwareProvenance]):
574  self._add_provenance(p)
575 
576  def _add_provenance(self, p):
577  """Add Software from SoftwareProvenance"""
578  # Only reference the same version of a given software package once
579  name = p.get_software_name()
580  version = p.get_version()
581  if (name, version) not in self._by_namever:
582  s = ihm.Software(name=name,
583  classification='integrative model building',
584  description=None, version=version,
585  location=p.get_location(),
586  citation=self.cites.get(name))
587  self.system.software.append(s)
588  self._by_namever[name, version] = s
589  return self._by_namever[name, version]
590 
591  def _add_previous_provenance(self, prov):
592  """Add Software from a previous SoftwareProvenance, if any"""
593  while prov:
595  return self._add_provenance(IMP.core.SoftwareProvenance(prov))
596  prov = prov.get_previous()
597 
598 
599 class _ExternalFiles:
600  """Track all externally-referenced files
601  (i.e. anything that refers to a Location that isn't
602  a DatabaseLocation)."""
603  def __init__(self, system):
604  self.system = system
605  self._by_path = {}
606 
607  def add_hierarchy(self, h, top_h=None):
608  # Add all Python scripts that were used in the modeling
609  for p in _get_all_state_provenance(
610  h, top_h, types=[IMP.core.ScriptProvenance]):
611  self._add_provenance(p)
612 
613  def _add_provenance(self, p):
614  """Add external file from ScriptProvenance"""
615  # Only reference the same path once
616  path = p.get_filename()
617  if path not in self._by_path:
618  loc = ihm.location.WorkflowFileLocation(
619  path=p.get_filename(),
620  details='Integrative modeling Python script')
621  self.system.locations.append(loc)
622  self._by_path[path] = loc
623  return self._by_path[path]
624 
625 
626 class _ProtocolStep(ihm.protocol.Step):
627  """A single step (e.g. sampling, refinement) in a protocol."""
628  def __init__(self, prov, num_models_begin, assembly, all_software):
629  method = prov.get_method()
630  if prov.get_number_of_replicas() > 1:
631  method = "Replica exchange " + method
632  super().__init__(
633  assembly=assembly,
634  # todo: fill in useful value for dataset_group
635  dataset_group=None,
636  method=method, name='Sampling',
637  num_models_begin=num_models_begin,
638  num_models_end=prov.get_number_of_frames(),
639  # todo: support multiple states, time ordered
640  multi_state=False, ordered=False,
641  # todo: revisit assumption all models are multiscale
642  multi_scale=True,
643  software=all_software._add_previous_provenance(prov))
644 
645  def add_combine(self, prov):
646  self.num_models_end = prov.get_number_of_frames()
647  return self.num_models_end
648 
649 
650 class _Protocol(ihm.protocol.Protocol):
651  """A modeling protocol.
652  Each protocol consists of a number of protocol steps (e.g. sampling,
653  refinement) followed by a number of postprocessing steps (e.g.
654  filtering, rescoring, clustering)"""
655 
656  def add_step(self, prov, num_models, assembly, all_software):
657  if isinstance(prov, IMP.core.CombineProvenance):
658  # Fold CombineProvenance into a previous sampling step
659  if len(self.steps) == 0:
660  raise ValueError("CombineProvenance with no previous sampling")
661  return self.steps[-1].add_combine(prov)
662  else:
663  ps = _ProtocolStep(prov, num_models, assembly, all_software)
664  self.steps.append(ps)
665  return ps.num_models_end
666 
667  def add_postproc(self, prov, num_models, assembly):
668  if not self.analyses:
669  self.analyses.append(ihm.analysis.Analysis())
670  if isinstance(prov, IMP.core.FilterProvenance):
671  pp = ihm.analysis.FilterStep(
672  feature='energy/score', assembly=assembly,
673  num_models_begin=num_models,
674  num_models_end=prov.get_number_of_frames())
675  elif isinstance(prov, IMP.core.ClusterProvenance):
676  # Assume clustering uses all models
677  pp = ihm.analysis.ClusterStep(
678  feature='RMSD', assembly=assembly, num_models_begin=num_models,
679  num_models_end=num_models)
680  else:
681  raise ValueError("Unhandled provenance", prov)
682  self.analyses[-1].steps.append(pp)
683  return pp.num_models_end
684 
685 
686 class _Protocols:
687  """Track all modeling protocols used."""
688  def __init__(self, system):
689  self.system = system
690 
691  def _add_protocol(self, prot):
692  # Protocol isn't hashable or sortable, so just compare dicts
693  # with existing protocols. This should still be performant as
694  # we generally don't have more than one or two protocols.
695  # We exclude dataset_group from the comparison as this is typically
696  # filled in later.
697  def step_equal(x, y):
698  def get_dict(d):
699  return {x: y for x, y in d.__dict__.items()
700  if x != 'dataset_group'}
701 
702  return (type(x) == type(y) # noqa: E721
703  and get_dict(x) == get_dict(y))
704 
705  def analysis_equal(x, y):
706  return (len(x.steps) == len(y.steps)
707  and all(step_equal(a, b)
708  for (a, b) in zip(x.steps, y.steps)))
709 
710  for existing in self.system.orphan_protocols:
711  if (len(existing.steps) == len(prot.steps)
712  and len(existing.analyses) == len(prot.analyses)
713  and all(step_equal(x, y)
714  for (x, y) in zip(existing.steps, prot.steps))
715  and all(analysis_equal(x, y)
716  for (x, y) in zip(existing.analyses, prot.analyses))):
717  return existing
718  self.system.orphan_protocols.append(prot)
719  return prot
720 
721  def _add_hierarchy(self, h, top_h, modeled_assembly, all_software):
722  num_models = 0 # assume we always start with no models
725  in_postproc = False
726  prot = _Protocol()
727  for p in reversed(list(_get_all_state_provenance(
728  h, top_h, types=prot_types + pp_types))):
729  if isinstance(p, pp_types):
730  num_models = prot.add_postproc(p, num_models, modeled_assembly)
731  in_postproc = True
732  else:
733  if in_postproc:
734  # Start a new protocol
735  self._add_protocol(prot)
736  prot = _Protocol()
737  num_models = prot.add_step(p, num_models, modeled_assembly,
738  all_software)
739  in_postproc = False
740  if len(prot.steps) > 0:
741  return self._add_protocol(prot)
742 
743 
744 class _CoordinateHandler:
745  def __init__(self, system, datasets):
746  self._system = system
747  self._datasets = datasets
748  self._representation = ihm.representation.Representation()
749  # IHM atoms/spheres corresponding to IMP beads/residues/atoms
750  # We build them up front (rather than on the fly) as the original
751  # IMP objects may have been destroyed or changed (e.g. if we read
752  # multiple frames from an RMF file) by the time we write the mmCIF.
753  self._atoms = []
754  self._spheres = []
755 
756  def get_residue_sequence(self, ps):
757  """Determine the primary sequence based on Residue particles.
758  Return the index of the first residue and the sequence, as a list
759  of ihm.ChemComp objects (or None)"""
760  restyp = {}
761  for p in ps:
762  if isinstance(p, IMP.atom.Atom):
763  residue = IMP.atom.get_residue(p)
764  restyp[residue.get_index()] = residue.get_residue_type()
765  elif isinstance(p, IMP.atom.Residue):
766  restyp[p.get_index()] = p.get_residue_type()
767  else: # fragment
768  resinds = p.get_residue_indexes()
769  for ri in resinds:
770  if ri not in restyp: # don't overwrite residue/atom info
771  restyp[ri] = None
772  if not restyp:
773  return 1, []
774  seq_id_begin = min(restyp.keys())
775  seq_id_end = max(restyp.keys())
776  return (seq_id_begin,
777  [_imp_to_ihm[restyp.get(x)]
778  for x in range(seq_id_begin, seq_id_end + 1)])
779 
780  def add_chain(self, ps, asym):
781  def matches_asym(s):
782  # Match AsymUnit or AsymUnitRange
783  return s == asym or hasattr(s, 'asym') and s.asym == asym
784 
785  # Consolidate starting models if the same model was used for this
786  # asym in a different state or for a different model_id
787  smf = _StartingModelFinder(
788  asym, [s for s in self._system.orphan_starting_models
789  if matches_asym(s.asym_unit)],
790  self._system, self._datasets)
791  segfactory = _RepSegmentFactory(asym)
792  offset = asym.auth_seq_id_map
793  for p in ps:
794  starting_model = smf.find(p)
795  seg = segfactory.add(p, starting_model)
796  if seg:
797  self._representation.append(seg)
798  self._add_atom_or_sphere(p, asym, offset)
799  last = segfactory.get_last()
800  if last:
801  self._representation.append(last)
802 
803  def _add_atom_or_sphere(self, p, asym, offset):
804  if isinstance(p, IMP.atom.Atom):
805  residue = IMP.atom.get_residue(p)
806  xyz = IMP.core.XYZ(p).get_coordinates()
807  element = p.get_element()
808  element = IMP.atom.get_element_table().get_name(element)
809  atom_name = p.get_atom_type().get_string()
810  het = atom_name.startswith('HET:')
811  if het:
812  atom_name = atom_name[4:]
813  self._atoms.append(ihm.model.Atom(
814  asym_unit=asym, seq_id=residue.get_index() - offset,
815  atom_id=atom_name, type_symbol=element,
816  x=xyz[0], y=xyz[1], z=xyz[2], het=het,
817  biso=p.get_temperature_factor(),
818  occupancy=p.get_occupancy()))
819  else:
820  if isinstance(p, IMP.atom.Fragment):
821  resinds = p.get_residue_indexes()
822  sbegin = resinds[0]
823  send = resinds[-1]
824  else: # residue
825  sbegin = send = p.get_index()
826  xyzr = IMP.core.XYZR(p)
827  xyz = xyzr.get_coordinates()
828  self._spheres.append(ihm.model.Sphere(
829  asym_unit=asym, seq_id_range=(sbegin - offset, send - offset),
830  x=xyz[0], y=xyz[1], z=xyz[2], radius=xyzr.get_radius()))
831 
832  def get_structure_particles(self, h):
833  """Return particles sorted by residue index"""
834  ps = []
835  if h.get_number_of_children() == 0:
836  return []
837  if not h.get_is_valid():
838  raise ValueError("Invalid hierarchy as input")
839  for p in IMP.atom.Selection(
840  hierarchy=h, resolution=0.).get_selected_particles():
842  residue = IMP.atom.Residue(p)
843  ps.append((residue.get_index(), residue))
845  fragment = IMP.atom.Fragment(p)
846  resinds = fragment.get_residue_indexes()
847  _check_sequential(fragment, resinds)
848  resind = resinds[len(resinds) // 2]
849  ps.append((resind, fragment))
851  atom = IMP.atom.Atom(p)
852  residue = IMP.atom.get_residue(atom)
853  ps.append((residue.get_index(), atom))
854  return [p[1] for p in sorted(ps, key=operator.itemgetter(0))]
855 
856 
857 class _ModelAssemblies:
858  def __init__(self, system):
859  self.system = system
860  self._seen_assemblies = {}
861 
862  def add(self, asyms):
863  # list isn't hashable but tuple is
864  asyms = tuple(asyms)
865  if asyms not in self._seen_assemblies:
866  assembly = ihm.Assembly(
867  asyms, name="Modeled assembly",
868  description="All components modeled by IMP")
869  self.system.orphan_assemblies.append(assembly)
870  self._seen_assemblies[asyms] = assembly
871  return self._seen_assemblies[asyms]
872 
873 
874 class _Representations:
875  def __init__(self, system):
876  self.system = system
877 
878  def add(self, rep):
879  # Representation isn't hashable or sortable, so just compare dicts
880  # with existing representations. This should still be performant as
881  # we generally don't have more than one or two representations.
882  for existing in self.system.orphan_representations:
883  if (len(existing) == len(rep)
884  and all(type(x) == type(y) # noqa: E721
885  and x.__dict__ == y.__dict__
886  for (x, y) in zip(existing, rep))):
887  return existing
888  self.system.orphan_representations.append(rep)
889  return rep
Select non water and non hydrogen atoms.
Definition: pdb.h:314
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
Hierarchy read_pdb_or_mmcif(TextInput input, Model *model, PDBSelector *selector=get_default_pdb_selector(), bool select_first_model=true)
Read all the molecules in the first model of the PDB or mmCIF file.
Definition: mmcif.h:40
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Residue.h:158
A decorator to associate a particle with a part of a protein/DNA/RNA.
Definition: Fragment.h:20
Track creation of a system fragment from running some software.
Definition: provenance.h:562
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: atom/Atom.h:245
ElementTable & get_element_table()
Track creation of a system fragment from sampling.
Definition: provenance.h:173
def get_molecule
Given a Hierarchy, walk up and find the parent Molecule.
Definition: data.py:44
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:749
Track creation of a system fragment by combination.
Definition: provenance.h:280
Class for storing model, its restraints, constraints, and particles.
Definition: Model.h:86
static bool get_is_setup(Model *m, ParticleIndex pi)
Definition: Fragment.h:46
void add_hierarchy(RMF::FileHandle fh, atom::Hierarchy hs)
The standard decorator for manipulating molecular structures.
Ints get_index(const ParticlesTemp &particles, const Subset &subset, const Subsets &excluded)
A decorator for a particle representing an atom.
Definition: atom/Atom.h:238
Track creation of a system fragment by filtering.
Definition: provenance.h:340
The type for a residue.
Track creation of a system fragment from a PDB file.
Definition: provenance.h:86
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
A decorator for a residue.
Definition: Residue.h:137
static bool get_is_setup(Model *m, ParticleIndex p)
Check if the particle has the needed attributes for a cast to succeed.
Residue get_residue(Atom d, bool nothrow=false)
Return the Residue containing this atom.
Track creation of a system fragment from clustering.
Definition: provenance.h:426
static bool get_is_setup(Model *m, ParticleIndex pi)
Definition: provenance.h:584
Functionality for loading, creating, manipulating and scoring atomic structures.
std::string get_chain_id(Hierarchy h)
Walk up the hierarchy to determine the chain id.
def get_all_provenance
Yield all provenance decorators of the given types for the particle.
A decorator for a molecule.
Definition: Molecule.h:24
Select hierarchy particles identified by the biological name.
Definition: Selection.h:70
Select all ATOM and HETATM records with the given chain ids.
Definition: pdb.h:256
Track creation of a system fragment from running a script.
Definition: provenance.h:511
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27