IMP logo
IMP Reference Guide  develop.298718d020,2025/02/22
The Integrative Modeling Platform
data.py
1 """@namespace IMP.mmcif.data
2  @brief Classes to represent data structures used in mmCIF.
3 """
4 
5 import IMP.atom
6 import ihm.location
7 import ihm.metadata
8 import ihm.startmodel
9 import ihm.analysis
10 import ihm.protocol
11 import ihm.model
12 import ihm.citations
13 import ihm.reference
14 import operator
15 import inspect
16 import warnings
17 
18 
19 # Map from IMP ResidueType to ihm ChemComp
20 _imp_to_ihm = {}
21 
22 
23 def _fill_imp_to_ihm():
24  d = dict(x for x in inspect.getmembers(IMP.atom)
25  if isinstance(x[1], IMP.atom.ResidueType))
26  # Handle standard amino acids plus some extras like MSE, UNK
27  for comp in ihm.LPeptideAlphabet._comps.values():
28  if comp.id in d:
29  _imp_to_ihm[d[comp.id]] = comp
30  # Handle RNA and DNA
31  alpha = ihm.RNAAlphabet()
32  for code in ['ADE', 'CYT', 'GUA', 'URA']:
33  _imp_to_ihm[d[code]] = alpha[d[code].get_string()]
34  alpha = ihm.DNAAlphabet()
35  for code in ['DADE', 'DCYT', 'DGUA', 'DTHY']:
36  _imp_to_ihm[d[code]] = alpha[d[code].get_string()]
37  # Pass through missing IMP residue
38  _imp_to_ihm[None] = None
39 
40 
41 _fill_imp_to_ihm()
42 
43 
44 def get_molecule(h):
45  """Given a Hierarchy, walk up and find the parent Molecule"""
46  while h:
48  return IMP.atom.Molecule(h)
49  h = h.get_parent()
50  return None
51 
52 
53 def _check_sequential(fragment, resinds):
54  for i in range(1, len(resinds)):
55  if resinds[i - 1] + 1 != resinds[i]:
56  warnings.warn(
57  "%s: non-sequential residue indices; mmCIF bead will cover "
58  "indices [%d-%d]"
59  % (str(fragment), resinds[0], resinds[-1]))
60 
61 
62 def _get_all_state_provenance(state_h, top_h, types):
63  """Yield all provenance information for the given state.
64  If the given State Hierarchy node contains no provenance information,
65  fall back to any provenance information for the top-level node
66  (if provided)."""
67  count = 0
68  for p in IMP.core.get_all_provenance(state_h, types=types):
69  count += 1
70  yield p
71  if count == 0 and top_h is not None:
72  for p in IMP.core.get_all_provenance(top_h, types=types):
73  yield p
74 
75 
76 class _CustomDNAAlphabet(ihm.Alphabet):
77  """Custom DNA alphabet that maps A,C,G,T (rather than DA,DC,DG,DT
78  as in python-ihm)"""
79  _comps = dict([cc.code_canonical, cc]
80  for cc in ihm.DNAAlphabet._comps.values())
81 
82 
83 class _EntityMapper(dict):
84  """Handle mapping from IMP chains to CIF entities.
85  Multiple components may map to the same entity if they
86  share sequence."""
87  def __init__(self, system):
88  self.system = system
89  super().__init__()
90  self._sequence_dict = {}
91  self._entities = []
92  self._alphabet_map = {
93  IMP.atom.UnknownChainType: ihm.LPeptideAlphabet,
94  IMP.atom.Protein: ihm.LPeptideAlphabet,
95  IMP.atom.RNA: ihm.RNAAlphabet,
96  IMP.atom.DNA: _CustomDNAAlphabet}
97 
98  def _get_sequence_from_residues(self, chain, seq_from_res):
99  seq_id_begin, seq = seq_from_res
100  if not seq:
101  raise ValueError("Chain %s has no sequence and no residues"
102  % chain)
103  missing_seq = [ind + seq_id_begin
104  for (ind, res) in enumerate(seq) if res is None]
105  if missing_seq:
106  raise ValueError(
107  "Chain %s has no declared sequence; tried to determine the "
108  "sequence from Residues, but the following residue indices "
109  "have no residue type (perhaps covered only by Fragments): %s"
110  % (chain, str(missing_seq)))
111  return seq_id_begin - 1, tuple(seq)
112 
113  def add(self, chain, seq_from_res=None):
114  sequence = chain.get_sequence()
115  offset = chain.get_sequence_offset()
116  if sequence == '':
117  if seq_from_res is not None:
118  offset, sequence = self._get_sequence_from_residues(
119  chain, seq_from_res)
120  else:
121  raise ValueError("Chain %s has no sequence" % chain)
122  else:
123  # Map one-letter codes to ihm.ChemComp
124  alphabet = self._alphabet_map[chain.get_chain_type()]()
125  sequence = tuple(alphabet[x] for x in sequence)
126  if sequence not in self._sequence_dict:
127  entity = ihm.Entity(sequence)
128  self.system.entities.append(entity)
129  self._entities.append(entity)
130  self._sequence_dict[sequence] = entity
131  uniprot = chain.get_uniprot_accession()
132  if uniprot:
133  up = ihm.reference.UniProtSequence.from_accession(uniprot)
134  entity.references.append(up)
135  self[chain] = self._sequence_dict[sequence]
136  return self[chain], offset
137 
138  def get_all(self):
139  """Yield all entities"""
140  return self._entities
141 
142 
143 def _assign_id(obj, seen_objs, obj_by_id):
144  """Assign a unique ID to obj, and track all ids in obj_by_id."""
145  if obj not in seen_objs:
146  if not hasattr(obj, 'id'):
147  obj_by_id.append(obj)
148  obj.id = len(obj_by_id)
149  seen_objs[obj] = obj.id
150  else:
151  obj.id = seen_objs[obj]
152 
153 
154 class _Component:
155  """An mmCIF component. This is an instance of an _Entity. Multiple
156  _Components may map to the same _Entity but must have unique
157  asym_ids. A _Component is similar to an IMP Chain but multiple
158  Chains may map to the same _Component (the Chains represent the
159  same structure, just in different states, and potentially in
160  different IMP Models). A _Component may also represent something
161  that is described by an experiment but was not modeled by IMP, and
162  so no Chains map to it but a string name does."""
163  def __init__(self, entity, asym_id, name):
164  self.entity, self.asym_id, self.name = entity, asym_id, name
165 
166 
167 class _ComponentMapper:
168  """Handle mapping from IMP Chains to CIF AsymUnits."""
169  def __init__(self, system):
170  super().__init__()
171  self.system = system
172  self._used_entities = set()
173  self._all_components = []
174  self._map = {}
175 
176  def __getitem__(self, chain):
177  asym_id, map_key, name = self._handle_chain(chain)
178  return self._map[map_key]
179 
180  def _handle_chain(self, chain):
181  mol = get_molecule(chain)
182  asym_id = chain.get_id()
183  name = mol.get_name() if mol else None
184  # Avoid conflict between name="A" and asym_id="A"
185  if name:
186  map_key = "name", name
187  else:
188  map_key = "asym_id", asym_id
189  return asym_id, map_key, name
190 
191  def add(self, chain, entity, offset):
192  """Add a chain (an IMP Chain object)"""
193  asym_id, map_key, name = self._handle_chain(chain)
194  if map_key not in self._map:
195  component = _Component(entity, asym_id, name)
196  if entity not in self._used_entities:
197  self._used_entities.add(entity)
198  # Assign entity name from the component; strip out anything
199  # after a @ or .
200  if component.name:
201  entity.description = \
202  component.name.split("@")[0].split(".")[0]
203  self._all_components.append(component)
204  asym = ihm.AsymUnit(entity, name, id=asym_id,
205  auth_seq_id_map=offset)
206  self.system.asym_units.append(asym)
207  component.asym_unit = asym
208  self._map[map_key] = component
209  else:
210  component = self._map[map_key]
211  if component.entity != entity:
212  raise ValueError("Two chains have the same ID (%s) but "
213  "different sequences - rename one of the "
214  "chains" % component.asym_unit.id)
215  if component.asym_unit.auth_seq_id_map != offset:
216  raise ValueError(
217  "Two chains have the same ID (%s) but different offsets "
218  "(%d, %d) - this is not supported"
219  % (component.asym_unit.id,
220  component.asym_unit.auth_seq_id_map, offset))
221  return component
222 
223  def get_all(self):
224  """Get all components"""
225  return self._all_components
226 
227 
228 class _RepSegmentFactory:
229  """Make ihm.representation.Segment objects for each set of contiguous
230  particles with the same representation"""
231  def __init__(self, asym):
232  self.asym = asym
233  # Offset from IHM to IMP numbering
234  self.offset = asym.auth_seq_id_map
235  self.particles = []
236  self.imp_residue_range = () # inclusive range, using IMP numbering
237 
238  def add(self, particle, starting_model):
239  """Add a new particle to the last segment (and return None).
240  Iff the particle could not be added, return the segment and start
241  a new one."""
242  (resrange, rigid_body,
243  is_res, is_atom) = self._get_particle_info(particle)
244 
245  def start_new_segment():
246  self.particles = [particle]
247  self.imp_residue_range = resrange
248  self.rigid_body = rigid_body
249  self.is_res = is_res
250  self.is_atom = is_atom
251  self.starting_model = starting_model
252  if not self.particles:
253  # First particle in a segment
254  start_new_segment()
255  elif (type(particle) == type(self.particles[0]) # noqa: E721
256  and is_res == self.is_res
257  and is_atom == self.is_atom
258  and resrange[0] <= self.imp_residue_range[1] + 1
259  and starting_model == self.starting_model
260  and self._same_rigid_body(rigid_body)):
261  # Continue an existing segment
262  self.particles.append(particle)
263  self.imp_residue_range = (self.imp_residue_range[0], resrange[1])
264  else:
265  # Make a new segment
266  seg = self.get_last()
267  start_new_segment()
268  return seg
269 
270  def get_last(self):
271  """Return the last segment, or None"""
272  if self.particles:
273  # Convert residue_range from IMP to IHM
274  asym = self.asym(self.imp_residue_range[0] - self.offset,
275  self.imp_residue_range[1] - self.offset)
276  if self.is_atom:
277  return ihm.representation.AtomicSegment(
278  asym_unit=asym, rigid=self.rigid_body is not None,
279  starting_model=self.starting_model)
280  elif self.is_res:
281  return ihm.representation.ResidueSegment(
282  asym_unit=asym,
283  rigid=self.rigid_body is not None, primitive='sphere',
284  starting_model=self.starting_model)
285  else:
286  return ihm.representation.FeatureSegment(
287  asym_unit=asym,
288  rigid=self.rigid_body is not None, primitive='sphere',
289  count=len(self.particles),
290  starting_model=self.starting_model)
291 
292  def _same_rigid_body(self, rigid_body):
293  # Note: can't just use self.rigid_body == rigid_body as IMP may
294  # crash when comparing a RigidBody object against None
295  if self.rigid_body is None and rigid_body is None:
296  return True
297  elif self.rigid_body is None or rigid_body is None:
298  return False
299  else:
300  return self.rigid_body == rigid_body
301 
302  def _get_particle_info(self, p):
303  # Note that we consider nonrigid members to not be rigid here
305  rigid_body = IMP.core.RigidMember(p).get_rigid_body()
306  else:
307  rigid_body = None
308  if isinstance(p, IMP.atom.Residue):
309  return (p.get_index(), p.get_index()), rigid_body, True, False
310  elif isinstance(p, IMP.atom.Atom):
311  res = IMP.atom.get_residue(p)
312  return (res.get_index(), res.get_index()), rigid_body, False, True
313  elif isinstance(p, IMP.atom.Fragment):
314  resinds = p.get_residue_indexes()
315  return (resinds[0], resinds[-1]), rigid_body, False, False
316  raise TypeError("Unknown particle ", p)
317 
318 
319 def _get_all_structure_provenance(p):
320  """Yield all StructureProvenance decorators for the given particle."""
322 
323 
324 class _StartingModelAtomHandler:
325  def __init__(self, templates, asym):
326  self._seq_dif = []
327  self._last_res_index = None
328  self.templates = templates
329  self.asym = asym
330 
331  def _residue_first_atom(self, res):
332  """Return True iff we're looking at the first atom in this residue"""
333  # Only add one seq_dif record per residue
334  ind = res.get_index()
335  if ind != self._last_res_index:
336  self._last_res_index = ind
337  return True
338 
339  def handle_residue(self, res, comp_id, seq_id, offset):
340  res_name = res.get_residue_type().get_string()
341  # MSE in the original PDB is automatically mutated
342  # by IMP to MET, so reflect that in the output,
343  # and pass back to populate the seq_dif category.
344  if res_name == 'MSE' and comp_id == 'MET':
345  if self._residue_first_atom(res):
346  # This should only happen when we're using
347  # a crystal structure as the source (a
348  # comparative model would use MET in
349  # the sequence)
350  assert len(self.templates) == 0
351  self._seq_dif.append(ihm.startmodel.MSESeqDif(
352  res.get_index(), seq_id))
353  elif res_name != comp_id:
354  if self._residue_first_atom(res):
355  print("WARNING: Starting model residue %s does not match "
356  "that in the output model (%s) for chain %s residue %d. "
357  "Check offset (currently %d)."
358  % (res_name, comp_id, self.asym._id, seq_id, offset))
359  self._seq_dif.append(ihm.startmodel.SeqDif(
360  db_seq_id=res.get_index(), seq_id=seq_id,
361  db_comp_id=res_name,
362  details="Mutation of %s to %s" % (res_name, comp_id)))
363 
364  def get_ihm_atoms(self, particles, offset):
365  for a in particles:
366  coord = IMP.core.XYZ(a).get_coordinates()
367  atom = IMP.atom.Atom(a)
368  element = atom.get_element()
369  element = IMP.atom.get_element_table().get_name(element)
370  atom_name = atom.get_atom_type().get_string()
371  het = atom_name.startswith('HET:')
372  if het:
373  atom_name = atom_name[4:]
374  res = IMP.atom.get_residue(atom)
375 
376  seq_id = res.get_index() + offset
377  comp_id = self.asym.entity.sequence[seq_id-1].id
378  self.handle_residue(res, comp_id, seq_id, offset)
379  yield ihm.model.Atom(asym_unit=self.asym, seq_id=seq_id,
380  atom_id=atom_name, type_symbol=element,
381  x=coord[0], y=coord[1], z=coord[2],
382  het=het, biso=atom.get_temperature_factor())
383 
384 
385 class _StartingModel(ihm.startmodel.StartingModel):
386  _eq_keys = ['filename', 'asym_id', 'offset']
387 
388  def __init__(self, asym_unit, struc_prov):
389  self.filename = struc_prov[0].get_filename()
390  super().__init__(
391  asym_unit=asym_unit(1, 1), # will update in _add_residue()
392  # will fill in later with _set_sources_datasets()
393  dataset=None,
394  asym_id=struc_prov[0].get_chain_id(),
395  offset=struc_prov[0].get_residue_offset())
396 
397  def _add_residue(self, resind):
398  # Update seq_id_range to accommodate this residue
399  seq_id_end = resind
400  seq_id_begin = self.asym_unit.seq_id_range[0]
401  if seq_id_begin == 0:
402  seq_id_begin = seq_id_end
403  self.asym_unit = self.asym_unit.asym(seq_id_begin, seq_id_end)
404 
405  # Two starting models with same filename, chain ID, and offset
406  # compare identical
407  # note: this results in separate starting models if only the
408  # offset differs; maybe consolidate into one?
409  def _eq_vals(self):
410  return tuple([self.__class__]
411  + [getattr(self, x) for x in self._eq_keys])
412 
413  def __eq__(self, other):
414  return other is not None and self._eq_vals() == other._eq_vals()
415 
416  def __hash__(self):
417  return hash(self._eq_vals())
418 
419  def _set_sources_datasets(self, system, datasets):
420  # Attempt to identify PDB file vs. comparative model
421  if (hasattr(ihm.metadata, 'CIFParser')
422  and self.filename.endswith('.cif')):
423  p = ihm.metadata.CIFParser()
424  elif (hasattr(ihm.metadata, 'BinaryCIFParser')
425  and self.filename.endswith('.bcif')):
426  p = ihm.metadata.BinaryCIFParser()
427  else:
428  p = ihm.metadata.PDBParser()
429  r = p.parse_file(self.filename)
430  system.software.extend(r.get('software', []))
431  dataset = datasets.add(r['dataset'])
432  # We only want the templates that model the starting model chain
433  templates = r.get('templates', {}).get(self.asym_id, [])
434  for t in templates:
435  if t.alignment_file:
436  system.locations.append(t.alignment_file)
437  if t.dataset:
438  datasets.add(t.dataset)
439  self.dataset = dataset
440  self.templates = templates
441  self.metadata = r.get('metadata', [])
442 
443  def _read_coords(self):
444  """Read the coordinates for this starting model"""
445  m = IMP.Model()
446  # todo: support reading other subsets of the atoms (e.g. CA/CB)
447  slt = IMP.atom.ChainPDBSelector([self.asym_id]) \
449  hier = IMP.atom.read_pdb_any(self.filename, m, slt)
450  rng = self.asym_unit.seq_id_range
451  sel = IMP.atom.Selection(
452  hier, residue_indexes=list(range(rng[0] - self.offset,
453  rng[1] + 1 - self.offset)))
454  return m, sel
455 
456  def get_seq_dif(self):
457  return self._seq_dif # filled in by get_atoms()
458 
459  def get_atoms(self):
460  mh = _StartingModelAtomHandler(self.templates, self.asym_unit)
461  m, sel = self._read_coords()
462  for a in mh.get_ihm_atoms(sel.get_selected_particles(), self.offset):
463  yield a
464  self._seq_dif = mh._seq_dif
465 
466 
467 class _StartingModelFinder:
468  """Map IMP particles to starting model objects"""
469  def __init__(self, asym, existing_starting_models, system, datasets):
470  self._seen_particles = {}
471  self._asym = asym
472  self._seen_starting_models = {}
473  for sm in existing_starting_models:
474  self._seen_starting_models[sm] = sm
475  self._system = system
476  self._datasets = datasets
477 
478  def find(self, particle):
479  """Return a StartingModel object, or None, for this particle"""
480  def _get_starting_model(sp, resind):
481  s = _StartingModel(self._asym, sp)
482  if s not in self._seen_starting_models:
483  self._seen_starting_models[s] = s
484  s._set_sources_datasets(self._system, self._datasets)
485  self._system.orphan_starting_models.append(s)
486  s = self._seen_starting_models[s]
487  if s:
488  s._add_residue(resind)
489  return s
490  resind = None
491  if IMP.atom.Residue.get_is_setup(particle):
492  resind = IMP.atom.Residue(particle).get_index()
493  sp = list(_get_all_structure_provenance(particle))
494  if sp:
495  return _get_starting_model(sp, resind)
496  elif IMP.atom.Hierarchy.get_is_setup(particle):
497  h = IMP.atom.Hierarchy(particle).get_parent()
498  # Remember all nodes we inspect
499  seen_parents = []
500  while h:
502  resind = IMP.atom.Residue(h).get_index()
503  pi = h.get_particle_index()
504  seen_parents.append(pi)
505  # If we inspected this node before, return the cached result
506  if pi in self._seen_particles:
507  sp = self._seen_particles[pi]
508  if sp and sp[0] and resind is not None:
509  sp[0]._add_residue(resind)
510  return sp[0] if sp else None
511  else:
512  sp = list(_get_all_structure_provenance(h))
513  self._seen_particles[pi] = []
514  if sp:
515  s = _get_starting_model(sp, resind)
516  # Set cache for this node and all the children we
517  # inspected on the way up
518  for spi in seen_parents:
519  self._seen_particles[spi].append(s)
520  return s
521  h = h.get_parent()
522 
523 
524 class _Datasets:
525  """Store all datasets used."""
526  def __init__(self, system):
527  super().__init__()
528  self._datasets = {}
529  self._groups = {}
530  self.system = system
531 
532  def add(self, d):
533  """Add and return a new dataset."""
534  if d not in self._datasets:
535  self._datasets[d] = d
536  self.system.orphan_datasets.append(d)
537  return self._datasets[d]
538 
539  def add_group(self, datasets, name):
540  """Add and return a new group of datasets"""
541  seen = set()
542  # Remove duplicates
543  d = []
544  for dataset in datasets:
545  if dataset not in seen:
546  d.append(dataset)
547  seen.add(dataset)
548  d = tuple(d)
549  if d not in self._groups:
550  g = ihm.dataset.DatasetGroup(d, name=name)
551  self._groups[d] = g
552  self.system.orphan_dataset_groups.append(g)
553  return self._groups[d]
554 
555  def get_all(self):
556  """Yield all datasets"""
557  return self._datasets.keys()
558 
559 
560 class _AllSoftware:
561  """Keep track of all Software objects."""
562 
563  # IMP/RMF doesn't store citation info for software, so provide it
564  # for known software packages
565  cites = {'Integrative Modeling Platform (IMP)': ihm.citations.imp,
566  'IMP PMI module': ihm.citations.pmi}
567 
568  def __init__(self, system):
569  self.system = system
570  self._by_namever = {}
571  super().__init__()
572 
573  def add_hierarchy(self, h, top_h=None):
574  # todo: if no SoftwareProvenance available, use RMF producer field
575  for p in _get_all_state_provenance(
576  h, top_h, types=[IMP.core.SoftwareProvenance]):
577  self._add_provenance(p)
578 
579  def _add_provenance(self, p):
580  """Add Software from SoftwareProvenance"""
581  # Only reference the same version of a given software package once
582  name = p.get_software_name()
583  version = p.get_version()
584  if (name, version) not in self._by_namever:
585  s = ihm.Software(name=name,
586  classification='integrative model building',
587  description=None, version=version,
588  location=p.get_location(),
589  citation=self.cites.get(name))
590  self.system.software.append(s)
591  self._by_namever[name, version] = s
592  return self._by_namever[name, version]
593 
594  def _add_previous_provenance(self, prov):
595  """Add Software from a previous SoftwareProvenance, if any"""
596  while prov:
598  return self._add_provenance(IMP.core.SoftwareProvenance(prov))
599  prov = prov.get_previous()
600 
601 
602 class _ExternalFiles:
603  """Track all externally-referenced files
604  (i.e. anything that refers to a Location that isn't
605  a DatabaseLocation)."""
606  def __init__(self, system):
607  self.system = system
608  self._by_path = {}
609 
610  def add_hierarchy(self, h, top_h=None):
611  # Add all Python scripts that were used in the modeling
612  for p in _get_all_state_provenance(
613  h, top_h, types=[IMP.core.ScriptProvenance]):
614  self._add_provenance(p)
615 
616  def _add_provenance(self, p):
617  """Add external file from ScriptProvenance"""
618  # Only reference the same path once
619  path = p.get_filename()
620  if path not in self._by_path:
621  loc = ihm.location.WorkflowFileLocation(
622  path=p.get_filename(),
623  details='Integrative modeling Python script')
624  self.system.locations.append(loc)
625  self._by_path[path] = loc
626  return self._by_path[path]
627 
628 
629 class _ProtocolStep(ihm.protocol.Step):
630  """A single step (e.g. sampling, refinement) in a protocol."""
631  def __init__(self, prov, num_models_begin, assembly, all_software):
632  method = prov.get_method()
633  if prov.get_number_of_replicas() > 1:
634  method = "Replica exchange " + method
635  super().__init__(
636  assembly=assembly,
637  # todo: fill in useful value for dataset_group
638  dataset_group=None,
639  method=method, name='Sampling',
640  num_models_begin=num_models_begin,
641  num_models_end=prov.get_number_of_frames(),
642  # todo: support multiple states, time ordered
643  multi_state=False, ordered=False,
644  # todo: revisit assumption all models are multiscale
645  multi_scale=True,
646  software=all_software._add_previous_provenance(prov))
647 
648  def add_combine(self, prov):
649  self.num_models_end = prov.get_number_of_frames()
650  return self.num_models_end
651 
652 
653 class _Protocol(ihm.protocol.Protocol):
654  """A modeling protocol.
655  Each protocol consists of a number of protocol steps (e.g. sampling,
656  refinement) followed by a number of postprocessing steps (e.g.
657  filtering, rescoring, clustering)"""
658 
659  def add_step(self, prov, num_models, assembly, all_software):
660  if isinstance(prov, IMP.core.CombineProvenance):
661  # Fold CombineProvenance into a previous sampling step
662  if len(self.steps) == 0:
663  raise ValueError("CombineProvenance with no previous sampling")
664  return self.steps[-1].add_combine(prov)
665  else:
666  ps = _ProtocolStep(prov, num_models, assembly, all_software)
667  self.steps.append(ps)
668  return ps.num_models_end
669 
670  def add_postproc(self, prov, num_models, assembly):
671  if not self.analyses:
672  self.analyses.append(ihm.analysis.Analysis())
673  if isinstance(prov, IMP.core.FilterProvenance):
674  pp = ihm.analysis.FilterStep(
675  feature='energy/score', assembly=assembly,
676  num_models_begin=num_models,
677  num_models_end=prov.get_number_of_frames())
678  elif isinstance(prov, IMP.core.ClusterProvenance):
679  # Assume clustering uses all models
680  pp = ihm.analysis.ClusterStep(
681  feature='RMSD', assembly=assembly, num_models_begin=num_models,
682  num_models_end=num_models)
683  else:
684  raise ValueError("Unhandled provenance", prov)
685  self.analyses[-1].steps.append(pp)
686  return pp.num_models_end
687 
688 
689 class _Protocols:
690  """Track all modeling protocols used."""
691  def __init__(self, system):
692  self.system = system
693 
694  def _add_protocol(self, prot):
695  # Protocol isn't hashable or sortable, so just compare dicts
696  # with existing protocols. This should still be performant as
697  # we generally don't have more than one or two protocols.
698  # We exclude dataset_group from the comparison as this is typically
699  # filled in later.
700  def step_equal(x, y):
701  def get_dict(d):
702  return {x: y for x, y in d.__dict__.items()
703  if x != 'dataset_group'}
704 
705  return (type(x) == type(y) # noqa: E721
706  and get_dict(x) == get_dict(y))
707 
708  def analysis_equal(x, y):
709  return (len(x.steps) == len(y.steps)
710  and all(step_equal(a, b)
711  for (a, b) in zip(x.steps, y.steps)))
712 
713  for existing in self.system.orphan_protocols:
714  if (len(existing.steps) == len(prot.steps)
715  and len(existing.analyses) == len(prot.analyses)
716  and all(step_equal(x, y)
717  for (x, y) in zip(existing.steps, prot.steps))
718  and all(analysis_equal(x, y)
719  for (x, y) in zip(existing.analyses, prot.analyses))):
720  return existing
721  self.system.orphan_protocols.append(prot)
722  return prot
723 
724  def _add_hierarchy(self, h, top_h, modeled_assembly, all_software):
725  num_models = 0 # assume we always start with no models
728  in_postproc = False
729  prot = _Protocol()
730  for p in reversed(list(_get_all_state_provenance(
731  h, top_h, types=prot_types + pp_types))):
732  if isinstance(p, pp_types):
733  num_models = prot.add_postproc(p, num_models, modeled_assembly)
734  in_postproc = True
735  else:
736  if in_postproc:
737  # Start a new protocol
738  self._add_protocol(prot)
739  prot = _Protocol()
740  num_models = prot.add_step(p, num_models, modeled_assembly,
741  all_software)
742  in_postproc = False
743  if len(prot.steps) > 0:
744  return self._add_protocol(prot)
745 
746 
747 class _CoordinateHandler:
748  def __init__(self, system, datasets):
749  self._system = system
750  self._datasets = datasets
751  self._representation = ihm.representation.Representation()
752  # IHM atoms/spheres corresponding to IMP beads/residues/atoms
753  # We build them up front (rather than on the fly) as the original
754  # IMP objects may have been destroyed or changed (e.g. if we read
755  # multiple frames from an RMF file) by the time we write the mmCIF.
756  self._atoms = []
757  self._spheres = []
758 
759  def get_residue_sequence(self, ps):
760  """Determine the primary sequence based on Residue particles.
761  Return the index of the first residue and the sequence, as a list
762  of ihm.ChemComp objects (or None)"""
763  restyp = {}
764  for p in ps:
765  if isinstance(p, IMP.atom.Atom):
766  residue = IMP.atom.get_residue(p)
767  restyp[residue.get_index()] = residue.get_residue_type()
768  elif isinstance(p, IMP.atom.Residue):
769  restyp[p.get_index()] = p.get_residue_type()
770  else: # fragment
771  resinds = p.get_residue_indexes()
772  for ri in resinds:
773  if ri not in restyp: # don't overwrite residue/atom info
774  restyp[ri] = None
775  if not restyp:
776  return 1, []
777  seq_id_begin = min(restyp.keys())
778  seq_id_end = max(restyp.keys())
779  return (seq_id_begin,
780  [_imp_to_ihm[restyp.get(x)]
781  for x in range(seq_id_begin, seq_id_end + 1)])
782 
783  def add_chain(self, ps, asym):
784  def matches_asym(s):
785  # Match AsymUnit or AsymUnitRange
786  return s == asym or hasattr(s, 'asym') and s.asym == asym
787 
788  # Consolidate starting models if the same model was used for this
789  # asym in a different state or for a different model_id
790  smf = _StartingModelFinder(
791  asym, [s for s in self._system.orphan_starting_models
792  if matches_asym(s.asym_unit)],
793  self._system, self._datasets)
794  segfactory = _RepSegmentFactory(asym)
795  offset = asym.auth_seq_id_map
796  for p in ps:
797  starting_model = smf.find(p)
798  seg = segfactory.add(p, starting_model)
799  if seg:
800  self._representation.append(seg)
801  self._add_atom_or_sphere(p, asym, offset)
802  last = segfactory.get_last()
803  if last:
804  self._representation.append(last)
805 
806  def _add_atom_or_sphere(self, p, asym, offset):
807  if isinstance(p, IMP.atom.Atom):
808  residue = IMP.atom.get_residue(p)
809  xyz = IMP.core.XYZ(p).get_coordinates()
810  element = p.get_element()
811  element = IMP.atom.get_element_table().get_name(element)
812  atom_name = p.get_atom_type().get_string()
813  het = atom_name.startswith('HET:')
814  if het:
815  atom_name = atom_name[4:]
816  self._atoms.append(ihm.model.Atom(
817  asym_unit=asym, seq_id=residue.get_index() - offset,
818  atom_id=atom_name, type_symbol=element,
819  x=xyz[0], y=xyz[1], z=xyz[2], het=het,
820  biso=p.get_temperature_factor(),
821  occupancy=p.get_occupancy()))
822  else:
823  if isinstance(p, IMP.atom.Fragment):
824  resinds = p.get_residue_indexes()
825  sbegin = resinds[0]
826  send = resinds[-1]
827  else: # residue
828  sbegin = send = p.get_index()
829  xyzr = IMP.core.XYZR(p)
830  xyz = xyzr.get_coordinates()
831  self._spheres.append(ihm.model.Sphere(
832  asym_unit=asym, seq_id_range=(sbegin - offset, send - offset),
833  x=xyz[0], y=xyz[1], z=xyz[2], radius=xyzr.get_radius()))
834 
835  def get_structure_particles(self, h):
836  """Return particles sorted by residue index"""
837  ps = []
838  if h.get_number_of_children() == 0:
839  return []
840  if not h.get_is_valid():
841  raise ValueError("Invalid hierarchy as input")
842  for p in IMP.atom.Selection(
843  hierarchy=h, resolution=0.).get_selected_particles():
845  residue = IMP.atom.Residue(p)
846  ps.append((residue.get_index(), residue))
848  fragment = IMP.atom.Fragment(p)
849  resinds = fragment.get_residue_indexes()
850  _check_sequential(fragment, resinds)
851  resind = resinds[len(resinds) // 2]
852  ps.append((resind, fragment))
854  atom = IMP.atom.Atom(p)
855  residue = IMP.atom.get_residue(atom)
856  ps.append((residue.get_index(), atom))
857  return [p[1] for p in sorted(ps, key=operator.itemgetter(0))]
858 
859 
860 class _ModelAssemblies:
861  def __init__(self, system):
862  self.system = system
863  self._seen_assemblies = {}
864 
865  def add(self, asyms):
866  # list isn't hashable but tuple is
867  asyms = tuple(asyms)
868  if asyms not in self._seen_assemblies:
869  assembly = ihm.Assembly(
870  asyms, name="Modeled assembly",
871  description="All components modeled by IMP")
872  self.system.orphan_assemblies.append(assembly)
873  self._seen_assemblies[asyms] = assembly
874  return self._seen_assemblies[asyms]
875 
876 
877 class _Representations:
878  def __init__(self, system):
879  self.system = system
880 
881  def add(self, rep):
882  # Representation isn't hashable or sortable, so just compare dicts
883  # with existing representations. This should still be performant as
884  # we generally don't have more than one or two representations.
885  for existing in self.system.orphan_representations:
886  if (len(existing) == len(rep)
887  and all(type(x) == type(y) # noqa: E721
888  and x.__dict__ == y.__dict__
889  for (x, y) in zip(existing, rep))):
890  return existing
891  self.system.orphan_representations.append(rep)
892  return rep
Select non water and non hydrogen atoms.
Definition: pdb.h:314
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Residue.h:158
A decorator to associate a particle with a part of a protein/DNA/RNA.
Definition: Fragment.h:20
Track creation of a system fragment from running some software.
Definition: provenance.h:562
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: atom/Atom.h:245
ElementTable & get_element_table()
Track creation of a system fragment from sampling.
Definition: provenance.h:173
def get_molecule
Given a Hierarchy, walk up and find the parent Molecule.
Definition: data.py:44
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:749
Track creation of a system fragment by combination.
Definition: provenance.h:280
Class for storing model, its restraints, constraints, and particles.
Definition: Model.h:86
static bool get_is_setup(Model *m, ParticleIndex pi)
Definition: Fragment.h:46
Hierarchy read_pdb_any(TextInput input, Model *model, PDBSelector *selector=get_default_pdb_selector(), bool select_first_model=true)
Read all the molecules in the first model of the PDB-like file.
Definition: mmcif.h:61
void add_hierarchy(RMF::FileHandle fh, atom::Hierarchy hs)
The standard decorator for manipulating molecular structures.
Ints get_index(const ParticlesTemp &particles, const Subset &subset, const Subsets &excluded)
A decorator for a particle representing an atom.
Definition: atom/Atom.h:238
Track creation of a system fragment by filtering.
Definition: provenance.h:340
The type for a residue.
Track creation of a system fragment from a PDB file.
Definition: provenance.h:86
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
A decorator for a residue.
Definition: Residue.h:137
static bool get_is_setup(Model *m, ParticleIndex p)
Check if the particle has the needed attributes for a cast to succeed.
Residue get_residue(Atom d, bool nothrow=false)
Return the Residue containing this atom.
Track creation of a system fragment from clustering.
Definition: provenance.h:426
static bool get_is_setup(Model *m, ParticleIndex pi)
Definition: provenance.h:584
Functionality for loading, creating, manipulating and scoring atomic structures.
std::string get_chain_id(Hierarchy h)
Walk up the hierarchy to determine the chain id.
def get_all_provenance
Yield all provenance decorators of the given types for the particle.
A decorator for a molecule.
Definition: Molecule.h:24
Select hierarchy particles identified by the biological name.
Definition: Selection.h:70
Select all ATOM and HETATM records with the given chain ids.
Definition: pdb.h:256
Track creation of a system fragment from running a script.
Definition: provenance.h:511
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27