IMP logo
IMP Reference Guide  develop.5651aa123e,2024/07/26
The Integrative Modeling Platform
data.py
1 """@namespace IMP.mmcif.data
2  @brief Classes to represent data structures used in mmCIF.
3 """
4 
5 import IMP.atom
6 import ihm.location
7 import ihm.metadata
8 import ihm.startmodel
9 import ihm.analysis
10 import ihm.protocol
11 import ihm.model
12 import ihm.citations
13 import ihm.reference
14 import operator
15 import inspect
16 
17 
18 # Map from IMP ResidueType to ihm ChemComp
19 _imp_to_ihm = {}
20 
21 
22 def _fill_imp_to_ihm():
23  d = dict(x for x in inspect.getmembers(IMP.atom)
24  if isinstance(x[1], IMP.atom.ResidueType))
25  # Handle standard amino acids plus some extras like MSE, UNK
26  for comp in ihm.LPeptideAlphabet._comps.values():
27  if comp.id in d:
28  _imp_to_ihm[d[comp.id]] = comp
29  # Handle RNA and DNA
30  alpha = ihm.RNAAlphabet()
31  for code in ['ADE', 'CYT', 'GUA', 'URA']:
32  _imp_to_ihm[d[code]] = alpha[d[code].get_string()]
33  alpha = ihm.DNAAlphabet()
34  for code in ['DADE', 'DCYT', 'DGUA', 'DTHY']:
35  _imp_to_ihm[d[code]] = alpha[d[code].get_string()]
36  # Pass through missing IMP residue
37  _imp_to_ihm[None] = None
38 
39 
40 _fill_imp_to_ihm()
41 
42 
43 def get_molecule(h):
44  """Given a Hierarchy, walk up and find the parent Molecule"""
45  while h:
47  return IMP.atom.Molecule(h)
48  h = h.get_parent()
49  return None
50 
51 
52 def _check_sequential(fragment, resinds):
53  for i in range(1, len(resinds)):
54  if resinds[i - 1] + 1 != resinds[i]:
55  raise ValueError(
56  "%s: non-sequential residue indices are not supported"
57  % str(fragment))
58 
59 
60 def _get_all_state_provenance(state_h, top_h, types):
61  """Yield all provenance information for the given state.
62  If the given State Hierarchy node contains no provenance information,
63  fall back to any provenance information for the top-level node
64  (if provided)."""
65  count = 0
66  for p in IMP.core.get_all_provenance(state_h, types=types):
67  count += 1
68  yield p
69  if count == 0 and top_h is not None:
70  for p in IMP.core.get_all_provenance(top_h, types=types):
71  yield p
72 
73 
74 class _CustomDNAAlphabet(ihm.Alphabet):
75  """Custom DNA alphabet that maps A,C,G,T (rather than DA,DC,DG,DT
76  as in python-ihm)"""
77  _comps = dict([cc.code_canonical, cc]
78  for cc in ihm.DNAAlphabet._comps.values())
79 
80 
81 class _EntityMapper(dict):
82  """Handle mapping from IMP chains to CIF entities.
83  Multiple components may map to the same entity if they
84  share sequence."""
85  def __init__(self, system):
86  self.system = system
87  super().__init__()
88  self._sequence_dict = {}
89  self._entities = []
90  self._alphabet_map = {
91  IMP.atom.UnknownChainType: ihm.LPeptideAlphabet,
92  IMP.atom.Protein: ihm.LPeptideAlphabet,
93  IMP.atom.RNA: ihm.RNAAlphabet,
94  IMP.atom.DNA: _CustomDNAAlphabet}
95 
96  def _get_sequence_from_residues(self, chain, seq_from_res):
97  seq_id_begin, seq = seq_from_res
98  if not seq:
99  raise ValueError("Chain %s has no sequence and no residues"
100  % chain)
101  missing_seq = [ind + seq_id_begin
102  for (ind, res) in enumerate(seq) if res is None]
103  if missing_seq:
104  raise ValueError(
105  "Chain %s has no declared sequence; tried to determine the "
106  "sequence from Residues, but the following residue indices "
107  "have no residue type (perhaps covered only by Fragments): %s"
108  % (chain, str(missing_seq)))
109  return seq_id_begin - 1, tuple(seq)
110 
111  def add(self, chain, seq_from_res=None):
112  sequence = chain.get_sequence()
113  offset = chain.get_sequence_offset()
114  if sequence == '':
115  if seq_from_res is not None:
116  offset, sequence = self._get_sequence_from_residues(
117  chain, seq_from_res)
118  else:
119  raise ValueError("Chain %s has no sequence" % chain)
120  else:
121  # Map one-letter codes to ihm.ChemComp
122  alphabet = self._alphabet_map[chain.get_chain_type()]()
123  sequence = tuple(alphabet[x] for x in sequence)
124  if sequence not in self._sequence_dict:
125  entity = ihm.Entity(sequence)
126  self.system.entities.append(entity)
127  self._entities.append(entity)
128  self._sequence_dict[sequence] = entity
129  uniprot = chain.get_uniprot_accession()
130  if uniprot:
131  up = ihm.reference.UniProtSequence.from_accession(uniprot)
132  entity.references.append(up)
133  self[chain] = self._sequence_dict[sequence]
134  return self[chain], offset
135 
136  def get_all(self):
137  """Yield all entities"""
138  return self._entities
139 
140 
141 def _assign_id(obj, seen_objs, obj_by_id):
142  """Assign a unique ID to obj, and track all ids in obj_by_id."""
143  if obj not in seen_objs:
144  if not hasattr(obj, 'id'):
145  obj_by_id.append(obj)
146  obj.id = len(obj_by_id)
147  seen_objs[obj] = obj.id
148  else:
149  obj.id = seen_objs[obj]
150 
151 
152 class _Component:
153  """An mmCIF component. This is an instance of an _Entity. Multiple
154  _Components may map to the same _Entity but must have unique
155  asym_ids. A _Component is similar to an IMP Chain but multiple
156  Chains may map to the same _Component (the Chains represent the
157  same structure, just in different states, and potentially in
158  different IMP Models). A _Component may also represent something
159  that is described by an experiment but was not modeled by IMP, and
160  so no Chains map to it but a string name does."""
161  def __init__(self, entity, asym_id, name):
162  self.entity, self.asym_id, self.name = entity, asym_id, name
163 
164 
165 class _ComponentMapper:
166  """Handle mapping from IMP Chains to CIF AsymUnits."""
167  def __init__(self, system):
168  super().__init__()
169  self.system = system
170  self._used_entities = set()
171  self._all_components = []
172  self._map = {}
173 
174  def __getitem__(self, chain):
175  asym_id, map_key, name = self._handle_chain(chain)
176  return self._map[map_key]
177 
178  def _handle_chain(self, chain):
179  mol = get_molecule(chain)
180  asym_id = chain.get_id()
181  name = mol.get_name() if mol else None
182  # Avoid conflict between name="A" and asym_id="A"
183  if name:
184  map_key = "name", name
185  else:
186  map_key = "asym_id", asym_id
187  return asym_id, map_key, name
188 
189  def add(self, chain, entity, offset):
190  """Add a chain (an IMP Chain object)"""
191  asym_id, map_key, name = self._handle_chain(chain)
192  if map_key not in self._map:
193  component = _Component(entity, asym_id, name)
194  if entity not in self._used_entities:
195  self._used_entities.add(entity)
196  # Assign entity name from the component; strip out anything
197  # after a @ or .
198  if component.name:
199  entity.description = \
200  component.name.split("@")[0].split(".")[0]
201  self._all_components.append(component)
202  asym = ihm.AsymUnit(entity, name, id=asym_id,
203  auth_seq_id_map=offset)
204  self.system.asym_units.append(asym)
205  component.asym_unit = asym
206  self._map[map_key] = component
207  else:
208  component = self._map[map_key]
209  if component.entity != entity:
210  raise ValueError("Two chains have the same ID (%s) but "
211  "different sequences - rename one of the "
212  "chains" % component.asym_unit.id)
213  if component.asym_unit.auth_seq_id_map != offset:
214  raise ValueError(
215  "Two chains have the same ID (%s) but different offsets "
216  "(%d, %d) - this is not supported"
217  % (component.asym_unit.id,
218  component.asym_unit.auth_seq_id_map, offset))
219  return component
220 
221  def get_all(self):
222  """Get all components"""
223  return self._all_components
224 
225 
226 class _RepSegmentFactory:
227  """Make ihm.representation.Segment objects for each set of contiguous
228  particles with the same representation"""
229  def __init__(self, asym):
230  self.asym = asym
231  # Offset from IHM to IMP numbering
232  self.offset = asym.auth_seq_id_map
233  self.particles = []
234  self.imp_residue_range = () # inclusive range, using IMP numbering
235 
236  def add(self, particle, starting_model):
237  """Add a new particle to the last segment (and return None).
238  Iff the particle could not be added, return the segment and start
239  a new one."""
240  (resrange, rigid_body,
241  is_res, is_atom) = self._get_particle_info(particle)
242 
243  def start_new_segment():
244  self.particles = [particle]
245  self.imp_residue_range = resrange
246  self.rigid_body = rigid_body
247  self.is_res = is_res
248  self.is_atom = is_atom
249  self.starting_model = starting_model
250  if not self.particles:
251  # First particle in a segment
252  start_new_segment()
253  elif (type(particle) == type(self.particles[0]) # noqa: E721
254  and is_res == self.is_res
255  and is_atom == self.is_atom
256  and resrange[0] <= self.imp_residue_range[1] + 1
257  and starting_model == self.starting_model
258  and self._same_rigid_body(rigid_body)):
259  # Continue an existing segment
260  self.particles.append(particle)
261  self.imp_residue_range = (self.imp_residue_range[0], resrange[1])
262  else:
263  # Make a new segment
264  seg = self.get_last()
265  start_new_segment()
266  return seg
267 
268  def get_last(self):
269  """Return the last segment, or None"""
270  if self.particles:
271  # Convert residue_range from IMP to IHM
272  asym = self.asym(self.imp_residue_range[0] - self.offset,
273  self.imp_residue_range[1] - self.offset)
274  if self.is_atom:
275  return ihm.representation.AtomicSegment(
276  asym_unit=asym, rigid=self.rigid_body is not None,
277  starting_model=self.starting_model)
278  elif self.is_res:
279  return ihm.representation.ResidueSegment(
280  asym_unit=asym,
281  rigid=self.rigid_body is not None, primitive='sphere',
282  starting_model=self.starting_model)
283  else:
284  return ihm.representation.FeatureSegment(
285  asym_unit=asym,
286  rigid=self.rigid_body is not None, primitive='sphere',
287  count=len(self.particles),
288  starting_model=self.starting_model)
289 
290  def _same_rigid_body(self, rigid_body):
291  # Note: can't just use self.rigid_body == rigid_body as IMP may
292  # crash when comparing a RigidBody object against None
293  if self.rigid_body is None and rigid_body is None:
294  return True
295  elif self.rigid_body is None or rigid_body is None:
296  return False
297  else:
298  return self.rigid_body == rigid_body
299 
300  def _get_particle_info(self, p):
301  # Note that we consider nonrigid members to not be rigid here
303  rigid_body = IMP.core.RigidMember(p).get_rigid_body()
304  else:
305  rigid_body = None
306  if isinstance(p, IMP.atom.Residue):
307  return (p.get_index(), p.get_index()), rigid_body, True, False
308  elif isinstance(p, IMP.atom.Atom):
309  res = IMP.atom.get_residue(p)
310  return (res.get_index(), res.get_index()), rigid_body, False, True
311  elif isinstance(p, IMP.atom.Fragment):
312  resinds = p.get_residue_indexes()
313  return (resinds[0], resinds[-1]), rigid_body, False, False
314  raise TypeError("Unknown particle ", p)
315 
316 
317 def _get_all_structure_provenance(p):
318  """Yield all StructureProvenance decorators for the given particle."""
320 
321 
322 class _StartingModelAtomHandler:
323  def __init__(self, templates, asym):
324  self._seq_dif = []
325  self._last_res_index = None
326  self.templates = templates
327  self.asym = asym
328 
329  def _residue_first_atom(self, res):
330  """Return True iff we're looking at the first atom in this residue"""
331  # Only add one seq_dif record per residue
332  ind = res.get_index()
333  if ind != self._last_res_index:
334  self._last_res_index = ind
335  return True
336 
337  def handle_residue(self, res, comp_id, seq_id, offset):
338  res_name = res.get_residue_type().get_string()
339  # MSE in the original PDB is automatically mutated
340  # by IMP to MET, so reflect that in the output,
341  # and pass back to populate the seq_dif category.
342  if res_name == 'MSE' and comp_id == 'MET':
343  if self._residue_first_atom(res):
344  # This should only happen when we're using
345  # a crystal structure as the source (a
346  # comparative model would use MET in
347  # the sequence)
348  assert len(self.templates) == 0
349  self._seq_dif.append(ihm.startmodel.MSESeqDif(
350  res.get_index(), seq_id))
351  elif res_name != comp_id:
352  if self._residue_first_atom(res):
353  print("WARNING: Starting model residue %s does not match "
354  "that in the output model (%s) for chain %s residue %d. "
355  "Check offset (currently %d)."
356  % (res_name, comp_id, self.asym._id, seq_id, offset))
357  self._seq_dif.append(ihm.startmodel.SeqDif(
358  db_seq_id=res.get_index(), seq_id=seq_id,
359  db_comp_id=res_name,
360  details="Mutation of %s to %s" % (res_name, comp_id)))
361 
362  def get_ihm_atoms(self, particles, offset):
363  for a in particles:
364  coord = IMP.core.XYZ(a).get_coordinates()
365  atom = IMP.atom.Atom(a)
366  element = atom.get_element()
367  element = IMP.atom.get_element_table().get_name(element)
368  atom_name = atom.get_atom_type().get_string()
369  het = atom_name.startswith('HET:')
370  if het:
371  atom_name = atom_name[4:]
372  res = IMP.atom.get_residue(atom)
373 
374  seq_id = res.get_index() + offset
375  comp_id = self.asym.entity.sequence[seq_id-1].id
376  self.handle_residue(res, comp_id, seq_id, offset)
377  yield ihm.model.Atom(asym_unit=self.asym, seq_id=seq_id,
378  atom_id=atom_name, type_symbol=element,
379  x=coord[0], y=coord[1], z=coord[2],
380  het=het, biso=atom.get_temperature_factor())
381 
382 
383 class _StartingModel(ihm.startmodel.StartingModel):
384  _eq_keys = ['filename', 'asym_id', 'offset']
385 
386  def __init__(self, asym_unit, struc_prov):
387  self.filename = struc_prov[0].get_filename()
388  super().__init__(
389  asym_unit=asym_unit(0, 0), # will update in _add_residue()
390  # will fill in later with _set_sources_datasets()
391  dataset=None,
392  asym_id=struc_prov[0].get_chain_id(),
393  offset=struc_prov[0].get_residue_offset())
394 
395  def _add_residue(self, resind):
396  # Update seq_id_range to accommodate this residue
397  seq_id_end = resind
398  seq_id_begin = self.asym_unit.seq_id_range[0]
399  if seq_id_begin == 0:
400  seq_id_begin = seq_id_end
401  self.asym_unit = self.asym_unit.asym(seq_id_begin, seq_id_end)
402 
403  # Two starting models with same filename, chain ID, and offset
404  # compare identical
405  # note: this results in separate starting models if only the
406  # offset differs; maybe consolidate into one?
407  def _eq_vals(self):
408  return tuple([self.__class__]
409  + [getattr(self, x) for x in self._eq_keys])
410 
411  def __eq__(self, other):
412  return other is not None and self._eq_vals() == other._eq_vals()
413 
414  def __hash__(self):
415  return hash(self._eq_vals())
416 
417  def _set_sources_datasets(self, system, datasets):
418  # Attempt to identify PDB file vs. comparative model
419  if (hasattr(ihm.metadata, 'CIFParser')
420  and self.filename.endswith('.cif')):
421  p = ihm.metadata.CIFParser()
422  else:
423  p = ihm.metadata.PDBParser()
424  r = p.parse_file(self.filename)
425  system.software.extend(r.get('software', []))
426  dataset = datasets.add(r['dataset'])
427  # We only want the templates that model the starting model chain
428  templates = r.get('templates', {}).get(self.asym_id, [])
429  for t in templates:
430  if t.alignment_file:
431  system.locations.append(t.alignment_file)
432  if t.dataset:
433  datasets.add(t.dataset)
434  self.dataset = dataset
435  self.templates = templates
436  self.metadata = r.get('metadata', [])
437 
438  def _read_coords(self):
439  """Read the coordinates for this starting model"""
440  m = IMP.Model()
441  # todo: support reading other subsets of the atoms (e.g. CA/CB)
442  slt = IMP.atom.ChainPDBSelector([self.asym_id]) \
444  hier = IMP.atom.read_pdb_or_mmcif(self.filename, m, slt)
445  rng = self.asym_unit.seq_id_range
446  sel = IMP.atom.Selection(
447  hier, residue_indexes=list(range(rng[0] - self.offset,
448  rng[1] + 1 - self.offset)))
449  return m, sel
450 
451  def get_seq_dif(self):
452  return self._seq_dif # filled in by get_atoms()
453 
454  def get_atoms(self):
455  mh = _StartingModelAtomHandler(self.templates, self.asym_unit)
456  m, sel = self._read_coords()
457  for a in mh.get_ihm_atoms(sel.get_selected_particles(), self.offset):
458  yield a
459  self._seq_dif = mh._seq_dif
460 
461 
462 class _StartingModelFinder:
463  """Map IMP particles to starting model objects"""
464  def __init__(self, asym, existing_starting_models, system, datasets):
465  self._seen_particles = {}
466  self._asym = asym
467  self._seen_starting_models = {}
468  for sm in existing_starting_models:
469  self._seen_starting_models[sm] = sm
470  self._system = system
471  self._datasets = datasets
472 
473  def find(self, particle):
474  """Return a StartingModel object, or None, for this particle"""
475  def _get_starting_model(sp, resind):
476  s = _StartingModel(self._asym, sp)
477  if s not in self._seen_starting_models:
478  self._seen_starting_models[s] = s
479  s._set_sources_datasets(self._system, self._datasets)
480  self._system.orphan_starting_models.append(s)
481  s = self._seen_starting_models[s]
482  if s:
483  s._add_residue(resind)
484  return s
485  resind = None
486  if IMP.atom.Residue.get_is_setup(particle):
487  resind = IMP.atom.Residue(particle).get_index()
488  sp = list(_get_all_structure_provenance(particle))
489  if sp:
490  return _get_starting_model(sp, resind)
491  elif IMP.atom.Hierarchy.get_is_setup(particle):
492  h = IMP.atom.Hierarchy(particle).get_parent()
493  # Remember all nodes we inspect
494  seen_parents = []
495  while h:
497  resind = IMP.atom.Residue(h).get_index()
498  pi = h.get_particle_index()
499  seen_parents.append(pi)
500  # If we inspected this node before, return the cached result
501  if pi in self._seen_particles:
502  sp = self._seen_particles[pi]
503  if sp and sp[0] and resind is not None:
504  sp[0]._add_residue(resind)
505  return sp[0] if sp else None
506  else:
507  sp = list(_get_all_structure_provenance(h))
508  self._seen_particles[pi] = []
509  if sp:
510  s = _get_starting_model(sp, resind)
511  # Set cache for this node and all the children we
512  # inspected on the way up
513  for spi in seen_parents:
514  self._seen_particles[spi].append(s)
515  return s
516  h = h.get_parent()
517 
518 
519 class _Datasets:
520  """Store all datasets used."""
521  def __init__(self, system):
522  super().__init__()
523  self._datasets = {}
524  self._groups = {}
525  self.system = system
526 
527  def add(self, d):
528  """Add and return a new dataset."""
529  if d not in self._datasets:
530  self._datasets[d] = d
531  self.system.orphan_datasets.append(d)
532  return self._datasets[d]
533 
534  def add_group(self, datasets, name):
535  """Add and return a new group of datasets"""
536  seen = set()
537  # Remove duplicates
538  d = []
539  for dataset in datasets:
540  if dataset not in seen:
541  d.append(dataset)
542  seen.add(dataset)
543  d = tuple(d)
544  if d not in self._groups:
545  g = ihm.dataset.DatasetGroup(d, name=name)
546  self._groups[d] = g
547  self.system.orphan_dataset_groups.append(g)
548  return self._groups[d]
549 
550  def get_all(self):
551  """Yield all datasets"""
552  return self._datasets.keys()
553 
554 
555 class _AllSoftware:
556  """Keep track of all Software objects."""
557 
558  # IMP/RMF doesn't store citation info for software, so provide it
559  # for known software packages
560  cites = {'Integrative Modeling Platform (IMP)': ihm.citations.imp,
561  'IMP PMI module': ihm.citations.pmi}
562 
563  def __init__(self, system):
564  self.system = system
565  self._by_namever = {}
566  super().__init__()
567 
568  def add_hierarchy(self, h, top_h=None):
569  # todo: if no SoftwareProvenance available, use RMF producer field
570  for p in _get_all_state_provenance(
571  h, top_h, types=[IMP.core.SoftwareProvenance]):
572  self._add_provenance(p)
573 
574  def _add_provenance(self, p):
575  """Add Software from SoftwareProvenance"""
576  # Only reference the same version of a given software package once
577  name = p.get_software_name()
578  version = p.get_version()
579  if (name, version) not in self._by_namever:
580  s = ihm.Software(name=name,
581  classification='integrative model building',
582  description=None, version=version,
583  location=p.get_location(),
584  citation=self.cites.get(name))
585  self.system.software.append(s)
586  self._by_namever[name, version] = s
587  return self._by_namever[name, version]
588 
589  def _add_previous_provenance(self, prov):
590  """Add Software from a previous SoftwareProvenance, if any"""
591  while prov:
593  return self._add_provenance(IMP.core.SoftwareProvenance(prov))
594  prov = prov.get_previous()
595 
596 
597 class _ExternalFiles:
598  """Track all externally-referenced files
599  (i.e. anything that refers to a Location that isn't
600  a DatabaseLocation)."""
601  def __init__(self, system):
602  self.system = system
603  self._by_path = {}
604 
605  def add_hierarchy(self, h, top_h=None):
606  # Add all Python scripts that were used in the modeling
607  for p in _get_all_state_provenance(
608  h, top_h, types=[IMP.core.ScriptProvenance]):
609  self._add_provenance(p)
610 
611  def _add_provenance(self, p):
612  """Add external file from ScriptProvenance"""
613  # Only reference the same path once
614  path = p.get_filename()
615  if path not in self._by_path:
616  loc = ihm.location.WorkflowFileLocation(
617  path=p.get_filename(),
618  details='Integrative modeling Python script')
619  self.system.locations.append(loc)
620  self._by_path[path] = loc
621  return self._by_path[path]
622 
623 
624 class _ProtocolStep(ihm.protocol.Step):
625  """A single step (e.g. sampling, refinement) in a protocol."""
626  def __init__(self, prov, num_models_begin, assembly, all_software):
627  method = prov.get_method()
628  if prov.get_number_of_replicas() > 1:
629  method = "Replica exchange " + method
630  super().__init__(
631  assembly=assembly,
632  # todo: fill in useful value for dataset_group
633  dataset_group=None,
634  method=method, name='Sampling',
635  num_models_begin=num_models_begin,
636  num_models_end=prov.get_number_of_frames(),
637  # todo: support multiple states, time ordered
638  multi_state=False, ordered=False,
639  # todo: revisit assumption all models are multiscale
640  multi_scale=True,
641  software=all_software._add_previous_provenance(prov))
642 
643  def add_combine(self, prov):
644  self.num_models_end = prov.get_number_of_frames()
645  return self.num_models_end
646 
647 
648 class _Protocol(ihm.protocol.Protocol):
649  """A modeling protocol.
650  Each protocol consists of a number of protocol steps (e.g. sampling,
651  refinement) followed by a number of postprocessing steps (e.g.
652  filtering, rescoring, clustering)"""
653 
654  def add_step(self, prov, num_models, assembly, all_software):
655  if isinstance(prov, IMP.core.CombineProvenance):
656  # Fold CombineProvenance into a previous sampling step
657  if len(self.steps) == 0:
658  raise ValueError("CombineProvenance with no previous sampling")
659  return self.steps[-1].add_combine(prov)
660  else:
661  ps = _ProtocolStep(prov, num_models, assembly, all_software)
662  self.steps.append(ps)
663  return ps.num_models_end
664 
665  def add_postproc(self, prov, num_models, assembly):
666  if not self.analyses:
667  self.analyses.append(ihm.analysis.Analysis())
668  if isinstance(prov, IMP.core.FilterProvenance):
669  pp = ihm.analysis.FilterStep(
670  feature='energy/score', assembly=assembly,
671  num_models_begin=num_models,
672  num_models_end=prov.get_number_of_frames())
673  elif isinstance(prov, IMP.core.ClusterProvenance):
674  # Assume clustering uses all models
675  pp = ihm.analysis.ClusterStep(
676  feature='RMSD', assembly=assembly, num_models_begin=num_models,
677  num_models_end=num_models)
678  else:
679  raise ValueError("Unhandled provenance", prov)
680  self.analyses[-1].steps.append(pp)
681  return pp.num_models_end
682 
683 
684 class _Protocols:
685  """Track all modeling protocols used."""
686  def __init__(self, system):
687  self.system = system
688 
689  def _add_protocol(self, prot):
690  # Protocol isn't hashable or sortable, so just compare dicts
691  # with existing protocols. This should still be performant as
692  # we generally don't have more than one or two protocols.
693  # We exclude dataset_group from the comparison as this is typically
694  # filled in later.
695  def step_equal(x, y):
696  def get_dict(d):
697  return {x: y for x, y in d.__dict__.items()
698  if x != 'dataset_group'}
699 
700  return (type(x) == type(y) # noqa: E721
701  and get_dict(x) == get_dict(y))
702 
703  def analysis_equal(x, y):
704  return (len(x.steps) == len(y.steps)
705  and all(step_equal(a, b)
706  for (a, b) in zip(x.steps, y.steps)))
707 
708  for existing in self.system.orphan_protocols:
709  if (len(existing.steps) == len(prot.steps)
710  and len(existing.analyses) == len(prot.analyses)
711  and all(step_equal(x, y)
712  for (x, y) in zip(existing.steps, prot.steps))
713  and all(analysis_equal(x, y)
714  for (x, y) in zip(existing.analyses, prot.analyses))):
715  return existing
716  self.system.orphan_protocols.append(prot)
717  return prot
718 
719  def _add_hierarchy(self, h, top_h, modeled_assembly, all_software):
720  num_models = 0 # assume we always start with no models
723  in_postproc = False
724  prot = _Protocol()
725  for p in reversed(list(_get_all_state_provenance(
726  h, top_h, types=prot_types + pp_types))):
727  if isinstance(p, pp_types):
728  num_models = prot.add_postproc(p, num_models, modeled_assembly)
729  in_postproc = True
730  else:
731  if in_postproc:
732  # Start a new protocol
733  self._add_protocol(prot)
734  prot = _Protocol()
735  num_models = prot.add_step(p, num_models, modeled_assembly,
736  all_software)
737  in_postproc = False
738  if len(prot.steps) > 0:
739  return self._add_protocol(prot)
740 
741 
742 class _CoordinateHandler:
743  def __init__(self, system, datasets):
744  self._system = system
745  self._datasets = datasets
746  self._representation = ihm.representation.Representation()
747  # IHM atoms/spheres corresponding to IMP beads/residues/atoms
748  # We build them up front (rather than on the fly) as the original
749  # IMP objects may have been destroyed or changed (e.g. if we read
750  # multiple frames from an RMF file) by the time we write the mmCIF.
751  self._atoms = []
752  self._spheres = []
753 
754  def get_residue_sequence(self, ps):
755  """Determine the primary sequence based on Residue particles.
756  Return the index of the first residue and the sequence, as a list
757  of ihm.ChemComp objects (or None)"""
758  restyp = {}
759  for p in ps:
760  if isinstance(p, IMP.atom.Atom):
761  residue = IMP.atom.get_residue(p)
762  restyp[residue.get_index()] = residue.get_residue_type()
763  elif isinstance(p, IMP.atom.Residue):
764  restyp[p.get_index()] = p.get_residue_type()
765  else: # fragment
766  resinds = p.get_residue_indexes()
767  for ri in resinds:
768  if ri not in restyp: # don't overwrite residue/atom info
769  restyp[ri] = None
770  if not restyp:
771  return 1, []
772  seq_id_begin = min(restyp.keys())
773  seq_id_end = max(restyp.keys())
774  return (seq_id_begin,
775  [_imp_to_ihm[restyp.get(x)]
776  for x in range(seq_id_begin, seq_id_end + 1)])
777 
778  def add_chain(self, ps, asym):
779  def matches_asym(s):
780  # Match AsymUnit or AsymUnitRange
781  return s == asym or hasattr(s, 'asym') and s.asym == asym
782 
783  # Consolidate starting models if the same model was used for this
784  # asym in a different state or for a different model_id
785  smf = _StartingModelFinder(
786  asym, [s for s in self._system.orphan_starting_models
787  if matches_asym(s.asym_unit)],
788  self._system, self._datasets)
789  segfactory = _RepSegmentFactory(asym)
790  offset = asym.auth_seq_id_map
791  for p in ps:
792  starting_model = smf.find(p)
793  seg = segfactory.add(p, starting_model)
794  if seg:
795  self._representation.append(seg)
796  self._add_atom_or_sphere(p, asym, offset)
797  last = segfactory.get_last()
798  if last:
799  self._representation.append(last)
800 
801  def _add_atom_or_sphere(self, p, asym, offset):
802  if isinstance(p, IMP.atom.Atom):
803  residue = IMP.atom.get_residue(p)
804  xyz = IMP.core.XYZ(p).get_coordinates()
805  element = p.get_element()
806  element = IMP.atom.get_element_table().get_name(element)
807  atom_name = p.get_atom_type().get_string()
808  het = atom_name.startswith('HET:')
809  if het:
810  atom_name = atom_name[4:]
811  self._atoms.append(ihm.model.Atom(
812  asym_unit=asym, seq_id=residue.get_index() - offset,
813  atom_id=atom_name, type_symbol=element,
814  x=xyz[0], y=xyz[1], z=xyz[2], het=het,
815  biso=p.get_temperature_factor(),
816  occupancy=p.get_occupancy()))
817  else:
818  if isinstance(p, IMP.atom.Fragment):
819  resinds = p.get_residue_indexes()
820  sbegin = resinds[0]
821  send = resinds[-1]
822  else: # residue
823  sbegin = send = p.get_index()
824  xyzr = IMP.core.XYZR(p)
825  xyz = xyzr.get_coordinates()
826  self._spheres.append(ihm.model.Sphere(
827  asym_unit=asym, seq_id_range=(sbegin - offset, send - offset),
828  x=xyz[0], y=xyz[1], z=xyz[2], radius=xyzr.get_radius()))
829 
830  def get_structure_particles(self, h):
831  """Return particles sorted by residue index"""
832  ps = []
833  if h.get_number_of_children() == 0:
834  return []
835  if not h.get_is_valid():
836  raise ValueError("Invalid hierarchy as input")
837  for p in IMP.atom.Selection(
838  hierarchy=h, resolution=0.).get_selected_particles():
840  residue = IMP.atom.Residue(p)
841  ps.append((residue.get_index(), residue))
843  fragment = IMP.atom.Fragment(p)
844  resinds = fragment.get_residue_indexes()
845  _check_sequential(fragment, resinds)
846  resind = resinds[len(resinds) // 2]
847  ps.append((resind, fragment))
849  atom = IMP.atom.Atom(p)
850  residue = IMP.atom.get_residue(atom)
851  ps.append((residue.get_index(), atom))
852  return [p[1] for p in sorted(ps, key=operator.itemgetter(0))]
853 
854 
855 class _ModelAssemblies:
856  def __init__(self, system):
857  self.system = system
858  self._seen_assemblies = {}
859 
860  def add(self, asyms):
861  # list isn't hashable but tuple is
862  asyms = tuple(asyms)
863  if asyms not in self._seen_assemblies:
864  assembly = ihm.Assembly(
865  asyms, name="Modeled assembly",
866  description="All components modeled by IMP")
867  self.system.orphan_assemblies.append(assembly)
868  self._seen_assemblies[asyms] = assembly
869  return self._seen_assemblies[asyms]
870 
871 
872 class _Representations:
873  def __init__(self, system):
874  self.system = system
875 
876  def add(self, rep):
877  # Representation isn't hashable or sortable, so just compare dicts
878  # with existing representations. This should still be performant as
879  # we generally don't have more than one or two representations.
880  for existing in self.system.orphan_representations:
881  if (len(existing) == len(rep)
882  and all(type(x) == type(y) # noqa: E721
883  and x.__dict__ == y.__dict__
884  for (x, y) in zip(existing, rep))):
885  return existing
886  self.system.orphan_representations.append(rep)
887  return rep
Select non water and non hydrogen atoms.
Definition: pdb.h:314
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
Hierarchy read_pdb_or_mmcif(TextInput input, Model *model, PDBSelector *selector=get_default_pdb_selector(), bool select_first_model=true)
Read all the molecules in the first model of the PDB or mmCIF file.
Definition: mmcif.h:40
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Residue.h:158
A decorator to associate a particle with a part of a protein/DNA/RNA.
Definition: Fragment.h:20
Track creation of a system fragment from running some software.
Definition: provenance.h:562
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: atom/Atom.h:245
ElementTable & get_element_table()
Track creation of a system fragment from sampling.
Definition: provenance.h:173
def get_molecule
Given a Hierarchy, walk up and find the parent Molecule.
Definition: data.py:43
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:749
Track creation of a system fragment by combination.
Definition: provenance.h:280
Class for storing model, its restraints, constraints, and particles.
Definition: Model.h:86
static bool get_is_setup(Model *m, ParticleIndex pi)
Definition: Fragment.h:46
void add_hierarchy(RMF::FileHandle fh, atom::Hierarchy hs)
The standard decorator for manipulating molecular structures.
Ints get_index(const ParticlesTemp &particles, const Subset &subset, const Subsets &excluded)
A decorator for a particle representing an atom.
Definition: atom/Atom.h:238
Track creation of a system fragment by filtering.
Definition: provenance.h:340
The type for a residue.
Track creation of a system fragment from a PDB file.
Definition: provenance.h:86
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
A decorator for a residue.
Definition: Residue.h:137
static bool get_is_setup(Model *m, ParticleIndex p)
Check if the particle has the needed attributes for a cast to succeed.
Residue get_residue(Atom d, bool nothrow=false)
Return the Residue containing this atom.
Track creation of a system fragment from clustering.
Definition: provenance.h:426
static bool get_is_setup(Model *m, ParticleIndex pi)
Definition: provenance.h:584
Functionality for loading, creating, manipulating and scoring atomic structures.
std::string get_chain_id(Hierarchy h)
Walk up the hierarchy to determine the chain id.
def get_all_provenance
Yield all provenance decorators of the given types for the particle.
A decorator for a molecule.
Definition: Molecule.h:24
Select hierarchy particles identified by the biological name.
Definition: Selection.h:70
Select all ATOM and HETATM records with the given chain ids.
Definition: pdb.h:256
Track creation of a system fragment from running a script.
Definition: provenance.h:511
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27