IMP logo
IMP Reference Guide  develop.e6c37e56ed,2022/01/25
The Integrative Modeling Platform
data.py
1 """@namespace IMP.mmcif.data
2  @brief Classes to represent data structures used in mmCIF.
3 """
4 
5 from __future__ import print_function
6 import IMP.atom
7 import ihm.location
8 import ihm.metadata
9 import ihm.startmodel
10 import ihm.analysis
11 import ihm.protocol
12 import ihm.model
13 
14 
15 def get_molecule(h):
16  """Given a Hierarchy, walk up and find the parent Molecule"""
17  while h:
19  return IMP.atom.Molecule(h)
20  h = h.get_parent()
21  return None
22 
23 
24 class _EntityMapper(dict):
25  """Handle mapping from IMP chains to CIF entities.
26  Multiple components may map to the same entity if they
27  share sequence."""
28  def __init__(self, system):
29  self.system = system
30  super(_EntityMapper, self).__init__()
31  self._sequence_dict = {}
32  self._entities = []
33 
34  def add(self, chain):
35  # todo: handle non-protein sequences
36  sequence = chain.get_sequence()
37  if sequence == '':
38  raise ValueError("Chain %s has no sequence" % chain)
39  if sequence not in self._sequence_dict:
40  entity = ihm.Entity(sequence)
41  self.system.entities.append(entity)
42  self._entities.append(entity)
43  self._sequence_dict[sequence] = entity
44  self[chain] = self._sequence_dict[sequence]
45  return self[chain]
46 
47  def get_all(self):
48  """Yield all entities"""
49  return self._entities
50 
51 
52 def _assign_id(obj, seen_objs, obj_by_id):
53  """Assign a unique ID to obj, and track all ids in obj_by_id."""
54  if obj not in seen_objs:
55  if not hasattr(obj, 'id'):
56  obj_by_id.append(obj)
57  obj.id = len(obj_by_id)
58  seen_objs[obj] = obj.id
59  else:
60  obj.id = seen_objs[obj]
61 
62 
63 class _Component(object):
64  """An mmCIF component. This is an instance of an _Entity. Multiple
65  _Components may map to the same _Entity but must have unique
66  asym_ids. A _Component is similar to an IMP Chain but multiple
67  Chains may map to the same _Component (the Chains represent the
68  same structure, just in different states, and potentially in
69  different IMP Models). A _Component may also represent something
70  that is described by an experiment but was not modeled by IMP, and
71  so no Chains map to it but a string name does."""
72  def __init__(self, entity, asym_id, name):
73  self.entity, self.asym_id, self.name = entity, asym_id, name
74 
75 
76 class _ComponentMapper(object):
77  """Handle mapping from chains to CIF components."""
78  def __init__(self, system):
79  super(_ComponentMapper, self).__init__()
80  self.system = system
81  self._used_entities = set()
82  self._all_components = []
83  self._all_modeled_components = []
84  self._map = {}
85 
86  def __getitem__(self, chain):
87  modeled, asym_id, map_key, name = self._handle_chain(chain)
88  return self._map[map_key]
89 
90  def _handle_chain(self, chain):
91  if isinstance(chain, IMP.atom.Chain):
92  modeled = True
93  mol = get_molecule(chain)
94  asym_id = map_key = chain.get_id()
95  name = mol.get_name() if mol else None
96  else:
97  modeled = False
98  asym_id = None
99  name = map_key = chain.name
100  return modeled, asym_id, map_key, name
101 
102  def add(self, chain, entity):
103  """Add a chain (either an IMP Chain object for a modeled component,
104  or a NonModeledChain object for a non-modeled component)"""
105  modeled, asym_id, map_key, name = self._handle_chain(chain)
106  if map_key not in self._map:
107  component = _Component(entity, asym_id, name)
108  if entity not in self._used_entities:
109  self._used_entities.add(entity)
110  # Assign entity name from the component; strip out anything
111  # after a @ or .
112  entity.description = component.name.split("@")[0].split(".")[0]
113  self._all_components.append(component)
114  if modeled:
115  asym = ihm.AsymUnit(entity, name, id=asym_id)
116  self.system.asym_units.append(asym)
117  component.asym_unit = asym
118  self._all_modeled_components.append(component)
119  self._map[map_key] = component
120  else:
121  component = self._map[map_key]
122  if component.entity != entity:
123  raise ValueError("Two chains have the same ID (%s) but "
124  "different sequences - rename one of the "
125  "chains" % map_key)
126  return component
127 
128  def get_all(self):
129  """Get all components"""
130  return self._all_components
131 
132  def get_all_modeled(self):
133  """Get all modeled components"""
134  return self._all_modeled_components
135 
136 
137 class _RepSegmentFactory(object):
138  """Make ihm.representation.Segment objects for each set of contiguous
139  particles with the same representation"""
140  def __init__(self, component):
141  self.component = component
142  self.particles = []
143  self.residue_range = () # inclusive range
144 
145  def add(self, particle, starting_model):
146  """Add a new particle to the last segment (and return None).
147  Iff the particle could not be added, return the segment and start
148  a new one."""
149  resrange, rigid_body, is_res = self._get_particle_info(particle)
150 
151  def start_new_segment():
152  self.particles = [particle]
153  self.residue_range = resrange
154  self.rigid_body = rigid_body
155  self.is_res = is_res
156  self.starting_model = starting_model
157  if not self.particles:
158  # First particle in a segment
159  start_new_segment()
160  elif (type(particle) == type(self.particles[0]) # noqa: E721
161  and is_res == self.is_res
162  and resrange[0] == self.residue_range[1] + 1
163  and starting_model == self.starting_model
164  and self._same_rigid_body(rigid_body)):
165  # Continue an existing segment
166  self.particles.append(particle)
167  self.residue_range = (self.residue_range[0], resrange[1])
168  else:
169  # Make a new segment
170  seg = self.get_last()
171  start_new_segment()
172  return seg
173 
174  def get_last(self):
175  """Return the last segment, or None"""
176  if self.particles:
177  asym = self.component.asym_unit(*self.residue_range)
178  if self.is_res:
179  return ihm.representation.ResidueSegment(
180  asym_unit=asym,
181  rigid=self.rigid_body is not None, primitive='sphere',
182  starting_model=self.starting_model)
183  else:
184  return ihm.representation.FeatureSegment(
185  asym_unit=asym,
186  rigid=self.rigid_body is not None, primitive='sphere',
187  count=len(self.particles),
188  starting_model=self.starting_model)
189 
190  def _same_rigid_body(self, rigid_body):
191  # Note: can't just use self.rigid_body == rigid_body as IMP may
192  # crash when comparing a RigidBody object against None
193  if self.rigid_body is None and rigid_body is None:
194  return True
195  elif self.rigid_body is None or rigid_body is None:
196  return False
197  else:
198  return self.rigid_body == rigid_body
199 
200  def _get_particle_info(self, p):
201  # Note that we consider nonrigid members to not be rigid here
203  rigid_body = IMP.core.RigidMember(p).get_rigid_body()
204  else:
205  rigid_body = None
206  if isinstance(p, IMP.atom.Residue):
207  return (p.get_index(), p.get_index()), rigid_body, True
208  elif isinstance(p, IMP.atom.Fragment):
209  resinds = p.get_residue_indexes()
210  return (resinds[0], resinds[-1]), rigid_body, False
211  raise TypeError("Unknown particle ", p)
212 
213 
214 def _get_all_structure_provenance(p):
215  """Yield all StructureProvenance decorators for the given particle."""
217 
218 
219 class _StartingModelAtomHandler(object):
220  def __init__(self, templates, asym):
221  self._seq_dif = []
222  self._last_res_index = None
223  self.templates = templates
224  self.asym = asym
225 
226  def _residue_first_atom(self, res):
227  """Return True iff we're looking at the first atom in this residue"""
228  # Only add one seq_dif record per residue
229  ind = res.get_index()
230  if ind != self._last_res_index:
231  self._last_res_index = ind
232  return True
233 
234  def handle_residue(self, res, comp_id, seq_id, offset):
235  res_name = res.get_residue_type().get_string()
236  # MSE in the original PDB is automatically mutated
237  # by IMP to MET, so reflect that in the output,
238  # and pass back to populate the seq_dif category.
239  if res_name == 'MSE' and comp_id == 'MET':
240  if self._residue_first_atom(res):
241  # This should only happen when we're using
242  # a crystal structure as the source (a
243  # comparative model would use MET in
244  # the sequence)
245  assert(len(self.templates) == 0)
246  self._seq_dif.append(ihm.startmodel.MSESeqDif(
247  res.get_index(), seq_id))
248  elif res_name != comp_id:
249  if self._residue_first_atom(res):
250  print("WARNING: Starting model residue %s does not match "
251  "that in the output model (%s) for chain %s residue %d. "
252  "Check offset (currently %d)."
253  % (res_name, comp_id, self.asym._id, seq_id, offset))
254  self._seq_dif.append(ihm.startmodel.SeqDif(
255  db_seq_id=res.get_index(), seq_id=seq_id,
256  db_comp_id=res_name,
257  details="Mutation of %s to %s" % (res_name, comp_id)))
258 
259  def get_ihm_atoms(self, particles, offset):
260  for a in particles:
261  coord = IMP.core.XYZ(a).get_coordinates()
262  atom = IMP.atom.Atom(a)
263  element = atom.get_element()
264  element = IMP.atom.get_element_table().get_name(element)
265  atom_name = atom.get_atom_type().get_string()
266  het = atom_name.startswith('HET:')
267  if het:
268  atom_name = atom_name[4:]
269  res = IMP.atom.get_residue(atom)
270 
271  seq_id = res.get_index() + offset
272  comp_id = self.asym.entity.sequence[seq_id-1].id
273  self.handle_residue(res, comp_id, seq_id, offset)
274  yield ihm.model.Atom(asym_unit=self.asym, seq_id=seq_id,
275  atom_id=atom_name, type_symbol=element,
276  x=coord[0], y=coord[1], z=coord[2],
277  het=het, biso=atom.get_temperature_factor())
278 
279 
280 class _StartingModel(ihm.startmodel.StartingModel):
281  _eq_keys = ['filename', 'asym_id', 'offset']
282 
283  def __init__(self, asym_unit, struc_prov):
284  self.filename = struc_prov[0].get_filename()
285  super(_StartingModel, self).__init__(
286  asym_unit=asym_unit(0, 0), # will update in _add_residue()
287  # will fill in later with _set_sources_datasets()
288  dataset=None,
289  asym_id=struc_prov[0].get_chain_id(),
290  offset=struc_prov[0].get_residue_offset())
291 
292  def _add_residue(self, resind):
293  # Update seq_id_range to accommodate this residue
294  seq_id_end = resind + self.offset
295  seq_id_begin = self.asym_unit.seq_id_range[0]
296  if seq_id_begin == 0:
297  seq_id_begin = seq_id_end
298  self.asym_unit = self.asym_unit.asym(seq_id_begin, seq_id_end)
299 
300  # Two starting models with same filename, chain ID, and offset
301  # compare identical
302  # note: this results in separate starting models if only the
303  # offset differs; maybe consolidate into one?
304  def _eq_vals(self):
305  return tuple([self.__class__]
306  + [getattr(self, x) for x in self._eq_keys])
307 
308  def __eq__(self, other):
309  return other is not None and self._eq_vals() == other._eq_vals()
310 
311  def __hash__(self):
312  return hash(self._eq_vals())
313 
314  def _set_sources_datasets(self, system):
315  # Attempt to identify PDB file vs. comparative model
316  p = ihm.metadata.PDBParser()
317  r = p.parse_file(self.filename)
318  system.system.software.extend(r['software'])
319  dataset = system.datasets.add(r['dataset'])
320  # We only want the templates that model the starting model chain
321  templates = r['templates'].get(self.asym_id, [])
322  for t in templates:
323  if t.alignment_file:
324  system.system.locations.append(t.alignment_file)
325  if t.dataset:
326  system.datasets.add(t.dataset)
327  self.dataset = dataset
328  self.templates = templates
329  self.metadata = r['metadata']
330 
331  def _read_coords(self):
332  """Read the coordinates for this starting model"""
333  m = IMP.Model()
334  # todo: support reading other subsets of the atoms (e.g. CA/CB)
335  slt = IMP.atom.ChainPDBSelector(self.asym_id) \
337  hier = IMP.atom.read_pdb(self.filename, m, slt)
338  rng = self.asym_unit.seq_id_range
339  sel = IMP.atom.Selection(
340  hier, residue_indexes=list(range(rng[0] - self.offset,
341  rng[1] + 1 - self.offset)))
342  return m, sel
343 
344  def get_seq_dif(self):
345  return self._seq_dif # filled in by get_atoms()
346 
347  def get_atoms(self):
348  mh = _StartingModelAtomHandler(self.templates, self.asym_unit)
349  m, sel = self._read_coords()
350  for a in mh.get_ihm_atoms(sel.get_selected_particles(), self.offset):
351  yield a
352  self._seq_dif = mh._seq_dif
353 
354 
355 class _StartingModelFinder(object):
356  """Map IMP particles to starting model objects"""
357  def __init__(self, component, existing_starting_models):
358  self._seen_particles = {}
359  self._component = component
360  self._seen_starting_models = dict.fromkeys(existing_starting_models)
361 
362  def find(self, particle, system):
363  """Return a StartingModel object, or None, for this particle"""
364  def _get_starting_model(sp, resind):
365  s = _StartingModel(self._component.asym_unit, sp)
366  if s not in self._seen_starting_models:
367  self._seen_starting_models[s] = s
368  s._set_sources_datasets(system)
369  system.system.orphan_starting_models.append(s)
370  s = self._seen_starting_models[s]
371  if s:
372  s._add_residue(resind)
373  return s
374  resind = None
375  if IMP.atom.Residue.get_is_setup(particle):
376  resind = IMP.atom.Residue(particle).get_index()
377  sp = list(_get_all_structure_provenance(particle))
378  if sp:
379  return _get_starting_model(sp, resind)
380  elif IMP.atom.Hierarchy.get_is_setup(particle):
381  h = IMP.atom.Hierarchy(particle).get_parent()
382  # Remember all nodes we inspect
383  seen_parents = []
384  while h:
386  resind = IMP.atom.Residue(h).get_index()
387  pi = h.get_particle_index()
388  seen_parents.append(pi)
389  # If we inspected this node before, return the cached result
390  if pi in self._seen_particles:
391  sp = self._seen_particles[pi]
392  if sp and sp[0] and resind is not None:
393  sp[0]._add_residue(resind)
394  return sp[0] if sp else None
395  else:
396  sp = list(_get_all_structure_provenance(h))
397  self._seen_particles[pi] = []
398  if sp:
399  s = _get_starting_model(sp, resind)
400  # Set cache for this node and all the children we
401  # inspected on the way up
402  for spi in seen_parents:
403  self._seen_particles[spi].append(s)
404  return s
405  h = h.get_parent()
406 
407 
408 class _Datasets(object):
409  """Store all datasets used."""
410  def __init__(self, system):
411  super(_Datasets, self).__init__()
412  self._datasets = {}
413  self.system = system
414 
415  def add(self, d):
416  """Add and return a new dataset."""
417  if d not in self._datasets:
418  self._datasets[d] = d
419  self.system.orphan_datasets.append(d)
420  return self._datasets[d]
421 
422  def get_all(self):
423  """Yield all datasets"""
424  return self._datasets.keys()
425 
426 
427 class _AllSoftware(object):
428  """Keep track of all Software objects."""
429  def __init__(self, system):
430  self.system = system
431  super(_AllSoftware, self).__init__()
432 
433  def add_hierarchy(self, h):
434  # todo: if no SoftwareProvenance available, use RMF producer field
436  h, types=[IMP.core.SoftwareProvenance]):
437  self.system.software.append(
438  ihm.Software(name=p.get_software_name(),
439  classification='integrative model building',
440  description=None,
441  version=p.get_version(),
442  location=p.get_location()))
443 
444 
445 class _ExternalFiles(object):
446  """Track all externally-referenced files
447  (i.e. anything that refers to a Location that isn't
448  a DatabaseLocation)."""
449  def __init__(self, system):
450  self.system = system
451 
452  def add(self, location):
453  """Add a new externally-referenced file.
454  Note that ids are assigned later."""
455  self.system.locations.append(location)
456 
457  def add_hierarchy(self, h):
458  # Add all Python scripts that were used in the modeling
460  h, types=[IMP.core.ScriptProvenance]):
461  # todo: set details
462  loc = ihm.location.WorkflowFileLocation(
463  path=p.get_filename(),
464  details='Integrative modeling Python script')
465  self.add(loc)
466 
467 
468 class _ProtocolStep(ihm.protocol.Step):
469  """A single step (e.g. sampling, refinement) in a protocol."""
470  def __init__(self, prov, num_models_begin, assembly):
471  method = prov.get_method()
472  if prov.get_number_of_replicas() > 1:
473  method = "Replica exchange " + method
474  super(_ProtocolStep, self).__init__(
475  assembly=assembly,
476  # todo: fill in useful value for dataset_group
477  dataset_group=None,
478  method=method, name='Sampling',
479  num_models_begin=num_models_begin,
480  num_models_end=prov.get_number_of_frames(),
481  # todo: support multiple states, time ordered
482  multi_state=False, ordered=False,
483  # todo: revisit assumption all models are multiscale
484  multi_scale=True)
485 
486  def add_combine(self, prov):
487  self.num_models_end = prov.get_number_of_frames()
488  return self.num_models_end
489 
490 
491 class _Protocol(ihm.protocol.Protocol):
492  """A modeling protocol.
493  Each protocol consists of a number of protocol steps (e.g. sampling,
494  refinement) followed by a number of postprocessing steps (e.g.
495  filtering, rescoring, clustering)"""
496 
497  def add_step(self, prov, num_models, assembly):
498  if isinstance(prov, IMP.core.CombineProvenance):
499  # Fold CombineProvenance into a previous sampling step
500  if len(self.steps) == 0:
501  raise ValueError("CombineProvenance with no previous sampling")
502  return self.steps[-1].add_combine(prov)
503  else:
504  ps = _ProtocolStep(prov, num_models, assembly)
505  self.steps.append(ps)
506  return ps.num_models_end
507 
508  def add_postproc(self, prov, num_models, assembly):
509  if not self.analyses:
510  self.analyses.append(ihm.analysis.Analysis())
511  if isinstance(prov, IMP.core.FilterProvenance):
512  pp = ihm.analysis.FilterStep(
513  feature='energy/score', assembly=assembly,
514  num_models_begin=num_models,
515  num_models_end=prov.get_number_of_frames())
516  elif isinstance(prov, IMP.core.ClusterProvenance):
517  # Assume clustering uses all models
518  pp = ihm.analysis.ClusterStep(
519  feature='RMSD', assembly=assembly, num_models_begin=num_models,
520  num_models_end=num_models)
521  else:
522  raise ValueError("Unhandled provenance", prov)
523  self.analyses[-1].steps.append(pp)
524  return pp.num_models_end
525 
526 
527 class _Protocols(object):
528  """Track all modeling protocols used."""
529  def __init__(self, system):
530  self.system = system
531 
532  def _add_protocol(self, prot):
533  self.system.orphan_protocols.append(prot)
534 
535  def _add_hierarchy(self, h, modeled_assembly):
536  num_models = 0 # assume we always start with no models
539  in_postproc = False
540  prot = _Protocol()
541  for p in reversed(list(IMP.core.get_all_provenance(
542  h, types=prot_types + pp_types))):
543  if isinstance(p, pp_types):
544  num_models = prot.add_postproc(p, num_models, modeled_assembly)
545  in_postproc = True
546  else:
547  if in_postproc:
548  # Start a new protocol
549  self._add_protocol(prot)
550  prot = _Protocol()
551  num_models = prot.add_step(p, num_models, modeled_assembly)
552  in_postproc = False
553  if len(prot.steps) > 0:
554  self._add_protocol(prot)
555 
556 
557 class _Model(ihm.model.Model):
558  def __init__(self, frame, state):
559  self._frame = frame
560  self.state = state # state already a weakproxy
561  super(_Model, self).__init__(
562  assembly=state.modeled_assembly,
563  # todo: add protocol, support multiple representations
564  protocol=None, representation=self.state.system.representation,
565  name=frame.name)
566 
567  def _get_structure_particles(self):
568  self.state._load_frame(self._frame)
569  for h in self.state.hiers:
570  chains = [IMP.atom.Chain(c)
571  for c in IMP.atom.get_by_type(h, IMP.atom.CHAIN_TYPE)]
572  for chain in chains:
573  asym = self.state.system.components[chain].asym_unit
574  for p in self.state.system._get_structure_particles(chain):
575  yield asym, p
576 
577  def get_spheres(self):
578  # todo: support atomic output too
579  for asym, p in self._get_structure_particles():
580  if isinstance(p, IMP.atom.Fragment):
581  resinds = p.get_residue_indexes()
582  # todo: handle non-contiguous fragments
583  sbegin = resinds[0]
584  send = resinds[-1]
585  else: # residue
586  sbegin = send = p.get_index()
587  xyzr = IMP.core.XYZR(p)
588  xyz = xyzr.get_coordinates()
589  yield ihm.model.Sphere(
590  asym_unit=asym, seq_id_range=(sbegin, send),
591  x=xyz[0], y=xyz[1], z=xyz[2], radius=xyzr.get_radius())
Select non water and non hydrogen atoms.
Definition: pdb.h:245
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Residue.h:158
A decorator to associate a particle with a part of a protein/DNA/RNA.
Definition: Fragment.h:20
Track creation of a system fragment from running some software.
Definition: provenance.h:562
ElementTable & get_element_table()
Track creation of a system fragment from sampling.
Definition: provenance.h:173
def get_molecule
Given a Hierarchy, walk up and find the parent Molecule.
Definition: data.py:15
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:731
Track creation of a system fragment by combination.
Definition: provenance.h:280
void read_pdb(TextInput input, int model, Hierarchy h)
Class for storing model, its restraints, constraints, and particles.
Definition: Model.h:73
void add_hierarchy(RMF::FileHandle fh, atom::Hierarchy hs)
The standard decorator for manipulating molecular structures.
Ints get_index(const ParticlesTemp &particles, const Subset &subset, const Subsets &excluded)
A decorator for a particle representing an atom.
Definition: atom/Atom.h:234
Track creation of a system fragment by filtering.
Definition: provenance.h:340
Track creation of a system fragment from a PDB file.
Definition: provenance.h:86
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
A decorator for a residue.
Definition: Residue.h:137
static bool get_is_setup(Model *m, ParticleIndex p)
Check if the particle has the needed attributes for a cast to succeed.
Residue get_residue(Atom d, bool nothrow=false)
Return the Residue containing this atom.
Track creation of a system fragment from clustering.
Definition: provenance.h:426
Store info for a chain of a protein.
Definition: Chain.h:61
Functionality for loading, creating, manipulating and scoring atomic structures.
std::string get_chain_id(Hierarchy h)
Walk up the hierarchy to determine the chain id.
def get_all_provenance
Yield all provenance decorators of the given types for the particle.
A decorator for a molecule.
Definition: Molecule.h:24
Select hierarchy particles identified by the biological name.
Definition: Selection.h:66
Select all ATOM and HETATM records with the given chain ids.
Definition: pdb.h:189
Track creation of a system fragment from running a script.
Definition: provenance.h:511
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27