IMP logo
IMP Reference Guide  2.19.0
The Integrative Modeling Platform
data.py
1 """@namespace IMP.mmcif.data
2  @brief Classes to represent data structures used in mmCIF.
3 """
4 
5 from __future__ import print_function
6 import IMP.atom
7 import ihm.location
8 import ihm.metadata
9 import ihm.startmodel
10 import ihm.analysis
11 import ihm.protocol
12 import ihm.model
13 import ihm.citations
14 
15 
16 def get_molecule(h):
17  """Given a Hierarchy, walk up and find the parent Molecule"""
18  while h:
20  return IMP.atom.Molecule(h)
21  h = h.get_parent()
22  return None
23 
24 
25 class _EntityMapper(dict):
26  """Handle mapping from IMP chains to CIF entities.
27  Multiple components may map to the same entity if they
28  share sequence."""
29  def __init__(self, system):
30  self.system = system
31  super(_EntityMapper, self).__init__()
32  self._sequence_dict = {}
33  self._entities = []
34 
35  def add(self, chain):
36  # todo: handle non-protein sequences
37  sequence = chain.get_sequence()
38  if sequence == '':
39  raise ValueError("Chain %s has no sequence" % chain)
40  if sequence not in self._sequence_dict:
41  entity = ihm.Entity(sequence)
42  self.system.entities.append(entity)
43  self._entities.append(entity)
44  self._sequence_dict[sequence] = entity
45  self[chain] = self._sequence_dict[sequence]
46  return self[chain]
47 
48  def get_all(self):
49  """Yield all entities"""
50  return self._entities
51 
52 
53 def _assign_id(obj, seen_objs, obj_by_id):
54  """Assign a unique ID to obj, and track all ids in obj_by_id."""
55  if obj not in seen_objs:
56  if not hasattr(obj, 'id'):
57  obj_by_id.append(obj)
58  obj.id = len(obj_by_id)
59  seen_objs[obj] = obj.id
60  else:
61  obj.id = seen_objs[obj]
62 
63 
64 class _Component(object):
65  """An mmCIF component. This is an instance of an _Entity. Multiple
66  _Components may map to the same _Entity but must have unique
67  asym_ids. A _Component is similar to an IMP Chain but multiple
68  Chains may map to the same _Component (the Chains represent the
69  same structure, just in different states, and potentially in
70  different IMP Models). A _Component may also represent something
71  that is described by an experiment but was not modeled by IMP, and
72  so no Chains map to it but a string name does."""
73  def __init__(self, entity, asym_id, name):
74  self.entity, self.asym_id, self.name = entity, asym_id, name
75 
76 
77 class _ComponentMapper(object):
78  """Handle mapping from chains to CIF components."""
79  def __init__(self, system):
80  super(_ComponentMapper, self).__init__()
81  self.system = system
82  self._used_entities = set()
83  self._all_components = []
84  self._all_modeled_components = []
85  self._map = {}
86 
87  def __getitem__(self, chain):
88  modeled, asym_id, map_key, name = self._handle_chain(chain)
89  return self._map[map_key]
90 
91  def _handle_chain(self, chain):
92  if isinstance(chain, IMP.atom.Chain):
93  modeled = True
94  mol = get_molecule(chain)
95  asym_id = map_key = chain.get_id()
96  name = mol.get_name() if mol else None
97  else:
98  modeled = False
99  asym_id = None
100  name = map_key = chain.name
101  return modeled, asym_id, map_key, name
102 
103  def add(self, chain, entity):
104  """Add a chain (either an IMP Chain object for a modeled component,
105  or a NonModeledChain object for a non-modeled component)"""
106  modeled, asym_id, map_key, name = self._handle_chain(chain)
107  if map_key not in self._map:
108  component = _Component(entity, asym_id, name)
109  if entity not in self._used_entities:
110  self._used_entities.add(entity)
111  # Assign entity name from the component; strip out anything
112  # after a @ or .
113  entity.description = component.name.split("@")[0].split(".")[0]
114  self._all_components.append(component)
115  if modeled:
116  asym = ihm.AsymUnit(entity, name, id=asym_id)
117  self.system.asym_units.append(asym)
118  component.asym_unit = asym
119  self._all_modeled_components.append(component)
120  self._map[map_key] = component
121  else:
122  component = self._map[map_key]
123  if component.entity != entity:
124  raise ValueError("Two chains have the same ID (%s) but "
125  "different sequences - rename one of the "
126  "chains" % map_key)
127  return component
128 
129  def get_all(self):
130  """Get all components"""
131  return self._all_components
132 
133  def get_all_modeled(self):
134  """Get all modeled components"""
135  return self._all_modeled_components
136 
137 
138 class _RepSegmentFactory(object):
139  """Make ihm.representation.Segment objects for each set of contiguous
140  particles with the same representation"""
141  def __init__(self, component):
142  self.component = component
143  self.particles = []
144  self.residue_range = () # inclusive range
145 
146  def add(self, particle, starting_model):
147  """Add a new particle to the last segment (and return None).
148  Iff the particle could not be added, return the segment and start
149  a new one."""
150  resrange, rigid_body, is_res = self._get_particle_info(particle)
151 
152  def start_new_segment():
153  self.particles = [particle]
154  self.residue_range = resrange
155  self.rigid_body = rigid_body
156  self.is_res = is_res
157  self.starting_model = starting_model
158  if not self.particles:
159  # First particle in a segment
160  start_new_segment()
161  elif (type(particle) == type(self.particles[0]) # noqa: E721
162  and is_res == self.is_res
163  and resrange[0] == self.residue_range[1] + 1
164  and starting_model == self.starting_model
165  and self._same_rigid_body(rigid_body)):
166  # Continue an existing segment
167  self.particles.append(particle)
168  self.residue_range = (self.residue_range[0], resrange[1])
169  else:
170  # Make a new segment
171  seg = self.get_last()
172  start_new_segment()
173  return seg
174 
175  def get_last(self):
176  """Return the last segment, or None"""
177  if self.particles:
178  asym = self.component.asym_unit(*self.residue_range)
179  if self.is_res:
180  return ihm.representation.ResidueSegment(
181  asym_unit=asym,
182  rigid=self.rigid_body is not None, primitive='sphere',
183  starting_model=self.starting_model)
184  else:
185  return ihm.representation.FeatureSegment(
186  asym_unit=asym,
187  rigid=self.rigid_body is not None, primitive='sphere',
188  count=len(self.particles),
189  starting_model=self.starting_model)
190 
191  def _same_rigid_body(self, rigid_body):
192  # Note: can't just use self.rigid_body == rigid_body as IMP may
193  # crash when comparing a RigidBody object against None
194  if self.rigid_body is None and rigid_body is None:
195  return True
196  elif self.rigid_body is None or rigid_body is None:
197  return False
198  else:
199  return self.rigid_body == rigid_body
200 
201  def _get_particle_info(self, p):
202  # Note that we consider nonrigid members to not be rigid here
204  rigid_body = IMP.core.RigidMember(p).get_rigid_body()
205  else:
206  rigid_body = None
207  if isinstance(p, IMP.atom.Residue):
208  return (p.get_index(), p.get_index()), rigid_body, True
209  elif isinstance(p, IMP.atom.Fragment):
210  resinds = p.get_residue_indexes()
211  return (resinds[0], resinds[-1]), rigid_body, False
212  raise TypeError("Unknown particle ", p)
213 
214 
215 def _get_all_structure_provenance(p):
216  """Yield all StructureProvenance decorators for the given particle."""
218 
219 
220 class _StartingModelAtomHandler(object):
221  def __init__(self, templates, asym):
222  self._seq_dif = []
223  self._last_res_index = None
224  self.templates = templates
225  self.asym = asym
226 
227  def _residue_first_atom(self, res):
228  """Return True iff we're looking at the first atom in this residue"""
229  # Only add one seq_dif record per residue
230  ind = res.get_index()
231  if ind != self._last_res_index:
232  self._last_res_index = ind
233  return True
234 
235  def handle_residue(self, res, comp_id, seq_id, offset):
236  res_name = res.get_residue_type().get_string()
237  # MSE in the original PDB is automatically mutated
238  # by IMP to MET, so reflect that in the output,
239  # and pass back to populate the seq_dif category.
240  if res_name == 'MSE' and comp_id == 'MET':
241  if self._residue_first_atom(res):
242  # This should only happen when we're using
243  # a crystal structure as the source (a
244  # comparative model would use MET in
245  # the sequence)
246  assert len(self.templates) == 0
247  self._seq_dif.append(ihm.startmodel.MSESeqDif(
248  res.get_index(), seq_id))
249  elif res_name != comp_id:
250  if self._residue_first_atom(res):
251  print("WARNING: Starting model residue %s does not match "
252  "that in the output model (%s) for chain %s residue %d. "
253  "Check offset (currently %d)."
254  % (res_name, comp_id, self.asym._id, seq_id, offset))
255  self._seq_dif.append(ihm.startmodel.SeqDif(
256  db_seq_id=res.get_index(), seq_id=seq_id,
257  db_comp_id=res_name,
258  details="Mutation of %s to %s" % (res_name, comp_id)))
259 
260  def get_ihm_atoms(self, particles, offset):
261  for a in particles:
262  coord = IMP.core.XYZ(a).get_coordinates()
263  atom = IMP.atom.Atom(a)
264  element = atom.get_element()
265  element = IMP.atom.get_element_table().get_name(element)
266  atom_name = atom.get_atom_type().get_string()
267  het = atom_name.startswith('HET:')
268  if het:
269  atom_name = atom_name[4:]
270  res = IMP.atom.get_residue(atom)
271 
272  seq_id = res.get_index() + offset
273  comp_id = self.asym.entity.sequence[seq_id-1].id
274  self.handle_residue(res, comp_id, seq_id, offset)
275  yield ihm.model.Atom(asym_unit=self.asym, seq_id=seq_id,
276  atom_id=atom_name, type_symbol=element,
277  x=coord[0], y=coord[1], z=coord[2],
278  het=het, biso=atom.get_temperature_factor())
279 
280 
281 class _StartingModel(ihm.startmodel.StartingModel):
282  _eq_keys = ['filename', 'asym_id', 'offset']
283 
284  def __init__(self, asym_unit, struc_prov):
285  self.filename = struc_prov[0].get_filename()
286  super(_StartingModel, self).__init__(
287  asym_unit=asym_unit(0, 0), # will update in _add_residue()
288  # will fill in later with _set_sources_datasets()
289  dataset=None,
290  asym_id=struc_prov[0].get_chain_id(),
291  offset=struc_prov[0].get_residue_offset())
292 
293  def _add_residue(self, resind):
294  # Update seq_id_range to accommodate this residue
295  seq_id_end = resind + self.offset
296  seq_id_begin = self.asym_unit.seq_id_range[0]
297  if seq_id_begin == 0:
298  seq_id_begin = seq_id_end
299  self.asym_unit = self.asym_unit.asym(seq_id_begin, seq_id_end)
300 
301  # Two starting models with same filename, chain ID, and offset
302  # compare identical
303  # note: this results in separate starting models if only the
304  # offset differs; maybe consolidate into one?
305  def _eq_vals(self):
306  return tuple([self.__class__]
307  + [getattr(self, x) for x in self._eq_keys])
308 
309  def __eq__(self, other):
310  return other is not None and self._eq_vals() == other._eq_vals()
311 
312  def __hash__(self):
313  return hash(self._eq_vals())
314 
315  def _set_sources_datasets(self, system):
316  # Attempt to identify PDB file vs. comparative model
317  p = ihm.metadata.PDBParser()
318  r = p.parse_file(self.filename)
319  system.system.software.extend(r['software'])
320  dataset = system.datasets.add(r['dataset'])
321  # We only want the templates that model the starting model chain
322  templates = r['templates'].get(self.asym_id, [])
323  for t in templates:
324  if t.alignment_file:
325  system.system.locations.append(t.alignment_file)
326  if t.dataset:
327  system.datasets.add(t.dataset)
328  self.dataset = dataset
329  self.templates = templates
330  self.metadata = r['metadata']
331 
332  def _read_coords(self):
333  """Read the coordinates for this starting model"""
334  m = IMP.Model()
335  # todo: support reading other subsets of the atoms (e.g. CA/CB)
336  slt = IMP.atom.ChainPDBSelector(self.asym_id) \
338  hier = IMP.atom.read_pdb(self.filename, m, slt)
339  rng = self.asym_unit.seq_id_range
340  sel = IMP.atom.Selection(
341  hier, residue_indexes=list(range(rng[0] - self.offset,
342  rng[1] + 1 - self.offset)))
343  return m, sel
344 
345  def get_seq_dif(self):
346  return self._seq_dif # filled in by get_atoms()
347 
348  def get_atoms(self):
349  mh = _StartingModelAtomHandler(self.templates, self.asym_unit)
350  m, sel = self._read_coords()
351  for a in mh.get_ihm_atoms(sel.get_selected_particles(), self.offset):
352  yield a
353  self._seq_dif = mh._seq_dif
354 
355 
356 class _StartingModelFinder(object):
357  """Map IMP particles to starting model objects"""
358  def __init__(self, component, existing_starting_models):
359  self._seen_particles = {}
360  self._component = component
361  self._seen_starting_models = dict.fromkeys(existing_starting_models)
362 
363  def find(self, particle, system):
364  """Return a StartingModel object, or None, for this particle"""
365  def _get_starting_model(sp, resind):
366  s = _StartingModel(self._component.asym_unit, sp)
367  if s not in self._seen_starting_models:
368  self._seen_starting_models[s] = s
369  s._set_sources_datasets(system)
370  system.system.orphan_starting_models.append(s)
371  s = self._seen_starting_models[s]
372  if s:
373  s._add_residue(resind)
374  return s
375  resind = None
376  if IMP.atom.Residue.get_is_setup(particle):
377  resind = IMP.atom.Residue(particle).get_index()
378  sp = list(_get_all_structure_provenance(particle))
379  if sp:
380  return _get_starting_model(sp, resind)
381  elif IMP.atom.Hierarchy.get_is_setup(particle):
382  h = IMP.atom.Hierarchy(particle).get_parent()
383  # Remember all nodes we inspect
384  seen_parents = []
385  while h:
387  resind = IMP.atom.Residue(h).get_index()
388  pi = h.get_particle_index()
389  seen_parents.append(pi)
390  # If we inspected this node before, return the cached result
391  if pi in self._seen_particles:
392  sp = self._seen_particles[pi]
393  if sp and sp[0] and resind is not None:
394  sp[0]._add_residue(resind)
395  return sp[0] if sp else None
396  else:
397  sp = list(_get_all_structure_provenance(h))
398  self._seen_particles[pi] = []
399  if sp:
400  s = _get_starting_model(sp, resind)
401  # Set cache for this node and all the children we
402  # inspected on the way up
403  for spi in seen_parents:
404  self._seen_particles[spi].append(s)
405  return s
406  h = h.get_parent()
407 
408 
409 class _Datasets(object):
410  """Store all datasets used."""
411  def __init__(self, system):
412  super(_Datasets, self).__init__()
413  self._datasets = {}
414  self.system = system
415 
416  def add(self, d):
417  """Add and return a new dataset."""
418  if d not in self._datasets:
419  self._datasets[d] = d
420  self.system.orphan_datasets.append(d)
421  return self._datasets[d]
422 
423  def get_all(self):
424  """Yield all datasets"""
425  return self._datasets.keys()
426 
427 
428 class _AllSoftware(object):
429  """Keep track of all Software objects."""
430 
431  # IMP/RMF doesn't store citation info for software, so provide it
432  # for known software packages
433  cites = {'Integrative Modeling Platform (IMP)': ihm.citations.imp,
434  'IMP PMI module': ihm.citations.pmi}
435 
436  def __init__(self, system):
437  self.system = system
438  self._by_namever = {}
439  super(_AllSoftware, self).__init__()
440 
441  def add_hierarchy(self, h):
442  # todo: if no SoftwareProvenance available, use RMF producer field
444  h, types=[IMP.core.SoftwareProvenance]):
445  self._add_provenance(p)
446 
447  def _add_provenance(self, p):
448  """Add Software from SoftwareProvenance"""
449  # Only reference the same version of a given software package once
450  name = p.get_software_name()
451  version = p.get_version()
452  if (name, version) not in self._by_namever:
453  s = ihm.Software(name=name,
454  classification='integrative model building',
455  description=None, version=version,
456  location=p.get_location(),
457  citation=self.cites.get(name))
458  self.system.software.append(s)
459  self._by_namever[name, version] = s
460  return self._by_namever[name, version]
461 
462  def _add_previous_provenance(self, prov):
463  """Add Software from a previous SoftwareProvenance, if any"""
464  while prov:
466  return self._add_provenance(IMP.core.SoftwareProvenance(prov))
467  prov = prov.get_previous()
468 
469 
470 class _ExternalFiles(object):
471  """Track all externally-referenced files
472  (i.e. anything that refers to a Location that isn't
473  a DatabaseLocation)."""
474  def __init__(self, system):
475  self.system = system
476 
477  def add(self, location):
478  """Add a new externally-referenced file.
479  Note that ids are assigned later."""
480  self.system.locations.append(location)
481 
482  def add_hierarchy(self, h):
483  # Add all Python scripts that were used in the modeling
485  h, types=[IMP.core.ScriptProvenance]):
486  # todo: set details
487  loc = ihm.location.WorkflowFileLocation(
488  path=p.get_filename(),
489  details='Integrative modeling Python script')
490  self.add(loc)
491 
492 
493 class _ProtocolStep(ihm.protocol.Step):
494  """A single step (e.g. sampling, refinement) in a protocol."""
495  def __init__(self, prov, num_models_begin, assembly, all_software):
496  method = prov.get_method()
497  if prov.get_number_of_replicas() > 1:
498  method = "Replica exchange " + method
499  super(_ProtocolStep, self).__init__(
500  assembly=assembly,
501  # todo: fill in useful value for dataset_group
502  dataset_group=None,
503  method=method, name='Sampling',
504  num_models_begin=num_models_begin,
505  num_models_end=prov.get_number_of_frames(),
506  # todo: support multiple states, time ordered
507  multi_state=False, ordered=False,
508  # todo: revisit assumption all models are multiscale
509  multi_scale=True,
510  software=all_software._add_previous_provenance(prov))
511 
512  def add_combine(self, prov):
513  self.num_models_end = prov.get_number_of_frames()
514  return self.num_models_end
515 
516 
517 class _Protocol(ihm.protocol.Protocol):
518  """A modeling protocol.
519  Each protocol consists of a number of protocol steps (e.g. sampling,
520  refinement) followed by a number of postprocessing steps (e.g.
521  filtering, rescoring, clustering)"""
522 
523  def add_step(self, prov, num_models, assembly, all_software):
524  if isinstance(prov, IMP.core.CombineProvenance):
525  # Fold CombineProvenance into a previous sampling step
526  if len(self.steps) == 0:
527  raise ValueError("CombineProvenance with no previous sampling")
528  return self.steps[-1].add_combine(prov)
529  else:
530  ps = _ProtocolStep(prov, num_models, assembly, all_software)
531  self.steps.append(ps)
532  return ps.num_models_end
533 
534  def add_postproc(self, prov, num_models, assembly):
535  if not self.analyses:
536  self.analyses.append(ihm.analysis.Analysis())
537  if isinstance(prov, IMP.core.FilterProvenance):
538  pp = ihm.analysis.FilterStep(
539  feature='energy/score', assembly=assembly,
540  num_models_begin=num_models,
541  num_models_end=prov.get_number_of_frames())
542  elif isinstance(prov, IMP.core.ClusterProvenance):
543  # Assume clustering uses all models
544  pp = ihm.analysis.ClusterStep(
545  feature='RMSD', assembly=assembly, num_models_begin=num_models,
546  num_models_end=num_models)
547  else:
548  raise ValueError("Unhandled provenance", prov)
549  self.analyses[-1].steps.append(pp)
550  return pp.num_models_end
551 
552 
553 class _Protocols(object):
554  """Track all modeling protocols used."""
555  def __init__(self, system):
556  self.system = system
557 
558  def _add_protocol(self, prot):
559  self.system.orphan_protocols.append(prot)
560 
561  def _add_hierarchy(self, h, modeled_assembly, all_software):
562  num_models = 0 # assume we always start with no models
565  in_postproc = False
566  prot = _Protocol()
567  for p in reversed(list(IMP.core.get_all_provenance(
568  h, types=prot_types + pp_types))):
569  if isinstance(p, pp_types):
570  num_models = prot.add_postproc(p, num_models, modeled_assembly)
571  in_postproc = True
572  else:
573  if in_postproc:
574  # Start a new protocol
575  self._add_protocol(prot)
576  prot = _Protocol()
577  num_models = prot.add_step(p, num_models, modeled_assembly,
578  all_software)
579  in_postproc = False
580  if len(prot.steps) > 0:
581  self._add_protocol(prot)
582 
583 
584 class _Model(ihm.model.Model):
585  def __init__(self, frame, state):
586  self._frame = frame
587  self.state = state # state already a weakproxy
588  super(_Model, self).__init__(
589  assembly=state.modeled_assembly,
590  # todo: add protocol, support multiple representations
591  protocol=None, representation=self.state.system.representation,
592  name=frame.name)
593 
594  def _get_structure_particles(self):
595  self.state._load_frame(self._frame)
596  for h in self.state.hiers:
597  chains = [IMP.atom.Chain(c)
598  for c in IMP.atom.get_by_type(h, IMP.atom.CHAIN_TYPE)]
599  for chain in chains:
600  asym = self.state.system.components[chain].asym_unit
601  for p in self.state.system._get_structure_particles(chain):
602  yield asym, p
603 
604  def get_spheres(self):
605  # todo: support atomic output too
606  for asym, p in self._get_structure_particles():
607  if isinstance(p, IMP.atom.Fragment):
608  resinds = p.get_residue_indexes()
609  # todo: handle non-contiguous fragments
610  sbegin = resinds[0]
611  send = resinds[-1]
612  else: # residue
613  sbegin = send = p.get_index()
614  xyzr = IMP.core.XYZR(p)
615  xyz = xyzr.get_coordinates()
616  yield ihm.model.Sphere(
617  asym_unit=asym, seq_id_range=(sbegin, send),
618  x=xyz[0], y=xyz[1], z=xyz[2], radius=xyzr.get_radius())
Select non water and non hydrogen atoms.
Definition: pdb.h:248
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Residue.h:158
A decorator to associate a particle with a part of a protein/DNA/RNA.
Definition: Fragment.h:20
Track creation of a system fragment from running some software.
Definition: provenance.h:562
ElementTable & get_element_table()
Track creation of a system fragment from sampling.
Definition: provenance.h:173
def get_molecule
Given a Hierarchy, walk up and find the parent Molecule.
Definition: data.py:16
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:749
Track creation of a system fragment by combination.
Definition: provenance.h:280
void read_pdb(TextInput input, int model, Hierarchy h)
Class for storing model, its restraints, constraints, and particles.
Definition: Model.h:86
void add_hierarchy(RMF::FileHandle fh, atom::Hierarchy hs)
The standard decorator for manipulating molecular structures.
Ints get_index(const ParticlesTemp &particles, const Subset &subset, const Subsets &excluded)
A decorator for a particle representing an atom.
Definition: atom/Atom.h:238
Track creation of a system fragment by filtering.
Definition: provenance.h:340
Track creation of a system fragment from a PDB file.
Definition: provenance.h:86
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
A decorator for a residue.
Definition: Residue.h:137
static bool get_is_setup(Model *m, ParticleIndex p)
Check if the particle has the needed attributes for a cast to succeed.
Residue get_residue(Atom d, bool nothrow=false)
Return the Residue containing this atom.
Track creation of a system fragment from clustering.
Definition: provenance.h:426
Store info for a chain of a protein.
Definition: Chain.h:61
static bool get_is_setup(Model *m, ParticleIndex pi)
Definition: provenance.h:584
Functionality for loading, creating, manipulating and scoring atomic structures.
std::string get_chain_id(Hierarchy h)
Walk up the hierarchy to determine the chain id.
def get_all_provenance
Yield all provenance decorators of the given types for the particle.
A decorator for a molecule.
Definition: Molecule.h:24
Select hierarchy particles identified by the biological name.
Definition: Selection.h:70
Select all ATOM and HETATM records with the given chain ids.
Definition: pdb.h:192
Track creation of a system fragment from running a script.
Definition: provenance.h:511
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27