IMP logo
IMP Reference Guide  2.13.0
The Integrative Modeling Platform
data.py
1 """@namespace IMP.mmcif.data
2  @brief Classes to represent data structures used in mmCIF.
3 """
4 
5 from __future__ import print_function
6 import IMP.atom
7 import ihm.location
8 import ihm.metadata
9 import ihm.startmodel
10 import ihm.analysis
11 import ihm.protocol
12 import ihm.model
13 
14 def get_molecule(h):
15  """Given a Hierarchy, walk up and find the parent Molecule"""
16  while h:
18  return IMP.atom.Molecule(h)
19  h = h.get_parent()
20  return None
21 
22 
23 class _EntityMapper(dict):
24  """Handle mapping from IMP chains to CIF entities.
25  Multiple components may map to the same entity if they share sequence."""
26  def __init__(self, system):
27  self.system = system
28  super(_EntityMapper, self).__init__()
29  self._sequence_dict = {}
30  self._entities = []
31 
32  def add(self, chain):
33  # todo: handle non-protein sequences
34  sequence = chain.get_sequence()
35  if sequence == '':
36  raise ValueError("Chain %s has no sequence" % chain)
37  if sequence not in self._sequence_dict:
38  entity = ihm.Entity(sequence)
39  self.system.entities.append(entity)
40  self._entities.append(entity)
41  self._sequence_dict[sequence] = entity
42  self[chain] = self._sequence_dict[sequence]
43  return self[chain]
44 
45  def get_all(self):
46  """Yield all entities"""
47  return self._entities
48 
49 
50 def _assign_id(obj, seen_objs, obj_by_id):
51  """Assign a unique ID to obj, and track all ids in obj_by_id."""
52  if obj not in seen_objs:
53  if not hasattr(obj, 'id'):
54  obj_by_id.append(obj)
55  obj.id = len(obj_by_id)
56  seen_objs[obj] = obj.id
57  else:
58  obj.id = seen_objs[obj]
59 
60 
61 class _Component(object):
62  """An mmCIF component. This is an instance of an _Entity. Multiple
63  _Components may map to the same _Entity but must have unique
64  asym_ids. A _Component is similar to an IMP Chain but multiple
65  Chains may map to the same _Component (the Chains represent the
66  same structure, just in different states, and potentially in
67  different IMP Models). A _Component may also represent something
68  that is described by an experiment but was not modeled by IMP, and
69  so no Chains map to it but a string name does."""
70  def __init__(self, entity, asym_id, name):
71  self.entity, self.asym_id, self.name = entity, asym_id, name
72 
73 
74 class _ComponentMapper(object):
75  """Handle mapping from chains to CIF components."""
76  def __init__(self, system):
77  super(_ComponentMapper, self).__init__()
78  self.system = system
79  self._used_entities = set()
80  self._all_components = []
81  self._all_modeled_components = []
82  self._map = {}
83 
84  def __getitem__(self, chain):
85  modeled, asym_id, map_key, name = self._handle_chain(chain)
86  return self._map[map_key]
87 
88  def _handle_chain(self, chain):
89  if isinstance(chain, IMP.atom.Chain):
90  modeled = True
91  mol = get_molecule(chain)
92  asym_id = map_key = chain.get_id()
93  name = mol.get_name() if mol else None
94  else:
95  modeled = False
96  asym_id = None
97  name = map_key = chain.name
98  return modeled, asym_id, map_key, name
99 
100  def add(self, chain, entity):
101  """Add a chain (either an IMP Chain object for a modeled component,
102  or a NonModeledChain object for a non-modeled component)"""
103  modeled, asym_id, map_key, name = self._handle_chain(chain)
104  if map_key not in self._map:
105  component = _Component(entity, asym_id, name)
106  if entity not in self._used_entities:
107  self._used_entities.add(entity)
108  # Assign entity name from the component; strip out anything
109  # after a @ or .
110  entity.description = component.name.split("@")[0].split(".")[0]
111  self._all_components.append(component)
112  if modeled:
113  asym = ihm.AsymUnit(entity, name, id=asym_id)
114  self.system.asym_units.append(asym)
115  component.asym_unit = asym
116  self._all_modeled_components.append(component)
117  self._map[map_key] = component
118  else:
119  component = self._map[map_key]
120  if component.entity != entity:
121  raise ValueError("Two chains have the same ID (%s) but "
122  "different sequences - rename one of the "
123  "chains" % map_key)
124  return component
125 
126  def get_all(self):
127  """Get all components"""
128  return self._all_components
129 
130  def get_all_modeled(self):
131  """Get all modeled components"""
132  return self._all_modeled_components
133 
134 
135 class _RepSegmentFactory(object):
136  """Make ihm.representation.Segment objects for each set of contiguous
137  particles with the same representation"""
138  def __init__(self, component):
139  self.component = component
140  self.particles = []
141  self.residue_range = () # inclusive range
142 
143  def add(self, particle, starting_model):
144  """Add a new particle to the last segment (and return None).
145  Iff the particle could not be added, return the segment and start
146  a new one."""
147  resrange, rigid_body, is_res = self._get_particle_info(particle)
148  def start_new_segment():
149  self.particles = [particle]
150  self.residue_range = resrange
151  self.rigid_body = rigid_body
152  self.is_res = is_res
153  self.starting_model = starting_model
154  if not self.particles:
155  # First particle in a segment
156  start_new_segment()
157  elif type(particle) == type(self.particles[0]) \
158  and is_res == self.is_res \
159  and resrange[0] == self.residue_range[1] + 1 \
160  and starting_model == self.starting_model \
161  and self._same_rigid_body(rigid_body):
162  # Continue an existing segment
163  self.particles.append(particle)
164  self.residue_range = (self.residue_range[0], resrange[1])
165  else:
166  # Make a new segment
167  seg = self.get_last()
168  start_new_segment()
169  return seg
170 
171  def get_last(self):
172  """Return the last segment, or None"""
173  if self.particles:
174  asym = self.component.asym_unit(*self.residue_range)
175  if self.is_res:
176  return ihm.representation.ResidueSegment(
177  asym_unit=asym,
178  rigid=self.rigid_body is not None, primitive='sphere',
179  starting_model=self.starting_model)
180  else:
181  return ihm.representation.FeatureSegment(
182  asym_unit=asym,
183  rigid=self.rigid_body is not None, primitive='sphere',
184  count=len(self.particles),
185  starting_model=self.starting_model)
186 
187  def _same_rigid_body(self, rigid_body):
188  # Note: can't just use self.rigid_body == rigid_body as IMP may
189  # crash when comparing a RigidBody object against None
190  if self.rigid_body is None and rigid_body is None:
191  return True
192  elif self.rigid_body is None or rigid_body is None:
193  return False
194  else:
195  return self.rigid_body == rigid_body
196 
197  def _get_particle_info(self, p):
198  # Note that we consider nonrigid members to not be rigid here
200  rigid_body = IMP.core.RigidMember(p).get_rigid_body()
201  else:
202  rigid_body = None
203  if isinstance(p, IMP.atom.Residue):
204  return (p.get_index(), p.get_index()), rigid_body, True
205  elif isinstance(p, IMP.atom.Fragment):
206  resinds = p.get_residue_indexes()
207  return (resinds[0], resinds[-1]), rigid_body, False
208  raise TypeError("Unknown particle ", p)
209 
210 
211 def _get_all_structure_provenance(p):
212  """Yield all StructureProvenance decorators for the given particle."""
214 
215 
216 class _StartingModelAtomHandler(object):
217  def __init__(self, templates, asym):
218  self._seq_dif = []
219  self._last_res_index = None
220  self.templates = templates
221  self.asym = asym
222 
223  def _residue_first_atom(self, res):
224  """Return True iff we're looking at the first atom in this residue"""
225  # Only add one seq_dif record per residue
226  ind = res.get_index()
227  if ind != self._last_res_index:
228  self._last_res_index = ind
229  return True
230 
231  def handle_residue(self, res, comp_id, seq_id, offset):
232  res_name = res.get_residue_type().get_string()
233  # MSE in the original PDB is automatically mutated
234  # by IMP to MET, so reflect that in the output,
235  # and pass back to populate the seq_dif category.
236  if res_name == 'MSE' and comp_id == 'MET':
237  if self._residue_first_atom(res):
238  # This should only happen when we're using
239  # a crystal structure as the source (a
240  # comparative model would use MET in
241  # the sequence)
242  assert(len(self.templates) == 0)
243  self._seq_dif.append(ihm.startmodel.MSESeqDif(
244  res.get_index(), seq_id))
245  elif res_name != comp_id:
246  if self._residue_first_atom(res):
247  print("WARNING: Starting model residue %s does not match "
248  "that in the output model (%s) for chain %s residue %d. "
249  "Check offset (currently %d)."
250  % (res_name, comp_id, self.asym._id, seq_id, offset))
251  self._seq_dif.append(ihm.startmodel.SeqDif(
252  db_seq_id=res.get_index(), seq_id=seq_id,
253  db_comp_id=res_name,
254  details="Mutation of %s to %s" % (res_name, comp_id)))
255 
256  def get_ihm_atoms(self, particles, offset):
257  for a in particles:
258  coord = IMP.core.XYZ(a).get_coordinates()
259  atom = IMP.atom.Atom(a)
260  element = atom.get_element()
261  element = IMP.atom.get_element_table().get_name(element)
262  atom_name = atom.get_atom_type().get_string()
263  het = atom_name.startswith('HET:')
264  if het:
265  atom_name = atom_name[4:]
266  res = IMP.atom.get_residue(atom)
267 
268  seq_id = res.get_index() + offset
269  comp_id = self.asym.entity.sequence[seq_id-1].id
270  self.handle_residue(res, comp_id, seq_id, offset)
271  yield ihm.model.Atom(asym_unit=self.asym, seq_id=seq_id,
272  atom_id=atom_name, type_symbol=element,
273  x=coord[0], y=coord[1], z=coord[2],
274  het=het, biso=atom.get_temperature_factor())
275 
276 
277 class _StartingModel(ihm.startmodel.StartingModel):
278  _eq_keys = ['filename', 'asym_id', 'offset']
279 
280  def __init__(self, asym_unit, struc_prov):
281  self.filename = struc_prov[0].get_filename()
282  super(_StartingModel, self).__init__(
283  asym_unit=asym_unit(0,0), # will update in _add_residue()
284  dataset=None, # will fill in later with _set_sources_datasets()
285  asym_id=struc_prov[0].get_chain_id(),
286  offset=struc_prov[0].get_residue_offset())
287 
288  def _add_residue(self, resind):
289  # Update seq_id_range to accommodate this residue
290  seq_id_end = resind + self.offset
291  seq_id_begin = self.asym_unit.seq_id_range[0]
292  if seq_id_begin == 0:
293  seq_id_begin = seq_id_end
294  self.asym_unit = self.asym_unit.asym(seq_id_begin, seq_id_end)
295 
296  # Two starting models with same filename, chain ID, and offset
297  # compare identical
298  # note: this results in separate starting models if only the offset differs;
299  # maybe consolidate into one?
300  def _eq_vals(self):
301  return tuple([self.__class__]
302  + [getattr(self, x) for x in self._eq_keys])
303  def __eq__(self, other):
304  return other is not None and self._eq_vals() == other._eq_vals()
305  def __hash__(self):
306  return hash(self._eq_vals())
307 
308  def _set_sources_datasets(self, system):
309  # Attempt to identify PDB file vs. comparative model
310  p = ihm.metadata.PDBParser()
311  r = p.parse_file(self.filename)
312  system.system.software.extend(r['software'])
313  dataset = system.datasets.add(r['dataset'])
314  # We only want the templates that model the starting model chain
315  templates = r['templates'].get(self.asym_id, [])
316  for t in templates:
317  if t.alignment_file:
318  system.system.locations.append(t.alignment_file)
319  if t.dataset:
320  system.datasets.add(t.dataset)
321  self.dataset = dataset
322  self.templates = templates
323  self.metadata = r['metadata']
324 
325  def _read_coords(self):
326  """Read the coordinates for this starting model"""
327  m = IMP.Model()
328  # todo: support reading other subsets of the atoms (e.g. CA/CB)
329  slt = IMP.atom.ChainPDBSelector(self.asym_id) \
331  hier = IMP.atom.read_pdb(self.filename, m, slt)
332  rng = self.asym_unit.seq_id_range
333  sel = IMP.atom.Selection(hier,
334  residue_indexes=list(range(rng[0] - self.offset,
335  rng[1] + 1 - self.offset)))
336  return m, sel
337 
338  def get_seq_dif(self):
339  return self._seq_dif # filled in by get_atoms()
340 
341  def get_atoms(self):
342  mh = _StartingModelAtomHandler(self.templates, self.asym_unit)
343  m, sel = self._read_coords()
344  for a in mh.get_ihm_atoms(sel.get_selected_particles(), self.offset):
345  yield a
346  self._seq_dif = mh._seq_dif
347 
348 
349 class _StartingModelFinder(object):
350  """Map IMP particles to starting model objects"""
351  def __init__(self, component, existing_starting_models):
352  self._seen_particles = {}
353  self._component = component
354  self._seen_starting_models = dict.fromkeys(existing_starting_models)
355 
356  def find(self, particle, system):
357  """Return a StartingModel object, or None, for this particle"""
358  def _get_starting_model(sp, resind):
359  s = _StartingModel(self._component.asym_unit, sp)
360  if s not in self._seen_starting_models:
361  self._seen_starting_models[s] = s
362  s._set_sources_datasets(system)
363  system.system.orphan_starting_models.append(s)
364  s = self._seen_starting_models[s]
365  if s:
366  s._add_residue(resind)
367  return s
368  resind = None
369  if IMP.atom.Residue.get_is_setup(particle):
370  resind = IMP.atom.Residue(particle).get_index()
371  sp = list(_get_all_structure_provenance(particle))
372  if sp:
373  return _get_starting_model(sp, resind)
374  elif IMP.atom.Hierarchy.get_is_setup(particle):
375  h = IMP.atom.Hierarchy(particle).get_parent()
376  # Remember all nodes we inspect
377  seen_parents = []
378  while h:
380  resind = IMP.atom.Residue(h).get_index()
381  pi = h.get_particle_index()
382  seen_parents.append(pi)
383  # If we inspected this node before, return the cached result
384  if pi in self._seen_particles:
385  sp = self._seen_particles[pi]
386  if sp and sp[0] and resind is not None:
387  sp[0]._add_residue(resind)
388  return sp[0] if sp else None
389  else:
390  sp = list(_get_all_structure_provenance(h))
391  self._seen_particles[pi] = []
392  if sp:
393  s = _get_starting_model(sp, resind)
394  # Set cache for this node and all the children we
395  # inspected on the way up
396  for spi in seen_parents:
397  self._seen_particles[spi].append(s)
398  return s
399  h = h.get_parent()
400 
401 
402 class _Datasets(object):
403  """Store all datasets used."""
404  def __init__(self, system):
405  super(_Datasets, self).__init__()
406  self._datasets = {}
407  self.system = system
408 
409  def add(self, d):
410  """Add and return a new dataset."""
411  if d not in self._datasets:
412  self._datasets[d] = d
413  self.system.orphan_datasets.append(d)
414  return self._datasets[d]
415 
416  def get_all(self):
417  """Yield all datasets"""
418  return self._datasets.keys()
419 
420 
421 class _AllSoftware(object):
422  """Keep track of all Software objects."""
423  def __init__(self, system):
424  self.system = system
425  super(_AllSoftware, self).__init__()
426 
427  def add_hierarchy(self, h):
428  # todo: if no SoftwareProvenance available, use RMF producer field
429  for p in IMP.core.get_all_provenance(h,
431  self.system.software.append(
432  ihm.Software(name=p.get_software_name(),
433  classification='integrative model building',
434  description=None,
435  version=p.get_version(),
436  location=p.get_location()))
437 
438 
439 class _ExternalFiles(object):
440  """Track all externally-referenced files
441  (i.e. anything that refers to a Location that isn't
442  a DatabaseLocation)."""
443  def __init__(self, system):
444  self.system = system
445 
446  def add(self, location):
447  """Add a new externally-referenced file.
448  Note that ids are assigned later."""
449  self.system.locations.append(location)
450 
451  def add_hierarchy(self, h):
452  # Add all Python scripts that were used in the modeling
453  for p in IMP.core.get_all_provenance(h,
454  types=[IMP.core.ScriptProvenance]):
455  # todo: set details
456  l = ihm.location.WorkflowFileLocation(path=p.get_filename(),
457  details='Integrative modeling Python script')
458  self.add(l)
459 
460 
461 class _ProtocolStep(ihm.protocol.Step):
462  """A single step (e.g. sampling, refinement) in a protocol."""
463  def __init__(self, prov, num_models_begin, assembly):
464  method = prov.get_method()
465  if prov.get_number_of_replicas() > 1:
466  method = "Replica exchange " + method
467  super(_ProtocolStep, self).__init__(
468  assembly=assembly,
469  # todo: fill in useful value for dataset_group
470  dataset_group=None,
471  method=method, name='Sampling',
472  num_models_begin=num_models_begin,
473  num_models_end=prov.get_number_of_frames(),
474  # todo: support multiple states, time ordered
475  multi_state=False, ordered=False,
476  # todo: revisit assumption all models are multiscale
477  multi_scale=True)
478 
479  def add_combine(self, prov):
480  self.num_models_end = prov.get_number_of_frames()
481  return self.num_models_end
482 
483 
484 class _Protocol(ihm.protocol.Protocol):
485  """A modeling protocol.
486  Each protocol consists of a number of protocol steps (e.g. sampling,
487  refinement) followed by a number of postprocessing steps (e.g.
488  filtering, rescoring, clustering)"""
489 
490  def add_step(self, prov, num_models, assembly):
491  if isinstance(prov, IMP.core.CombineProvenance):
492  # Fold CombineProvenance into a previous sampling step
493  if len(self.steps) == 0:
494  raise ValueError("CombineProvenance with no previous sampling")
495  return self.steps[-1].add_combine(prov)
496  else:
497  ps = _ProtocolStep(prov, num_models, assembly)
498  self.steps.append(ps)
499  return ps.num_models_end
500 
501  def add_postproc(self, prov, num_models, assembly):
502  if not self.analyses:
503  self.analyses.append(ihm.analysis.Analysis())
504  if isinstance(prov, IMP.core.FilterProvenance):
505  pp = ihm.analysis.FilterStep(feature='energy/score',
506  assembly=assembly, num_models_begin=num_models,
507  num_models_end = prov.get_number_of_frames())
508  elif isinstance(prov, IMP.core.ClusterProvenance):
509  # Assume clustering uses all models
510  pp = ihm.analysis.ClusterStep(feature='RMSD',
511  assembly=assembly, num_models_begin=num_models,
512  num_models_end=num_models)
513  else:
514  raise ValueError("Unhandled provenance", prov)
515  self.analyses[-1].steps.append(pp)
516  return pp.num_models_end
517 
518 
519 class _Protocols(object):
520  """Track all modeling protocols used."""
521  def __init__(self, system):
522  self.system = system
523 
524  def _add_protocol(self, prot):
525  self.system.orphan_protocols.append(prot)
526 
527  def _add_hierarchy(self, h, modeled_assembly):
528  num_models = 0 # assume we always start with no models
531  in_postproc = False
532  prot = _Protocol()
533  for p in reversed(list(IMP.core.get_all_provenance(
534  h, types=prot_types + pp_types))):
535  if isinstance(p, pp_types):
536  num_models = prot.add_postproc(p, num_models, modeled_assembly)
537  in_postproc = True
538  else:
539  if in_postproc:
540  # Start a new protocol
541  self._add_protocol(prot)
542  prot = _Protocol()
543  num_models = prot.add_step(p, num_models, modeled_assembly)
544  in_postproc = False
545  if len(prot.steps) > 0:
546  self._add_protocol(prot)
547 
548 
549 class _Model(ihm.model.Model):
550  def __init__(self, frame, state):
551  self._frame = frame
552  self.state = state # state already a weakproxy
553  super(_Model, self).__init__(
554  assembly=state.modeled_assembly,
555  # todo: add protocol, support multiple representations
556  protocol=None, representation=self.state.system.representation,
557  name=frame.name)
558 
559  def _get_structure_particles(self):
560  self.state._load_frame(self._frame)
561  for h in self.state.hiers:
562  chains = [IMP.atom.Chain(c)
563  for c in IMP.atom.get_by_type(h, IMP.atom.CHAIN_TYPE)]
564  for chain in chains:
565  asym = self.state.system.components[chain].asym_unit
566  for p in self.state.system._get_structure_particles(chain):
567  yield asym, p
568 
569  def get_spheres(self):
570  # todo: support atomic output too
571  for asym, p in self._get_structure_particles():
572  if isinstance(p, IMP.atom.Fragment):
573  resinds = p.get_residue_indexes()
574  # todo: handle non-contiguous fragments
575  sbegin = resinds[0]
576  send = resinds[-1]
577  else: # residue
578  sbegin = send = p.get_index()
579  xyzr = IMP.core.XYZR(p)
580  xyz = xyzr.get_coordinates()
581  yield ihm.model.Sphere(asym_unit=asym, seq_id_range=(sbegin, send),
582  x=xyz[0], y=xyz[1], z=xyz[2], radius=xyzr.get_radius())
Select non water and non hydrogen atoms.
Definition: pdb.h:245
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Residue.h:156
A decorator to associate a particle with a part of a protein/DNA/RNA.
Definition: Fragment.h:20
Track creation of a system fragment from running some software.
Definition: provenance.h:516
ElementTable & get_element_table()
Track creation of a system fragment from sampling.
Definition: provenance.h:173
def get_molecule
Given a Hierarchy, walk up and find the parent Molecule.
Definition: data.py:14
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:731
Track creation of a system fragment by combination.
Definition: provenance.h:280
void read_pdb(TextInput input, int model, Hierarchy h)
Class for storing model, its restraints, constraints, and particles.
Definition: Model.h:72
void add_hierarchy(RMF::FileHandle fh, atom::Hierarchy hs)
The standard decorator for manipulating molecular structures.
Ints get_index(const ParticlesTemp &particles, const Subset &subset, const Subsets &excluded)
A decorator for a particle representing an atom.
Definition: atom/Atom.h:234
Track creation of a system fragment by filtering.
Definition: provenance.h:340
Track creation of a system fragment from a PDB file.
Definition: provenance.h:86
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
A decorator for a residue.
Definition: Residue.h:135
static bool get_is_setup(Model *m, ParticleIndex p)
Check if the particle has the needed attributes for a cast to succeed.
Residue get_residue(Atom d, bool nothrow=false)
Return the Residue containing this atom.
Track creation of a system fragment from clustering.
Definition: provenance.h:426
Store info for a chain of a protein.
Definition: Chain.h:61
Functionality for loading, creating, manipulating and scoring atomic structures.
std::string get_chain_id(Hierarchy h)
Walk up the hierarchy to determine the chain id.
def get_all_provenance
Yield all provenance decorators of the given types for the particle.
A decorator for a molecule.
Definition: Molecule.h:24
Select hierarchy particles identified by the biological name.
Definition: Selection.h:66
Select all ATOM and HETATM records with the given chain ids.
Definition: pdb.h:189
Track creation of a system fragment from running a script.
Definition: provenance.h:465
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27