IMP logo
IMP Reference Guide  develop.45c11de31d,2024/03/27
The Integrative Modeling Platform
data.py
1 """@namespace IMP.mmcif.data
2  @brief Classes to represent data structures used in mmCIF.
3 """
4 
5 from __future__ import print_function
6 import IMP.atom
7 import ihm.location
8 import ihm.metadata
9 import ihm.startmodel
10 import ihm.analysis
11 import ihm.protocol
12 import ihm.model
13 import ihm.citations
14 import ihm.reference
15 import operator
16 import inspect
17 
18 
19 # Map from IMP ResidueType to ihm ChemComp
20 _imp_to_ihm = {}
21 
22 
23 def _fill_imp_to_ihm():
24  d = dict(x for x in inspect.getmembers(IMP.atom)
25  if isinstance(x[1], IMP.atom.ResidueType))
26  # Handle standard amino acids plus some extras like MSE, UNK
27  for comp in ihm.LPeptideAlphabet._comps.values():
28  if comp.id in d:
29  _imp_to_ihm[d[comp.id]] = comp
30  # Handle RNA and DNA
31  alpha = ihm.RNAAlphabet()
32  for code in ['ADE', 'CYT', 'GUA', 'URA']:
33  _imp_to_ihm[d[code]] = alpha[d[code].get_string()]
34  alpha = ihm.DNAAlphabet()
35  for code in ['DADE', 'DCYT', 'DGUA', 'DTHY']:
36  _imp_to_ihm[d[code]] = alpha[d[code].get_string()]
37  # Pass through missing IMP residue
38  _imp_to_ihm[None] = None
39 
40 
41 _fill_imp_to_ihm()
42 
43 
44 def get_molecule(h):
45  """Given a Hierarchy, walk up and find the parent Molecule"""
46  while h:
48  return IMP.atom.Molecule(h)
49  h = h.get_parent()
50  return None
51 
52 
53 def _check_sequential(fragment, resinds):
54  for i in range(1, len(resinds)):
55  if resinds[i - 1] + 1 != resinds[i]:
56  raise ValueError(
57  "%s: non-sequential residue indices are not supported"
58  % str(fragment))
59 
60 
61 def _get_all_state_provenance(state_h, top_h, types):
62  """Yield all provenance information for the given state.
63  If the given State Hierarchy node contains no provenance information,
64  fall back to any provenance information for the top-level node
65  (if provided)."""
66  count = 0
67  for p in IMP.core.get_all_provenance(state_h, types=types):
68  count += 1
69  yield p
70  if count == 0 and top_h is not None:
71  for p in IMP.core.get_all_provenance(top_h, types=types):
72  yield p
73 
74 
75 class _CustomDNAAlphabet(ihm.Alphabet):
76  """Custom DNA alphabet that maps A,C,G,T (rather than DA,DC,DG,DT
77  as in python-ihm)"""
78  _comps = dict([cc.code_canonical, cc]
79  for cc in ihm.DNAAlphabet._comps.values())
80 
81 
82 class _EntityMapper(dict):
83  """Handle mapping from IMP chains to CIF entities.
84  Multiple components may map to the same entity if they
85  share sequence."""
86  def __init__(self, system):
87  self.system = system
88  super(_EntityMapper, self).__init__()
89  self._sequence_dict = {}
90  self._entities = []
91  self._alphabet_map = {
92  IMP.atom.UnknownChainType: ihm.LPeptideAlphabet,
93  IMP.atom.Protein: ihm.LPeptideAlphabet,
94  IMP.atom.RNA: ihm.RNAAlphabet,
95  IMP.atom.DNA: _CustomDNAAlphabet}
96 
97  def _get_sequence_from_residues(self, chain, seq_from_res):
98  seq_id_begin, seq = seq_from_res
99  if not seq:
100  raise ValueError("Chain %s has no sequence and no residues"
101  % chain)
102  missing_seq = [ind + seq_id_begin
103  for (ind, res) in enumerate(seq) if res is None]
104  if missing_seq:
105  raise ValueError(
106  "Chain %s has no declared sequence; tried to determine the "
107  "sequence from Residues, but the following residue indices "
108  "have no residue type (perhaps covered only by Fragments): %s"
109  % (chain, str(missing_seq)))
110  return seq_id_begin - 1, tuple(seq)
111 
112  def add(self, chain, seq_from_res=None):
113  sequence = chain.get_sequence()
114  offset = chain.get_sequence_offset()
115  if sequence == '':
116  if seq_from_res is not None:
117  offset, sequence = self._get_sequence_from_residues(
118  chain, seq_from_res)
119  else:
120  raise ValueError("Chain %s has no sequence" % chain)
121  else:
122  # Map one-letter codes to ihm.ChemComp
123  alphabet = self._alphabet_map[chain.get_chain_type()]()
124  sequence = tuple(alphabet[x] for x in sequence)
125  if sequence not in self._sequence_dict:
126  entity = ihm.Entity(sequence)
127  self.system.entities.append(entity)
128  self._entities.append(entity)
129  self._sequence_dict[sequence] = entity
130  uniprot = chain.get_uniprot_accession()
131  if uniprot:
132  up = ihm.reference.UniProtSequence.from_accession(uniprot)
133  entity.references.append(up)
134  self[chain] = self._sequence_dict[sequence]
135  return self[chain], offset
136 
137  def get_all(self):
138  """Yield all entities"""
139  return self._entities
140 
141 
142 def _assign_id(obj, seen_objs, obj_by_id):
143  """Assign a unique ID to obj, and track all ids in obj_by_id."""
144  if obj not in seen_objs:
145  if not hasattr(obj, 'id'):
146  obj_by_id.append(obj)
147  obj.id = len(obj_by_id)
148  seen_objs[obj] = obj.id
149  else:
150  obj.id = seen_objs[obj]
151 
152 
153 class _Component(object):
154  """An mmCIF component. This is an instance of an _Entity. Multiple
155  _Components may map to the same _Entity but must have unique
156  asym_ids. A _Component is similar to an IMP Chain but multiple
157  Chains may map to the same _Component (the Chains represent the
158  same structure, just in different states, and potentially in
159  different IMP Models). A _Component may also represent something
160  that is described by an experiment but was not modeled by IMP, and
161  so no Chains map to it but a string name does."""
162  def __init__(self, entity, asym_id, name):
163  self.entity, self.asym_id, self.name = entity, asym_id, name
164 
165 
166 class _ComponentMapper(object):
167  """Handle mapping from IMP Chains to CIF AsymUnits."""
168  def __init__(self, system):
169  super(_ComponentMapper, self).__init__()
170  self.system = system
171  self._used_entities = set()
172  self._all_components = []
173  self._map = {}
174 
175  def __getitem__(self, chain):
176  asym_id, map_key, name = self._handle_chain(chain)
177  return self._map[map_key]
178 
179  def _handle_chain(self, chain):
180  mol = get_molecule(chain)
181  asym_id = chain.get_id()
182  name = mol.get_name() if mol else None
183  # Avoid conflict between name="A" and asym_id="A"
184  if name:
185  map_key = "name", name
186  else:
187  map_key = "asym_id", asym_id
188  return asym_id, map_key, name
189 
190  def add(self, chain, entity, offset):
191  """Add a chain (an IMP Chain object)"""
192  asym_id, map_key, name = self._handle_chain(chain)
193  if map_key not in self._map:
194  component = _Component(entity, asym_id, name)
195  if entity not in self._used_entities:
196  self._used_entities.add(entity)
197  # Assign entity name from the component; strip out anything
198  # after a @ or .
199  if component.name:
200  entity.description = \
201  component.name.split("@")[0].split(".")[0]
202  self._all_components.append(component)
203  asym = ihm.AsymUnit(entity, name, id=asym_id,
204  auth_seq_id_map=offset)
205  self.system.asym_units.append(asym)
206  component.asym_unit = asym
207  self._map[map_key] = component
208  else:
209  component = self._map[map_key]
210  if component.entity != entity:
211  raise ValueError("Two chains have the same ID (%s) but "
212  "different sequences - rename one of the "
213  "chains" % component.asym_unit.id)
214  if component.asym_unit.auth_seq_id_map != offset:
215  raise ValueError(
216  "Two chains have the same ID (%s) but different offsets "
217  "(%d, %d) - this is not supported"
218  % (component.asym_unit.id,
219  component.asym_unit.auth_seq_id_map, offset))
220  return component
221 
222  def get_all(self):
223  """Get all components"""
224  return self._all_components
225 
226 
227 class _RepSegmentFactory(object):
228  """Make ihm.representation.Segment objects for each set of contiguous
229  particles with the same representation"""
230  def __init__(self, asym):
231  self.asym = asym
232  # Offset from IHM to IMP numbering
233  self.offset = asym.auth_seq_id_map
234  self.particles = []
235  self.imp_residue_range = () # inclusive range, using IMP numbering
236 
237  def add(self, particle, starting_model):
238  """Add a new particle to the last segment (and return None).
239  Iff the particle could not be added, return the segment and start
240  a new one."""
241  (resrange, rigid_body,
242  is_res, is_atom) = self._get_particle_info(particle)
243 
244  def start_new_segment():
245  self.particles = [particle]
246  self.imp_residue_range = resrange
247  self.rigid_body = rigid_body
248  self.is_res = is_res
249  self.is_atom = is_atom
250  self.starting_model = starting_model
251  if not self.particles:
252  # First particle in a segment
253  start_new_segment()
254  elif (type(particle) == type(self.particles[0]) # noqa: E721
255  and is_res == self.is_res
256  and is_atom == self.is_atom
257  and resrange[0] <= self.imp_residue_range[1] + 1
258  and starting_model == self.starting_model
259  and self._same_rigid_body(rigid_body)):
260  # Continue an existing segment
261  self.particles.append(particle)
262  self.imp_residue_range = (self.imp_residue_range[0], resrange[1])
263  else:
264  # Make a new segment
265  seg = self.get_last()
266  start_new_segment()
267  return seg
268 
269  def get_last(self):
270  """Return the last segment, or None"""
271  if self.particles:
272  # Convert residue_range from IMP to IHM
273  asym = self.asym(self.imp_residue_range[0] - self.offset,
274  self.imp_residue_range[1] - self.offset)
275  if self.is_atom:
276  return ihm.representation.AtomicSegment(
277  asym_unit=asym, rigid=self.rigid_body is not None,
278  starting_model=self.starting_model)
279  elif self.is_res:
280  return ihm.representation.ResidueSegment(
281  asym_unit=asym,
282  rigid=self.rigid_body is not None, primitive='sphere',
283  starting_model=self.starting_model)
284  else:
285  return ihm.representation.FeatureSegment(
286  asym_unit=asym,
287  rigid=self.rigid_body is not None, primitive='sphere',
288  count=len(self.particles),
289  starting_model=self.starting_model)
290 
291  def _same_rigid_body(self, rigid_body):
292  # Note: can't just use self.rigid_body == rigid_body as IMP may
293  # crash when comparing a RigidBody object against None
294  if self.rigid_body is None and rigid_body is None:
295  return True
296  elif self.rigid_body is None or rigid_body is None:
297  return False
298  else:
299  return self.rigid_body == rigid_body
300 
301  def _get_particle_info(self, p):
302  # Note that we consider nonrigid members to not be rigid here
304  rigid_body = IMP.core.RigidMember(p).get_rigid_body()
305  else:
306  rigid_body = None
307  if isinstance(p, IMP.atom.Residue):
308  return (p.get_index(), p.get_index()), rigid_body, True, False
309  elif isinstance(p, IMP.atom.Atom):
310  res = IMP.atom.get_residue(p)
311  return (res.get_index(), res.get_index()), rigid_body, False, True
312  elif isinstance(p, IMP.atom.Fragment):
313  resinds = p.get_residue_indexes()
314  return (resinds[0], resinds[-1]), rigid_body, False, False
315  raise TypeError("Unknown particle ", p)
316 
317 
318 def _get_all_structure_provenance(p):
319  """Yield all StructureProvenance decorators for the given particle."""
321 
322 
323 class _StartingModelAtomHandler(object):
324  def __init__(self, templates, asym):
325  self._seq_dif = []
326  self._last_res_index = None
327  self.templates = templates
328  self.asym = asym
329 
330  def _residue_first_atom(self, res):
331  """Return True iff we're looking at the first atom in this residue"""
332  # Only add one seq_dif record per residue
333  ind = res.get_index()
334  if ind != self._last_res_index:
335  self._last_res_index = ind
336  return True
337 
338  def handle_residue(self, res, comp_id, seq_id, offset):
339  res_name = res.get_residue_type().get_string()
340  # MSE in the original PDB is automatically mutated
341  # by IMP to MET, so reflect that in the output,
342  # and pass back to populate the seq_dif category.
343  if res_name == 'MSE' and comp_id == 'MET':
344  if self._residue_first_atom(res):
345  # This should only happen when we're using
346  # a crystal structure as the source (a
347  # comparative model would use MET in
348  # the sequence)
349  assert len(self.templates) == 0
350  self._seq_dif.append(ihm.startmodel.MSESeqDif(
351  res.get_index(), seq_id))
352  elif res_name != comp_id:
353  if self._residue_first_atom(res):
354  print("WARNING: Starting model residue %s does not match "
355  "that in the output model (%s) for chain %s residue %d. "
356  "Check offset (currently %d)."
357  % (res_name, comp_id, self.asym._id, seq_id, offset))
358  self._seq_dif.append(ihm.startmodel.SeqDif(
359  db_seq_id=res.get_index(), seq_id=seq_id,
360  db_comp_id=res_name,
361  details="Mutation of %s to %s" % (res_name, comp_id)))
362 
363  def get_ihm_atoms(self, particles, offset):
364  for a in particles:
365  coord = IMP.core.XYZ(a).get_coordinates()
366  atom = IMP.atom.Atom(a)
367  element = atom.get_element()
368  element = IMP.atom.get_element_table().get_name(element)
369  atom_name = atom.get_atom_type().get_string()
370  het = atom_name.startswith('HET:')
371  if het:
372  atom_name = atom_name[4:]
373  res = IMP.atom.get_residue(atom)
374 
375  seq_id = res.get_index() + offset
376  comp_id = self.asym.entity.sequence[seq_id-1].id
377  self.handle_residue(res, comp_id, seq_id, offset)
378  yield ihm.model.Atom(asym_unit=self.asym, seq_id=seq_id,
379  atom_id=atom_name, type_symbol=element,
380  x=coord[0], y=coord[1], z=coord[2],
381  het=het, biso=atom.get_temperature_factor())
382 
383 
384 class _StartingModel(ihm.startmodel.StartingModel):
385  _eq_keys = ['filename', 'asym_id', 'offset']
386 
387  def __init__(self, asym_unit, struc_prov):
388  self.filename = struc_prov[0].get_filename()
389  super(_StartingModel, self).__init__(
390  asym_unit=asym_unit(0, 0), # will update in _add_residue()
391  # will fill in later with _set_sources_datasets()
392  dataset=None,
393  asym_id=struc_prov[0].get_chain_id(),
394  offset=struc_prov[0].get_residue_offset())
395 
396  def _add_residue(self, resind):
397  # Update seq_id_range to accommodate this residue
398  seq_id_end = resind
399  seq_id_begin = self.asym_unit.seq_id_range[0]
400  if seq_id_begin == 0:
401  seq_id_begin = seq_id_end
402  self.asym_unit = self.asym_unit.asym(seq_id_begin, seq_id_end)
403 
404  # Two starting models with same filename, chain ID, and offset
405  # compare identical
406  # note: this results in separate starting models if only the
407  # offset differs; maybe consolidate into one?
408  def _eq_vals(self):
409  return tuple([self.__class__]
410  + [getattr(self, x) for x in self._eq_keys])
411 
412  def __eq__(self, other):
413  return other is not None and self._eq_vals() == other._eq_vals()
414 
415  def __hash__(self):
416  return hash(self._eq_vals())
417 
418  def _set_sources_datasets(self, system, datasets):
419  # Attempt to identify PDB file vs. comparative model
420  if (hasattr(ihm.metadata, 'CIFParser')
421  and self.filename.endswith('.cif')):
422  p = ihm.metadata.CIFParser()
423  else:
424  p = ihm.metadata.PDBParser()
425  r = p.parse_file(self.filename)
426  system.software.extend(r.get('software', []))
427  dataset = datasets.add(r['dataset'])
428  # We only want the templates that model the starting model chain
429  templates = r.get('templates', {}).get(self.asym_id, [])
430  for t in templates:
431  if t.alignment_file:
432  system.locations.append(t.alignment_file)
433  if t.dataset:
434  datasets.add(t.dataset)
435  self.dataset = dataset
436  self.templates = templates
437  self.metadata = r.get('metadata', [])
438 
439  def _read_coords(self):
440  """Read the coordinates for this starting model"""
441  m = IMP.Model()
442  # todo: support reading other subsets of the atoms (e.g. CA/CB)
443  slt = IMP.atom.ChainPDBSelector([self.asym_id]) \
445  hier = IMP.atom.read_pdb_or_mmcif(self.filename, m, slt)
446  rng = self.asym_unit.seq_id_range
447  sel = IMP.atom.Selection(
448  hier, residue_indexes=list(range(rng[0] - self.offset,
449  rng[1] + 1 - self.offset)))
450  return m, sel
451 
452  def get_seq_dif(self):
453  return self._seq_dif # filled in by get_atoms()
454 
455  def get_atoms(self):
456  mh = _StartingModelAtomHandler(self.templates, self.asym_unit)
457  m, sel = self._read_coords()
458  for a in mh.get_ihm_atoms(sel.get_selected_particles(), self.offset):
459  yield a
460  self._seq_dif = mh._seq_dif
461 
462 
463 class _StartingModelFinder(object):
464  """Map IMP particles to starting model objects"""
465  def __init__(self, asym, existing_starting_models, system, datasets):
466  self._seen_particles = {}
467  self._asym = asym
468  self._seen_starting_models = {}
469  for sm in existing_starting_models:
470  self._seen_starting_models[sm] = sm
471  self._system = system
472  self._datasets = datasets
473 
474  def find(self, particle):
475  """Return a StartingModel object, or None, for this particle"""
476  def _get_starting_model(sp, resind):
477  s = _StartingModel(self._asym, sp)
478  if s not in self._seen_starting_models:
479  self._seen_starting_models[s] = s
480  s._set_sources_datasets(self._system, self._datasets)
481  self._system.orphan_starting_models.append(s)
482  s = self._seen_starting_models[s]
483  if s:
484  s._add_residue(resind)
485  return s
486  resind = None
487  if IMP.atom.Residue.get_is_setup(particle):
488  resind = IMP.atom.Residue(particle).get_index()
489  sp = list(_get_all_structure_provenance(particle))
490  if sp:
491  return _get_starting_model(sp, resind)
492  elif IMP.atom.Hierarchy.get_is_setup(particle):
493  h = IMP.atom.Hierarchy(particle).get_parent()
494  # Remember all nodes we inspect
495  seen_parents = []
496  while h:
498  resind = IMP.atom.Residue(h).get_index()
499  pi = h.get_particle_index()
500  seen_parents.append(pi)
501  # If we inspected this node before, return the cached result
502  if pi in self._seen_particles:
503  sp = self._seen_particles[pi]
504  if sp and sp[0] and resind is not None:
505  sp[0]._add_residue(resind)
506  return sp[0] if sp else None
507  else:
508  sp = list(_get_all_structure_provenance(h))
509  self._seen_particles[pi] = []
510  if sp:
511  s = _get_starting_model(sp, resind)
512  # Set cache for this node and all the children we
513  # inspected on the way up
514  for spi in seen_parents:
515  self._seen_particles[spi].append(s)
516  return s
517  h = h.get_parent()
518 
519 
520 class _Datasets(object):
521  """Store all datasets used."""
522  def __init__(self, system):
523  super(_Datasets, self).__init__()
524  self._datasets = {}
525  self._groups = {}
526  self.system = system
527 
528  def add(self, d):
529  """Add and return a new dataset."""
530  if d not in self._datasets:
531  self._datasets[d] = d
532  self.system.orphan_datasets.append(d)
533  return self._datasets[d]
534 
535  def add_group(self, datasets, name):
536  """Add and return a new group of datasets"""
537  seen = set()
538  # Remove duplicates
539  d = []
540  for dataset in datasets:
541  if dataset not in seen:
542  d.append(dataset)
543  seen.add(dataset)
544  d = tuple(d)
545  if d not in self._groups:
546  g = ihm.dataset.DatasetGroup(d, name=name)
547  self._groups[d] = g
548  self.system.orphan_dataset_groups.append(g)
549  return self._groups[d]
550 
551  def get_all(self):
552  """Yield all datasets"""
553  return self._datasets.keys()
554 
555 
556 class _AllSoftware(object):
557  """Keep track of all Software objects."""
558 
559  # IMP/RMF doesn't store citation info for software, so provide it
560  # for known software packages
561  cites = {'Integrative Modeling Platform (IMP)': ihm.citations.imp,
562  'IMP PMI module': ihm.citations.pmi}
563 
564  def __init__(self, system):
565  self.system = system
566  self._by_namever = {}
567  super(_AllSoftware, self).__init__()
568 
569  def add_hierarchy(self, h, top_h=None):
570  # todo: if no SoftwareProvenance available, use RMF producer field
571  for p in _get_all_state_provenance(
572  h, top_h, types=[IMP.core.SoftwareProvenance]):
573  self._add_provenance(p)
574 
575  def _add_provenance(self, p):
576  """Add Software from SoftwareProvenance"""
577  # Only reference the same version of a given software package once
578  name = p.get_software_name()
579  version = p.get_version()
580  if (name, version) not in self._by_namever:
581  s = ihm.Software(name=name,
582  classification='integrative model building',
583  description=None, version=version,
584  location=p.get_location(),
585  citation=self.cites.get(name))
586  self.system.software.append(s)
587  self._by_namever[name, version] = s
588  return self._by_namever[name, version]
589 
590  def _add_previous_provenance(self, prov):
591  """Add Software from a previous SoftwareProvenance, if any"""
592  while prov:
594  return self._add_provenance(IMP.core.SoftwareProvenance(prov))
595  prov = prov.get_previous()
596 
597 
598 class _ExternalFiles(object):
599  """Track all externally-referenced files
600  (i.e. anything that refers to a Location that isn't
601  a DatabaseLocation)."""
602  def __init__(self, system):
603  self.system = system
604  self._by_path = {}
605 
606  def add_hierarchy(self, h, top_h=None):
607  # Add all Python scripts that were used in the modeling
608  for p in _get_all_state_provenance(
609  h, top_h, types=[IMP.core.ScriptProvenance]):
610  self._add_provenance(p)
611 
612  def _add_provenance(self, p):
613  """Add external file from ScriptProvenance"""
614  # Only reference the same path once
615  path = p.get_filename()
616  if path not in self._by_path:
617  loc = ihm.location.WorkflowFileLocation(
618  path=p.get_filename(),
619  details='Integrative modeling Python script')
620  self.system.locations.append(loc)
621  self._by_path[path] = loc
622  return self._by_path[path]
623 
624 
625 class _ProtocolStep(ihm.protocol.Step):
626  """A single step (e.g. sampling, refinement) in a protocol."""
627  def __init__(self, prov, num_models_begin, assembly, all_software):
628  method = prov.get_method()
629  if prov.get_number_of_replicas() > 1:
630  method = "Replica exchange " + method
631  super(_ProtocolStep, self).__init__(
632  assembly=assembly,
633  # todo: fill in useful value for dataset_group
634  dataset_group=None,
635  method=method, name='Sampling',
636  num_models_begin=num_models_begin,
637  num_models_end=prov.get_number_of_frames(),
638  # todo: support multiple states, time ordered
639  multi_state=False, ordered=False,
640  # todo: revisit assumption all models are multiscale
641  multi_scale=True,
642  software=all_software._add_previous_provenance(prov))
643 
644  def add_combine(self, prov):
645  self.num_models_end = prov.get_number_of_frames()
646  return self.num_models_end
647 
648 
649 class _Protocol(ihm.protocol.Protocol):
650  """A modeling protocol.
651  Each protocol consists of a number of protocol steps (e.g. sampling,
652  refinement) followed by a number of postprocessing steps (e.g.
653  filtering, rescoring, clustering)"""
654 
655  def add_step(self, prov, num_models, assembly, all_software):
656  if isinstance(prov, IMP.core.CombineProvenance):
657  # Fold CombineProvenance into a previous sampling step
658  if len(self.steps) == 0:
659  raise ValueError("CombineProvenance with no previous sampling")
660  return self.steps[-1].add_combine(prov)
661  else:
662  ps = _ProtocolStep(prov, num_models, assembly, all_software)
663  self.steps.append(ps)
664  return ps.num_models_end
665 
666  def add_postproc(self, prov, num_models, assembly):
667  if not self.analyses:
668  self.analyses.append(ihm.analysis.Analysis())
669  if isinstance(prov, IMP.core.FilterProvenance):
670  pp = ihm.analysis.FilterStep(
671  feature='energy/score', assembly=assembly,
672  num_models_begin=num_models,
673  num_models_end=prov.get_number_of_frames())
674  elif isinstance(prov, IMP.core.ClusterProvenance):
675  # Assume clustering uses all models
676  pp = ihm.analysis.ClusterStep(
677  feature='RMSD', assembly=assembly, num_models_begin=num_models,
678  num_models_end=num_models)
679  else:
680  raise ValueError("Unhandled provenance", prov)
681  self.analyses[-1].steps.append(pp)
682  return pp.num_models_end
683 
684 
685 class _Protocols(object):
686  """Track all modeling protocols used."""
687  def __init__(self, system):
688  self.system = system
689 
690  def _add_protocol(self, prot):
691  # Protocol isn't hashable or sortable, so just compare dicts
692  # with existing protocols. This should still be performant as
693  # we generally don't have more than one or two protocols.
694  # We exclude dataset_group from the comparison as this is typically
695  # filled in later.
696  def step_equal(x, y):
697  def get_dict(d):
698  return {x: y for x, y in d.__dict__.items()
699  if x != 'dataset_group'}
700 
701  return (type(x) == type(y) # noqa: E721
702  and get_dict(x) == get_dict(y))
703 
704  def analysis_equal(x, y):
705  return (len(x.steps) == len(y.steps)
706  and all(step_equal(a, b)
707  for (a, b) in zip(x.steps, y.steps)))
708 
709  for existing in self.system.orphan_protocols:
710  if (len(existing.steps) == len(prot.steps)
711  and len(existing.analyses) == len(prot.analyses)
712  and all(step_equal(x, y)
713  for (x, y) in zip(existing.steps, prot.steps))
714  and all(analysis_equal(x, y)
715  for (x, y) in zip(existing.analyses, prot.analyses))):
716  return existing
717  self.system.orphan_protocols.append(prot)
718  return prot
719 
720  def _add_hierarchy(self, h, top_h, modeled_assembly, all_software):
721  num_models = 0 # assume we always start with no models
724  in_postproc = False
725  prot = _Protocol()
726  for p in reversed(list(_get_all_state_provenance(
727  h, top_h, types=prot_types + pp_types))):
728  if isinstance(p, pp_types):
729  num_models = prot.add_postproc(p, num_models, modeled_assembly)
730  in_postproc = True
731  else:
732  if in_postproc:
733  # Start a new protocol
734  self._add_protocol(prot)
735  prot = _Protocol()
736  num_models = prot.add_step(p, num_models, modeled_assembly,
737  all_software)
738  in_postproc = False
739  if len(prot.steps) > 0:
740  return self._add_protocol(prot)
741 
742 
743 class _CoordinateHandler(object):
744  def __init__(self, system, datasets):
745  self._system = system
746  self._datasets = datasets
747  self._representation = ihm.representation.Representation()
748  # IHM atoms/spheres corresponding to IMP beads/residues/atoms
749  # We build them up front (rather than on the fly) as the original
750  # IMP objects may have been destroyed or changed (e.g. if we read
751  # multiple frames from an RMF file) by the time we write the mmCIF.
752  self._atoms = []
753  self._spheres = []
754 
755  def get_residue_sequence(self, ps):
756  """Determine the primary sequence based on Residue particles.
757  Return the index of the first residue and the sequence, as a list
758  of ihm.ChemComp objects (or None)"""
759  restyp = {}
760  for p in ps:
761  if isinstance(p, IMP.atom.Atom):
762  residue = IMP.atom.get_residue(p)
763  restyp[residue.get_index()] = residue.get_residue_type()
764  elif isinstance(p, IMP.atom.Residue):
765  restyp[p.get_index()] = p.get_residue_type()
766  else: # fragment
767  resinds = p.get_residue_indexes()
768  for ri in resinds:
769  if ri not in restyp: # don't overwrite residue/atom info
770  restyp[ri] = None
771  if not restyp:
772  return 1, []
773  seq_id_begin = min(restyp.keys())
774  seq_id_end = max(restyp.keys())
775  return (seq_id_begin,
776  [_imp_to_ihm[restyp.get(x)]
777  for x in range(seq_id_begin, seq_id_end + 1)])
778 
779  def add_chain(self, ps, asym):
780  def matches_asym(s):
781  # Match AsymUnit or AsymUnitRange
782  return s == asym or hasattr(s, 'asym') and s.asym == asym
783 
784  # Consolidate starting models if the same model was used for this
785  # asym in a different state or for a different model_id
786  smf = _StartingModelFinder(
787  asym, [s for s in self._system.orphan_starting_models
788  if matches_asym(s.asym_unit)],
789  self._system, self._datasets)
790  segfactory = _RepSegmentFactory(asym)
791  offset = asym.auth_seq_id_map
792  for p in ps:
793  starting_model = smf.find(p)
794  seg = segfactory.add(p, starting_model)
795  if seg:
796  self._representation.append(seg)
797  self._add_atom_or_sphere(p, asym, offset)
798  last = segfactory.get_last()
799  if last:
800  self._representation.append(last)
801 
802  def _add_atom_or_sphere(self, p, asym, offset):
803  if isinstance(p, IMP.atom.Atom):
804  residue = IMP.atom.get_residue(p)
805  xyz = IMP.core.XYZ(p).get_coordinates()
806  element = p.get_element()
807  element = IMP.atom.get_element_table().get_name(element)
808  atom_name = p.get_atom_type().get_string()
809  het = atom_name.startswith('HET:')
810  if het:
811  atom_name = atom_name[4:]
812  self._atoms.append(ihm.model.Atom(
813  asym_unit=asym, seq_id=residue.get_index() - offset,
814  atom_id=atom_name, type_symbol=element,
815  x=xyz[0], y=xyz[1], z=xyz[2], het=het,
816  biso=p.get_temperature_factor(),
817  occupancy=p.get_occupancy()))
818  else:
819  if isinstance(p, IMP.atom.Fragment):
820  resinds = p.get_residue_indexes()
821  sbegin = resinds[0]
822  send = resinds[-1]
823  else: # residue
824  sbegin = send = p.get_index()
825  xyzr = IMP.core.XYZR(p)
826  xyz = xyzr.get_coordinates()
827  self._spheres.append(ihm.model.Sphere(
828  asym_unit=asym, seq_id_range=(sbegin - offset, send - offset),
829  x=xyz[0], y=xyz[1], z=xyz[2], radius=xyzr.get_radius()))
830 
831  def get_structure_particles(self, h):
832  """Return particles sorted by residue index"""
833  ps = []
834  if h.get_number_of_children() == 0:
835  return []
836  if not h.get_is_valid():
837  raise ValueError("Invalid hierarchy as input")
838  for p in IMP.atom.Selection(
839  hierarchy=h, resolution=0.).get_selected_particles():
841  residue = IMP.atom.Residue(p)
842  ps.append((residue.get_index(), residue))
844  fragment = IMP.atom.Fragment(p)
845  resinds = fragment.get_residue_indexes()
846  _check_sequential(fragment, resinds)
847  resind = resinds[len(resinds) // 2]
848  ps.append((resind, fragment))
850  atom = IMP.atom.Atom(p)
851  residue = IMP.atom.get_residue(atom)
852  ps.append((residue.get_index(), atom))
853  return [p[1] for p in sorted(ps, key=operator.itemgetter(0))]
854 
855 
856 class _ModelAssemblies(object):
857  def __init__(self, system):
858  self.system = system
859  self._seen_assemblies = {}
860 
861  def add(self, asyms):
862  # list isn't hashable but tuple is
863  asyms = tuple(asyms)
864  if asyms not in self._seen_assemblies:
865  assembly = ihm.Assembly(
866  asyms, name="Modeled assembly",
867  description="All components modeled by IMP")
868  self.system.orphan_assemblies.append(assembly)
869  self._seen_assemblies[asyms] = assembly
870  return self._seen_assemblies[asyms]
871 
872 
873 class _Representations(object):
874  def __init__(self, system):
875  self.system = system
876 
877  def add(self, rep):
878  # Representation isn't hashable or sortable, so just compare dicts
879  # with existing representations. This should still be performant as
880  # we generally don't have more than one or two representations.
881  for existing in self.system.orphan_representations:
882  if (len(existing) == len(rep)
883  and all(type(x) == type(y) # noqa: E721
884  and x.__dict__ == y.__dict__
885  for (x, y) in zip(existing, rep))):
886  return existing
887  self.system.orphan_representations.append(rep)
888  return rep
Select non water and non hydrogen atoms.
Definition: pdb.h:314
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
Hierarchy read_pdb_or_mmcif(TextInput input, Model *model, PDBSelector *selector=get_default_pdb_selector(), bool select_first_model=true)
Read all the molecules in the first model of the PDB or mmCIF file.
Definition: mmcif.h:40
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Residue.h:158
A decorator to associate a particle with a part of a protein/DNA/RNA.
Definition: Fragment.h:20
Track creation of a system fragment from running some software.
Definition: provenance.h:562
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: atom/Atom.h:245
ElementTable & get_element_table()
Track creation of a system fragment from sampling.
Definition: provenance.h:173
def get_molecule
Given a Hierarchy, walk up and find the parent Molecule.
Definition: data.py:44
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:749
Track creation of a system fragment by combination.
Definition: provenance.h:280
Class for storing model, its restraints, constraints, and particles.
Definition: Model.h:86
static bool get_is_setup(Model *m, ParticleIndex pi)
Definition: Fragment.h:46
void add_hierarchy(RMF::FileHandle fh, atom::Hierarchy hs)
The standard decorator for manipulating molecular structures.
Ints get_index(const ParticlesTemp &particles, const Subset &subset, const Subsets &excluded)
A decorator for a particle representing an atom.
Definition: atom/Atom.h:238
Track creation of a system fragment by filtering.
Definition: provenance.h:340
The type for a residue.
Track creation of a system fragment from a PDB file.
Definition: provenance.h:86
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
A decorator for a residue.
Definition: Residue.h:137
static bool get_is_setup(Model *m, ParticleIndex p)
Check if the particle has the needed attributes for a cast to succeed.
Residue get_residue(Atom d, bool nothrow=false)
Return the Residue containing this atom.
Track creation of a system fragment from clustering.
Definition: provenance.h:426
static bool get_is_setup(Model *m, ParticleIndex pi)
Definition: provenance.h:584
Functionality for loading, creating, manipulating and scoring atomic structures.
std::string get_chain_id(Hierarchy h)
Walk up the hierarchy to determine the chain id.
def get_all_provenance
Yield all provenance decorators of the given types for the particle.
A decorator for a molecule.
Definition: Molecule.h:24
Select hierarchy particles identified by the biological name.
Definition: Selection.h:70
Select all ATOM and HETATM records with the given chain ids.
Definition: pdb.h:256
Track creation of a system fragment from running a script.
Definition: provenance.h:511
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27