IMP logo
IMP Reference Guide  develop.d97d4ead1f,2024/11/21
The Integrative Modeling Platform
rmsd_calculation.py
1 """@namespace IMP.sampcon.rmsd_calculation
2  Utilities to help with RMSD calculation."""
3 
4 from __future__ import print_function
5 import pyRMSD.RMSDCalculator
6 from pyRMSD.condensedMatrix import CondensedMatrix
7 
8 import numpy as np
9 import os
10 import glob
11 
12 import IMP
13 import IMP.atom
14 import IMP.rmf
15 import RMF
16 import multiprocessing as mp
17 import sys
18 from itertools import combinations
19 
20 
21 def parse_symm_groups_for_pyrmsd(s):
22  output = []
23  for grp in s:
24  n = len(grp[0])
25  if n == 2:
26  output.append(grp)
27  continue
28  for j, k in combinations(np.arange(n), 2):
29  sub_grp = np.array(grp)[:, np.array([j, k])]
30  output.append(sub_grp.tolist())
31  return output
32 
33 
34 def parse_rmsd_selection(h, selection, resolution=1):
35  s0 = None
36  for domain_list in selection.values():
37  # each element of the dictionary is a list of domains
38  # each domain is a tuple like (start,end,protein) or (protein)
39  # to add copy number use (start,end,protein.copy_number)
40  # or (protein.copy_number)
41 
42  for domain in domain_list:
43 
44  start_res = int(domain[0])
45  end_res = int(domain[1])
46 
47  prot_plus_copy = domain[2]
48 
49  if "." in prot_plus_copy:
50  copy_number = int(prot_plus_copy.split(".")[1])
51  prot_name = prot_plus_copy.split(".")[0]
52 
53  else:
54  copy_number = 0
55  prot_name = prot_plus_copy
56 
57  s = IMP.atom.Selection(h, resolution=resolution,
58  molecule=prot_name,
59  copy_index=copy_number,
60  residue_indexes=range(start_res, end_res+1))
61 
62  if s0:
63  s0 |= s
64  else:
65  s0 = s
66  return s0
67 
68 
69 def get_pdbs_coordinates(path, idfile_A, idfile_B):
70  pts = []
71  conform = []
72  num = 0
73  masses = []
74  radii = []
75 
76  models_name = []
77 
78  with open(idfile_A, 'w+') as f1:
79  for str_file in sorted(
80  glob.glob("%s/sample_A/*.pdb" % path),
81  key=lambda x: int(x.split('/')[-1].split('.')[0])):
82  print(str_file, num, file=f1)
83  models_name.append(str_file)
84 
85  m = IMP.Model()
86  mh = IMP.atom.read_pdb(str_file, m,
88  mps = IMP.core.get_leaves(mh)
89  pts = [IMP.core.XYZ(p).get_coordinates() for p in mps]
90  if num == 0:
91  masses = [IMP.atom.Mass(p).get_mass() for p in mps]
92  radii = [IMP.core.XYZR(p).get_radius() for p in mps]
93  conform.append(pts)
94  pts = []
95  num = num + 1
96 
97  with open(idfile_B, 'w+') as f2:
98  for str_file in sorted(
99  glob.glob("%s/sample_B/*.pdb" % path),
100  key=lambda x: int(x.split('/')[-1].split('.')[0])):
101  print(str_file, num, file=f2)
102  models_name.append(str_file)
103 
104  m = IMP.Model()
105  mh = IMP.atom.read_pdb(str_file, m,
107  mps = IMP.core.get_leaves(mh)
108  pts = [IMP.core.XYZ(p).get_coordinates() for p in mps]
109  conform.append(pts)
110  pts = []
111  num = num + 1
112 
113  return np.array(conform), masses, radii, models_name
114 
115 
116 def get_rmfs_coordinates(path, idfile_A, idfile_B,
117  subunit_name=None, selection=None, resolution=1):
118 
119  conform = []
120  num = 0
121  masses = []
122  radii = []
123  ps_names = []
124 
125  f1 = open(idfile_A, 'w+')
126  f2 = open(idfile_B, 'w+')
127 
128  models_name = []
129 
130  for sample_name, sample_id_file in zip(['A', 'B'], [f1, f2]):
131 
132  for str_file in sorted(
133  glob.glob("%s/sample_%s/*.rmf3" % (path, sample_name)),
134  key=lambda x: int(x.split('/')[-1].split('.')[0])):
135  print(str_file, num, file=sample_id_file)
136  models_name.append(str_file)
137 
138  m = IMP.Model()
139  inf = RMF.open_rmf_file_read_only(str_file)
140  h = IMP.rmf.create_hierarchies(inf, m)[0]
141  IMP.rmf.load_frame(inf, 0)
142 
143  pts = []
144 
145  if subunit_name:
146  s0 = IMP.atom.Selection(h, resolution=resolution,
147  molecule=subunit_name)
148  elif selection is not None:
149  s0 = parse_rmsd_selection(h, selection, resolution)
150  else:
151  s0 = IMP.atom.Selection(h, resolution=resolution)
152 
153  for leaf in s0.get_selected_particles():
154 
155  p = IMP.core.XYZR(leaf)
156  pts.append([p.get_coordinates()[i] for i in range(3)])
157 
158  if num == 0 and sample_name == 'A':
159  masses.append(IMP.atom.Mass(leaf).get_mass())
160  radii.append(p.get_radius())
161  mol_name = \
163  # traverse up the Hierarchy to get copy number of the
164  # molecule that the bead belongs to
165  copy_number = \
167 
169  # TODO not tested on non-fragment systems
170  residues_in_bead = \
171  IMP.atom.Fragment(leaf).get_residue_indexes()
172 
173  ps_names.append(
174  mol_name + "_" + str(min(residues_in_bead)) + "_"
175  + str(max(residues_in_bead)) + "_"
176  + str(copy_number))
177 
178  else:
179  residue_in_bead = \
180  str(IMP.atom.Residue(leaf).get_index())
181 
182  ps_names.append(mol_name + "_" + residue_in_bead + "_"
183  + residue_in_bead + "_"
184  + str(copy_number))
185 
186  conform.append(pts)
187  pts = []
188  num = num + 1
189  f1.close()
190  f2.close()
191 
192  return ps_names, masses, radii, np.array(conform), models_name
193 
194 
195 def parse_symmetric_groups_file(symm_groups_file):
196  symm_groups = []
197  member_to_symm_group = {}
198  first_group_member = []
199  curr_particle_index_in_group = []
200 
201  sgf = open(symm_groups_file, 'r')
202 
203  for indx, ln in enumerate(sgf.readlines()):
204 
205  symm_groups.append([]) # create new symm group list
206  # particle index for new symmetric group
207  curr_particle_index_in_group.append(-1)
208  fields = ln.strip().split()
209 
210  for fld in fields:
211  # group that the current protein copy belongs to
212  for subfld in fld.split('/'):
213  member_to_symm_group[subfld] = indx
214 
215  # the first group member is special! We create a symm group list
216  # of particles for the first group member
217  first_group_member.append(fields[0].split('/'))
218 
219  sgf.close()
220 
221  return (symm_groups, member_to_symm_group, curr_particle_index_in_group,
222  first_group_member)
223 
224 
225 def get_conforms_per_frame_batch(arg_bundle):
226  rmf_file, frames, mod_id_start = arg_bundle[:3]
227  resolution, subunit_name, selection, path = arg_bundle[3:]
228  from collections import defaultdict
229  m = IMP.Model()
230  rmf_fh = RMF.open_rmf_file_read_only(os.path.join(path, rmf_file))
231  h = IMP.rmf.create_hierarchies(rmf_fh, m)[0]
232  result = defaultdict(list)
233  mod_id = mod_id_start
234  for f in frames:
235  IMP.rmf.load_frame(rmf_fh, f)
236  m.update()
237  if subunit_name:
238  s0 = IMP.atom.Selection(h, resolution=resolution,
239  molecule=subunit_name)
240  elif selection is not None:
241  s0 = parse_rmsd_selection(h, selection, resolution)
242  else:
243  s0 = IMP.atom.Selection(h, resolution=resolution)
244 
245  particles = s0.get_selected_particles()
246 
247  # Copy particle coordinates
248  for i in range(len(particles)):
249  # i is an index over all particles in the system
250 
251  leaf = particles[i]
252  p = IMP.core.XYZR(leaf)
253  pxyz = p.get_coordinates()
254  result[mod_id].append(list(pxyz))
255  mod_id += 1
256  return result
257 
258 
259 def get_rmfs_coordinates_one_rmf(path, rmf_A, rmf_B,
260  subunit_name=None,
261  symm_groups_file=None, selection=None,
262  resolution=1, n_cores=None):
263  '''Modified RMF coordinates function to work with symmetric copies'''
264  if n_cores is None:
265  n_cores = 1
266  # Open RMFs and get total number of models
267  rmf_fh = RMF.open_rmf_file_read_only(os.path.join(path, rmf_A))
268  n_models = [rmf_fh.get_number_of_frames()]
269  rmf_fh = RMF.open_rmf_file_read_only(os.path.join(path, rmf_B))
270  n_models.append(rmf_fh.get_number_of_frames())
271 
272  masses = []
273  radii = []
274  ps_names = []
275 
276  models_name = []
277 
278  # Build hierarchy from the RMF file
279  m = IMP.Model()
280  h = IMP.rmf.create_hierarchies(rmf_fh, m)[0]
281  IMP.rmf.load_frame(rmf_fh, 0)
282  m.update()
283 
284  ######
285  # Initialize output array
286  pts = 0 # number of particles in each model
287 
288  # Get selection
289  if subunit_name:
290  s0 = IMP.atom.Selection(h, resolution=resolution,
291  molecule=subunit_name)
292  elif selection is not None:
293  s0 = parse_rmsd_selection(h, selection, resolution)
294  else:
295  s0 = IMP.atom.Selection(h, resolution=resolution)
296 
297  # Count particles
298  for leaf in s0.get_selected_particles():
299  p = IMP.core.XYZR(leaf)
300  pts += 1
301 
302  # Initialize array
303  conform = np.empty([n_models[0]+n_models[1], pts, 3])
304 
305  # Initialize the symmetric group particles list, and protein to
306  # symmetry group mapping
307  if symm_groups_file:
308  (symm_groups, group_member_to_symm_group_map,
309  curr_particle_index_in_group, first_group_member) = \
310  parse_symmetric_groups_file(symm_groups_file)
311  else:
312  symm_groups = None
313 
314  # add masses, particle names and radii
315  rmf_fh = RMF.open_rmf_file_read_only(os.path.join(path, rmf_A))
316  h = IMP.rmf.create_hierarchies(rmf_fh, m)[0]
317  IMP.rmf.load_frame(rmf_fh, 0)
318  m.update()
319  if subunit_name:
320  s0 = IMP.atom.Selection(h, resolution=resolution,
321  molecule=subunit_name)
322  elif selection is not None:
323  s0 = parse_rmsd_selection(h, selection, resolution)
324  else:
325  s0 = IMP.atom.Selection(h, resolution=resolution)
326  particles = s0.get_selected_particles()
327 
328  # Copy particle coordinates
329  for i in range(len(particles)):
330  # i is an index over all particles in the system
331 
332  leaf = particles[i]
333  p = IMP.core.XYZR(leaf)
334  masses.append(IMP.atom.Mass(leaf).get_mass())
335  radii.append(p.get_radius())
336  mol_name = IMP.atom.get_molecule_name(
337  IMP.atom.Hierarchy(leaf))
338  # traverse up the Hierarchy to get copy number of
339  # the molecule that the bead belongs to
340  copy_number = \
342 
343  # Add to symmetric groups if needed
344  if symm_groups_file:
345 
346  protein_plus_copy = mol_name+'.'+str(copy_number)
347  if protein_plus_copy in group_member_to_symm_group_map:
348  # protein copy is in a symmetric group
349  group_index = group_member_to_symm_group_map[
350  protein_plus_copy]
351  curr_particle_index_in_group[group_index] += 1
352 
353  if protein_plus_copy \
354  in first_group_member[group_index]:
355  symm_groups[group_index].append([i])
356 
357  else:
358  j = curr_particle_index_in_group[group_index] \
359  % len(symm_groups[group_index])
360  symm_groups[group_index][j].append(i)
361 
362  # TODO not tested on non-fragment systems
364  residues_in_bead = \
365  IMP.atom.Fragment(leaf).get_residue_indexes()
366  ps_names.append(
367  mol_name + "_" + str(min(residues_in_bead)) + "_"
368  + str(max(residues_in_bead)) + "_"
369  + str(copy_number))
370  else:
371  residue_in_bead = \
372  str(IMP.atom.Residue(leaf).get_index())
373  ps_names.append(
374  mol_name + "_" + residue_in_bead + "_" +
375  residue_in_bead + "_" + str(copy_number))
376 
377  mod_id = 0 # index for each model in conform.
378  for rmf_file in [rmf_A, rmf_B]:
379 
380  models_name.append(rmf_file)
381  rmf_fh = RMF.open_rmf_file_read_only(os.path.join(path, rmf_file))
382  h = IMP.rmf.create_hierarchies(rmf_fh, m)[0]
383  n_frames = rmf_fh.get_number_of_frames()
384  print("Opening RMF file:", rmf_file, "with",
385  n_frames, "frames")
386 
387  n_cores = min(n_cores, n_frames) # to ensure n_per_core > 0
388  n_per_core = n_frames // n_cores
389  spacing = np.arange(0, n_per_core * n_cores, n_per_core)
390  mod_id_starts = spacing + mod_id
391  frame_number_starts = spacing
392  frame_number_ends = spacing + n_per_core
393  frame_number_ends[-1] = n_frames - 1
394  frame_lists = []
395  for i in range(n_cores):
396  a = frame_number_starts[i]
397  b = frame_number_ends[i]
398  frame_lists.append(np.arange(a, b + 1))
399  p = mp.Pool(n_cores)
400  args_list = [(rmf_file, frame_lists[i], mod_id_starts[i], resolution,
401  subunit_name, selection, path) for i in range(n_cores)]
402  results = p.map(get_conforms_per_frame_batch, args_list)
403  for res in results:
404  for m_id in res:
405  conform[m_id, :, :] = np.array(res[m_id])
406  mod_id += n_frames
407 
408  if symm_groups_file:
409  for grp in symm_groups:
410  if len(grp) == 0:
411  print("Warning. Symmetry option specified but created "
412  "symmetry group is empty. Cross-check the "
413  "specification of symmetry groups.")
414  check = [len(x) for x in grp]
415  message = "Unequal no. of particles in same symm group."
416  message2 = "Cross-check the specification of symm groups."
417  assert len(set(check)) == 1, message + message2
418 
419  return ps_names, masses, radii, conform, symm_groups, models_name, n_models
420 
421 
422 def get_rmsds_matrix(conforms, mode, sup, cores, symm_groups=None):
423 
424  if (mode == "cpu_serial" and not sup) or (mode == "cpu_omp" and not sup):
425  calculator_name = "NOSUP_OMP_CALCULATOR"
426 
427  elif mode == "cpu_omp" and sup:
428  calculator_name = "QCP_OMP_CALCULATOR"
429  print("we are using QCP_OMP to compute RMSD")
430  elif mode == "cuda" and sup:
431  calculator_name = "QCP_CUDA_MEM_CALCULATOR"
432  else:
433  print("Wrong values to pyRMSD ! Please Fix")
434  exit()
435 
436  if symm_groups:
437  print("We have ambiguity.")
438  if calculator_name == 'NOSUP_OMP_CALCULATOR':
439  # calc_symm_groups are enough without any fitting
440  s1 = parse_symm_groups_for_pyrmsd(symm_groups)
441  calculator = pyRMSD.RMSDCalculator.RMSDCalculator(
442  calculator_name,
443  fittingCoordsets=conforms,
444  calculationCoordsets=conforms,
445  calcSymmetryGroups=s1,
446  fitSymmetryGroups=[])
447  else:
448  calculator = pyRMSD.RMSDCalculator.RMSDCalculator(
449  calculator_name,
450  fittingCoordsets=conforms,
451  calcSymmetryGroups=[],
452  fitSymmetryGroups=symm_groups)
453 
454  else:
455  calculator = pyRMSD.RMSDCalculator.RMSDCalculator(
456  calculator_name, conforms)
457 
458  if not symm_groups:
459  if mode == "cpu_omp":
460  # additionally set number of cores for parallel calculator
461  calculator.setNumberOfOpenMPThreads(cores)
462  rmsd = calculator.pairwiseRMSDMatrix()
463  elif sys.version_info[0] == 3: # python 3 with symm groups
464  if mode == "cpu_omp":
465  # additionally set number of cores for parallel calculator
466  calculator.setNumberOfOpenMPThreads(1)
467  rmsd = []
468  p = mp.Pool(cores)
469  gen = p.imap(calculator.oneVsFollowing, list(range(len(conforms) - 1)))
470  for i in gen:
471  rmsd += list(i)
472  p.close()
473  p.terminate()
474  else: # python 2 with symm groups
475  if mode == "cpu_omp":
476  # additionally set number of cores for parallel calculator
477  calculator.setNumberOfOpenMPThreads(cores)
478  rmsd = []
479  for i in range(len(conforms) - 1):
480  rmsd += list(calculator.oneVsFollowing(i))
481  rmsd_matrix = CondensedMatrix(rmsd)
482  inner_data = rmsd_matrix.get_data()
483  np.save("Distances_Matrix.data", inner_data)
484 
485  return inner_data
Select non water and non hydrogen atoms.
Definition: pdb.h:314
Add mass to a particle.
Definition: Mass.h:23
A decorator to associate a particle with a part of a protein/DNA/RNA.
Definition: Fragment.h:20
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
double get_mass(ResidueType c)
Get the mass from the residue type.
GenericHierarchies get_leaves(Hierarchy mhd)
Get all the leaves of the bit of hierarchy.
void read_pdb(TextInput input, int model, Hierarchy h)
Class for storing model, its restraints, constraints, and particles.
Definition: Model.h:86
static bool get_is_setup(Model *m, ParticleIndex pi)
Definition: Fragment.h:46
The standard decorator for manipulating molecular structures.
Ints get_index(const ParticlesTemp &particles, const Subset &subset, const Subsets &excluded)
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
def get_rmfs_coordinates_one_rmf
Modified RMF coordinates function to work with symmetric copies.
std::string get_molecule_name(Hierarchy h)
A decorator for a residue.
Definition: Residue.h:137
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Definition: Selection.h:70
Support for the RMF file format for storing hierarchical molecular data and markup.
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27