IMP logo
IMP Reference Guide  2.20.0
The Integrative Modeling Platform
rmsd_calculation.py
1 from __future__ import print_function
2 import pyRMSD.RMSDCalculator
3 from pyRMSD.condensedMatrix import CondensedMatrix
4 
5 import numpy as np
6 import os
7 import glob
8 
9 import IMP
10 import IMP.atom
11 import IMP.rmf
12 import RMF
13 import multiprocessing as mp
14 import sys
15 from itertools import combinations
16 
17 
18 def parse_symm_groups_for_pyrmsd(s):
19  output = []
20  for grp in s:
21  n = len(grp[0])
22  if n == 2:
23  output.append(grp)
24  continue
25  for j, k in combinations(np.arange(n), 2):
26  sub_grp = np.array(grp)[:, np.array([j, k])]
27  output.append(sub_grp.tolist())
28  return output
29 
30 
31 def parse_rmsd_selection(h, selection, resolution=1):
32  s0 = None
33  for domain_list in selection.values():
34  # each element of the dictionary is a list of domains
35  # each domain is a tuple like (start,end,protein) or (protein)
36  # to add copy number use (start,end,protein.copy_number)
37  # or (protein.copy_number)
38 
39  for domain in domain_list:
40 
41  start_res = int(domain[0])
42  end_res = int(domain[1])
43 
44  prot_plus_copy = domain[2]
45 
46  if "." in prot_plus_copy:
47  copy_number = int(prot_plus_copy.split(".")[1])
48  prot_name = prot_plus_copy.split(".")[0]
49 
50  else:
51  copy_number = 0
52  prot_name = prot_plus_copy
53 
54  s = IMP.atom.Selection(h, resolution=resolution,
55  molecule=prot_name,
56  copy_index=copy_number,
57  residue_indexes=range(start_res, end_res+1))
58 
59  if s0:
60  s0 |= s
61  else:
62  s0 = s
63  return s0
64 
65 
66 def get_pdbs_coordinates(path, idfile_A, idfile_B):
67  pts = []
68  conform = []
69  num = 0
70  masses = []
71  radii = []
72 
73  models_name = []
74 
75  with open(idfile_A, 'w+') as f1:
76  for str_file in sorted(
77  glob.glob("%s/sample_A/*.pdb" % path),
78  key=lambda x: int(x.split('/')[-1].split('.')[0])):
79  print(str_file, num, file=f1)
80  models_name.append(str_file)
81 
82  m = IMP.Model()
83  mh = IMP.atom.read_pdb(str_file, m,
85  mps = IMP.core.get_leaves(mh)
86  pts = [IMP.core.XYZ(p).get_coordinates() for p in mps]
87  if num == 0:
88  masses = [IMP.atom.Mass(p).get_mass() for p in mps]
89  radii = [IMP.core.XYZR(p).get_radius() for p in mps]
90  conform.append(pts)
91  pts = []
92  num = num + 1
93 
94  with open(idfile_B, 'w+') as f2:
95  for str_file in sorted(
96  glob.glob("%s/sample_B/*.pdb" % path),
97  key=lambda x: int(x.split('/')[-1].split('.')[0])):
98  print(str_file, num, file=f2)
99  models_name.append(str_file)
100 
101  m = IMP.Model()
102  mh = IMP.atom.read_pdb(str_file, m,
104  mps = IMP.core.get_leaves(mh)
105  pts = [IMP.core.XYZ(p).get_coordinates() for p in mps]
106  conform.append(pts)
107  pts = []
108  num = num + 1
109 
110  return np.array(conform), masses, radii, models_name
111 
112 
113 def get_rmfs_coordinates(path, idfile_A, idfile_B,
114  subunit_name=None, selection=None, resolution=1):
115 
116  conform = []
117  num = 0
118  masses = []
119  radii = []
120  ps_names = []
121 
122  f1 = open(idfile_A, 'w+')
123  f2 = open(idfile_B, 'w+')
124 
125  models_name = []
126 
127  for sample_name, sample_id_file in zip(['A', 'B'], [f1, f2]):
128 
129  for str_file in sorted(
130  glob.glob("%s/sample_%s/*.rmf3" % (path, sample_name)),
131  key=lambda x: int(x.split('/')[-1].split('.')[0])):
132  print(str_file, num, file=sample_id_file)
133  models_name.append(str_file)
134 
135  m = IMP.Model()
136  inf = RMF.open_rmf_file_read_only(str_file)
137  h = IMP.rmf.create_hierarchies(inf, m)[0]
138  IMP.rmf.load_frame(inf, 0)
139 
140  pts = []
141 
142  if subunit_name:
143  s0 = IMP.atom.Selection(h, resolution=resolution,
144  molecule=subunit_name)
145  elif selection is not None:
146  s0 = parse_rmsd_selection(h, selection, resolution)
147  else:
148  s0 = IMP.atom.Selection(h, resolution=resolution)
149 
150  for leaf in s0.get_selected_particles():
151 
152  p = IMP.core.XYZR(leaf)
153  pts.append([p.get_coordinates()[i] for i in range(3)])
154 
155  if num == 0 and sample_name == 'A':
156  masses.append(IMP.atom.Mass(leaf).get_mass())
157  radii.append(p.get_radius())
158  mol_name = \
160  # traverse up the Hierarchy to get copy number of the
161  # molecule that the bead belongs to
162  copy_number = \
164 
166  # TODO not tested on non-fragment systems
167  residues_in_bead = \
168  IMP.atom.Fragment(leaf).get_residue_indexes()
169 
170  ps_names.append(
171  mol_name + "_" + str(min(residues_in_bead)) + "_"
172  + str(max(residues_in_bead)) + "_"
173  + str(copy_number))
174 
175  else:
176  residue_in_bead = \
177  str(IMP.atom.Residue(leaf).get_index())
178 
179  ps_names.append(mol_name + "_" + residue_in_bead + "_"
180  + residue_in_bead + "_"
181  + str(copy_number))
182 
183  conform.append(pts)
184  pts = []
185  num = num + 1
186  f1.close()
187  f2.close()
188 
189  return ps_names, masses, radii, np.array(conform), models_name
190 
191 
192 def parse_symmetric_groups_file(symm_groups_file):
193  symm_groups = []
194  member_to_symm_group = {}
195  first_group_member = []
196  curr_particle_index_in_group = []
197 
198  sgf = open(symm_groups_file, 'r')
199 
200  for indx, ln in enumerate(sgf.readlines()):
201 
202  symm_groups.append([]) # create new symm group list
203  # particle index for new symmetric group
204  curr_particle_index_in_group.append(-1)
205  fields = ln.strip().split()
206 
207  for fld in fields:
208  # group that the current protein copy belongs to
209  for subfld in fld.split('/'):
210  member_to_symm_group[subfld] = indx
211 
212  # the first group member is special! We create a symm group list
213  # of particles for the first group member
214  first_group_member.append(fields[0].split('/'))
215 
216  sgf.close()
217 
218  return (symm_groups, member_to_symm_group, curr_particle_index_in_group,
219  first_group_member)
220 
221 
222 def get_conforms_per_frame_batch(arg_bundle):
223  rmf_file, frames, mod_id_start = arg_bundle[:3]
224  resolution, subunit_name, selection, path = arg_bundle[3:]
225  from collections import defaultdict
226  m = IMP.Model()
227  rmf_fh = RMF.open_rmf_file_read_only(os.path.join(path, rmf_file))
228  h = IMP.rmf.create_hierarchies(rmf_fh, m)[0]
229  result = defaultdict(list)
230  mod_id = mod_id_start
231  for f in frames:
232  IMP.rmf.load_frame(rmf_fh, f)
233  m.update()
234  if subunit_name:
235  s0 = IMP.atom.Selection(h, resolution=resolution,
236  molecule=subunit_name)
237  elif selection is not None:
238  s0 = parse_rmsd_selection(h, selection, resolution)
239  else:
240  s0 = IMP.atom.Selection(h, resolution=resolution)
241 
242  particles = s0.get_selected_particles()
243 
244  # Copy particle coordinates
245  for i in range(len(particles)):
246  # i is an index over all particles in the system
247 
248  leaf = particles[i]
249  p = IMP.core.XYZR(leaf)
250  pxyz = p.get_coordinates()
251  result[mod_id].append(list(pxyz))
252  mod_id += 1
253  return result
254 
255 
256 def get_rmfs_coordinates_one_rmf(path, rmf_A, rmf_B,
257  subunit_name=None,
258  symm_groups_file=None, selection=None,
259  resolution=1, n_cores=None):
260  '''Modified RMF coordinates function to work with symmetric copies'''
261  if n_cores is None:
262  n_cores = 1
263  # Open RMFs and get total number of models
264  rmf_fh = RMF.open_rmf_file_read_only(os.path.join(path, rmf_A))
265  n_models = [rmf_fh.get_number_of_frames()]
266  rmf_fh = RMF.open_rmf_file_read_only(os.path.join(path, rmf_B))
267  n_models.append(rmf_fh.get_number_of_frames())
268 
269  masses = []
270  radii = []
271  ps_names = []
272 
273  models_name = []
274 
275  # Build hierarchy from the RMF file
276  m = IMP.Model()
277  h = IMP.rmf.create_hierarchies(rmf_fh, m)[0]
278  IMP.rmf.load_frame(rmf_fh, 0)
279  m.update()
280 
281  ######
282  # Initialize output array
283  pts = 0 # number of particles in each model
284 
285  # Get selection
286  if subunit_name:
287  s0 = IMP.atom.Selection(h, resolution=resolution,
288  molecule=subunit_name)
289  elif selection is not None:
290  s0 = parse_rmsd_selection(h, selection, resolution)
291  else:
292  s0 = IMP.atom.Selection(h, resolution=resolution)
293 
294  # Count particles
295  for leaf in s0.get_selected_particles():
296  p = IMP.core.XYZR(leaf)
297  pts += 1
298 
299  # Initialize array
300  conform = np.empty([n_models[0]+n_models[1], pts, 3])
301 
302  # Initialize the symmetric group particles list, and protein to
303  # symmetry group mapping
304  if symm_groups_file:
305  (symm_groups, group_member_to_symm_group_map,
306  curr_particle_index_in_group, first_group_member) = \
307  parse_symmetric_groups_file(symm_groups_file)
308  else:
309  symm_groups = None
310 
311  # add masses, particle names and radii
312  rmf_fh = RMF.open_rmf_file_read_only(os.path.join(path, rmf_A))
313  h = IMP.rmf.create_hierarchies(rmf_fh, m)[0]
314  IMP.rmf.load_frame(rmf_fh, 0)
315  m.update()
316  if subunit_name:
317  s0 = IMP.atom.Selection(h, resolution=resolution,
318  molecule=subunit_name)
319  elif selection is not None:
320  s0 = parse_rmsd_selection(h, selection, resolution)
321  else:
322  s0 = IMP.atom.Selection(h, resolution=resolution)
323  particles = s0.get_selected_particles()
324 
325  # Copy particle coordinates
326  for i in range(len(particles)):
327  # i is an index over all particles in the system
328 
329  leaf = particles[i]
330  p = IMP.core.XYZR(leaf)
331  masses.append(IMP.atom.Mass(leaf).get_mass())
332  radii.append(p.get_radius())
333  mol_name = IMP.atom.get_molecule_name(
334  IMP.atom.Hierarchy(leaf))
335  # traverse up the Hierarchy to get copy number of
336  # the molecule that the bead belongs to
337  copy_number = \
339 
340  # Add to symmetric groups if needed
341  if symm_groups_file:
342 
343  protein_plus_copy = mol_name+'.'+str(copy_number)
344  if protein_plus_copy in group_member_to_symm_group_map:
345  # protein copy is in a symmetric group
346  group_index = group_member_to_symm_group_map[
347  protein_plus_copy]
348  curr_particle_index_in_group[group_index] += 1
349 
350  if protein_plus_copy \
351  in first_group_member[group_index]:
352  symm_groups[group_index].append([i])
353 
354  else:
355  j = curr_particle_index_in_group[group_index] \
356  % len(symm_groups[group_index])
357  symm_groups[group_index][j].append(i)
358 
359  # TODO not tested on non-fragment systems
361  residues_in_bead = \
362  IMP.atom.Fragment(leaf).get_residue_indexes()
363  ps_names.append(
364  mol_name + "_" + str(min(residues_in_bead)) + "_"
365  + str(max(residues_in_bead)) + "_"
366  + str(copy_number))
367  else:
368  residue_in_bead = \
369  str(IMP.atom.Residue(leaf).get_index())
370  ps_names.append(
371  mol_name + "_" + residue_in_bead + "_" +
372  residue_in_bead + "_" + str(copy_number))
373 
374  mod_id = 0 # index for each model in conform.
375  for rmf_file in [rmf_A, rmf_B]:
376 
377  models_name.append(rmf_file)
378  rmf_fh = RMF.open_rmf_file_read_only(os.path.join(path, rmf_file))
379  h = IMP.rmf.create_hierarchies(rmf_fh, m)[0]
380  n_frames = rmf_fh.get_number_of_frames()
381  print("Opening RMF file:", rmf_file, "with",
382  n_frames, "frames")
383 
384  n_cores = min(n_cores, n_frames) # to ensure n_per_core > 0
385  n_per_core = n_frames // n_cores
386  spacing = np.arange(0, n_per_core * n_cores, n_per_core)
387  mod_id_starts = spacing + mod_id
388  frame_number_starts = spacing
389  frame_number_ends = spacing + n_per_core
390  frame_number_ends[-1] = n_frames - 1
391  frame_lists = []
392  for i in range(n_cores):
393  a = frame_number_starts[i]
394  b = frame_number_ends[i]
395  frame_lists.append(np.arange(a, b + 1))
396  p = mp.Pool(n_cores)
397  args_list = [(rmf_file, frame_lists[i], mod_id_starts[i], resolution,
398  subunit_name, selection, path) for i in range(n_cores)]
399  results = p.map(get_conforms_per_frame_batch, args_list)
400  for res in results:
401  for m_id in res:
402  conform[m_id, :, :] = np.array(res[m_id])
403  mod_id += n_frames
404 
405  if symm_groups_file:
406  for grp in symm_groups:
407  if len(grp) == 0:
408  print("Warning. Symmetry option specified but created "
409  "symmetry group is empty. Cross-check the "
410  "specification of symmetry groups.")
411  check = [len(x) for x in grp]
412  message = "Unequal no. of particles in same symm group."
413  message2 = "Cross-check the specification of symm groups."
414  assert len(set(check)) == 1, message + message2
415 
416  return ps_names, masses, radii, conform, symm_groups, models_name, n_models
417 
418 
419 def get_rmsds_matrix(conforms, mode, sup, cores, symm_groups=None):
420 
421  if (mode == "cpu_serial" and not sup) or (mode == "cpu_omp" and not sup):
422  calculator_name = "NOSUP_OMP_CALCULATOR"
423 
424  elif mode == "cpu_omp" and sup:
425  calculator_name = "QCP_OMP_CALCULATOR"
426  print("we are using QCP_OMP to compute RMSD")
427  elif mode == "cuda" and sup:
428  calculator_name = "QCP_CUDA_MEM_CALCULATOR"
429  else:
430  print("Wrong values to pyRMSD ! Please Fix")
431  exit()
432 
433  if symm_groups:
434  print("We have ambiguity.")
435  if calculator_name == 'NOSUP_OMP_CALCULATOR':
436  # calc_symm_groups are enough without any fitting
437  s1 = parse_symm_groups_for_pyrmsd(symm_groups)
438  calculator = pyRMSD.RMSDCalculator.RMSDCalculator(
439  calculator_name,
440  fittingCoordsets=conforms,
441  calculationCoordsets=conforms,
442  calcSymmetryGroups=s1,
443  fitSymmetryGroups=[])
444  else:
445  calculator = pyRMSD.RMSDCalculator.RMSDCalculator(
446  calculator_name,
447  fittingCoordsets=conforms,
448  calcSymmetryGroups=[],
449  fitSymmetryGroups=symm_groups)
450 
451  else:
452  calculator = pyRMSD.RMSDCalculator.RMSDCalculator(
453  calculator_name, conforms)
454 
455  if not symm_groups:
456  if mode == "cpu_omp":
457  # additionally set number of cores for parallel calculator
458  calculator.setNumberOfOpenMPThreads(cores)
459  rmsd = calculator.pairwiseRMSDMatrix()
460  elif sys.version_info[0] == 3: # python 3 with symm groups
461  if mode == "cpu_omp":
462  # additionally set number of cores for parallel calculator
463  calculator.setNumberOfOpenMPThreads(1)
464  rmsd = []
465  p = mp.Pool(cores)
466  gen = p.imap(calculator.oneVsFollowing, list(range(len(conforms) - 1)))
467  for i in gen:
468  rmsd += list(i)
469  p.close()
470  p.terminate()
471  else: # python 2 with symm groups
472  if mode == "cpu_omp":
473  # additionally set number of cores for parallel calculator
474  calculator.setNumberOfOpenMPThreads(cores)
475  rmsd = []
476  for i in range(len(conforms) - 1):
477  rmsd += list(calculator.oneVsFollowing(i))
478  rmsd_matrix = CondensedMatrix(rmsd)
479  inner_data = rmsd_matrix.get_data()
480  np.save("Distances_Matrix.data", inner_data)
481 
482  return inner_data
Select non water and non hydrogen atoms.
Definition: pdb.h:331
Add mass to a particle.
Definition: Mass.h:23
A decorator to associate a particle with a part of a protein/DNA/RNA.
Definition: Fragment.h:20
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
double get_mass(ResidueType c)
Get the mass from the residue type.
GenericHierarchies get_leaves(Hierarchy mhd)
Get all the leaves of the bit of hierarchy.
void read_pdb(TextInput input, int model, Hierarchy h)
Class for storing model, its restraints, constraints, and particles.
Definition: Model.h:86
static bool get_is_setup(Model *m, ParticleIndex pi)
Definition: Fragment.h:46
The standard decorator for manipulating molecular structures.
Ints get_index(const ParticlesTemp &particles, const Subset &subset, const Subsets &excluded)
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
std::string get_molecule_name(Hierarchy h)
A decorator for a residue.
Definition: Residue.h:137
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Definition: Selection.h:70
Support for the RMF file format for storing hierarchical molecular data and markup.
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27