IMP logo
IMP Reference Guide  2.17.0
The Integrative Modeling Platform
rmsd_calculation.py
1 from __future__ import print_function
2 import pyRMSD.RMSDCalculator
3 from pyRMSD.condensedMatrix import CondensedMatrix
4 
5 import numpy as np
6 import os
7 import glob
8 
9 import IMP
10 import IMP.atom
11 import IMP.rmf
12 import RMF
13 import multiprocessing as mp
14 from itertools import combinations
15 
16 
17 def parse_symm_groups_for_pyrmsd(s):
18  output = []
19  for grp in s:
20  n = len(grp[0])
21  if n == 2:
22  output.append(grp)
23  continue
24  for j, k in combinations(np.arange(n), 2):
25  sub_grp = np.array(grp)[:, np.array([j, k])]
26  output.append(sub_grp.tolist())
27  return output
28 
29 
30 def parse_rmsd_selection(h, selection, resolution=1):
31  s0 = None
32  for domain_list in selection.values():
33  # each element of the dictionary is a list of domains
34  # each domain is a tuple like (start,end,protein) or (protein)
35  # to add copy number use (start,end,protein.copy_number)
36  # or (protein.copy_number)
37 
38  for domain in domain_list:
39 
40  start_res = int(domain[0])
41  end_res = int(domain[1])
42 
43  prot_plus_copy = domain[2]
44 
45  if "." in prot_plus_copy:
46  copy_number = int(prot_plus_copy.split(".")[1])
47  prot_name = prot_plus_copy.split(".")[0]
48 
49  else:
50  copy_number = 0
51  prot_name = prot_plus_copy
52 
53  s = IMP.atom.Selection(h, resolution=resolution,
54  molecule=prot_name,
55  copy_index=copy_number,
56  residue_indexes=range(start_res, end_res+1))
57 
58  if s0:
59  s0 |= s
60  else:
61  s0 = s
62  return s0
63 
64 
65 def get_pdbs_coordinates(path, idfile_A, idfile_B):
66  pts = []
67  conform = []
68  num = 0
69  masses = []
70  radii = []
71 
72  models_name = []
73 
74  with open(idfile_A, 'w+') as f1:
75  for str_file in sorted(
76  glob.glob("%s/sample_A/*.pdb" % path),
77  key=lambda x: int(x.split('/')[-1].split('.')[0])):
78  print(str_file, num, file=f1)
79  models_name.append(str_file)
80 
81  m = IMP.Model()
82  mh = IMP.atom.read_pdb(str_file, m,
84  mps = IMP.core.get_leaves(mh)
85  pts = [IMP.core.XYZ(p).get_coordinates() for p in mps]
86  if num == 0:
87  masses = [IMP.atom.Mass(p).get_mass() for p in mps]
88  radii = [IMP.core.XYZR(p).get_radius() for p in mps]
89  conform.append(pts)
90  pts = []
91  num = num + 1
92 
93  with open(idfile_B, 'w+') as f2:
94  for str_file in sorted(
95  glob.glob("%s/sample_B/*.pdb" % path),
96  key=lambda x: int(x.split('/')[-1].split('.')[0])):
97  print(str_file, num, file=f2)
98  models_name.append(str_file)
99 
100  m = IMP.Model()
101  mh = IMP.atom.read_pdb(str_file, m,
103  mps = IMP.core.get_leaves(mh)
104  pts = [IMP.core.XYZ(p).get_coordinates() for p in mps]
105  conform.append(pts)
106  pts = []
107  num = num + 1
108 
109  return np.array(conform), masses, radii, models_name
110 
111 
112 def get_rmfs_coordinates(path, idfile_A, idfile_B,
113  subunit_name=None, selection=None, resolution=1):
114 
115  conform = []
116  num = 0
117  masses = []
118  radii = []
119  ps_names = []
120 
121  f1 = open(idfile_A, 'w+')
122  f2 = open(idfile_B, 'w+')
123 
124  models_name = []
125 
126  for sample_name, sample_id_file in zip(['A', 'B'], [f1, f2]):
127 
128  for str_file in sorted(
129  glob.glob("%s/sample_%s/*.rmf3" % (path, sample_name)),
130  key=lambda x: int(x.split('/')[-1].split('.')[0])):
131  print(str_file, num, file=sample_id_file)
132  models_name.append(str_file)
133 
134  m = IMP.Model()
135  inf = RMF.open_rmf_file_read_only(str_file)
136  h = IMP.rmf.create_hierarchies(inf, m)[0]
137  IMP.rmf.load_frame(inf, 0)
138 
139  pts = []
140 
141  if subunit_name:
142  s0 = IMP.atom.Selection(h, resolution=resolution,
143  molecule=subunit_name)
144  elif selection is not None:
145  s0 = parse_rmsd_selection(h, selection, resolution)
146  else:
147  s0 = IMP.atom.Selection(h, resolution=resolution)
148 
149  for leaf in s0.get_selected_particles():
150 
151  p = IMP.core.XYZR(leaf)
152  pts.append([p.get_coordinates()[i] for i in range(3)])
153 
154  if num == 0 and sample_name == 'A':
155  masses.append(IMP.atom.Mass(leaf).get_mass())
156  radii.append(p.get_radius())
157  mol_name = \
159  # traverse up the Hierarchy to get copy number of the
160  # molecule that the bead belongs to
161  copy_number = \
163 
165  # TODO not tested on non-fragment systems
166  residues_in_bead = \
167  IMP.atom.Fragment(leaf).get_residue_indexes()
168 
169  ps_names.append(
170  mol_name + "_" + str(min(residues_in_bead)) + "_"
171  + str(max(residues_in_bead)) + "_"
172  + str(copy_number))
173 
174  else:
175  residue_in_bead = \
176  str(IMP.atom.Residue(leaf).get_index())
177 
178  ps_names.append(mol_name + "_" + residue_in_bead + "_"
179  + residue_in_bead + "_"
180  + str(copy_number))
181 
182  conform.append(pts)
183  pts = []
184  num = num + 1
185  f1.close()
186  f2.close()
187 
188  return ps_names, masses, radii, np.array(conform), models_name
189 
190 
191 def parse_symmetric_groups_file(symm_groups_file):
192  symm_groups = []
193  member_to_symm_group = {}
194  first_group_member = []
195  curr_particle_index_in_group = []
196 
197  sgf = open(symm_groups_file, 'r')
198 
199  for indx, ln in enumerate(sgf.readlines()):
200 
201  symm_groups.append([]) # create new symm group list
202  # particle index for new symmetric group
203  curr_particle_index_in_group.append(-1)
204  fields = ln.strip().split()
205 
206  for fld in fields:
207  # group that the current protein copy belongs to
208  member_to_symm_group[fld] = indx
209 
210  # the first group member is special! We create a symm group list
211  # of particles for the first group member
212  first_group_member.append(fields[0])
213 
214  sgf.close()
215 
216  return (symm_groups, member_to_symm_group, curr_particle_index_in_group,
217  first_group_member)
218 
219 
220 def get_conforms_per_frame_batch(arg_bundle):
221  rmf_file, frames, mod_id_start = arg_bundle[:3]
222  resolution, subunit_name, selection, path = arg_bundle[3:]
223  from collections import defaultdict
224  m = IMP.Model()
225  rmf_fh = RMF.open_rmf_file_read_only(os.path.join(path, rmf_file))
226  h = IMP.rmf.create_hierarchies(rmf_fh, m)[0]
227  result = defaultdict(list)
228  mod_id = mod_id_start
229  for f in frames:
230  IMP.rmf.load_frame(rmf_fh, f)
231  m.update()
232  if subunit_name:
233  s0 = IMP.atom.Selection(h, resolution=resolution,
234  molecule=subunit_name)
235  elif selection is not None:
236  s0 = parse_rmsd_selection(h, selection, resolution)
237  else:
238  s0 = IMP.atom.Selection(h, resolution=resolution)
239 
240  particles = s0.get_selected_particles()
241 
242  # Copy particle coordinates
243  for i in range(len(particles)):
244  # i is an index over all particles in the system
245 
246  leaf = particles[i]
247  p = IMP.core.XYZR(leaf)
248  pxyz = p.get_coordinates()
249  result[mod_id].append(list(pxyz))
250  mod_id += 1
251  return result
252 
253 
254 def get_rmfs_coordinates_one_rmf(path, rmf_A, rmf_B,
255  subunit_name=None,
256  symm_groups_file=None, selection=None,
257  resolution=1, n_cores=None):
258  '''Modified RMF coordinates function to work with symmetric copies'''
259  if n_cores is None:
260  n_cores = 1
261  # Open RMFs and get total number of models
262  rmf_fh = RMF.open_rmf_file_read_only(os.path.join(path, rmf_A))
263  n_models = [rmf_fh.get_number_of_frames()]
264  rmf_fh = RMF.open_rmf_file_read_only(os.path.join(path, rmf_B))
265  n_models.append(rmf_fh.get_number_of_frames())
266 
267  masses = []
268  radii = []
269  ps_names = []
270 
271  models_name = []
272 
273  # Build hierarchy from the RMF file
274  m = IMP.Model()
275  h = IMP.rmf.create_hierarchies(rmf_fh, m)[0]
276  IMP.rmf.load_frame(rmf_fh, 0)
277  m.update()
278 
279  ######
280  # Initialize output array
281  pts = 0 # number of particles in each model
282 
283  # Get selection
284  if subunit_name:
285  s0 = IMP.atom.Selection(h, resolution=resolution,
286  molecule=subunit_name)
287  elif selection is not None:
288  s0 = parse_rmsd_selection(h, selection, resolution)
289  else:
290  s0 = IMP.atom.Selection(h, resolution=resolution)
291 
292  # Count particles
293  for leaf in s0.get_selected_particles():
294  p = IMP.core.XYZR(leaf)
295  pts += 1
296 
297  # Initialize array
298  conform = np.empty([n_models[0]+n_models[1], pts, 3])
299 
300  # Initialize the symmetric group particles list, and protein to
301  # symmetry group mapping
302  if symm_groups_file:
303  (symm_groups, group_member_to_symm_group_map,
304  curr_particle_index_in_group, first_group_member) = \
305  parse_symmetric_groups_file(symm_groups_file)
306  else:
307  symm_groups = None
308 
309  # add masses, particle names and radii
310  rmf_fh = RMF.open_rmf_file_read_only(os.path.join(path, rmf_A))
311  h = IMP.rmf.create_hierarchies(rmf_fh, m)[0]
312  IMP.rmf.load_frame(rmf_fh, 0)
313  m.update()
314  if subunit_name:
315  s0 = IMP.atom.Selection(h, resolution=resolution,
316  molecule=subunit_name)
317  elif selection is not None:
318  s0 = parse_rmsd_selection(h, selection, resolution)
319  else:
320  s0 = IMP.atom.Selection(h, resolution=resolution)
321  particles = s0.get_selected_particles()
322 
323  # Copy particle coordinates
324  for i in range(len(particles)):
325  # i is an index over all particles in the system
326 
327  leaf = particles[i]
328  p = IMP.core.XYZR(leaf)
329  masses.append(IMP.atom.Mass(leaf).get_mass())
330  radii.append(p.get_radius())
331  mol_name = IMP.atom.get_molecule_name(
332  IMP.atom.Hierarchy(leaf))
333  # traverse up the Hierarchy to get copy number of
334  # the molecule that the bead belongs to
335  copy_number = \
337 
338  # Add to symmetric groups if needed
339  if symm_groups_file:
340 
341  protein_plus_copy = mol_name+'.'+str(copy_number)
342  if protein_plus_copy in group_member_to_symm_group_map:
343  # protein copy is in a symmetric group
344  group_index = group_member_to_symm_group_map[
345  protein_plus_copy]
346  curr_particle_index_in_group[group_index] += 1
347 
348  if protein_plus_copy \
349  == first_group_member[group_index]:
350  symm_groups[group_index].append([i])
351 
352  else:
353  j = curr_particle_index_in_group[group_index] \
354  % len(symm_groups[group_index])
355  symm_groups[group_index][j].append(i)
356 
357  # TODO not tested on non-fragment systems
359  residues_in_bead = \
360  IMP.atom.Fragment(leaf).get_residue_indexes()
361  ps_names.append(
362  mol_name + "_" + str(min(residues_in_bead)) + "_"
363  + str(max(residues_in_bead)) + "_"
364  + str(copy_number))
365  else:
366  residue_in_bead = \
367  str(IMP.atom.Residue(leaf).get_index())
368  ps_names.append(
369  mol_name + "_" + residue_in_bead + "_" +
370  residue_in_bead + "_" + str(copy_number))
371 
372  mod_id = 0 # index for each model in conform.
373  for rmf_file in [rmf_A, rmf_B]:
374 
375  models_name.append(rmf_file)
376  rmf_fh = RMF.open_rmf_file_read_only(os.path.join(path, rmf_file))
377  h = IMP.rmf.create_hierarchies(rmf_fh, m)[0]
378  n_frames = rmf_fh.get_number_of_frames()
379  print("Opening RMF file:", rmf_file, "with",
380  n_frames, "frames")
381 
382  n_cores = min(n_cores, n_frames) # to ensure n_per_core > 0
383  n_per_core = n_frames // n_cores
384  spacing = np.arange(0, n_per_core * n_cores, n_per_core)
385  mod_id_starts = spacing + mod_id
386  frame_number_starts = spacing
387  frame_number_ends = spacing + n_per_core
388  frame_number_ends[-1] = n_frames - 1
389  frame_lists = []
390  for i in range(n_cores):
391  a = frame_number_starts[i]
392  b = frame_number_ends[i]
393  frame_lists.append(np.arange(a, b + 1))
394  p = mp.Pool(n_cores)
395  args_list = [(rmf_file, frame_lists[i], mod_id_starts[i], resolution,
396  subunit_name, selection, path) for i in range(n_cores)]
397  results = p.map(get_conforms_per_frame_batch, args_list)
398  for res in results:
399  for m_id in res:
400  conform[m_id, :, :] = np.array(res[m_id])
401  mod_id += n_frames
402 
403  if symm_groups_file:
404  for grp in symm_groups:
405  if len(grp) == 0:
406  print("Warning. Symmetry option specified but created "
407  "symmetry group is empty. Cross-check the "
408  "specification of symmetry groups.")
409 
410  return ps_names, masses, radii, conform, symm_groups, models_name, n_models
411 
412 
413 def get_rmsds_matrix(conforms, mode, sup, cores, symm_groups=None):
414 
415  if (mode == "cpu_serial" and not sup) or (mode == "cpu_omp" and not sup):
416  calculator_name = "NOSUP_OMP_CALCULATOR"
417 
418  elif mode == "cpu_omp" and sup:
419  calculator_name = "QCP_OMP_CALCULATOR"
420  print("we are using QCP_OMP to compute RMSD")
421  elif mode == "cuda" and sup:
422  calculator_name = "QCP_CUDA_MEM_CALCULATOR"
423  else:
424  print("Wrong values to pyRMSD ! Please Fix")
425  exit()
426 
427  if symm_groups:
428  print("We have ambiguity.")
429  s1 = parse_symm_groups_for_pyrmsd(symm_groups)
430  calculator = pyRMSD.RMSDCalculator.RMSDCalculator(
431  calculator_name,
432  fittingCoordsets=conforms,
433  calcSymmetryGroups=s1,
434  fitSymmetryGroups=s1)
435 
436  else:
437  calculator = pyRMSD.RMSDCalculator.RMSDCalculator(
438  calculator_name, conforms)
439 
440  # additionally set number of cores for parallel calculator
441  if mode == "cpu_omp":
442  calculator.setNumberOfOpenMPThreads(int(cores))
443 
444  if not symm_groups:
445  rmsd = calculator.pairwiseRMSDMatrix()
446  else:
447  rmsd = []
448  for i in range(len(conforms) - 1):
449  rmsd += list(calculator.oneVsFollowing(i))
450  rmsd_matrix = CondensedMatrix(rmsd)
451  inner_data = rmsd_matrix.get_data()
452  np.save("Distances_Matrix.data", inner_data)
453 
454  return inner_data
Select non water and non hydrogen atoms.
Definition: pdb.h:245
Add mass to a particle.
Definition: Mass.h:23
A decorator to associate a particle with a part of a protein/DNA/RNA.
Definition: Fragment.h:20
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
double get_mass(ResidueType c)
Get the mass from the residue type.
GenericHierarchies get_leaves(Hierarchy mhd)
Get all the leaves of the bit of hierarchy.
void read_pdb(TextInput input, int model, Hierarchy h)
Class for storing model, its restraints, constraints, and particles.
Definition: Model.h:73
static bool get_is_setup(Model *m, ParticleIndex pi)
Definition: Fragment.h:46
The standard decorator for manipulating molecular structures.
Ints get_index(const ParticlesTemp &particles, const Subset &subset, const Subsets &excluded)
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
std::string get_molecule_name(Hierarchy h)
A decorator for a residue.
Definition: Residue.h:137
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Definition: Selection.h:70
Support for the RMF file format for storing hierarchical molecular data and markup.
A decorator for a particle with x,y,z coordinates and a radius.
Definition: XYZR.h:27