IMP logo
IMP Reference Guide  2.19.0
The Integrative Modeling Platform
em2d/clustering_of_pdb_models.py

This example clusters pdb models of an structure, chosen from a selection file. It is assumed that all the pdb files belong to the same structure and that the order of the atoms in the pdb files is the same in all files.

After the clustering procedure, a linkage matrix is generated.

1 ## \example em2d/clustering_of_pdb_models.py
2 # This example clusters pdb models of an structure, chosen from a
3 # selection file.
4 #
5 # It is assumed that all the pdb files belong to the same structure
6 # and that the order of the atoms in the pdb files is the same in all files.
7 #
8 # After the clustering procedure, a linkage matrix is generated.
9 #
10 
11 from __future__ import print_function
12 import IMP
13 import IMP.algebra
14 import IMP.core
15 import IMP.atom
16 import IMP.em2d
17 import sys
18 import csv
19 
20 IMP.setup_from_argv(sys.argv, "clustering of PDB models")
21 
22 """
23  Clustering of pdb models.
24  This script clusters pdb models of an structure, chosen from a
25  selection file.
26  - It is assumed that all the pdb files belong to the same structure
27  and that the order of the atoms in the pdb files is the same in all files
28  - After the clustering procedure, a linkage matrix is generated.
29 
30 
31 """
32 
33 if sys.platform == 'win32':
34  sys.stderr.write("this example does not currently work on Windows\n")
35  sys.exit(0)
36 
37 
38 def get_columns(fn, cols=[], delimiter=" ", comment="#"):
39  """ ge the columns of a file:
40  cols - a list of columns to extract. E.g [0,3,5]
41  If empty, all the columns are extracted
42  lines starting with the comment character are ignored """
43  columns = [[] for i in cols]
44  # get a reader for the file
45  reader = csv.reader(
46  open(fn, "r"), delimiter=delimiter, skipinitialspace=True)
47  for row in reader:
48  if(row != [] and row[0][0] != comment): # not empty or comment row
49  if(cols == []):
50  for i in range(0, len(row)):
51  columns[i].append(row[i])
52  else:
53  for i in range(0, len(cols)):
54  columns[i].append(row[cols[i]])
55  return columns
56 
57 
58 def argmin(sequence):
59  """ Argmin function: Returns the pair (min_value,min_index),
60  where min_index is the index of the minimum value
61  """
62  min_value = sequence[0]
63  min_index = 0
64  for i in range(0, len(sequence)):
65  if(sequence[i] < min_value):
66  min_value = sequence[i]
67  min_index = i
68  return min_value, min_index
69 
70 
71 fn_selection = IMP.em2d.get_example_path("all-models-1z5s.sel")
72 fn_em2d_scores = IMP.em2d.get_example_path("em2d_scores_for_clustering.data")
73 # Load models
74 print("Reading models ...")
75 model = IMP.Model()
77 coords = []
78 fn_models = IMP.em2d.read_selection_file(fn_selection)
79 n_models = len(fn_models)
80 hierarchies = []
81 for fn in fn_models:
82  fn_model = IMP.em2d.get_example_path(fn)
83  h = IMP.atom.read_pdb(fn_model, model, ssel, True)
84  hierarchies.append(h)
86  coords.append([x.get_coordinates() for x in xyz])
87 
88 print("Computing matrix of RMSD ...")
89 rmsds = [[0.0 for i in range(0, n_models)] for n in range(0, n_models)]
90 transformations = [[[] for i in range(0, n_models)]
91  for j in range(0, n_models)]
92 # fill rmsd and transformations
93 for i in range(0, n_models):
94  for j in range(i + 1, n_models):
95  if(i != j):
97  coords[i],
98  coords[j])
99  transformations[i][j] = t
100  transformations[j][i] = t.get_inverse()
101  temp = [t.get_transformed(v) for v in coords[i]]
102  rmsd = IMP.algebra.get_rmsd(temp, coords[j])
103  rmsds[i][j] = rmsd
104  rmsds[j][i] = rmsd
105 
106 # cluster
107 print("Clustering (Complete linkage method)...")
108 cluster_set = IMP.em2d.do_hierarchical_clustering_complete_linkage(rmsds)
109 mat2 = cluster_set.get_linkage_matrix()
110 print("Complete Linkage Matrix")
111 for m in mat2:
112  print(m)
113 
114 # Read scores from the scores file
115 em2d_scores = get_columns(fn_em2d_scores, [1])
116 em2d_scores = em2d_scores[0]
117 
118 # get biggest clusters below a certain rmsd
119 rmsd_cutoff = 1.4
120 print("clusters below cutoff", rmsd_cutoff, "Angstroms")
121 clusters = cluster_set.get_clusters_below_cutoff(rmsd_cutoff)
122 for c in clusters:
123  elems = cluster_set.get_cluster_elements(c)
124  scores_elements = []
125  for cid in elems:
126  scores_elements.append(em2d_scores[cid])
127  print("Cluster", c, ":", elems, scores_elements, end=' ')
128  # find model with best score
129  min_value, min_index = argmin(scores_elements)
130  min_elem_id = elems[min_index]
131  # The representative element is the one with the minimum em2d score
132  print("representative element", min_elem_id, min_value)
133  for i in elems:
134  pdb_name = "cluster-%03d-elem-%03d.pdb" % (c, i)
135 
136  if(i != min_elem_id):
137  print("Writing element", i, "aligned to ", min_elem_id, ":",
138  pdb_name)
139  T = IMP.core.Transform(transformations[i][min_elem_id])
140  ps = IMP.atom.get_leaves(hierarchies[i])
141  for p in ps:
142  T.apply_index(model, p.get_particle_index())
143  else:
144  print("Writing representative element", min_elem_id, ":", pdb_name)
145  IMP.atom.write_pdb(hierarchies[i], pdb_name)