IMP logo
IMP Reference Guide  develop.4390929e3e,2024/11/12
The Integrative Modeling Platform
em2d/clustering_of_pdb_models.py

This example clusters pdb models of an structure, chosen from a selection file. It is assumed that all the pdb files belong to the same structure and that the order of the atoms in the pdb files is the same in all files.

After the clustering procedure, a linkage matrix is generated.

1 ## \example em2d/clustering_of_pdb_models.py
2 # This example clusters pdb models of an structure, chosen from a
3 # selection file.
4 #
5 # It is assumed that all the pdb files belong to the same structure
6 # and that the order of the atoms in the pdb files is the same in all files.
7 #
8 # After the clustering procedure, a linkage matrix is generated.
9 #
10 
11 import IMP
12 import IMP.algebra
13 import IMP.core
14 import IMP.atom
15 import IMP.em2d
16 import sys
17 import csv
18 
19 IMP.setup_from_argv(sys.argv, "clustering of PDB models")
20 
21 """
22  Clustering of pdb models.
23  This script clusters pdb models of an structure, chosen from a
24  selection file.
25  - It is assumed that all the pdb files belong to the same structure
26  and that the order of the atoms in the pdb files is the same in all files
27  - After the clustering procedure, a linkage matrix is generated.
28 
29 
30 """
31 
32 if sys.platform == 'win32':
33  sys.stderr.write("this example does not currently work on Windows\n")
34  sys.exit(0)
35 
36 
37 def get_columns(fn, cols=[], delimiter=" ", comment="#"):
38  """ ge the columns of a file:
39  cols - a list of columns to extract. E.g [0,3,5]
40  If empty, all the columns are extracted
41  lines starting with the comment character are ignored """
42  columns = [[] for i in cols]
43  # get a reader for the file
44  reader = csv.reader(
45  open(fn, "r"), delimiter=delimiter, skipinitialspace=True)
46  for row in reader:
47  if(row != [] and row[0][0] != comment): # not empty or comment row
48  if(cols == []):
49  for i in range(0, len(row)):
50  columns[i].append(row[i])
51  else:
52  for i in range(0, len(cols)):
53  columns[i].append(row[cols[i]])
54  return columns
55 
56 
57 def argmin(sequence):
58  """ Argmin function: Returns the pair (min_value,min_index),
59  where min_index is the index of the minimum value
60  """
61  min_value = sequence[0]
62  min_index = 0
63  for i in range(0, len(sequence)):
64  if(sequence[i] < min_value):
65  min_value = sequence[i]
66  min_index = i
67  return min_value, min_index
68 
69 
70 fn_selection = IMP.em2d.get_example_path("all-models-1z5s.sel")
71 fn_em2d_scores = IMP.em2d.get_example_path("em2d_scores_for_clustering.data")
72 # Load models
73 print("Reading models ...")
74 model = IMP.Model()
76 coords = []
77 fn_models = IMP.em2d.read_selection_file(fn_selection)
78 n_models = len(fn_models)
79 hierarchies = []
80 for fn in fn_models:
81  fn_model = IMP.em2d.get_example_path(fn)
82  h = IMP.atom.read_pdb(fn_model, model, ssel, True)
83  hierarchies.append(h)
85  coords.append([x.get_coordinates() for x in xyz])
86 
87 print("Computing matrix of RMSD ...")
88 rmsds = [[0.0 for i in range(0, n_models)] for n in range(0, n_models)]
89 transformations = [[[] for i in range(0, n_models)]
90  for j in range(0, n_models)]
91 # fill rmsd and transformations
92 for i in range(0, n_models):
93  for j in range(i + 1, n_models):
94  if(i != j):
96  coords[i],
97  coords[j])
98  transformations[i][j] = t
99  transformations[j][i] = t.get_inverse()
100  temp = [t.get_transformed(v) for v in coords[i]]
101  rmsd = IMP.algebra.get_rmsd(temp, coords[j])
102  rmsds[i][j] = rmsd
103  rmsds[j][i] = rmsd
104 
105 # cluster
106 print("Clustering (Complete linkage method)...")
107 cluster_set = IMP.em2d.do_hierarchical_clustering_complete_linkage(rmsds)
108 mat2 = cluster_set.get_linkage_matrix()
109 print("Complete Linkage Matrix")
110 for m in mat2:
111  print(m)
112 
113 # Read scores from the scores file
114 em2d_scores = get_columns(fn_em2d_scores, [1])
115 em2d_scores = em2d_scores[0]
116 
117 # get biggest clusters below a certain rmsd
118 rmsd_cutoff = 1.4
119 print("clusters below cutoff", rmsd_cutoff, "Angstroms")
120 clusters = cluster_set.get_clusters_below_cutoff(rmsd_cutoff)
121 for c in clusters:
122  elems = cluster_set.get_cluster_elements(c)
123  scores_elements = []
124  for cid in elems:
125  scores_elements.append(em2d_scores[cid])
126  print("Cluster", c, ":", elems, scores_elements, end=' ')
127  # find model with best score
128  min_value, min_index = argmin(scores_elements)
129  min_elem_id = elems[min_index]
130  # The representative element is the one with the minimum em2d score
131  print("representative element", min_elem_id, min_value)
132  for i in elems:
133  pdb_name = "cluster-%03d-elem-%03d.pdb" % (c, i)
134 
135  if(i != min_elem_id):
136  print("Writing element", i, "aligned to ", min_elem_id, ":",
137  pdb_name)
138  T = IMP.core.Transform(transformations[i][min_elem_id])
139  ps = IMP.atom.get_leaves(hierarchies[i])
140  for p in ps:
141  T.apply_index(model, p.get_particle_index())
142  else:
143  print("Writing representative element", min_elem_id, ":", pdb_name)
144  IMP.atom.write_pdb(hierarchies[i], pdb_name)