IMP logo
IMP Reference Guide  develop.879b8d2544,2020/09/30
The Integrative Modeling Platform
em2d/clustering_of_pdb_models.py

This example clusters pdb models of an structure, chosen from a selection file. It is assumed that all the pdb files belong to the same structure and that the order of the atoms in the pdb files is the same in all files.

After the clustering procedure, a linkage matrix is generated.

1 ## \example em2d/clustering_of_pdb_models.py
2 # This example clusters pdb models of an structure, chosen from a
3 # selection file.
4 #
5 # It is assumed that all the pdb files belong to the same structure
6 # and that the order of the atoms in the pdb files is the same in all files.
7 #
8 # After the clustering procedure, a linkage matrix is generated.
9 #
10 
11 from __future__ import print_function
12 import IMP
13 import IMP.algebra
14 import IMP.core
15 import IMP.atom
16 import IMP.em2d
17 import os
18 import sys
19 import csv
20 
21 IMP.setup_from_argv(sys.argv, "clustering of PDB models")
22 
23 """
24  Clustering of pdb models.
25  This script clusters pdb models of an structure, chosen from a
26  selection file.
27  - It is assumed that all the pdb files belong to the same structure
28  and that the order of the atoms in the pdb files is the same in all files
29  - After the clustering procedure, a linkage matrix is generated.
30 
31 
32 """
33 
34 if sys.platform == 'win32':
35  sys.stderr.write("this example does not currently work on Windows\n")
36  sys.exit(0)
37 
38 
39 def get_columns(fn, cols=[], delimiter=" ", comment="#"):
40  """ ge the columns of a file:
41  cols - a list of columns to extract. E.g [0,3,5]
42  If empty, all the columns are extracted
43  lines starting with the comment character are ignored """
44  columns = [[] for i in cols]
45  # get a reader for the file
46  reader = csv.reader(
47  open(fn, "r"), delimiter=delimiter, skipinitialspace=True)
48  for row in reader:
49  if(row != [] and row[0][0] != comment): # not empty or comment row
50  if(cols == []):
51  for i in range(0, len(row)):
52  columns[i].append(row[i])
53  else:
54  for i in range(0, len(cols)):
55  columns[i].append(row[cols[i]])
56  return columns
57 
58 
59 def argmin(sequence):
60  """ Argmin function: Returns the pair (min_value,min_index),
61  where min_index is the index of the minimum value
62  """
63  min_value = sequence[0]
64  min_index = 0
65  for i in range(0, len(sequence)):
66 # print "argmin - checking ",sequence[i]
67  if(sequence[i] < min_value):
68  min_value = sequence[i]
69  min_index = i
70 # print "argmin - selecting ",min_value,min_index
71  return min_value, min_index
72 
73 #***************************
74 
75 
76 fn_selection = IMP.em2d.get_example_path("all-models-1z5s.sel")
77 fn_em2d_scores = IMP.em2d.get_example_path("em2d_scores_for_clustering.data")
78 # Load models
79 print("Reading models ...")
80 model = IMP.Model()
82 coords = []
83 fn_models = IMP.em2d.read_selection_file(fn_selection)
84 n_models = len(fn_models)
85 hierarchies = []
86 for fn in fn_models:
87  fn_model = IMP.em2d.get_example_path(fn)
88  h = IMP.atom.read_pdb(fn_model, model, ssel, True)
89  hierarchies.append(h)
91  coords.append([x.get_coordinates() for x in xyz])
92 
93 print("Computing matrix of RMSD ...")
94 rmsds = [[0.0 for i in range(0, n_models)] for n in range(0, n_models)]
95 transformations = [[[] for i in range(0, n_models)]
96  for j in range(0, n_models)]
97 # fill rmsd and transformations
98 for i in range(0, n_models):
99  for j in range(i + 1, n_models):
100  if(i != j):
102  coords[i],
103  coords[j])
104  transformations[i][j] = t
105  transformations[j][i] = t.get_inverse()
106  temp = [t.get_transformed(v) for v in coords[i]]
107  rmsd = IMP.algebra.get_rmsd(temp, coords[j])
108  rmsds[i][j] = rmsd
109  rmsds[j][i] = rmsd
110 
111 # cluster
112 print("Clustering (Complete linkage method)...")
113 cluster_set = IMP.em2d.do_hierarchical_clustering_complete_linkage(rmsds)
114 mat2 = cluster_set.get_linkage_matrix()
115 print("Complete Linkage Matrix")
116 for m in mat2:
117  print(m)
118 
119 # Read scores from the scores file
120 em2d_scores = get_columns(fn_em2d_scores, [1])
121 em2d_scores = em2d_scores[0]
122 
123 # get biggest clusters below a certain rmsd
124 rmsd_cutoff = 1.4
125 print("clusters below cutoff", rmsd_cutoff, "Angstroms")
126 clusters = cluster_set.get_clusters_below_cutoff(rmsd_cutoff)
127 for c in clusters:
128  elems = cluster_set.get_cluster_elements(c)
129  scores_elements = []
130  for cid in elems:
131  scores_elements.append(em2d_scores[cid])
132  print("Cluster", c, ":", elems, scores_elements, end=' ')
133  # find model with best score
134  min_value, min_index = argmin(scores_elements)
135  min_elem_id = elems[min_index]
136  # The representative element is the one with the minimum em2d score
137  print("representative element", min_elem_id, min_value)
138  for i in elems:
139  pdb_name = "cluster-%03d-elem-%03d.pdb" % (c, i)
140 
141  if(i != min_elem_id):
142  print("Writing element", i, "aligned to ", min_elem_id, ":", pdb_name)
143  T = IMP.core.Transform(transformations[i][min_elem_id])
144  ps = IMP.atom.get_leaves(hierarchies[i])
145  for p in ps:
146  T.apply_index(model, p.get_particle_index())
147  else:
148  print("Writing representative element", min_elem_id, ":", pdb_name)
149  IMP.atom.write_pdb(hierarchies[i], pdb_name)