IMP  2.1.1
The Integrative Modeling Platform
em2d/clustering_of_pdb_models.py

This example clusters pdb models of an structure, chosen from a selection file. It is assumed that all the pdb files belong to the same structure and that the order of the atoms in the pdb files is the same in all files.

After the clustering procedure, a linkage matrix is generated.

1 ## \example em2d/clustering_of_pdb_models.py
2 # This example clusters pdb models of an structure, chosen from a
3 # selection file.
4 #
5 # It is assumed that all the pdb files belong to the same structure
6 # and that the order of the atoms in the pdb files is the same in all files.
7 #
8 # After the clustering procedure, a linkage matrix is generated.
9 #
10 
11 import IMP
12 import IMP.core as core
13 import IMP.atom as atom
14 import IMP.em2d as em2d
15 import os
16 import sys
17 import csv
18 """
19  Clustering of pdb models.
20  This script clusters pdb models of an structure, chosen from a
21  selection file.
22  - It is assumed that all the pdb files belong to the same structure
23  and that the order of the atoms in the pdb files is the same in all files
24  - After the clustering procedure, a linkage matrix is generated.
25 
26 
27 """
28 
29 if sys.platform == 'win32':
30  sys.stderr.write("this example does not currently work on Windows\n")
31  sys.exit(0)
32 
33 
34 def get_columns(fn, cols=[], delimiter=" ", comment="#"):
35  """ ge the columns of a file:
36  cols - a list of columns to extract. E.g [0,3,5]
37  If empty, all the columns are extracted
38  lines starting with the comment character are ignored """
39  columns = [[] for i in cols]
40  # get a reader for the file
41  reader = csv.reader(
42  open(fn, "r"), delimiter=delimiter, skipinitialspace=True)
43  for row in reader:
44  if(row != [] and row[0][0] != comment): # not empty or comment row
45  if(cols == []):
46  for i in range(0, len(row)):
47  columns[i].append(row[i])
48  else:
49  for i in range(0, len(cols)):
50  columns[i].append(row[cols[i]])
51  return columns
52 
53 
54 def argmin(sequence):
55  """ Argmin function: Returns the pair (min_value,min_index),
56  where min_index is the index of the minimum value
57  """
58  min_value = sequence[0]
59  min_index = 0
60  for i in range(0, len(sequence)):
61 # print "argmin - checking ",sequence[i]
62  if(sequence[i] < min_value):
63  min_value = sequence[i]
64  min_index = i
65 # print "argmin - selecting ",min_value,min_index
66  return min_value, min_index
67 
68 #***************************
69 
70 
71 fn_selection = em2d.get_example_path("all-models-1z5s.sel")
72 fn_em2d_scores = em2d.get_example_path("em2d_scores_for_clustering.data")
73 # Load models
74 print "Reading models ..."
75 model = IMP.kernel.Model()
76 ssel = atom.ATOMPDBSelector()
77 coords = []
78 fn_models = em2d.read_selection_file(fn_selection)
79 n_models = len(fn_models)
80 hierarchies = []
81 for fn in fn_models:
82  fn_model = em2d.get_example_path(fn)
83  h = atom.read_pdb(fn_model, model, ssel, True)
84  hierarchies.append(h)
85  xyz = core.XYZs(atom.get_leaves(h))
86  coords.append([x.get_coordinates() for x in xyz])
87 
88 print "Computing matrix of RMSD ..."
89 rmsds = [[0.0 for i in range(0, n_models)] for n in range(0, n_models)]
90 transformations = [[[] for i in range(0, n_models)]
91  for j in range(0, n_models)]
92 # fill rmsd and transformations
93 for i in xrange(0, n_models):
94  for j in xrange(i + 1, n_models):
95  if(i != j):
97  coords[i],
98  coords[j])
99  transformations[i][j] = t
100  transformations[j][i] = t.get_inverse()
101  temp = [t.get_transformed(v) for v in coords[i]]
102  rmsd = IMP.atom.get_rmsd(temp, coords[j])
103  rmsds[i][j] = rmsd
104  rmsds[j][i] = rmsd
105 
106 # cluster
107 print "Clustering (Complete linkage method)..."
108 cluster_set = em2d.do_hierarchical_clustering_complete_linkage(rmsds)
109 mat2 = cluster_set.get_linkage_matrix()
110 print "Complete Linkage Matrix"
111 for m in mat2:
112  print m
113 
114 # Read scores from the scores file
115 em2d_scores = get_columns(fn_em2d_scores, [1])
116 em2d_scores = em2d_scores[0]
117 
118 # get biggest clusters below a certain rmsd
119 rmsd_cutoff = 1.4
120 print "clusters below cutoff", rmsd_cutoff, "Angstroms"
121 clusters = cluster_set.get_clusters_below_cutoff(rmsd_cutoff)
122 for c in clusters:
123  elems = cluster_set.get_cluster_elements(c)
124  scores_elements = []
125  for cid in elems:
126  scores_elements.append(em2d_scores[cid])
127  print "Cluster", c, ":", elems, scores_elements,
128  # find model with best score
129  min_value, min_index = argmin(scores_elements)
130  min_elem_id = elems[min_index]
131  # The representative element is the one with the minimum em2d score
132  print "representative element", min_elem_id, min_value
133  for i in elems:
134  pdb_name = "cluster-%03d-elem-%03d.pdb" % (c, i)
135 
136  if(i != min_elem_id):
137  print "Writing element", i, "aligned to ", min_elem_id, ":", pdb_name
138  T = core.Transform(transformations[i][min_elem_id])
139  ps = atom.get_leaves(hierarchies[i])
140  for p in ps:
141  T.apply(p)
142  else:
143  print "Writing representative element", min_elem_id, ":", pdb_name
144  atom.write_pdb(hierarchies[i], pdb_name)