IMP  2.3.1
The Integrative Modeling Platform
em2d/clustering_of_pdb_models.py

This example clusters pdb models of an structure, chosen from a selection file. It is assumed that all the pdb files belong to the same structure and that the order of the atoms in the pdb files is the same in all files.

After the clustering procedure, a linkage matrix is generated.

1 ## \example em2d/clustering_of_pdb_models.py
2 # This example clusters pdb models of an structure, chosen from a
3 # selection file.
4 #
5 # It is assumed that all the pdb files belong to the same structure
6 # and that the order of the atoms in the pdb files is the same in all files.
7 #
8 # After the clustering procedure, a linkage matrix is generated.
9 #
10 
11 import IMP
12 import IMP.algebra
13 import IMP.core
14 import IMP.atom
15 import IMP.em2d
16 import os
17 import sys
18 import csv
19 """
20  Clustering of pdb models.
21  This script clusters pdb models of an structure, chosen from a
22  selection file.
23  - It is assumed that all the pdb files belong to the same structure
24  and that the order of the atoms in the pdb files is the same in all files
25  - After the clustering procedure, a linkage matrix is generated.
26 
27 
28 """
29 
30 if sys.platform == 'win32':
31  sys.stderr.write("this example does not currently work on Windows\n")
32  sys.exit(0)
33 
34 
35 def get_columns(fn, cols=[], delimiter=" ", comment="#"):
36  """ ge the columns of a file:
37  cols - a list of columns to extract. E.g [0,3,5]
38  If empty, all the columns are extracted
39  lines starting with the comment character are ignored """
40  columns = [[] for i in cols]
41  # get a reader for the file
42  reader = csv.reader(
43  open(fn, "r"), delimiter=delimiter, skipinitialspace=True)
44  for row in reader:
45  if(row != [] and row[0][0] != comment): # not empty or comment row
46  if(cols == []):
47  for i in range(0, len(row)):
48  columns[i].append(row[i])
49  else:
50  for i in range(0, len(cols)):
51  columns[i].append(row[cols[i]])
52  return columns
53 
54 
55 def argmin(sequence):
56  """ Argmin function: Returns the pair (min_value,min_index),
57  where min_index is the index of the minimum value
58  """
59  min_value = sequence[0]
60  min_index = 0
61  for i in range(0, len(sequence)):
62 # print "argmin - checking ",sequence[i]
63  if(sequence[i] < min_value):
64  min_value = sequence[i]
65  min_index = i
66 # print "argmin - selecting ",min_value,min_index
67  return min_value, min_index
68 
69 #***************************
70 
71 
72 fn_selection = IMP.em2d.get_example_path("all-models-1z5s.sel")
73 fn_em2d_scores = IMP.em2d.get_example_path("em2d_scores_for_clustering.data")
74 # Load models
75 print "Reading models ..."
76 model = IMP.kernel.Model()
78 coords = []
79 fn_models = IMP.em2d.read_selection_file(fn_selection)
80 n_models = len(fn_models)
81 hierarchies = []
82 for fn in fn_models:
83  fn_model = IMP.em2d.get_example_path(fn)
84  h = IMP.atom.read_pdb(fn_model, model, ssel, True)
85  hierarchies.append(h)
87  coords.append([x.get_coordinates() for x in xyz])
88 
89 print "Computing matrix of RMSD ..."
90 rmsds = [[0.0 for i in range(0, n_models)] for n in range(0, n_models)]
91 transformations = [[[] for i in range(0, n_models)]
92  for j in range(0, n_models)]
93 # fill rmsd and transformations
94 for i in xrange(0, n_models):
95  for j in xrange(i + 1, n_models):
96  if(i != j):
98  coords[i],
99  coords[j])
100  transformations[i][j] = t
101  transformations[j][i] = t.get_inverse()
102  temp = [t.get_transformed(v) for v in coords[i]]
103  rmsd = IMP.algebra.get_rmsd(temp, coords[j])
104  rmsds[i][j] = rmsd
105  rmsds[j][i] = rmsd
106 
107 # cluster
108 print "Clustering (Complete linkage method)..."
109 cluster_set = IMP.em2d.do_hierarchical_clustering_complete_linkage(rmsds)
110 mat2 = cluster_set.get_linkage_matrix()
111 print "Complete Linkage Matrix"
112 for m in mat2:
113  print m
114 
115 # Read scores from the scores file
116 em2d_scores = get_columns(fn_em2d_scores, [1])
117 em2d_scores = em2d_scores[0]
118 
119 # get biggest clusters below a certain rmsd
120 rmsd_cutoff = 1.4
121 print "clusters below cutoff", rmsd_cutoff, "Angstroms"
122 clusters = cluster_set.get_clusters_below_cutoff(rmsd_cutoff)
123 for c in clusters:
124  elems = cluster_set.get_cluster_elements(c)
125  scores_elements = []
126  for cid in elems:
127  scores_elements.append(em2d_scores[cid])
128  print "Cluster", c, ":", elems, scores_elements,
129  # find model with best score
130  min_value, min_index = argmin(scores_elements)
131  min_elem_id = elems[min_index]
132  # The representative element is the one with the minimum em2d score
133  print "representative element", min_elem_id, min_value
134  for i in elems:
135  pdb_name = "cluster-%03d-elem-%03d.pdb" % (c, i)
136 
137  if(i != min_elem_id):
138  print "Writing element", i, "aligned to ", min_elem_id, ":", pdb_name
139  T = IMP.core.Transform(transformations[i][min_elem_id])
140  ps = IMP.atom.get_leaves(hierarchies[i])
141  for p in ps:
142  T.apply(p)
143  else:
144  print "Writing representative element", min_elem_id, ":", pdb_name
145  IMP.atom.write_pdb(hierarchies[i], pdb_name)