IMP  2.2.1
The Integrative Modeling Platform
clustering_of_pdb_models.py
1 ## \example em2d/clustering_of_pdb_models.py
2 # This example clusters pdb models of an structure, chosen from a
3 # selection file.
4 #
5 # It is assumed that all the pdb files belong to the same structure
6 # and that the order of the atoms in the pdb files is the same in all files.
7 #
8 # After the clustering procedure, a linkage matrix is generated.
9 #
10 
11 import IMP
12 import IMP.algebra
13 import IMP.core
14 import IMP.atom
15 import IMP.em2d
16 import os
17 import sys
18 import csv
19 """
20  Clustering of pdb models.
21  This script clusters pdb models of an structure, chosen from a
22  selection file.
23  - It is assumed that all the pdb files belong to the same structure
24  and that the order of the atoms in the pdb files is the same in all files
25  - After the clustering procedure, a linkage matrix is generated.
26 
27 
28 """
29 
30 if sys.platform == 'win32':
31  sys.stderr.write("this example does not currently work on Windows\n")
32  sys.exit(0)
33 
34 
35 def get_columns(fn, cols=[], delimiter=" ", comment="#"):
36  """ ge the columns of a file:
37  cols - a list of columns to extract. E.g [0,3,5]
38  If empty, all the columns are extracted
39  lines starting with the comment character are ignored """
40  columns = [[] for i in cols]
41  # get a reader for the file
42  reader = csv.reader(
43  open(fn, "r"), delimiter=delimiter, skipinitialspace=True)
44  for row in reader:
45  if(row != [] and row[0][0] != comment): # not empty or comment row
46  if(cols == []):
47  for i in range(0, len(row)):
48  columns[i].append(row[i])
49  else:
50  for i in range(0, len(cols)):
51  columns[i].append(row[cols[i]])
52  return columns
53 
54 
55 def argmin(sequence):
56  """ Argmin function: Returns the pair (min_value,min_index),
57  where min_index is the index of the minimum value
58  """
59  min_value = sequence[0]
60  min_index = 0
61  for i in range(0, len(sequence)):
62 # print "argmin - checking ",sequence[i]
63  if(sequence[i] < min_value):
64  min_value = sequence[i]
65  min_index = i
66 # print "argmin - selecting ",min_value,min_index
67  return min_value, min_index
68 
69 #***************************
70 
71 
72 fn_selection = IMP.em2d.get_example_path("all-models-1z5s.sel")
73 fn_em2d_scores = IMP.em2d.get_example_path("em2d_scores_for_clustering.data")
74 # Load models
75 print "Reading models ..."
76 model = IMP.kernel.Model()
78 coords = []
79 fn_models = IMP.em2d.read_selection_file(fn_selection)
80 n_models = len(fn_models)
81 hierarchies = []
82 for fn in fn_models:
83  fn_model = IMP.em2d.get_example_path(fn)
84  h = IMP.atom.read_pdb(fn_model, model, ssel, True)
85  hierarchies.append(h)
87  coords.append([x.get_coordinates() for x in xyz])
88 
89 print "Computing matrix of RMSD ..."
90 rmsds = [[0.0 for i in range(0, n_models)] for n in range(0, n_models)]
91 transformations = [[[] for i in range(0, n_models)]
92  for j in range(0, n_models)]
93 # fill rmsd and transformations
94 for i in xrange(0, n_models):
95  for j in xrange(i + 1, n_models):
96  if(i != j):
98  coords[i],
99  coords[j])
100  transformations[i][j] = t
101  transformations[j][i] = t.get_inverse()
102  temp = [t.get_transformed(v) for v in coords[i]]
103  rmsd = IMP.algebra.get_rmsd(temp, coords[j])
104  rmsds[i][j] = rmsd
105  rmsds[j][i] = rmsd
106 
107 # cluster
108 print "Clustering (Complete linkage method)..."
109 cluster_set = IMP.em2d.do_hierarchical_clustering_complete_linkage(rmsds)
110 mat2 = cluster_set.get_linkage_matrix()
111 print "Complete Linkage Matrix"
112 for m in mat2:
113  print m
114 
115 # Read scores from the scores file
116 em2d_scores = get_columns(fn_em2d_scores, [1])
117 em2d_scores = em2d_scores[0]
118 
119 # get biggest clusters below a certain rmsd
120 rmsd_cutoff = 1.4
121 print "clusters below cutoff", rmsd_cutoff, "Angstroms"
122 clusters = cluster_set.get_clusters_below_cutoff(rmsd_cutoff)
123 for c in clusters:
124  elems = cluster_set.get_cluster_elements(c)
125  scores_elements = []
126  for cid in elems:
127  scores_elements.append(em2d_scores[cid])
128  print "Cluster", c, ":", elems, scores_elements,
129  # find model with best score
130  min_value, min_index = argmin(scores_elements)
131  min_elem_id = elems[min_index]
132  # The representative element is the one with the minimum em2d score
133  print "representative element", min_elem_id, min_value
134  for i in elems:
135  pdb_name = "cluster-%03d-elem-%03d.pdb" % (c, i)
136 
137  if(i != min_elem_id):
138  print "Writing element", i, "aligned to ", min_elem_id, ":", pdb_name
139  T = IMP.core.Transform(transformations[i][min_elem_id])
140  ps = IMP.atom.get_leaves(hierarchies[i])
141  for p in ps:
142  T.apply(p)
143  else:
144  print "Writing representative element", min_elem_id, ":", pdb_name
145  IMP.atom.write_pdb(hierarchies[i], pdb_name)
void write_pdb(const Selection &mhd, base::TextOutput out, unsigned int model=1)
See IMP.em2d for more information.
Definition: align2D.h:18
std::string get_example_path(std::string file_name)
Return the path to installed example data for this module.
Select all non-alternative ATOM records.
Definition: pdb.h:63
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Apply a transformation to a passed particle.
Definition: Transform.h:22
See IMP.core for more information.
See IMP.algebra for more information.
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
See IMP.atom for more information.
void read_pdb(base::TextInput input, int model, Hierarchy h)
Hierarchies get_leaves(const Selection &h)
Class for storing model, its restraints, constraints, and particles.
Definition: kernel/Model.h:72