IMP logo
IMP Reference Guide  2.7.0
The Integrative Modeling Platform
clustering_of_pdb_models.py
1 ## \example em2d/clustering_of_pdb_models.py
2 # This example clusters pdb models of an structure, chosen from a
3 # selection file.
4 #
5 # It is assumed that all the pdb files belong to the same structure
6 # and that the order of the atoms in the pdb files is the same in all files.
7 #
8 # After the clustering procedure, a linkage matrix is generated.
9 #
10 
11 from __future__ import print_function
12 import IMP
13 import IMP.algebra
14 import IMP.core
15 import IMP.atom
16 import IMP.em2d
17 import os
18 import sys
19 import csv
20 
21 IMP.setup_from_argv(sys.argv, "clustering of PDB models")
22 
23 """
24  Clustering of pdb models.
25  This script clusters pdb models of an structure, chosen from a
26  selection file.
27  - It is assumed that all the pdb files belong to the same structure
28  and that the order of the atoms in the pdb files is the same in all files
29  - After the clustering procedure, a linkage matrix is generated.
30 
31 
32 """
33 
34 if sys.platform == 'win32':
35  sys.stderr.write("this example does not currently work on Windows\n")
36  sys.exit(0)
37 
38 
39 def get_columns(fn, cols=[], delimiter=" ", comment="#"):
40  """ ge the columns of a file:
41  cols - a list of columns to extract. E.g [0,3,5]
42  If empty, all the columns are extracted
43  lines starting with the comment character are ignored """
44  columns = [[] for i in cols]
45  # get a reader for the file
46  reader = csv.reader(
47  open(fn, "r"), delimiter=delimiter, skipinitialspace=True)
48  for row in reader:
49  if(row != [] and row[0][0] != comment): # not empty or comment row
50  if(cols == []):
51  for i in range(0, len(row)):
52  columns[i].append(row[i])
53  else:
54  for i in range(0, len(cols)):
55  columns[i].append(row[cols[i]])
56  return columns
57 
58 
59 def argmin(sequence):
60  """ Argmin function: Returns the pair (min_value,min_index),
61  where min_index is the index of the minimum value
62  """
63  min_value = sequence[0]
64  min_index = 0
65  for i in range(0, len(sequence)):
66 # print "argmin - checking ",sequence[i]
67  if(sequence[i] < min_value):
68  min_value = sequence[i]
69  min_index = i
70 # print "argmin - selecting ",min_value,min_index
71  return min_value, min_index
72 
73 #***************************
74 
75 
76 fn_selection = IMP.em2d.get_example_path("all-models-1z5s.sel")
77 fn_em2d_scores = IMP.em2d.get_example_path("em2d_scores_for_clustering.data")
78 # Load models
79 print("Reading models ...")
80 model = IMP.Model()
82 coords = []
83 fn_models = IMP.em2d.read_selection_file(fn_selection)
84 n_models = len(fn_models)
85 hierarchies = []
86 for fn in fn_models:
87  fn_model = IMP.em2d.get_example_path(fn)
88  h = IMP.atom.read_pdb(fn_model, model, ssel, True)
89  hierarchies.append(h)
91  coords.append([x.get_coordinates() for x in xyz])
92 
93 print("Computing matrix of RMSD ...")
94 rmsds = [[0.0 for i in range(0, n_models)] for n in range(0, n_models)]
95 transformations = [[[] for i in range(0, n_models)]
96  for j in range(0, n_models)]
97 # fill rmsd and transformations
98 for i in range(0, n_models):
99  for j in range(i + 1, n_models):
100  if(i != j):
102  coords[i],
103  coords[j])
104  transformations[i][j] = t
105  transformations[j][i] = t.get_inverse()
106  temp = [t.get_transformed(v) for v in coords[i]]
107  rmsd = IMP.algebra.get_rmsd(temp, coords[j])
108  rmsds[i][j] = rmsd
109  rmsds[j][i] = rmsd
110 
111 # cluster
112 print("Clustering (Complete linkage method)...")
113 cluster_set = IMP.em2d.do_hierarchical_clustering_complete_linkage(rmsds)
114 mat2 = cluster_set.get_linkage_matrix()
115 print("Complete Linkage Matrix")
116 for m in mat2:
117  print(m)
118 
119 # Read scores from the scores file
120 em2d_scores = get_columns(fn_em2d_scores, [1])
121 em2d_scores = em2d_scores[0]
122 
123 # get biggest clusters below a certain rmsd
124 rmsd_cutoff = 1.4
125 print("clusters below cutoff", rmsd_cutoff, "Angstroms")
126 clusters = cluster_set.get_clusters_below_cutoff(rmsd_cutoff)
127 for c in clusters:
128  elems = cluster_set.get_cluster_elements(c)
129  scores_elements = []
130  for cid in elems:
131  scores_elements.append(em2d_scores[cid])
132  print("Cluster", c, ":", elems, scores_elements, end=' ')
133  # find model with best score
134  min_value, min_index = argmin(scores_elements)
135  min_elem_id = elems[min_index]
136  # The representative element is the one with the minimum em2d score
137  print("representative element", min_elem_id, min_value)
138  for i in elems:
139  pdb_name = "cluster-%03d-elem-%03d.pdb" % (c, i)
140 
141  if(i != min_elem_id):
142  print("Writing element", i, "aligned to ", min_elem_id, ":", pdb_name)
143  T = IMP.core.Transform(transformations[i][min_elem_id])
144  ps = IMP.atom.get_leaves(hierarchies[i])
145  for p in ps:
146  T.apply_index(model, p.get_particle_index())
147  else:
148  print("Writing representative element", min_elem_id, ":", pdb_name)
149  IMP.atom.write_pdb(hierarchies[i], pdb_name)
Restraints using electron microscopy 2D images (class averages).
Strings setup_from_argv(const Strings &argv, std::string description, std::string positional_description, int num_positional)
std::string get_example_path(std::string file_name)
Return the full path to one of this module's example files.
void write_pdb(const Selection &mhd, TextOutput out, unsigned int model=1)
void read_pdb(TextInput input, int model, Hierarchy h)
Class for storing model, its restraints, constraints, and particles.
Definition: Model.h:72
Select all non-alternative ATOM records.
Definition: pdb.h:64
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Apply a transformation to a passed particle.
Definition: Transform.h:20
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Functionality for loading, creating, manipulating and scoring atomic structures.
Hierarchies get_leaves(const Selection &h)