IMP logo
IMP Reference Guide  develop.031dafb4d2,2024/05/16
The Integrative Modeling Platform
clustering_of_pdb_models.py
1 ## \example em2d/clustering_of_pdb_models.py
2 # This example clusters pdb models of an structure, chosen from a
3 # selection file.
4 #
5 # It is assumed that all the pdb files belong to the same structure
6 # and that the order of the atoms in the pdb files is the same in all files.
7 #
8 # After the clustering procedure, a linkage matrix is generated.
9 #
10 
11 from __future__ import print_function
12 import IMP
13 import IMP.algebra
14 import IMP.core
15 import IMP.atom
16 import IMP.em2d
17 import sys
18 import csv
19 
20 IMP.setup_from_argv(sys.argv, "clustering of PDB models")
21 
22 """
23  Clustering of pdb models.
24  This script clusters pdb models of an structure, chosen from a
25  selection file.
26  - It is assumed that all the pdb files belong to the same structure
27  and that the order of the atoms in the pdb files is the same in all files
28  - After the clustering procedure, a linkage matrix is generated.
29 
30 
31 """
32 
33 if sys.platform == 'win32':
34  sys.stderr.write("this example does not currently work on Windows\n")
35  sys.exit(0)
36 
37 
38 def get_columns(fn, cols=[], delimiter=" ", comment="#"):
39  """ ge the columns of a file:
40  cols - a list of columns to extract. E.g [0,3,5]
41  If empty, all the columns are extracted
42  lines starting with the comment character are ignored """
43  columns = [[] for i in cols]
44  # get a reader for the file
45  reader = csv.reader(
46  open(fn, "r"), delimiter=delimiter, skipinitialspace=True)
47  for row in reader:
48  if(row != [] and row[0][0] != comment): # not empty or comment row
49  if(cols == []):
50  for i in range(0, len(row)):
51  columns[i].append(row[i])
52  else:
53  for i in range(0, len(cols)):
54  columns[i].append(row[cols[i]])
55  return columns
56 
57 
58 def argmin(sequence):
59  """ Argmin function: Returns the pair (min_value,min_index),
60  where min_index is the index of the minimum value
61  """
62  min_value = sequence[0]
63  min_index = 0
64  for i in range(0, len(sequence)):
65  if(sequence[i] < min_value):
66  min_value = sequence[i]
67  min_index = i
68  return min_value, min_index
69 
70 
71 fn_selection = IMP.em2d.get_example_path("all-models-1z5s.sel")
72 fn_em2d_scores = IMP.em2d.get_example_path("em2d_scores_for_clustering.data")
73 # Load models
74 print("Reading models ...")
75 model = IMP.Model()
77 coords = []
78 fn_models = IMP.em2d.read_selection_file(fn_selection)
79 n_models = len(fn_models)
80 hierarchies = []
81 for fn in fn_models:
82  fn_model = IMP.em2d.get_example_path(fn)
83  h = IMP.atom.read_pdb(fn_model, model, ssel, True)
84  hierarchies.append(h)
86  coords.append([x.get_coordinates() for x in xyz])
87 
88 print("Computing matrix of RMSD ...")
89 rmsds = [[0.0 for i in range(0, n_models)] for n in range(0, n_models)]
90 transformations = [[[] for i in range(0, n_models)]
91  for j in range(0, n_models)]
92 # fill rmsd and transformations
93 for i in range(0, n_models):
94  for j in range(i + 1, n_models):
95  if(i != j):
97  coords[i],
98  coords[j])
99  transformations[i][j] = t
100  transformations[j][i] = t.get_inverse()
101  temp = [t.get_transformed(v) for v in coords[i]]
102  rmsd = IMP.algebra.get_rmsd(temp, coords[j])
103  rmsds[i][j] = rmsd
104  rmsds[j][i] = rmsd
105 
106 # cluster
107 print("Clustering (Complete linkage method)...")
108 cluster_set = IMP.em2d.do_hierarchical_clustering_complete_linkage(rmsds)
109 mat2 = cluster_set.get_linkage_matrix()
110 print("Complete Linkage Matrix")
111 for m in mat2:
112  print(m)
113 
114 # Read scores from the scores file
115 em2d_scores = get_columns(fn_em2d_scores, [1])
116 em2d_scores = em2d_scores[0]
117 
118 # get biggest clusters below a certain rmsd
119 rmsd_cutoff = 1.4
120 print("clusters below cutoff", rmsd_cutoff, "Angstroms")
121 clusters = cluster_set.get_clusters_below_cutoff(rmsd_cutoff)
122 for c in clusters:
123  elems = cluster_set.get_cluster_elements(c)
124  scores_elements = []
125  for cid in elems:
126  scores_elements.append(em2d_scores[cid])
127  print("Cluster", c, ":", elems, scores_elements, end=' ')
128  # find model with best score
129  min_value, min_index = argmin(scores_elements)
130  min_elem_id = elems[min_index]
131  # The representative element is the one with the minimum em2d score
132  print("representative element", min_elem_id, min_value)
133  for i in elems:
134  pdb_name = "cluster-%03d-elem-%03d.pdb" % (c, i)
135 
136  if(i != min_elem_id):
137  print("Writing element", i, "aligned to ", min_elem_id, ":",
138  pdb_name)
139  T = IMP.core.Transform(transformations[i][min_elem_id])
140  ps = IMP.atom.get_leaves(hierarchies[i])
141  for p in ps:
142  T.apply_index(model, p.get_particle_index())
143  else:
144  print("Writing representative element", min_elem_id, ":", pdb_name)
145  IMP.atom.write_pdb(hierarchies[i], pdb_name)
Restraints using electron microscopy 2D images (class averages).
Strings setup_from_argv(const Strings &argv, std::string description, std::string positional_description, int num_positional)
std::string get_example_path(std::string file_name)
Return the full path to one of this module's example files.
void write_pdb(const Selection &mhd, TextOutput out, unsigned int model=1)
void read_pdb(TextInput input, int model, Hierarchy h)
Class for storing model, its restraints, constraints, and particles.
Definition: Model.h:86
Select all non-alternative ATOM records.
Definition: pdb.h:128
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Apply a transformation to a passed particle.
Definition: Transform.h:23
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Functionality for loading, creating, manipulating and scoring atomic structures.
Hierarchies get_leaves(const Selection &h)