IMP logo
IMP Reference Guide  develop.63b38c487d,2024/12/22
The Integrative Modeling Platform
clustering_of_pdb_models.py
1 ## \example em2d/clustering_of_pdb_models.py
2 # This example clusters pdb models of an structure, chosen from a
3 # selection file.
4 #
5 # It is assumed that all the pdb files belong to the same structure
6 # and that the order of the atoms in the pdb files is the same in all files.
7 #
8 # After the clustering procedure, a linkage matrix is generated.
9 #
10 
11 import IMP
12 import IMP.algebra
13 import IMP.core
14 import IMP.atom
15 import IMP.em2d
16 import sys
17 import csv
18 
19 IMP.setup_from_argv(sys.argv, "clustering of PDB models")
20 
21 """
22  Clustering of pdb models.
23  This script clusters pdb models of an structure, chosen from a
24  selection file.
25  - It is assumed that all the pdb files belong to the same structure
26  and that the order of the atoms in the pdb files is the same in all files
27  - After the clustering procedure, a linkage matrix is generated.
28 
29 
30 """
31 
32 if sys.platform == 'win32':
33  sys.stderr.write("this example does not currently work on Windows\n")
34  sys.exit(0)
35 
36 
37 def get_columns(fn, cols=[], delimiter=" ", comment="#"):
38  """ ge the columns of a file:
39  cols - a list of columns to extract. E.g [0,3,5]
40  If empty, all the columns are extracted
41  lines starting with the comment character are ignored """
42  columns = [[] for i in cols]
43  # get a reader for the file
44  reader = csv.reader(
45  open(fn, "r"), delimiter=delimiter, skipinitialspace=True)
46  for row in reader:
47  if(row != [] and row[0][0] != comment): # not empty or comment row
48  if(cols == []):
49  for i in range(0, len(row)):
50  columns[i].append(row[i])
51  else:
52  for i in range(0, len(cols)):
53  columns[i].append(row[cols[i]])
54  return columns
55 
56 
57 def argmin(sequence):
58  """ Argmin function: Returns the pair (min_value,min_index),
59  where min_index is the index of the minimum value
60  """
61  min_value = sequence[0]
62  min_index = 0
63  for i in range(0, len(sequence)):
64  if(sequence[i] < min_value):
65  min_value = sequence[i]
66  min_index = i
67  return min_value, min_index
68 
69 
70 fn_selection = IMP.em2d.get_example_path("all-models-1z5s.sel")
71 fn_em2d_scores = IMP.em2d.get_example_path("em2d_scores_for_clustering.data")
72 # Load models
73 print("Reading models ...")
74 model = IMP.Model()
76 coords = []
77 fn_models = IMP.em2d.read_selection_file(fn_selection)
78 n_models = len(fn_models)
79 hierarchies = []
80 for fn in fn_models:
81  fn_model = IMP.em2d.get_example_path(fn)
82  h = IMP.atom.read_pdb(fn_model, model, ssel, True)
83  hierarchies.append(h)
85  coords.append([x.get_coordinates() for x in xyz])
86 
87 print("Computing matrix of RMSD ...")
88 rmsds = [[0.0 for i in range(0, n_models)] for n in range(0, n_models)]
89 transformations = [[[] for i in range(0, n_models)]
90  for j in range(0, n_models)]
91 # fill rmsd and transformations
92 for i in range(0, n_models):
93  for j in range(i + 1, n_models):
94  if(i != j):
96  coords[i],
97  coords[j])
98  transformations[i][j] = t
99  transformations[j][i] = t.get_inverse()
100  temp = [t.get_transformed(v) for v in coords[i]]
101  rmsd = IMP.algebra.get_rmsd(temp, coords[j])
102  rmsds[i][j] = rmsd
103  rmsds[j][i] = rmsd
104 
105 # cluster
106 print("Clustering (Complete linkage method)...")
107 cluster_set = IMP.em2d.do_hierarchical_clustering_complete_linkage(rmsds)
108 mat2 = cluster_set.get_linkage_matrix()
109 print("Complete Linkage Matrix")
110 for m in mat2:
111  print(m)
112 
113 # Read scores from the scores file
114 em2d_scores = get_columns(fn_em2d_scores, [1])
115 em2d_scores = em2d_scores[0]
116 
117 # get biggest clusters below a certain rmsd
118 rmsd_cutoff = 1.4
119 print("clusters below cutoff", rmsd_cutoff, "Angstroms")
120 clusters = cluster_set.get_clusters_below_cutoff(rmsd_cutoff)
121 for c in clusters:
122  elems = cluster_set.get_cluster_elements(c)
123  scores_elements = []
124  for cid in elems:
125  scores_elements.append(em2d_scores[cid])
126  print("Cluster", c, ":", elems, scores_elements, end=' ')
127  # find model with best score
128  min_value, min_index = argmin(scores_elements)
129  min_elem_id = elems[min_index]
130  # The representative element is the one with the minimum em2d score
131  print("representative element", min_elem_id, min_value)
132  for i in elems:
133  pdb_name = "cluster-%03d-elem-%03d.pdb" % (c, i)
134 
135  if(i != min_elem_id):
136  print("Writing element", i, "aligned to ", min_elem_id, ":",
137  pdb_name)
138  T = IMP.core.Transform(transformations[i][min_elem_id])
139  ps = IMP.atom.get_leaves(hierarchies[i])
140  for p in ps:
141  T.apply_index(model, p.get_particle_index())
142  else:
143  print("Writing representative element", min_elem_id, ":", pdb_name)
144  IMP.atom.write_pdb(hierarchies[i], pdb_name)
Restraints using electron microscopy 2D images (class averages).
Strings setup_from_argv(const Strings &argv, std::string description, std::string positional_description, int num_positional)
std::string get_example_path(std::string file_name)
Return the full path to one of this module's example files.
void write_pdb(const Selection &mhd, TextOutput out, unsigned int model=1)
void read_pdb(TextInput input, int model, Hierarchy h)
Class for storing model, its restraints, constraints, and particles.
Definition: Model.h:86
Select all non-alternative ATOM records.
Definition: pdb.h:128
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
Apply a transformation to a passed particle.
Definition: Transform.h:23
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Transformation3D get_transformation_aligning_first_to_second(Vector3Ds a, Vector3Ds b)
Functionality for loading, creating, manipulating and scoring atomic structures.
Hierarchies get_leaves(const Selection &h)