IMP  2.0.1
The Integrative Modeling Platform
clustering_of_pdb_models.py
1 ## \example em2d/clustering_of_pdb_models.py
2 ## This example clusters pdb models of an structure, chosen from a
3 ## selection file.
4 ##
5 ## It is assumed that all the pdb files belong to the same structure
6 ## and that the order of the atoms in the pdb files is the same in all files.
7 ##
8 ## After the clustering procedure, a linkage matrix is generated.
9 ##
10 
11 import IMP
12 import IMP.core as core
13 import IMP.atom as atom
14 import IMP.em2d as em2d
15 import os
16 import sys
17 import csv
18 """
19  Clustering of pdb models.
20  This script clusters pdb models of an structure, chosen from a
21  selection file.
22  - It is assumed that all the pdb files belong to the same structure
23  and that the order of the atoms in the pdb files is the same in all files
24  - After the clustering procedure, a linkage matrix is generated.
25 
26 
27 """
28 
29 if sys.platform == 'win32':
30  sys.stderr.write("this example does not currently work on Windows\n")
31  sys.exit(0)
32 
33 def get_columns(fn,cols=[],delimiter=" ",comment="#"):
34  """ ge the columns of a file:
35  cols - a list of columns to extract. E.g [0,3,5]
36  If empty, all the columns are extracted
37  lines starting with the comment character are ignored """
38  columns=[[] for i in cols]
39  # get a reader for the file
40  reader=csv.reader(open(fn,"r"),delimiter=delimiter,skipinitialspace=True)
41  for row in reader:
42  if(row!=[] and row[0][0]!=comment): # not empty or comment row
43  if(cols==[]):
44  for i in range(0,len(row)):
45  columns[i].append(row[i])
46  else:
47  for i in range(0,len(cols)):
48  columns[i].append(row[cols[i]])
49  return columns
50 
51 
52 def argmin(sequence):
53  """ Argmin function: Returns the pair (min_value,min_index),
54  where min_index is the index of the minimum value
55  """
56  min_value = sequence[0]
57  min_index = 0
58  for i in range(0,len(sequence)):
59 # print "argmin - checking ",sequence[i]
60  if(sequence[i]<min_value):
61  min_value = sequence[i]
62  min_index =i
63 # print "argmin - selecting ",min_value,min_index
64  return min_value,min_index
65 
66 #***************************
67 
68 
69 fn_selection = em2d.get_example_path("all-models-1z5s.sel")
70 fn_em2d_scores = em2d.get_example_path("em2d_scores_for_clustering.data")
71 # Load models
72 print "Reading models ..."
73 model = IMP.Model()
74 ssel = atom.ATOMPDBSelector()
75 coords =[]
76 fn_models = em2d.read_selection_file(fn_selection)
77 n_models = len(fn_models)
78 hierarchies=[]
79 for fn in fn_models:
80  fn_model=em2d.get_example_path(fn)
81  h=atom.read_pdb(fn_model,model,ssel,True)
82  hierarchies.append(h)
83  xyz=core.XYZs(atom.get_leaves(h))
84  coords.append( [x.get_coordinates() for x in xyz])
85 
86 print "Computing matrix of RMSD ..."
87 rmsds=[[0.0 for i in range(0,n_models)] for n in range(0,n_models)]
88 transformations=[[[] for i in range(0,n_models)] for j in range(0,n_models)]
89 # fill rmsd and transformations
90 for i in xrange(0,n_models):
91  for j in xrange(i+1,n_models):
92  if(i!=j):
94  coords[i],
95  coords[j])
96  transformations[i][j]=t
97  transformations[j][i]=t.get_inverse()
98  temp = [t.get_transformed(v) for v in coords[i]]
99  rmsd=IMP.atom.get_rmsd(temp,coords[j])
100  rmsds[i][j]=rmsd
101  rmsds[j][i]=rmsd
102 
103 # cluster
104 print "Clustering (Complete linkage method)..."
105 cluster_set = em2d.do_hierarchical_clustering_complete_linkage(rmsds)
106 mat2=cluster_set.get_linkage_matrix()
107 print "Complete Linkage Matrix"
108 for m in mat2:
109  print m
110 
111 # Read scores from the scores file
112 em2d_scores = get_columns(fn_em2d_scores,[1])
113 em2d_scores = em2d_scores[0]
114 
115 # get biggest clusters below a certain rmsd
116 rmsd_cutoff=1.4
117 print "clusters below cutoff",rmsd_cutoff,"Angstroms"
118 clusters=cluster_set.get_clusters_below_cutoff(rmsd_cutoff)
119 for c in clusters:
120  elems=cluster_set.get_cluster_elements(c)
121  scores_elements=[]
122  for cid in elems:
123  scores_elements.append(em2d_scores[cid])
124  print "Cluster",c,":",elems,scores_elements,
125  # find model with best score
126  min_value,min_index= argmin(scores_elements)
127  min_elem_id=elems[min_index]
128  # The representative element is the one with the minimum em2d score
129  print "representative element",min_elem_id,min_value
130  for i in elems:
131  pdb_name="cluster-%03d-elem-%03d.pdb" % (c,i)
132 
133  if(i!=min_elem_id):
134  print "Writing element",i,"aligned to ",min_elem_id,":",pdb_name
135  T=core.Transform(transformations[i][min_elem_id])
136  ps=atom.get_leaves(hierarchies[i])
137  for p in ps:
138  T.apply(p)
139  else:
140  print "Writing representative element",min_elem_id,":",pdb_name
141  atom.write_pdb(hierarchies[i],pdb_name)