IMP  2.0.0
The Integrative Modeling Platform
hierarchical_clustering.h
Go to the documentation of this file.
1 /**
2  * \file hierarchical_clustering.h
3  * \brief Agglomerative clustering algorithm
4  *
5  * Copyright 2007-2013 IMP Inventors. All rights reserved.
6  *
7  */
8 
9 #ifndef IMPEM2D_HIERARCHICAL_CLUSTERING_H
10 #define IMPEM2D_HIERARCHICAL_CLUSTERING_H
11 
12 #include "IMP/em2d/em2d_config.h"
13 #include "IMP/em2d/scores2D.h"
14 #include "IMP/base_types.h"
15 #include <IMP/log.h>
16 #include <vector>
17 #include <list>
18 #include <algorithm>
19 #include <limits>
20 #include <functional>
21 
22 IMPEM2D_BEGIN_NAMESPACE
23 
24 typedef std::pair<unsigned int ,double> pair_cluster_id_distance;
25 typedef std::list< pair_cluster_id_distance > list_cluster_id_distance;
26 typedef FloatsList VectorOfFloats;
27 typedef IntsList VectorOfInts;
28 
29 
30 
31 template<class T>
32 void print_vector(const std::vector<T> &v) {
33  for (unsigned int i=0;i<v.size();++i) {
34  std::cout << v[i] << " , ";
35  }
36  std::cout << std::endl;
37 }
38 
39 // A class to store the clusters generated during hierarchical clustering;
40 class IMPEM2DEXPORT ClusterSet {
41 public:
42 
43  /*!
44  \param[in] N Number of elements to be clustered
45  */
46  ClusterSet(unsigned int N);
47 
48  // join operation
49  /*!
50  \param[in] cluster_id1 id of the 1st cluster joined
51  \param[in] cluster_id2 id of the 2nd cluster merged
52  \param[in] distance_between_clusters distance between the merged clusters
53  */
54  void do_join_clusters(unsigned int cluster_id1,
55  unsigned int cluster_id2,
56  double distance_between_clusters);
57 
58  // Returns a vector with the ids of the elements that are in a cluster
59  // Does not contain any hierarchical information, just the members
60  /*!
61  \param[in] id of the cluster
62  */
63  Ints get_cluster_elements(unsigned int id) const;
64 
65 
66  // Get the biggest clusters that have distances below a given cutoff
67  /*!
68  \param[in] cutoff distance
69  \param[out] A vector of Ints: Each Ints has the ids of all elements of the
70  cluster
71  */
72  Ints get_clusters_below_cutoff(double cutoff) const;
73 
74  // Return the elements of the cluster formed at a given step
75  /*!
76  */
77  Ints get_cluster_formed_at_step(unsigned int step) const;
78 
79  // Distance in the linkage matrix at a given step
80  double get_distance_at_step(unsigned int step) const;
81 
82  unsigned int get_id_for_cluster_at_step(unsigned int step) const {
83  return step+n_elements_;
84  }
85 
86 
87  // Returns the linkage matrix
88  /*!
89  \note Linkage matrix is a matrix A[N-1][3].
90  A[i][0] - id of the first cluster merged at step i
91  A[i][1] - id of the second cluster merged at step i
92  A[i][2] - distance between the clusters
93  */
94  FloatsList get_linkage_matrix() const;
95 
96  // Returns the linkage matrix compatible with Matlab format
97  /*!
98  \note This function merely adds 1 to the cluster ids, for compatibility
99  with Matlab.
100  Matlab format: http://www.mathworks.com/help/toolbox/stats/linkage.html
101  */
102  FloatsList get_linkage_matrix_in_matlab_format() const;
103 
104  // Returns the number of steps of clustering recorded
105  unsigned int get_number_of_steps() const {
106  return steps_;
107  }
108 
109  void show(std::ostream &out) const;
110 
111 
112 private:
113  void check_step_value(unsigned int s) const;
114  unsigned int get_step_from_id(unsigned int id) const {
115  return (id-n_elements_);
116  }
117 
118  unsigned int steps_;
119  unsigned int n_elements_; // number of elements to cluster
120  Ints joined_ids1_,joined_ids2_;
121  Floats cluster_distances_;
122  // each element of the outermost vector is a vector with all the elements
123  // in a cluster
124  IntsList clusters_elements_;
125 };
127 
128 
129 
130 
131 // Functor for hierarchical clustering based on single linkage
132 class IMPEM2DEXPORT SingleLinkage {
133 public:
134  SingleLinkage(){}
135  /*!
136  \param[in] id1 identity of cluster 1 to merge
137  \param[in] id2 identity of cluster 2 to merge
138  \param[in] linkage matrix describing the contents of clusters so far.
139  \param[in] distances. A NxN matrix of distances(i,j) between the individual
140  members to cluster
141  \param[out] Minimal distance between members of the clusters
142  \note the id of an isolated member n<N is n. The id of the cluster formed
143  at step i is i+N.
144  */
145  double operator()(unsigned int id1,
146  unsigned int id2,
147  const ClusterSet &cluster_set,
148  const FloatsList &distances ) const;
149 
150  void show(std::ostream &out) const {
151  out << "SingleLinkage";
152  };
153 };
154 IMP_VALUES(SingleLinkage,SingleLinkages);
155 
156 
157 
158 // Functor for hierarchical clustering based on complete linkage
159 class IMPEM2DEXPORT CompleteLinkage {
160 public:
161  CompleteLinkage(){}
162  // Distance between the clusters
163  /*!
164  \note See SingleLinkage class for the meaning of the arguments
165  \param[out] Maximal distance between 2 members in the merged cluster
166 
167  */
168  double operator()(unsigned int id1,
169  unsigned int id2,
170  const ClusterSet &cluster_set,
171  const FloatsList &distances );
172 
173  void show(std::ostream &out) const {
174  out << "CompleteLinkage";
175  };
176 
177 };
178 IMP_VALUES(CompleteLinkage,CompleteLinkages);
179 
180 
181 
182 
183 // Functor for hierarchical clustering based on average-linkage
184 class IMPEM2DEXPORT AverageDistanceLinkage {
185 public:
186  AverageDistanceLinkage(){}
187  // Distance between the clusters
188  /*!
189  \note See SingleLinkage class for the meaning of the arguments
190  \return Average between all members of the merged cluster
191  */
192  double operator()(unsigned int id1,
193  unsigned int id2,
194  const ClusterSet &cluster_set,
195  const FloatsList &distances );
196 
197  void show(std::ostream &out) const {
198  out << "AverageDistanceLinkage";
199  };
200 };
201 IMP_VALUES(AverageDistanceLinkage,AverageDistanceLinkages);
202 
203 
204 
205 
206 // Function to perform agglomerative clustering
207 /*!
208  \param[in] distances Vector of Floats containing all the
209  possible distances(i,j) between elements to cluster. Given N elements to
210  cluster, there are N vectors of size N
211  \param[in] Function to calculate distance between clusters.
212  \param[out] a ClusterSet class containing all the clustering steps.
213 */
214 template<class LinkageFunction>
215 ClusterSet
217 // Based on:
218 // http://nlp.stanford.edu/IR-book/html/htmledition/
219 // time-complexity-of-hac-1.html)
220 
222  "starting hierarchical_agglomerative_clustering " << std::endl);
223 
224  unsigned int N = distances.size(); // number of elements
225  // Lists of distances between elements
226  // List n has members (i,distance_n_i).
227  std::vector< list_cluster_id_distance > lists(N);
228  // id of the cluster associated with each list
229  Ints cluster_id(N);
230  // All list active at the beginning
231  std::vector<bool> active_list(N);
232  std::fill(active_list.begin(),active_list.end(),true);
233  // Fill lists
234  for (unsigned int n=0;n<N;++n) {
235  for(unsigned int i=0;i<N;++i) {
236  if(i != n) {
237  lists[n].push_back(std::make_pair(i,distances[n][i]));
238  }
239  }
241  // At the beginning each list is associated with a cluster of one element
242  cluster_id[n]=n;
243  }
244 
245  ClusterSet cluster_set(N);
246  LinkageFunction linkage_function;
247  unsigned int steps = N-1;// Steps of clustering
248  // cluster algorithm
249  for (unsigned int k=0;k<steps;++k) {
250  IMP_LOG_TERSE( std::endl);
251  // Find the list that contains lower distance
252  double minimum_distance=std::numeric_limits<double>::max();
253  unsigned int l1=0;
254 
255  for (unsigned int j=0;j<N;++j) {
256  if(active_list[j]==true) {
257  // closest distance for list j
258  if(lists[j].front().second < minimum_distance) {
259  minimum_distance=lists[j].front().second;
260  l1=j;
261  }
262  }
263  } // list l1 contains lowest distance
264  // lowest distance is between list l1 and list l2
265  unsigned int l2=lists[l1].front().first;
266  minimum_distance=lists[l2].front().second;
267  IMP_LOG_TERSE("step " << k << ": joining clusters " << cluster_id[l1]
268  << " and " << cluster_id[l2]
269  << " to form cluster "
270  << cluster_set.get_id_for_cluster_at_step(k) << " distance = "
271  << minimum_distance << std::endl);
272  cluster_set.do_join_clusters(cluster_id[l1],
273  cluster_id[l2],
274  minimum_distance);
275  active_list[l2]=false;
276  cluster_id[l1]=cluster_set.get_id_for_cluster_at_step(k);
277  // clear list 1. It will be filled with distance values for the new cluster
278  lists[l1].clear();
279  // Update lists of distances
280  for (unsigned int i=0;i<N;++i) {
281  IMP_LOG_TERSE("Updating list of distances " << i << std::endl);
282  if(active_list[i]==true && i!=l1) {
283  IMP_LOG_TERSE("List " << i << " is active " << std::endl);
284  // Delete list elements that store distances to the merged clusters
285  list_cluster_id_distance::iterator it;
286  for (it=lists[i].begin() ; it!=lists[i].end() ; ++it) {
287  if((*it).first == l1 || (*it).first == l2 ) {
288  lists[i].erase(it);
289  --it;
290  }
291  }
292  // Update distances to the merged cluster
293  double dist = linkage_function(cluster_id[l1],
294  cluster_id[i],
295  cluster_set,
296  distances);
297  IMP_LOG_TERSE("Distance by linkage function "<< dist<< std::endl);
298  lists[i].push_back(std::make_pair(l1,dist));
299  lists[l1].push_back(std::make_pair(i,dist));
300  }
301  }
302  // Sort lists
303  for (unsigned int i=0;i<N;++i) {
304  if(active_list[i]==true) {
306  }
307  }
308  }
309  return cluster_set;
310 }
311 
312 IMPEM2D_END_NAMESPACE
313 
314 #endif /* IMPEM2D_HIERARCHICAL_CLUSTERING_H */