IMP  2.1.0
The Integrative Modeling Platform
hierarchical_clustering.h
Go to the documentation of this file.
1 /**
2  * \file hierarchical_clustering.h
3  * \brief Agglomerative clustering algorithm
4  *
5  * Copyright 2007-2013 IMP Inventors. All rights reserved.
6  *
7  */
8 
9 #ifndef IMPEM2D_HIERARCHICAL_CLUSTERING_H
10 #define IMPEM2D_HIERARCHICAL_CLUSTERING_H
11 
12 #include "IMP/em2d/em2d_config.h"
13 #include "IMP/em2d/scores2D.h"
14 #include "IMP/base_types.h"
15 #include <IMP/base/log.h>
16 #include <vector>
17 #include <list>
18 #include <algorithm>
19 #include <limits>
20 #include <functional>
21 
22 IMPEM2D_BEGIN_NAMESPACE
23 
24 typedef std::pair<unsigned int ,double> pair_cluster_id_distance;
25 typedef std::list< pair_cluster_id_distance > list_cluster_id_distance;
26 typedef FloatsList VectorOfFloats;
27 typedef IntsList VectorOfInts;
28 
29 
30 
31 template<class T>
32 void print_vector(const std::vector<T> &v) {
33  for (unsigned int i=0;i<v.size();++i) {
34  std::cout << v[i] << " , ";
35  }
36  std::cout << std::endl;
37 }
38 
39 // A class to store the clusters generated during hierarchical clustering;
40 class IMPEM2DEXPORT ClusterSet {
41 public:
42 
43  /*!
44  \param[in] N Number of elements to be clustered
45  */
46  ClusterSet(unsigned int N);
47 
48  // join operation
49  /*!
50  \param[in] cluster_id1 id of the 1st cluster joined
51  \param[in] cluster_id2 id of the 2nd cluster merged
52  \param[in] distance_between_clusters distance between the merged clusters
53  */
54  void do_join_clusters(unsigned int cluster_id1,
55  unsigned int cluster_id2,
56  double distance_between_clusters);
57 
58  // Returns a vector with the ids of the elements that are in a cluster
59  // Does not contain any hierarchical information, just the members
60  /*!
61  \param[in] id of the cluster
62  */
63  Ints get_cluster_elements(unsigned int id) const;
64 
65 
66  // Get the biggest clusters that have distances below a given cutoff
67  /*!
68  \param[in] cutoff distance
69  \return A vector of Ints: Each Ints has the ids of all elements of the
70  cluster
71  */
72  Ints get_clusters_below_cutoff(double cutoff) const;
73 
74  // Return the elements of the cluster formed at a given step
75  /*!
76  */
77  Ints get_cluster_formed_at_step(unsigned int step) const;
78 
79  // Distance in the linkage matrix at a given step
80  double get_distance_at_step(unsigned int step) const;
81 
82  unsigned int get_id_for_cluster_at_step(unsigned int step) const {
83  return step+n_elements_;
84  }
85 
86 
87  // Returns the linkage matrix
88  /*!
89  \note Linkage matrix is a matrix A[N-1][3].
90  A[i][0] - id of the first cluster merged at step i
91  A[i][1] - id of the second cluster merged at step i
92  A[i][2] - distance between the clusters
93  */
94  FloatsList get_linkage_matrix() const;
95 
96  // Returns the linkage matrix compatible with Matlab format
97  /*!
98  \note This function merely adds 1 to the cluster ids, for compatibility
99  with Matlab.
100  Matlab format: http://www.mathworks.com/help/toolbox/stats/linkage.html
101  */
102  FloatsList get_linkage_matrix_in_matlab_format() const;
103 
104  // Returns the number of steps of clustering recorded
105  unsigned int get_number_of_steps() const {
106  return steps_;
107  }
108 
109  void show(std::ostream &out) const;
110 
111 
112 private:
113  void check_step_value(unsigned int s) const;
114  unsigned int get_step_from_id(unsigned int id) const {
115  return (id-n_elements_);
116  }
117 
118  unsigned int steps_;
119  unsigned int n_elements_; // number of elements to cluster
120  Ints joined_ids1_,joined_ids2_;
121  Floats cluster_distances_;
122  // each element of the outermost vector is a vector with all the elements
123  // in a cluster
124  IntsList clusters_elements_;
125 };
127 
128 
129 
130 
131 // Functor for hierarchical clustering based on single linkage
132 class IMPEM2DEXPORT SingleLinkage {
133 public:
134  SingleLinkage(){}
135  /*!
136  \param[in] id1 identity of cluster 1 to merge
137  \param[in] id2 identity of cluster 2 to merge
138  \param[in] cluster_set linkage matrix describing the contents of
139  clusters so far.
140  \param[in] distances A NxN matrix of distances(i,j) between the individual
141  members to cluster
142  \return Minimal distance between members of the clusters
143  \note the id of an isolated member n<N is n. The id of the cluster formed
144  at step i is i+N.
145  */
146  double operator()(unsigned int id1,
147  unsigned int id2,
148  const ClusterSet &cluster_set,
149  const FloatsList &distances ) const;
150 
151  void show(std::ostream &out) const {
152  out << "SingleLinkage";
153  };
154 };
155 IMP_VALUES(SingleLinkage,SingleLinkages);
156 
157 
158 
159 // Functor for hierarchical clustering based on complete linkage
160 class IMPEM2DEXPORT CompleteLinkage {
161 public:
162  CompleteLinkage(){}
163  // Distance between the clusters
164  /*!
165  \note See SingleLinkage class for the meaning of the arguments
166  \return Maximal distance between 2 members in the merged cluster
167 
168  */
169  double operator()(unsigned int id1,
170  unsigned int id2,
171  const ClusterSet &cluster_set,
172  const FloatsList &distances );
173 
174  void show(std::ostream &out) const {
175  out << "CompleteLinkage";
176  };
177 
178 };
179 IMP_VALUES(CompleteLinkage,CompleteLinkages);
180 
181 
182 
183 
184 // Functor for hierarchical clustering based on average-linkage
185 class IMPEM2DEXPORT AverageDistanceLinkage {
186 public:
187  AverageDistanceLinkage(){}
188  // Distance between the clusters
189  /*!
190  \note See SingleLinkage class for the meaning of the arguments
191  \return Average between all members of the merged cluster
192  */
193  double operator()(unsigned int id1,
194  unsigned int id2,
195  const ClusterSet &cluster_set,
196  const FloatsList &distances );
197 
198  void show(std::ostream &out) const {
199  out << "AverageDistanceLinkage";
200  };
201 };
202 IMP_VALUES(AverageDistanceLinkage,AverageDistanceLinkages);
203 
204 
205 
206 
207 // Function to perform agglomerative clustering
208 /*!
209  \param[in] distances Vector of Floats containing all the
210  possible distances(i,j) between elements to cluster. Given N elements to
211  cluster, there are N vectors of size N
212  \return a ClusterSet class containing all the clustering steps.
213 */
214 template<class LinkageFunction>
215 ClusterSet
217 // Based on:
218 // http://nlp.stanford.edu/IR-book/html/htmledition/
219 // time-complexity-of-hac-1.html)
220 
222  "starting hierarchical_agglomerative_clustering " << std::endl);
223 
224  unsigned int N = distances.size(); // number of elements
225  // Lists of distances between elements
226  // List n has members (i,distance_n_i).
227  std::vector< list_cluster_id_distance > lists(N);
228  // id of the cluster associated with each list
229  Ints cluster_id(N);
230  // All list active at the beginning
231  std::vector<bool> active_list(N);
232  std::fill(active_list.begin(),active_list.end(),true);
233  // Fill lists
234  for (unsigned int n=0;n<N;++n) {
235  for(unsigned int i=0;i<N;++i) {
236  if(i != n) {
237  lists[n].push_back(std::make_pair(i,distances[n][i]));
238  }
239  }
241  // At the beginning each list is associated with a cluster of one element
242  cluster_id[n]=n;
243  }
244 
245  ClusterSet cluster_set(N);
246  LinkageFunction linkage_function;
247  unsigned int steps = N-1;// Steps of clustering
248  // cluster algorithm
249  for (unsigned int k=0;k<steps;++k) {
250  IMP_LOG_TERSE( std::endl);
251  // Find the list that contains lower distance
252  double minimum_distance=std::numeric_limits<double>::max();
253  unsigned int l1=0;
254 
255  for (unsigned int j=0;j<N;++j) {
256  if(active_list[j]==true) {
257  // closest distance for list j
258  if(lists[j].front().second < minimum_distance) {
259  minimum_distance=lists[j].front().second;
260  l1=j;
261  }
262  }
263  } // list l1 contains lowest distance
264  // lowest distance is between list l1 and list l2
265  unsigned int l2=lists[l1].front().first;
266  minimum_distance=lists[l2].front().second;
267  IMP_LOG_TERSE("step " << k << ": joining clusters " << cluster_id[l1]
268  << " and " << cluster_id[l2]
269  << " to form cluster "
270  << cluster_set.get_id_for_cluster_at_step(k) << " distance = "
271  << minimum_distance << std::endl);
272  cluster_set.do_join_clusters(cluster_id[l1],
273  cluster_id[l2],
274  minimum_distance);
275  active_list[l2]=false;
276  cluster_id[l1]=cluster_set.get_id_for_cluster_at_step(k);
277  // clear list 1. It will be filled with distance values for the new cluster
278  lists[l1].clear();
279  // Update lists of distances
280  for (unsigned int i=0;i<N;++i) {
281  IMP_LOG_TERSE("Updating list of distances " << i << std::endl);
282  if(active_list[i]==true && i!=l1) {
283  IMP_LOG_TERSE("List " << i << " is active " << std::endl);
284  // Delete list elements that store distances to the merged clusters
285  list_cluster_id_distance::iterator it;
286  for (it=lists[i].begin() ; it!=lists[i].end() ; ++it) {
287  if((*it).first == l1 || (*it).first == l2 ) {
288  lists[i].erase(it);
289  --it;
290  }
291  }
292  // Update distances to the merged cluster
293  double dist = linkage_function(cluster_id[l1],
294  cluster_id[i],
295  cluster_set,
296  distances);
297  IMP_LOG_TERSE("Distance by linkage function "<< dist<< std::endl);
298  lists[i].push_back(std::make_pair(l1,dist));
299  lists[l1].push_back(std::make_pair(i,dist));
300  }
301  }
302  // Sort lists
303  for (unsigned int i=0;i<N;++i) {
304  if(active_list[i]==true) {
306  }
307  }
308  }
309  return cluster_set;
310 }
311 
312 IMPEM2D_END_NAMESPACE
313 
314 #endif /* IMPEM2D_HIERARCHICAL_CLUSTERING_H */
Import IMP/kernel/base_types.h in the namespace.
ClusterSet do_hierarchical_agglomerative_clustering(const FloatsList &distances)
#define IMP_VALUES(Name, PluralName)
Define the type for storing sets of values.
IMP::base::Vector< Floats > FloatsList
Standard way to pass a bunch of Floats values.
Definition: base/types.h:54
Scoring functions for 2D Copyright 2007-2013 IMP Inventors. All rights reserved.
#define IMP_LOG_TERSE(expr)
IMP::base::Vector< Ints > IntsList
Standard way to pass a bunch of Ints values.
Definition: base/types.h:56
void show(Hierarchy h, std::ostream &out=std::cout)
Print out a molecular hierarchy.
IMP::base::Vector< Float > Floats
Standard way to pass a bunch of Float values.
Definition: base/types.h:47
Logging and error reporting support.
Comparison of pairs by checking the second element.
Definition: scores2D.h:140
IMP::base::Vector< Int > Ints
Standard way to pass a bunch of Int values.
Definition: base/types.h:49