IMP  2.2.0
The Integrative Modeling Platform
hierarchical_clustering.h
Go to the documentation of this file.
1 /**
2  * \file IMP/em2d/hierarchical_clustering.h
3  * \brief Agglomerative clustering algorithm
4  *
5  * Copyright 2007-2014 IMP Inventors. All rights reserved.
6  *
7  */
8 
9 #ifndef IMPEM2D_HIERARCHICAL_CLUSTERING_H
10 #define IMPEM2D_HIERARCHICAL_CLUSTERING_H
11 
12 #include "IMP/em2d/em2d_config.h"
13 #include "IMP/em2d/scores2D.h"
14 #include "IMP/base_types.h"
15 #include <IMP/base/log.h>
16 #include <vector>
17 #include <list>
18 #include <algorithm>
19 #include <limits>
20 #include <functional>
21 
22 IMPEM2D_BEGIN_NAMESPACE
23 
24 typedef std::pair<unsigned int, double> pair_cluster_id_distance;
25 typedef std::list<pair_cluster_id_distance> list_cluster_id_distance;
26 typedef FloatsList VectorOfFloats;
27 typedef IntsList VectorOfInts;
28 
29 template <class T>
30 void print_vector(const std::vector<T> &v) {
31  for (unsigned int i = 0; i < v.size(); ++i) {
32  std::cout << v[i] << " , ";
33  }
34  std::cout << std::endl;
35 }
36 
37 // A class to store the clusters generated during hierarchical clustering;
38 class IMPEM2DEXPORT ClusterSet {
39  public:
40  /*!
41  \param[in] N Number of elements to be clustered
42  */
43  ClusterSet(unsigned int N);
44 
45  // join operation
46  /*!
47  \param[in] cluster_id1 id of the 1st cluster joined
48  \param[in] cluster_id2 id of the 2nd cluster merged
49  \param[in] distance_between_clusters distance between the merged clusters
50  */
51  void do_join_clusters(unsigned int cluster_id1, unsigned int cluster_id2,
52  double distance_between_clusters);
53 
54  // Returns a vector with the ids of the elements that are in a cluster
55  // Does not contain any hierarchical information, just the members
56  /*!
57  \param[in] id of the cluster
58  */
59  Ints get_cluster_elements(unsigned int id) const;
60 
61  // Get the biggest clusters that have distances below a given cutoff
62  /*!
63  \param[in] cutoff distance
64  \return A vector of Ints: Each Ints has the ids of all elements of the
65  cluster
66  */
67  Ints get_clusters_below_cutoff(double cutoff) const;
68 
69  // Return the elements of the cluster formed at a given step
70  /*!
71  */
72  Ints get_cluster_formed_at_step(unsigned int step) const;
73 
74  // Distance in the linkage matrix at a given step
75  double get_distance_at_step(unsigned int step) const;
76 
77  unsigned int get_id_for_cluster_at_step(unsigned int step) const {
78  return step + n_elements_;
79  }
80 
81  // Returns the linkage matrix
82  /*!
83  \note Linkage matrix is a matrix A[N-1][3].
84  A[i][0] - id of the first cluster merged at step i
85  A[i][1] - id of the second cluster merged at step i
86  A[i][2] - distance between the clusters
87  */
88  FloatsList get_linkage_matrix() const;
89 
90  // Returns the linkage matrix compatible with Matlab format
91  /*!
92  \note This function merely adds 1 to the cluster ids, for compatibility
93  with Matlab.
94  Matlab format: http://www.mathworks.com/help/toolbox/stats/linkage.html
95  */
96  FloatsList get_linkage_matrix_in_matlab_format() const;
97 
98  // Returns the number of steps of clustering recorded
99  unsigned int get_number_of_steps() const { return steps_; }
100 
101  void show(std::ostream &out) const;
102 
103  private:
104  void check_step_value(unsigned int s) const;
105  unsigned int get_step_from_id(unsigned int id) const {
106  return (id - n_elements_);
107  }
108 
109  unsigned int steps_;
110  unsigned int n_elements_; // number of elements to cluster
111  Ints joined_ids1_, joined_ids2_;
112  Floats cluster_distances_;
113  // each element of the outermost vector is a vector with all the elements
114  // in a cluster
115  IntsList clusters_elements_;
116 };
117 IMP_VALUES(ClusterSet, ClusterSets);
118 
119 // Functor for hierarchical clustering based on single linkage
120 class IMPEM2DEXPORT SingleLinkage {
121  public:
122  SingleLinkage() {}
123  /*!
124  \param[in] id1 identity of cluster 1 to merge
125  \param[in] id2 identity of cluster 2 to merge
126  \param[in] cluster_set linkage matrix describing the contents of
127  clusters so far.
128  \param[in] distances A NxN matrix of distances(i,j) between the individual
129  members to cluster
130  \return Minimal distance between members of the clusters
131  \note the id of an isolated member n<N is n. The id of the cluster formed
132  at step i is i+N.
133  */
134  double operator()(unsigned int id1, unsigned int id2,
135  const ClusterSet &cluster_set,
136  const FloatsList &distances) const;
137 
138  void show(std::ostream &out) const {
139  out << "SingleLinkage";
140  };
141 };
142 IMP_VALUES(SingleLinkage, SingleLinkages);
143 
144 // Functor for hierarchical clustering based on complete linkage
145 class IMPEM2DEXPORT CompleteLinkage {
146  public:
147  CompleteLinkage() {}
148  // Distance between the clusters
149  /*!
150  \note See SingleLinkage class for the meaning of the arguments
151  \return Maximal distance between 2 members in the merged cluster
152 
153  */
154  double operator()(unsigned int id1, unsigned int id2,
155  const ClusterSet &cluster_set, const FloatsList &distances);
156 
157  void show(std::ostream &out) const {
158  out << "CompleteLinkage";
159  };
160 };
161 IMP_VALUES(CompleteLinkage, CompleteLinkages);
162 
163 // Functor for hierarchical clustering based on average-linkage
164 class IMPEM2DEXPORT AverageDistanceLinkage {
165  public:
166  AverageDistanceLinkage() {}
167  // Distance between the clusters
168  /*!
169  \note See SingleLinkage class for the meaning of the arguments
170  \return Average between all members of the merged cluster
171  */
172  double operator()(unsigned int id1, unsigned int id2,
173  const ClusterSet &cluster_set, const FloatsList &distances);
174 
175  void show(std::ostream &out) const {
176  out << "AverageDistanceLinkage";
177  };
178 };
179 IMP_VALUES(AverageDistanceLinkage, AverageDistanceLinkages);
180 
181 // Function to perform agglomerative clustering
182 /*!
183  \param[in] distances Vector of Floats containing all the
184  possible distances(i,j) between elements to cluster. Given N elements to
185  cluster, there are N vectors of size N
186  \return a ClusterSet class containing all the clustering steps.
187 */
188 template <class LinkageFunction>
190  const FloatsList &distances) {
191  // Based on:
192  // http://nlp.stanford.edu/IR-book/html/htmledition/
193  // time-complexity-of-hac-1.html)
194 
195  IMP_LOG_TERSE("starting hierarchical_agglomerative_clustering " << std::endl);
196 
197  unsigned int N = distances.size(); // number of elements
198  // Lists of distances between elements
199  // List n has members (i,distance_n_i).
200  std::vector<list_cluster_id_distance> lists(N);
201  // id of the cluster associated with each list
202  Ints cluster_id(N);
203  // All list active at the beginning
204  std::vector<bool> active_list(N);
205  std::fill(active_list.begin(), active_list.end(), true);
206  // Fill lists
207  for (unsigned int n = 0; n < N; ++n) {
208  for (unsigned int i = 0; i < N; ++i) {
209  if (i != n) {
210  lists[n].push_back(std::make_pair(i, distances[n][i]));
211  }
212  }
214  // At the beginning each list is associated with a cluster of one element
215  cluster_id[n] = n;
216  }
217 
218  ClusterSet cluster_set(N);
219  LinkageFunction linkage_function;
220  unsigned int steps = N - 1; // Steps of clustering
221  // cluster algorithm
222  for (unsigned int k = 0; k < steps; ++k) {
223  IMP_LOG_TERSE(std::endl);
224  // Find the list that contains lower distance
225  double minimum_distance = std::numeric_limits<double>::max();
226  unsigned int l1 = 0;
227 
228  for (unsigned int j = 0; j < N; ++j) {
229  if (active_list[j] == true) {
230  // closest distance for list j
231  if (lists[j].front().second < minimum_distance) {
232  minimum_distance = lists[j].front().second;
233  l1 = j;
234  }
235  }
236  } // list l1 contains lowest distance
237  // lowest distance is between list l1 and list l2
238  unsigned int l2 = lists[l1].front().first;
239  minimum_distance = lists[l2].front().second;
240  IMP_LOG_TERSE("step " << k << ": joining clusters " << cluster_id[l1]
241  << " and " << cluster_id[l2] << " to form cluster "
242  << cluster_set.get_id_for_cluster_at_step(k)
243  << " distance = " << minimum_distance << std::endl);
244  cluster_set.do_join_clusters(cluster_id[l1], cluster_id[l2],
245  minimum_distance);
246  active_list[l2] = false;
247  cluster_id[l1] = cluster_set.get_id_for_cluster_at_step(k);
248  // clear list 1. It will be filled with distance values for the new cluster
249  lists[l1].clear();
250  // Update lists of distances
251  for (unsigned int i = 0; i < N; ++i) {
252  IMP_LOG_TERSE("Updating list of distances " << i << std::endl);
253  if (active_list[i] == true && i != l1) {
254  IMP_LOG_TERSE("List " << i << " is active " << std::endl);
255  // Delete list elements that store distances to the merged clusters
256  list_cluster_id_distance::iterator it;
257  for (it = lists[i].begin(); it != lists[i].end(); ++it) {
258  if ((*it).first == l1 || (*it).first == l2) {
259  lists[i].erase(it);
260  --it;
261  }
262  }
263  // Update distances to the merged cluster
264  double dist = linkage_function(cluster_id[l1], cluster_id[i],
265  cluster_set, distances);
266  IMP_LOG_TERSE("Distance by linkage function " << dist << std::endl);
267  lists[i].push_back(std::make_pair(l1, dist));
268  lists[l1].push_back(std::make_pair(i, dist));
269  }
270  }
271  // Sort lists
272  for (unsigned int i = 0; i < N; ++i) {
273  if (active_list[i] == true) {
275  }
276  }
277  }
278  return cluster_set;
279 }
280 
281 IMPEM2D_END_NAMESPACE
282 
283 #endif /* IMPEM2D_HIERARCHICAL_CLUSTERING_H */
Import IMP/kernel/base_types.h in the namespace.
ClusterSet do_hierarchical_agglomerative_clustering(const FloatsList &distances)
#define IMP_VALUES(Name, PluralName)
Define the type for storing sets of values.
IMP::base::Vector< Floats > FloatsList
Standard way to pass a bunch of Floats values.
Definition: base/types.h:54
Scoring functions for 2D Copyright 2007-2014 IMP Inventors. All rights reserved.
#define IMP_LOG_TERSE(expr)
IMP::base::Vector< Ints > IntsList
Standard way to pass a bunch of Ints values.
Definition: base/types.h:56
void show(Hierarchy h, std::ostream &out=std::cout)
Print out a molecular hierarchy.
IMP::base::Vector< Float > Floats
Standard way to pass a bunch of Float values.
Definition: base/types.h:47
Logging and error reporting support.
Comparison of pairs by checking the second element.
Definition: scores2D.h:133
IMP::base::Vector< Int > Ints
Standard way to pass a bunch of Int values.
Definition: base/types.h:49