IMP  2.4.0
The Integrative Modeling Platform
hierarchical_clustering.h
Go to the documentation of this file.
1 /**
2  * \file IMP/em2d/hierarchical_clustering.h
3  * \brief Agglomerative clustering algorithm
4  *
5  * Copyright 2007-2015 IMP Inventors. All rights reserved.
6  *
7  */
8 
9 #ifndef IMPEM2D_HIERARCHICAL_CLUSTERING_H
10 #define IMPEM2D_HIERARCHICAL_CLUSTERING_H
11 
12 #include "IMP/em2d/em2d_config.h"
13 #include "IMP/em2d/scores2D.h"
14 #include "IMP/base_types.h"
15 #include <IMP/base/log.h>
16 #include <vector>
17 #include <list>
18 #include <algorithm>
19 #include <limits>
20 #include <functional>
21 
22 IMPEM2D_BEGIN_NAMESPACE
23 
24 typedef std::pair<unsigned int, double> pair_cluster_id_distance;
25 typedef std::list<pair_cluster_id_distance> list_cluster_id_distance;
26 typedef FloatsList VectorOfFloats;
27 typedef IntsList VectorOfInts;
28 
29 template <class T>
30 void print_vector(const std::vector<T> &v) {
31  for (unsigned int i = 0; i < v.size(); ++i) {
32  std::cout << v[i] << " , ";
33  }
34  std::cout << std::endl;
35 }
36 
37 //! A class to store the clusters generated during hierarchical clustering
38 class IMPEM2DEXPORT ClusterSet {
39  public:
40  /**
41  \param[in] N Number of elements to be clustered
42  */
43  ClusterSet(unsigned int N);
44 
45  //! join operation
46  /** \param[in] cluster_id1 id of the 1st cluster joined
47  \param[in] cluster_id2 id of the 2nd cluster merged
48  \param[in] distance_between_clusters distance between the merged clusters
49  */
50  void do_join_clusters(unsigned int cluster_id1, unsigned int cluster_id2,
51  double distance_between_clusters);
52 
53  //! Returns a vector with the ids of the elements that are in a cluster
54  /** Does not contain any hierarchical information, just the members
55  \param[in] id of the cluster
56  */
57  Ints get_cluster_elements(unsigned int id) const;
58 
59  //! Get the biggest clusters that have distances below a given cutoff
60  /** \param[in] cutoff distance
61  \return A vector of Ints: Each Ints has the ids of all elements of the
62  cluster
63  */
64  Ints get_clusters_below_cutoff(double cutoff) const;
65 
66  //! Return the elements of the cluster formed at a given step
67  Ints get_cluster_formed_at_step(unsigned int step) const;
68 
69  //! Distance in the linkage matrix at a given step
70  double get_distance_at_step(unsigned int step) const;
71 
72  unsigned int get_id_for_cluster_at_step(unsigned int step) const {
73  return step + n_elements_;
74  }
75 
76  //! Returns the linkage matrix
77  /**
78  \note Linkage matrix is a matrix A[N-1][3].
79  A[i][0] - id of the first cluster merged at step i
80  A[i][1] - id of the second cluster merged at step i
81  A[i][2] - distance between the clusters
82  */
83  FloatsList get_linkage_matrix() const;
84 
85  //! Returns the linkage matrix compatible with Matlab format
86  /**
87  \note This function merely adds 1 to the cluster ids, for compatibility
88  with Matlab.
89  Matlab format: http://www.mathworks.com/help/toolbox/stats/linkage.html
90  */
91  FloatsList get_linkage_matrix_in_matlab_format() const;
92 
93  //! Returns the number of steps of clustering recorded
94  unsigned int get_number_of_steps() const { return steps_; }
95 
96  void show(std::ostream &out) const;
97 
98  private:
99  void check_step_value(unsigned int s) const;
100  unsigned int get_step_from_id(unsigned int id) const {
101  return (id - n_elements_);
102  }
103 
104  unsigned int steps_;
105  unsigned int n_elements_; // number of elements to cluster
106  Ints joined_ids1_, joined_ids2_;
107  Floats cluster_distances_;
108  // each element of the outermost vector is a vector with all the elements
109  // in a cluster
110  IntsList clusters_elements_;
111 };
113 
114 //! Functor for hierarchical clustering based on single linkage
115 class IMPEM2DEXPORT SingleLinkage {
116  public:
117  SingleLinkage() {}
118  /**
119  \param[in] id1 identity of cluster 1 to merge
120  \param[in] id2 identity of cluster 2 to merge
121  \param[in] cluster_set linkage matrix describing the contents of
122  clusters so far.
123  \param[in] distances A NxN matrix of distances(i,j) between the individual
124  members to cluster
125  \return Minimal distance between members of the clusters
126  \note the id of an isolated member n<N is n. The id of the cluster formed
127  at step i is i+N.
128  */
129  double operator()(unsigned int id1, unsigned int id2,
130  const ClusterSet &cluster_set,
131  const FloatsList &distances) const;
132 
133  void show(std::ostream &out) const {
134  out << "SingleLinkage";
135  };
136 };
138 
139 //! Functor for hierarchical clustering based on complete linkage
140 class IMPEM2DEXPORT CompleteLinkage {
141  public:
142  CompleteLinkage() {}
143  //! Distance between the clusters
144  /**
145  \note See SingleLinkage class for the meaning of the arguments
146  \return Maximal distance between 2 members in the merged cluster
147 
148  */
149  double operator()(unsigned int id1, unsigned int id2,
150  const ClusterSet &cluster_set, const FloatsList &distances);
151 
152  void show(std::ostream &out) const {
153  out << "CompleteLinkage";
154  };
155 };
157 
158 //! Functor for hierarchical clustering based on average-linkage
159 class IMPEM2DEXPORT AverageDistanceLinkage {
160  public:
162  //! Distance between the clusters
163  /**
164  \note See SingleLinkage class for the meaning of the arguments
165  \return Average between all members of the merged cluster
166  */
167  double operator()(unsigned int id1, unsigned int id2,
168  const ClusterSet &cluster_set, const FloatsList &distances);
169 
170  void show(std::ostream &out) const {
171  out << "AverageDistanceLinkage";
172  };
173 };
175 
176 //! Function to perform agglomerative clustering
177 /**
178  \param[in] distances Vector of Floats containing all the
179  possible distances(i,j) between elements to cluster. Given N elements to
180  cluster, there are N vectors of size N
181  \return a ClusterSet class containing all the clustering steps.
182 */
183 template <class LinkageFunction>
185  const FloatsList &distances) {
186  // Based on:
187  // http://nlp.stanford.edu/IR-book/html/htmledition/
188  // time-complexity-of-hac-1.html)
189 
190  IMP_LOG_TERSE("starting hierarchical_agglomerative_clustering " << std::endl);
191 
192  unsigned int N = distances.size(); // number of elements
193  // Lists of distances between elements
194  // List n has members (i,distance_n_i).
195  std::vector<list_cluster_id_distance> lists(N);
196  // id of the cluster associated with each list
197  Ints cluster_id(N);
198  // All list active at the beginning
199  std::vector<bool> active_list(N);
200  std::fill(active_list.begin(), active_list.end(), true);
201  // Fill lists
202  for (unsigned int n = 0; n < N; ++n) {
203  for (unsigned int i = 0; i < N; ++i) {
204  if (i != n) {
205  lists[n].push_back(std::make_pair(i, distances[n][i]));
206  }
207  }
209  // At the beginning each list is associated with a cluster of one element
210  cluster_id[n] = n;
211  }
212 
213  ClusterSet cluster_set(N);
214  LinkageFunction linkage_function;
215  unsigned int steps = N - 1; // Steps of clustering
216  // cluster algorithm
217  for (unsigned int k = 0; k < steps; ++k) {
218  IMP_LOG_TERSE(std::endl);
219  // Find the list that contains lower distance
220  double minimum_distance = std::numeric_limits<double>::max();
221  unsigned int l1 = 0;
222 
223  for (unsigned int j = 0; j < N; ++j) {
224  if (active_list[j] == true) {
225  // closest distance for list j
226  if (lists[j].front().second < minimum_distance) {
227  minimum_distance = lists[j].front().second;
228  l1 = j;
229  }
230  }
231  } // list l1 contains lowest distance
232  // lowest distance is between list l1 and list l2
233  unsigned int l2 = lists[l1].front().first;
234  minimum_distance = lists[l2].front().second;
235  IMP_LOG_TERSE("step " << k << ": joining clusters " << cluster_id[l1]
236  << " and " << cluster_id[l2] << " to form cluster "
237  << cluster_set.get_id_for_cluster_at_step(k)
238  << " distance = " << minimum_distance << std::endl);
239  cluster_set.do_join_clusters(cluster_id[l1], cluster_id[l2],
240  minimum_distance);
241  active_list[l2] = false;
242  cluster_id[l1] = cluster_set.get_id_for_cluster_at_step(k);
243  // clear list 1. It will be filled with distance values for the new cluster
244  lists[l1].clear();
245  // Update lists of distances
246  for (unsigned int i = 0; i < N; ++i) {
247  IMP_LOG_TERSE("Updating list of distances " << i << std::endl);
248  if (active_list[i] == true && i != l1) {
249  IMP_LOG_TERSE("List " << i << " is active " << std::endl);
250  // Delete list elements that store distances to the merged clusters
251  list_cluster_id_distance::iterator it;
252  for (it = lists[i].begin(); it != lists[i].end(); ++it) {
253  if ((*it).first == l1 || (*it).first == l2) {
254  lists[i].erase(it);
255  --it;
256  }
257  }
258  // Update distances to the merged cluster
259  double dist = linkage_function(cluster_id[l1], cluster_id[i],
260  cluster_set, distances);
261  IMP_LOG_TERSE("Distance by linkage function " << dist << std::endl);
262  lists[i].push_back(std::make_pair(l1, dist));
263  lists[l1].push_back(std::make_pair(i, dist));
264  }
265  }
266  // Sort lists
267  for (unsigned int i = 0; i < N; ++i) {
268  if (active_list[i] == true) {
270  }
271  }
272  }
273  return cluster_set;
274 }
275 
276 IMPEM2D_END_NAMESPACE
277 
278 #endif /* IMPEM2D_HIERARCHICAL_CLUSTERING_H */
Import IMP/kernel/base_types.h in the namespace.
ClusterSet do_hierarchical_agglomerative_clustering(const FloatsList &distances)
Function to perform agglomerative clustering.
Functor for hierarchical clustering based on single linkage.
IMP::base::Vector< Floats > FloatsList
Standard way to pass a bunch of Floats values.
Definition: types.h:54
#define IMP_LOG_TERSE(expr)
Definition: log_macros.h:83
Scoring functions for 2D Copyright 2007-2015 IMP Inventors. All rights reserved.
void do_join_clusters(unsigned int cluster_id1, unsigned int cluster_id2, double distance_between_clusters)
join operation
#define IMP_VALUES(Name, PluralName)
Define the type for storing sets of values.
Definition: value_macros.h:23
A class to store the clusters generated during hierarchical clustering.
IMP::base::Vector< Ints > IntsList
Standard way to pass a bunch of Ints values.
Definition: types.h:56
Functor for hierarchical clustering based on complete linkage.
Functor for hierarchical clustering based on average-linkage.
void show(Hierarchy h, std::ostream &out=std::cout)
Print out a molecular hierarchy.
Logging and error reporting support.
unsigned int get_number_of_steps() const
Returns the number of steps of clustering recorded.
Comparison of pairs by checking the second element.
Definition: scores2D.h:133