IMP logo
IMP Reference Guide  2.14.0
The Integrative Modeling Platform
hierarchical_clustering.h
Go to the documentation of this file.
1 /**
2  * \file IMP/em2d/hierarchical_clustering.h
3  * \brief Agglomerative clustering algorithm
4  *
5  * Copyright 2007-2020 IMP Inventors. All rights reserved.
6  *
7  */
8 
9 #ifndef IMPEM2D_HIERARCHICAL_CLUSTERING_H
10 #define IMPEM2D_HIERARCHICAL_CLUSTERING_H
11 
12 #include "IMP/em2d/em2d_config.h"
13 #include "IMP/em2d/scores2D.h"
14 #include <IMP/em2d/internal/clustering_helper.h>
15 #include "IMP/base_types.h"
16 #include <IMP/log.h>
17 #include <vector>
18 #include <list>
19 #include <algorithm>
20 #include <limits>
21 #include <functional>
22 
23 IMPEM2D_BEGIN_NAMESPACE
24 
25 template <class T>
26 void print_vector(const std::vector<T> &v) {
27  for (unsigned int i = 0; i < v.size(); ++i) {
28  std::cout << v[i] << " , ";
29  }
30  std::cout << std::endl;
31 }
32 
33 //! A class to store the clusters generated during hierarchical clustering
34 class IMPEM2DEXPORT ClusterSet {
35  public:
36  /**
37  \param[in] N Number of elements to be clustered
38  */
39  ClusterSet(unsigned int N);
40 
41  //! join operation
42  /** \param[in] cluster_id1 id of the 1st cluster joined
43  \param[in] cluster_id2 id of the 2nd cluster merged
44  \param[in] distance_between_clusters distance between the merged clusters
45  */
46  void do_join_clusters(unsigned int cluster_id1, unsigned int cluster_id2,
47  double distance_between_clusters);
48 
49  //! Returns a vector with the ids of the elements that are in a cluster
50  /** Does not contain any hierarchical information, just the members
51  \param[in] id of the cluster
52  */
53  Ints get_cluster_elements(unsigned int id) const;
54 
55  //! Get the biggest clusters that have distances below a given cutoff
56  /** \param[in] cutoff distance
57  \return A vector of Ints: Each Ints has the ids of all elements of the
58  cluster
59  */
60  Ints get_clusters_below_cutoff(double cutoff) const;
61 
62  //! Return the elements of the cluster formed at a given step
63  Ints get_cluster_formed_at_step(unsigned int step) const;
64 
65  //! Distance in the linkage matrix at a given step
66  double get_distance_at_step(unsigned int step) const;
67 
68  unsigned int get_id_for_cluster_at_step(unsigned int step) const {
69  return step + n_elements_;
70  }
71 
72  //! Returns the linkage matrix
73  /**
74  \note Linkage matrix is a matrix A[N-1][3].
75  A[i][0] - id of the first cluster merged at step i
76  A[i][1] - id of the second cluster merged at step i
77  A[i][2] - distance between the clusters
78  */
79  FloatsList get_linkage_matrix() const;
80 
81  //! Returns the linkage matrix compatible with Matlab format
82  /**
83  \note This function merely adds 1 to the cluster ids, for compatibility
84  with Matlab.
85  Matlab format: http://www.mathworks.com/help/toolbox/stats/linkage.html
86  */
87  FloatsList get_linkage_matrix_in_matlab_format() const;
88 
89  //! Returns the number of steps of clustering recorded
90  unsigned int get_number_of_steps() const { return steps_; }
91 
92  void show(std::ostream &out) const;
93 
94  private:
95  void check_step_value(unsigned int s) const;
96  unsigned int get_step_from_id(unsigned int id) const {
97  return (id - n_elements_);
98  }
99 
100  unsigned int steps_;
101  unsigned int n_elements_; // number of elements to cluster
102  Ints joined_ids1_, joined_ids2_;
103  Floats cluster_distances_;
104  // each element of the outermost vector is a vector with all the elements
105  // in a cluster
106  IntsList clusters_elements_;
107 };
109 
110 //! Functor for hierarchical clustering based on single linkage
111 class IMPEM2DEXPORT SingleLinkage {
112  public:
113  SingleLinkage() {}
114  /**
115  \param[in] id1 identity of cluster 1 to merge
116  \param[in] id2 identity of cluster 2 to merge
117  \param[in] cluster_set linkage matrix describing the contents of
118  clusters so far.
119  \param[in] distances A NxN matrix of distances(i,j) between the individual
120  members to cluster
121  \return Minimal distance between members of the clusters
122  \note the id of an isolated member n<N is n. The id of the cluster formed
123  at step i is i+N.
124  */
125  double operator()(unsigned int id1, unsigned int id2,
126  const ClusterSet &cluster_set,
127  const FloatsList &distances) const;
128 
129  void show(std::ostream &out) const {
130  out << "SingleLinkage";
131  };
132 };
134 
135 //! Functor for hierarchical clustering based on complete linkage
136 class IMPEM2DEXPORT CompleteLinkage {
137  public:
138  CompleteLinkage() {}
139  //! Distance between the clusters
140  /**
141  \note See SingleLinkage class for the meaning of the arguments
142  \return Maximal distance between 2 members in the merged cluster
143 
144  */
145  double operator()(unsigned int id1, unsigned int id2,
146  const ClusterSet &cluster_set, const FloatsList &distances);
147 
148  void show(std::ostream &out) const {
149  out << "CompleteLinkage";
150  };
151 };
153 
154 //! Functor for hierarchical clustering based on average-linkage
155 class IMPEM2DEXPORT AverageDistanceLinkage {
156  public:
158  //! Distance between the clusters
159  /**
160  \note See SingleLinkage class for the meaning of the arguments
161  \return Average between all members of the merged cluster
162  */
163  double operator()(unsigned int id1, unsigned int id2,
164  const ClusterSet &cluster_set, const FloatsList &distances);
165 
166  void show(std::ostream &out) const {
167  out << "AverageDistanceLinkage";
168  };
169 };
171 
172 //! Function to perform agglomerative clustering
173 /**
174  \param[in] distances Vector of Floats containing all the
175  possible distances(i,j) between elements to cluster. Given N elements to
176  cluster, there are N vectors of size N
177  \return a ClusterSet class containing all the clustering steps.
178 */
179 template <class LinkageFunction>
181  const FloatsList &distances) {
182  // Based on:
183  // http://nlp.stanford.edu/IR-book/html/htmledition/
184  // time-complexity-of-hac-1.html)
185 
186  IMP_LOG_TERSE("starting hierarchical_agglomerative_clustering " << std::endl);
187 
188  unsigned int N = distances.size(); // number of elements
189  // Lists of distances between elements
190  // List n has members (i,distance_n_i).
191  std::vector<internal::list_cluster_id_distance> lists(N);
192  // id of the cluster associated with each list
193  Ints cluster_id(N);
194  // All list active at the beginning
195  std::vector<bool> active_list(N);
196  std::fill(active_list.begin(), active_list.end(), true);
197  // Fill lists
198  for (unsigned int n = 0; n < N; ++n) {
199  for (unsigned int i = 0; i < N; ++i) {
200  if (i != n) {
201  lists[n].push_back(std::make_pair(i, distances[n][i]));
202  }
203  }
205  // At the beginning each list is associated with a cluster of one element
206  cluster_id[n] = n;
207  }
208 
209  ClusterSet cluster_set(N);
210  LinkageFunction linkage_function;
211  unsigned int steps = N - 1; // Steps of clustering
212  // cluster algorithm
213  for (unsigned int k = 0; k < steps; ++k) {
214  IMP_LOG_TERSE(std::endl);
215  // Find the list that contains lower distance
216  double minimum_distance = std::numeric_limits<double>::max();
217  unsigned int l1 = 0;
218 
219  for (unsigned int j = 0; j < N; ++j) {
220  if (active_list[j] == true) {
221  // closest distance for list j
222  if (lists[j].front().second < minimum_distance) {
223  minimum_distance = lists[j].front().second;
224  l1 = j;
225  }
226  }
227  } // list l1 contains lowest distance
228  // lowest distance is between list l1 and list l2
229  unsigned int l2 = lists[l1].front().first;
230  minimum_distance = lists[l2].front().second;
231  IMP_LOG_TERSE("step " << k << ": joining clusters " << cluster_id[l1]
232  << " and " << cluster_id[l2] << " to form cluster "
233  << cluster_set.get_id_for_cluster_at_step(k)
234  << " distance = " << minimum_distance << std::endl);
235  cluster_set.do_join_clusters(cluster_id[l1], cluster_id[l2],
236  minimum_distance);
237  active_list[l2] = false;
238  cluster_id[l1] = cluster_set.get_id_for_cluster_at_step(k);
239  // clear list 1. It will be filled with distance values for the new cluster
240  lists[l1].clear();
241  // Update lists of distances
242  for (unsigned int i = 0; i < N; ++i) {
243  IMP_LOG_TERSE("Updating list of distances " << i << std::endl);
244  if (active_list[i] == true && i != l1) {
245  IMP_LOG_TERSE("List " << i << " is active " << std::endl);
246  // Delete list elements that store distances to the merged clusters
247  lists[i].erase(std::remove_if(lists[i].begin(), lists[i].end(),
248  internal::ListHasDistance(l1, l2)),
249  lists[i].end());
250  // Update distances to the merged cluster
251  double dist = linkage_function(cluster_id[l1], cluster_id[i],
252  cluster_set, distances);
253  IMP_LOG_TERSE("Distance by linkage function " << dist << std::endl);
254  lists[i].push_back(std::make_pair(l1, dist));
255  lists[l1].push_back(std::make_pair(i, dist));
256  }
257  }
258  // Sort lists
259  for (unsigned int i = 0; i < N; ++i) {
260  if (active_list[i] == true) {
262  }
263  }
264  }
265  return cluster_set;
266 }
267 
268 IMPEM2D_END_NAMESPACE
269 
270 #endif /* IMPEM2D_HIERARCHICAL_CLUSTERING_H */
Basic types used by IMP.
ClusterSet do_hierarchical_agglomerative_clustering(const FloatsList &distances)
Function to perform agglomerative clustering.
Functor for hierarchical clustering based on single linkage.
#define IMP_LOG_TERSE(expr)
Definition: log_macros.h:72
Scoring functions for 2D Copyright 2007-2020 IMP Inventors. All rights reserved.
void do_join_clusters(unsigned int cluster_id1, unsigned int cluster_id2, double distance_between_clusters)
join operation
#define IMP_VALUES(Name, PluralName)
Define the type for storing sets of values.
Definition: value_macros.h:23
A class to store the clusters generated during hierarchical clustering.
std::ostream & show(Hierarchy h, std::ostream &out=std::cout)
Print the hierarchy using a given decorator to display each node.
Functor for hierarchical clustering based on complete linkage.
Functor for hierarchical clustering based on average-linkage.
Logging and error reporting support.
unsigned int get_number_of_steps() const
Returns the number of steps of clustering recorded.
Comparison of pairs by checking the second element.
Definition: scores2D.h:133