IMP logo
IMP Reference Guide  2.18.0
The Integrative Modeling Platform
hierarchical_clustering.h
Go to the documentation of this file.
1 /**
2  * \file IMP/em2d/hierarchical_clustering.h
3  * \brief Agglomerative clustering algorithm
4  *
5  * Copyright 2007-2022 IMP Inventors. All rights reserved.
6  *
7  */
8 
9 #ifndef IMPEM2D_HIERARCHICAL_CLUSTERING_H
10 #define IMPEM2D_HIERARCHICAL_CLUSTERING_H
11 
12 #include <IMP/em2d/em2d_config.h>
13 #include "IMP/em2d/scores2D.h"
14 #include <IMP/em2d/internal/clustering_helper.h>
15 #include "IMP/base_types.h"
16 #include <IMP/log.h>
17 #include <vector>
18 #include <list>
19 #include <algorithm>
20 #include <limits>
21 #include <functional>
22 #include <boost/serialization/access.hpp>
23 
24 IMPEM2D_BEGIN_NAMESPACE
25 
26 template <class T>
27 void print_vector(const std::vector<T> &v) {
28  for (unsigned int i = 0; i < v.size(); ++i) {
29  std::cout << v[i] << " , ";
30  }
31  std::cout << std::endl;
32 }
33 
34 //! A class to store the clusters generated during hierarchical clustering
35 class IMPEM2DEXPORT ClusterSet {
36  public:
37  ClusterSet() : steps_(0), n_elements_(0) {}
38 
39  /**
40  \param[in] N Number of elements to be clustered
41  */
42  ClusterSet(unsigned int N);
43 
44  //! join operation
45  /** \param[in] cluster_id1 id of the 1st cluster joined
46  \param[in] cluster_id2 id of the 2nd cluster merged
47  \param[in] distance_between_clusters distance between the merged clusters
48  */
49  void do_join_clusters(unsigned int cluster_id1, unsigned int cluster_id2,
50  double distance_between_clusters);
51 
52  //! Returns a vector with the ids of the elements that are in a cluster
53  /** Does not contain any hierarchical information, just the members
54  \param[in] id of the cluster
55  */
56  Ints get_cluster_elements(unsigned int id) const;
57 
58  //! Get the biggest clusters that have distances below a given cutoff
59  /** \param[in] cutoff distance
60  \return A vector of Ints: Each Ints has the ids of all elements of the
61  cluster
62  */
63  Ints get_clusters_below_cutoff(double cutoff) const;
64 
65  //! Return the elements of the cluster formed at a given step
66  Ints get_cluster_formed_at_step(unsigned int step) const;
67 
68  //! Distance in the linkage matrix at a given step
69  double get_distance_at_step(unsigned int step) const;
70 
71  unsigned int get_id_for_cluster_at_step(unsigned int step) const {
72  return step + n_elements_;
73  }
74 
75  //! Returns the linkage matrix
76  /**
77  \note Linkage matrix is a matrix A[N-1][3].
78  A[i][0] - id of the first cluster merged at step i
79  A[i][1] - id of the second cluster merged at step i
80  A[i][2] - distance between the clusters
81  */
82  FloatsList get_linkage_matrix() const;
83 
84  //! Returns the linkage matrix compatible with Matlab format
85  /**
86  \note This function merely adds 1 to the cluster ids, for compatibility
87  with Matlab.
88  Matlab format: http://www.mathworks.com/help/toolbox/stats/linkage.html
89  */
90  FloatsList get_linkage_matrix_in_matlab_format() const;
91 
92  //! Returns the number of steps of clustering recorded
93  unsigned int get_number_of_steps() const { return steps_; }
94 
95  void show(std::ostream &out) const;
96 
97  private:
98  void check_step_value(unsigned int s) const;
99  unsigned int get_step_from_id(unsigned int id) const {
100  return (id - n_elements_);
101  }
102 
103  unsigned int steps_;
104  unsigned int n_elements_; // number of elements to cluster
105  Ints joined_ids1_, joined_ids2_;
106  Floats cluster_distances_;
107  // each element of the outermost vector is a vector with all the elements
108  // in a cluster
109  IntsList clusters_elements_;
110 
111 private:
112  friend class boost::serialization::access;
113 
114  template<class Archive> void serialize(Archive &ar, const unsigned int) {
115  ar & steps_ & n_elements_ & joined_ids1_ & joined_ids2_
116  & cluster_distances_ & clusters_elements_;
117  }
118 };
120 
121 //! Functor for hierarchical clustering based on single linkage
122 class IMPEM2DEXPORT SingleLinkage {
123  public:
124  SingleLinkage() {}
125  /**
126  \param[in] id1 identity of cluster 1 to merge
127  \param[in] id2 identity of cluster 2 to merge
128  \param[in] cluster_set linkage matrix describing the contents of
129  clusters so far.
130  \param[in] distances A NxN matrix of distances(i,j) between the individual
131  members to cluster
132  \return Minimal distance between members of the clusters
133  \note the id of an isolated member n<N is n. The id of the cluster formed
134  at step i is i+N.
135  */
136  double operator()(unsigned int id1, unsigned int id2,
137  const ClusterSet &cluster_set,
138  const FloatsList &distances) const;
139 
140  void show(std::ostream &out) const {
141  out << "SingleLinkage";
142  };
143 private:
144  friend class boost::serialization::access;
145 
146  template<class Archive> void serialize(Archive &, const unsigned int) {}
147 };
149 
150 //! Functor for hierarchical clustering based on complete linkage
151 class IMPEM2DEXPORT CompleteLinkage {
152  public:
153  CompleteLinkage() {}
154  //! Distance between the clusters
155  /**
156  \note See SingleLinkage class for the meaning of the arguments
157  \return Maximal distance between 2 members in the merged cluster
158 
159  */
160  double operator()(unsigned int id1, unsigned int id2,
161  const ClusterSet &cluster_set, const FloatsList &distances);
162 
163  void show(std::ostream &out) const {
164  out << "CompleteLinkage";
165  };
166 private:
167  friend class boost::serialization::access;
168 
169  template<class Archive> void serialize(Archive &, const unsigned int) {}
170 };
172 
173 //! Functor for hierarchical clustering based on average-linkage
174 class IMPEM2DEXPORT AverageDistanceLinkage {
175  public:
177  //! Distance between the clusters
178  /**
179  \note See SingleLinkage class for the meaning of the arguments
180  \return Average between all members of the merged cluster
181  */
182  double operator()(unsigned int id1, unsigned int id2,
183  const ClusterSet &cluster_set, const FloatsList &distances);
184 
185  void show(std::ostream &out) const {
186  out << "AverageDistanceLinkage";
187  };
188 private:
189  friend class boost::serialization::access;
190 
191  template<class Archive> void serialize(Archive &, const unsigned int) {}
192 };
194 
195 //! Function to perform agglomerative clustering
196 /**
197  \param[in] distances Vector of Floats containing all the
198  possible distances(i,j) between elements to cluster. Given N elements to
199  cluster, there are N vectors of size N
200  \return a ClusterSet class containing all the clustering steps.
201 */
202 template <class LinkageFunction>
204  const FloatsList &distances) {
205  // Based on:
206  // http://nlp.stanford.edu/IR-book/html/htmledition/
207  // time-complexity-of-hac-1.html)
208 
209  IMP_LOG_TERSE("starting hierarchical_agglomerative_clustering " << std::endl);
210 
211  unsigned int N = distances.size(); // number of elements
212  // Lists of distances between elements
213  // List n has members (i,distance_n_i).
214  std::vector<internal::list_cluster_id_distance> lists(N);
215  // id of the cluster associated with each list
216  Ints cluster_id(N);
217  // All list active at the beginning
218  std::vector<bool> active_list(N);
219  std::fill(active_list.begin(), active_list.end(), true);
220  // Fill lists
221  for (unsigned int n = 0; n < N; ++n) {
222  for (unsigned int i = 0; i < N; ++i) {
223  if (i != n) {
224  lists[n].push_back(std::make_pair(i, distances[n][i]));
225  }
226  }
228  // At the beginning each list is associated with a cluster of one element
229  cluster_id[n] = n;
230  }
231 
232  ClusterSet cluster_set(N);
233  LinkageFunction linkage_function;
234  unsigned int steps = N - 1; // Steps of clustering
235  // cluster algorithm
236  for (unsigned int k = 0; k < steps; ++k) {
237  IMP_LOG_TERSE(std::endl);
238  // Find the list that contains lower distance
239  double minimum_distance = std::numeric_limits<double>::max();
240  unsigned int l1 = 0;
241 
242  for (unsigned int j = 0; j < N; ++j) {
243  if (active_list[j] == true) {
244  // closest distance for list j
245  if (lists[j].front().second < minimum_distance) {
246  minimum_distance = lists[j].front().second;
247  l1 = j;
248  }
249  }
250  } // list l1 contains lowest distance
251  // lowest distance is between list l1 and list l2
252  unsigned int l2 = lists[l1].front().first;
253  minimum_distance = lists[l2].front().second;
254  IMP_LOG_TERSE("step " << k << ": joining clusters " << cluster_id[l1]
255  << " and " << cluster_id[l2] << " to form cluster "
256  << cluster_set.get_id_for_cluster_at_step(k)
257  << " distance = " << minimum_distance << std::endl);
258  cluster_set.do_join_clusters(cluster_id[l1], cluster_id[l2],
259  minimum_distance);
260  active_list[l2] = false;
261  cluster_id[l1] = cluster_set.get_id_for_cluster_at_step(k);
262  // clear list 1. It will be filled with distance values for the new cluster
263  lists[l1].clear();
264  // Update lists of distances
265  for (unsigned int i = 0; i < N; ++i) {
266  IMP_LOG_TERSE("Updating list of distances " << i << std::endl);
267  if (active_list[i] == true && i != l1) {
268  IMP_LOG_TERSE("List " << i << " is active " << std::endl);
269  // Delete list elements that store distances to the merged clusters
270  lists[i].erase(std::remove_if(lists[i].begin(), lists[i].end(),
271  internal::ListHasDistance(l1, l2)),
272  lists[i].end());
273  // Update distances to the merged cluster
274  double dist = linkage_function(cluster_id[l1], cluster_id[i],
275  cluster_set, distances);
276  IMP_LOG_TERSE("Distance by linkage function " << dist << std::endl);
277  lists[i].push_back(std::make_pair(l1, dist));
278  lists[l1].push_back(std::make_pair(i, dist));
279  }
280  }
281  // Sort lists
282  for (unsigned int i = 0; i < N; ++i) {
283  if (active_list[i] == true) {
285  }
286  }
287  }
288  return cluster_set;
289 }
290 
291 IMPEM2D_END_NAMESPACE
292 
293 #endif /* IMPEM2D_HIERARCHICAL_CLUSTERING_H */
Basic types used by IMP.
ClusterSet do_hierarchical_agglomerative_clustering(const FloatsList &distances)
Function to perform agglomerative clustering.
Functor for hierarchical clustering based on single linkage.
#define IMP_LOG_TERSE(expr)
Definition: log_macros.h:72
Scoring functions for 2D Copyright 2007-2022 IMP Inventors. All rights reserved.
void do_join_clusters(unsigned int cluster_id1, unsigned int cluster_id2, double distance_between_clusters)
join operation
#define IMP_VALUES(Name, PluralName)
Define the type for storing sets of values.
Definition: value_macros.h:23
A class to store the clusters generated during hierarchical clustering.
std::ostream & show(Hierarchy h, std::ostream &out=std::cout)
Print the hierarchy using a given decorator to display each node.
Functor for hierarchical clustering based on complete linkage.
Functor for hierarchical clustering based on average-linkage.
Logging and error reporting support.
unsigned int get_number_of_steps() const
Returns the number of steps of clustering recorded.
Comparison of pairs by checking the second element.
Definition: scores2D.h:129