IMP  2.2.0
The Integrative Modeling Platform
kmeans/KMeans.h
Go to the documentation of this file.
1 /**
2  * \file IMP/kmeans/KMeans.h
3  * \brief an interface to k-means open source library (stored internally)
4  *
5  * Copyright 2007-2014 IMP Inventors. All rights reserved.
6 */
7 
8 #ifndef IMPKMEANS_KMEANS_H
9 #define IMPKMEANS_KMEANS_H
10 
11 #include <IMP/kmeans/kmeans_config.h>
12 #include "IMP/kmeans/internal/KMlocal.h" // k-means algorithms
13 #include "IMP/kmeans/internal/KMdata.h" // k-means algorithms
14 #include "IMP/kmeans/internal/KMterm.h"
15 #include <IMP/base/Pointer.h>
17 #include <IMP/base/object_macros.h>
19 #include <IMP/base/Object.h>
20 #include <IMP/base/types.h>
21 #include <IMP/base/enums.h>
22 #include <cstdlib> // C standard includes
23 #include <iostream> // C++ I/O
24 #include <string> // C++ strings
25 #include <string.h> // strcmp
26 
27 IMPKMEANS_BEGIN_NAMESPACE
28 
29 /*********************** Typedefs **************************/
30 
31 /** Different k-means algorithm variants that are
32  implemented in the library, see also
33  http://www.cs.umd.edu/~mount/Projects/KMeans/
34 */
36  /**
37  Repeatedly applies Lloyd's algorithm with randomly sampled starting
38  points.
39  */
40  KM_LLOYDS = 1,
41  /**
42  A local search heuristic, which works by performing swaps between existing
43  centers and a set of candidate centers.
44  */
46  /**
47  A simple hybrid algorithm, which does one swap followed by some number of
48  iterations of Lloyd's.
49  */
51  /**
52  A more complex hybrid of Lloyd's and Swap, which performs some number of
53  swaps followed by some number of iterations of Lloyd's algorithm. To
54  avoid getting trapped in local minima, an approach similar to simulated
55  annealing is included as well.
56  */
58 };
59 
60 /*********************** Class Definition **************************/
61 
62 /** Class that wraps and provides an interface to the K-means
63  library by David Mount (GPL license), downloaded and adapted
64  to IMP from http://www.cs.umd.edu/~mount/Projects/KMeans/
65 
66  For a simple usage example, see
67  modules/kmeans/examples/kmeans_example.py
68 
69  \untested{KMeans}
70  \unstable{KMeans}
71  */
72 class IMPKMEANSEXPORT KMeans : public IMP::base::Object {
73 
74  /*********************** Constructors **************************/
75  public:
76  /**
77  Initialize the KMeans object with data from fname_data,
78  assuming input data of dimension dim
79 
80  @param[in] fname_data Input filename. Input is assumed to be in
81  textual (ascii) whitespace separated format, with a
82  fixed number of columns dim. Each row represents a
83  single data point of fixed dimension dim. For
84  example, a file with three examples of dimension 4
85  would look as follows:\n
86  10.3 0.7 1.3 11.1\n
87  2.1 1.5 20.1 0.2\n
88  10.1 0.9 2.1 10.9
89  @param[in] dim Dimension of each data point
90  @param[in] max_nPts Maximal number of points to be read from file
91  */
92  KMeans(const std::string& fname_data, int dim, unsigned int max_nPts);
93 
94  /** Empty constructor for all default initializations -
95  object data is not considered initialized after this call
96  */
97  KMeans();
98 
99  /************ Object virtual methods / destructor ************/
100 
102 
103  /*********************** Public methods **************************/
104  public:
105  /**
106  Execute a kmeans algorithm variant on the data points stored.
107 
108  @param[in] k number of clusters
109  @param[in] alg_type The k-means algorithm variant to use
110  \see KM_ALG_TYPE
111  @param[in] stages Number of k-means iterations
112  */
113  void execute(unsigned int k, KM_ALG_TYPE alg_type = KM_LLOYDS,
114  int stages = 100);
115 
116  /**
117  Add a data point for clustering.
118 
119  @param[in] p point to be added
120  */
121  void add_data_pt(const IMP::Floats& p);
122 
123  /**
124  Clears all data in object.
125  */
126  void clear_data();
127 
128  /** Returns the i'th point in the dataset
129 
130  @param[in] i Center number in range (0,...,nPts-1)
131  */
132  const IMP::Floats& get_data_point(unsigned int i) const;
133 
134  /** @return The number of data points */
135  unsigned int get_number_of_data_points() const { return STLDataPts_.size(); }
136 
137  /**
138  Print the centers (assuming exectute() was applied) to log
139 
140  @param ll the log level for printout
141  */
142  void print_centers(base::LogLevel ll = base::PROGRESS) const;
143 
144  /** Returns the i'th center
145  Must be called only following a succesful execute() invokation
146 
147  @param[in] i Center number in range (0,...,k-1)
148  */
149  IMP::Floats get_center(unsigned int i) const;
150 
151  /** Returns the cluster assignment of each data point */
153  // TODO: exception instead of assertion?
154  assert(is_executed_);
155  return centerAssignments_;
156  }
157 
158  /** Returns the squared distance of each data point to its
159  respective cluster center */
161  // TODO: exception instead of assertion?
162  assert(is_executed_);
163  return ptsSqrDistToCenters_;
164  }
165 
166  /** @return The number of centers after a succeful execution */
167  unsigned int get_number_of_centers() const {
168  assert(is_executed_); // TODO: exception?
169  return pCenters_->getK();
170  }
171 
172  /*********************** Private methods **************************/
173  private:
174  /** Updates the wrapped data pts structure from the internal 2D STL vector
175  array (IMP::Float).
176  This method invalidates any prior information about clustering results,
177  unless the data was already synced (in which case no sync was needed)
178  */
179  void sync_KMdata_pts_from_STL();
180 
181  /**
182  Read a point from a stream into p
183 
184  @param[in] in input stream to read from
185  @param[out] p output point
186  @param[in] dim dimension of each data point
187 
188  @return false on error or EOF.
189  */
190  bool read_pt_from_stream(std::istream& in, IMP::Floats& p, unsigned int dim);
191 
192  /**
193  Read up to max_nPts from a stream
194 
195  @param[in] in input stream to read from
196  @param[in] dim dimension of each data point
197  @param[in] max_nPts maximal number of points to read from stream
198  */
199  void read_data_pts_from_stream(std::istream& in, unsigned int dim,
200  unsigned int max_nPts);
201 
202  /**
203  print a point
204 
205  @param[in] out stream for printing the point
206  @param[in] p the point
207  */
208  void print_pt_to_stream(std::ostream& out, const IMP::Floats& p);
209 
210  /* print final summary using stored data and centers after execution
211  to progress log
212 
213  @param theAlg the algorithm used for running
214  @param ll the log level in which to print the summary
215  */
216  void print_summary(const internal::KMlocal& theAlg,
217  base::LogLevel ll = base::PROGRESS); // the algorithm
218 
219  /*********************** Private Variables **************************/
220  private:
221  // was k-means executed succesfully
222  bool is_executed_;
223 
224  // The data points in STL format
225  IMP::FloatsList STLDataPts_;
226 
227  // data points in wrapped internal::KMdata strcture
228  // (should be synced from STLDataPts_ before usage)
230 
231  // was STL data updated to wrapped internal::KMdata points
232  bool is_KM_data_synced_;
233 
234  // the center points from a clustering execution
236 
237  // cluster of each point
238  IMP::Ints centerAssignments_;
239 
240  // data points squared distances from respective cluster centers
241  IMP::Floats ptsSqrDistToCenters_;
242 
243  //----------------------------------------------------------------------
244  // Termination conditions
245  // These are explained in the files internal/KMterm.h and internal/KMlocal.h
246  // Unless you are into fine tuning, don't worry about changing these.
247  //----------------------------------------------------------------------
248  internal::KMterm terminationConditions_;
249 
250 }; /*********************** class KMeans **************************/
251 
252 IMPKMEANS_END_NAMESPACE
253 
254 #endif /* IMPKMEANS_KMEANS_H */
Basic types used by IMP.
Various general useful macros for IMP.
A nullptr-initialized pointer to an IMP Object.
LogLevel
The log levels supported by IMP.
Definition: base/enums.h:20
IMP::Ints get_assignments() const
IMP::Floats get_squared_distance_to_centers() const
Various general useful macros for IMP.
unsigned int get_number_of_centers() const
#define IMP_OBJECT_METHODS(Name)
Define the basic things needed by any Object.
Basic types used by IMP.
Common base class for heavy weight IMP objects.
Definition: base/Object.h:106
Various general useful macros for IMP.
A shared base class to help in debugging and things.
unsigned int get_number_of_data_points() const