IMP  2.1.0
The Integrative Modeling Platform
kmeans/KMeans.h
Go to the documentation of this file.
1 /**
2  * \file IMP/kmeans/KMeans.h
3  * \brief an interface to k-means open source library (stored internally)
4  *
5  * Copyright 2007-2013 IMP Inventors. All rights reserved.
6 */
7 
8 #ifndef IMPKMEANS_KMEANS_H
9 #define IMPKMEANS_KMEANS_H
10 
11 #include <IMP/kmeans/kmeans_config.h>
12 #include "IMP/kmeans/internal/KMlocal.h" // k-means algorithms
13 #include "IMP/kmeans/internal/KMdata.h" // k-means algorithms
14 #include "IMP/kmeans/internal/KMterm.h"
15 #include <IMP/base/Pointer.h>
17 #include <IMP/base/object_macros.h>
19 #include <IMP/base/Object.h>
20 #include <IMP/base/types.h>
21 #include <IMP/base/enums.h>
22 #include <cstdlib> // C standard includes
23 #include <iostream> // C++ I/O
24 #include <string> // C++ strings
25 #include <string.h> // strcmp
26 
27 IMPKMEANS_BEGIN_NAMESPACE
28 
29 /*********************** Typedefs **************************/
30 
31 /** Different k-means algorithm variants that are
32  implemented in the library, see also
33  http://www.cs.umd.edu/~mount/Projects/KMeans/
34 */
36 {
37  /**
38  Repeatedly applies Lloyd's algorithm with randomly sampled starting
39  points.
40  */
41  KM_LLOYDS = 1,
42  /**
43  A local search heuristic, which works by performing swaps between existing
44  centers and a set of candidate centers.
45  */
47  /**
48  A simple hybrid algorithm, which does one swap followed by some number of
49  iterations of Lloyd's.
50  */
52  /**
53  A more complex hybrid of Lloyd's and Swap, which performs some number of
54  swaps followed by some number of iterations of Lloyd's algorithm. To
55  avoid getting trapped in local minima, an approach similar to simulated
56  annealing is included as well.
57  */
59 };
60 
61 /*********************** Class Definition **************************/
62 
63 /** Class that wraps and provides an interface to the K-means
64  library by David Mount (GPL license), downloaded and adapted
65  to IMP from http://www.cs.umd.edu/~mount/Projects/KMeans/
66 
67  For a simple usage example, see
68  modules/kmeans/examples/kmeans_example.py
69 
70  \untested{KMeans}
71  \unstable{KMeans}
72  */
73 class IMPKMEANSEXPORT
74 KMeans : public IMP::base::Object {
75 
76  /*********************** Constructors **************************/
77  public:
78 
79  /**
80  Initialize the KMeans object with data from fname_data,
81  assuming input data of dimension dim
82 
83  @param[in] fname_data Input filename. Input is assumed to be in
84  textual (ascii) whitespace separated format, with a
85  fixed number of columns dim. Each row represents a
86  single data point of fixed dimension dim. For
87  example, a file with three examples of dimension 4
88  would look as follows:\n
89  10.3 0.7 1.3 11.1\n
90  2.1 1.5 20.1 0.2\n
91  10.1 0.9 2.1 10.9
92  @param[in] dim Dimension of each data point
93  @param[in] max_nPts Maximal number of points to be read from file
94  */
95  KMeans
96  (const std::string& fname_data,
97  int dim,
98  unsigned int max_nPts);
99 
100  /** Empty constructor for all default initializations -
101  object data is not considered initialized after this call
102  */
103  KMeans();
104 
105  /************ Object virtual methods / destructor ************/
106 
108 
109  /*********************** Public methods **************************/
110  public:
111 
112  /**
113  Execute a kmeans algorithm variant on the data points stored.
114 
115  @param[in] k number of clusters
116  @param[in] alg_type The k-means algorithm variant to use
117  \see KM_ALG_TYPE
118  @param[in] stages Number of k-means iterations
119  */
120  void execute
121  (unsigned int k,
122  KM_ALG_TYPE alg_type = KM_LLOYDS,
123  int stages = 100);
124 
125  /**
126  Add a data point for clustering.
127 
128  @param[in] p point to be added
129  */
130  void add_data_pt(const IMP::Floats& p);
131 
132  /**
133  Clears all data in object.
134  */
135  void clear_data();
136 
137  /** Returns the i'th point in the dataset
138 
139  @param[in] i Center number in range (0,...,nPts-1)
140  */
141  const IMP::Floats& get_data_point(unsigned int i) const;
142 
143  /** @return The number of data points */
144  unsigned int get_number_of_data_points() const
145  {
146  return STLDataPts_.size();
147  }
148 
149  /**
150  Print the centers (assuming exectute() was applied) to log
151 
152  @param ll the log level for printout
153  */
154  void print_centers(base::LogLevel ll = base::PROGRESS) const;
155 
156  /** Returns the i'th center
157  Must be called only following a succesful execute() invokation
158 
159  @param[in] i Center number in range (0,...,k-1)
160  */
161  IMP::Floats get_center(unsigned int i) const;
162 
163  /** Returns the cluster assignment of each data point */
165  {
166  // TODO: exception instead of assertion?
167  assert(is_executed_);
168  return centerAssignments_;
169  }
170 
171  /** Returns the squared distance of each data point to its
172  respective cluster center */
174  {
175  // TODO: exception instead of assertion?
176  assert(is_executed_);
177  return ptsSqrDistToCenters_;
178  }
179 
180  /** @return The number of centers after a succeful execution */
181  unsigned int get_number_of_centers() const
182  {
183  assert( is_executed_ ); // TODO: exception?
184  return pCenters_->getK();
185  }
186 
187  /*********************** Private methods **************************/
188  private:
189 
190  /** Updates the wrapped data pts structure from the internal 2D STL vector
191  array (IMP::Float).
192  This method invalidates any prior information about clustering results,
193  unless the data was already synced (in which case no sync was needed)
194  */
195  void sync_KMdata_pts_from_STL();
196 
197  /**
198  Read a point from a stream into p
199 
200  @param[in] in input stream to read from
201  @param[out] p output point
202  @param[in] dim dimension of each data point
203 
204  @return false on error or EOF.
205  */
206  bool read_pt_from_stream
207  (std::istream& in,
208  IMP::Floats& p,
209  unsigned int dim);
210 
211  /**
212  Read up to max_nPts from a stream
213 
214  @param[in] in input stream to read from
215  @param[in] dim dimension of each data point
216  @param[in] max_nPts maximal number of points to read from stream
217  */
218  void read_data_pts_from_stream
219  (std::istream &in,
220  unsigned int dim,
221  unsigned int max_nPts);
222 
223  /**
224  print a point
225 
226  @param[in] out stream for printing the point
227  @param[in] p the point
228  */
229  void print_pt_to_stream
230  (std::ostream& out,
231  const IMP::Floats& p);
232 
233  /* print final summary using stored data and centers after execution
234  to progress log
235 
236  @param theAlg the algorithm used for running
237  @param ll the log level in which to print the summary
238  */
239  void print_summary(const internal::KMlocal& theAlg,
240  base::LogLevel ll = base::PROGRESS); // the algorithm
241 
242  /*********************** Private Variables **************************/
243  private:
244 
245  // was k-means executed succesfully
246  bool is_executed_;
247 
248  // The data points in STL format
249  IMP::FloatsList STLDataPts_;
250 
251  // data points in wrapped internal::KMdata strcture
252  // (should be synced from STLDataPts_ before usage)
254 
255  // was STL data updated to wrapped internal::KMdata points
256  bool is_KM_data_synced_;
257 
258  // the center points from a clustering execution
260 
261  // cluster of each point
262  IMP::Ints centerAssignments_;
263 
264  // data points squared distances from respective cluster centers
265  IMP::Floats ptsSqrDistToCenters_;
266 
267  //----------------------------------------------------------------------
268  // Termination conditions
269  // These are explained in the files internal/KMterm.h and internal/KMlocal.h
270  // Unless you are into fine tuning, don't worry about changing these.
271  //----------------------------------------------------------------------
272  internal::KMterm terminationConditions_;
273 
274 }; /*********************** class KMeans **************************/
275 
276 IMPKMEANS_END_NAMESPACE
277 
278 #endif /* IMPKMEANS_KMEANS_H */
Basic types used by IMP.
Various general useful macros for IMP.
A nullptr-initialized pointer to an IMP Object.
LogLevel
The log levels supported by IMP.
Definition: base/enums.h:18
IMP::Ints get_assignments() const
IMP::Floats get_squared_distance_to_centers() const
Various general useful macros for IMP.
unsigned int get_number_of_centers() const
#define IMP_OBJECT_METHODS(Name)
Define the basic things needed by any Object.
Basic types used by IMP.
Common base class for heavy weight IMP objects.
Various general useful macros for IMP.
A shared base class to help in debugging and things.
unsigned int get_number_of_data_points() const