IMP  2.0.1
The Integrative Modeling Platform
kmeans/KMeans.h
Go to the documentation of this file.
1 /**
2  * \file IMP/kmeans/KMeans.h
3  * \brief an interface to k-means open source library (stored internally)
4  *
5  * Copyright 2007-2013 IMP Inventors. All rights reserved.
6 */
7 
8 #ifndef IMPKMEANS_KMEANS_H
9 #define IMPKMEANS_KMEANS_H
10 
11 #include <IMP/kmeans/kmeans_config.h>
12 #include "IMP/kmeans/internal/KMlocal.h" // k-means algorithms
13 #include "IMP/kmeans/internal/KMdata.h" // k-means algorithms
14 #include "IMP/kmeans/internal/KMterm.h"
15 #include "IMP/Pointer.h"
17 #include "IMP/base/object_macros.h"
19 #include "IMP/base/Object.h"
20 #include "IMP/base/types.h"
21 #include <cstdlib> // C standard includes
22 #include <iostream> // C++ I/O
23 #include <string> // C++ strings
24 #include <string.h> // strcmp
25 
26 IMPKMEANS_BEGIN_NAMESPACE
27 
28 /*********************** Typedefs **************************/
29 
30 /** Different k-means algorithm variants that are
31  implemented in the library, see also
32  http://www.cs.umd.edu/~mount/Projects/KMeans/
33 */
35 {
36  /**
37  Repeatedly applies Lloyd's algorithm with randomly sampled starting
38  points.
39  */
40  KM_LLOYDS = 1,
41  /**
42  A local search heuristic, which works by performing swaps between existing
43  centers and a set of candidate centers.
44  */
46  /**
47  A simple hybrid algorithm, which does one swap followed by some number of
48  iterations of Lloyd's.
49  */
51  /**
52  A more complex hybrid of Lloyd's and Swap, which performs some number of
53  swaps followed by some number of iterations of Lloyd's algorithm. To
54  avoid getting trapped in local minima, an approach similar to simulated
55  annealing is included as well.
56  */
58 };
59 
60 /*********************** Class Definition **************************/
61 
62 /** Class that wraps and provides an interface to the K-means
63  library by David Mount (GPL license), downloaded and adapted
64  to IMP from http://www.cs.umd.edu/~mount/Projects/KMeans/
65 
66  For a simple usage example, see
67  modules/kmeans/examples/kmeans_example.py
68 
69  \untested{KMeans}
70  \unstable{KMeans}
71  */
72 class IMPKMEANSEXPORT
73 KMeans : public IMP::base::Object {
74 
75  /*********************** Constructors **************************/
76  public:
77 
78  /**
79  Initialize the KMeans object with data from fname_data,
80  assuming input data of dimension dim
81 
82  @param[in] fname_data Input filename. Input is assumed to be in
83  textual (ascii) whitespace separated format, with a
84  fixed number of columns dim. Each row represents a
85  single data point of fixed dimension dim. For
86  example, a file with three examples of dimension 4
87  would look as follows:\n
88  10.3 0.7 1.3 11.1\n
89  2.1 1.5 20.1 0.2\n
90  10.1 0.9 2.1 10.9
91  @param[in] dim Dimension of each data point
92  @param[in] max_nPts Maximal number of points to be read from file
93  */
94  KMeans
95  (const std::string& fname_data,
96  int dim,
97  unsigned int max_nPts);
98 
99  /** Empty constructor for all default initializations -
100  object data is not considered initialized after this call
101  */
102  KMeans();
103 
104  /************ Object virtual methods / destructor ************/
105 
107 
108  /*********************** Public methods **************************/
109  public:
110 
111  /**
112  Execute a kmeans algorithm variant on the data points stored.
113 
114  @param[in] k number of clusters
115  @param[in] alg_type The k-means algorithm variant to use
116  \see KM_ALG_TYPE
117  @param[in] stages Number of k-means iterations
118  */
119  void execute
120  (unsigned int k,
121  KM_ALG_TYPE alg_type = KM_LLOYDS,
122  int stages = 100);
123 
124  /**
125  Add a data point for clustering.
126 
127  @param[in] p point to be added
128  */
129  void add_data_pt(const IMP::Floats& p);
130 
131  /**
132  Clears all data in object.
133  */
134  void clear_data();
135 
136  /** Returns the i'th point in the dataset
137 
138  @param[in] i Center number in range (0,...,nPts-1)
139  */
140  const IMP::Floats& get_data_point(unsigned int i) const;
141 
142  /** @return The number of data points */
143  unsigned int get_number_of_data_points() const
144  {
145  return STLDataPts_.size();
146  }
147 
148  /**
149  Print the centers (assuming exectute() was applied)
150  */
151  void print_centers() const;
152 
153  /** Returns the i'th center
154  Must be called only following a succesful execute() invokation
155 
156  @param[in] i Center number in range (0,...,k-1)
157  */
158  IMP::Floats get_center(unsigned int i) const;
159 
160  /** Returns the cluster assignment of each data point */
161  IMP::Ints get_assignments() const
162  {
163  // TODO: exception instead of assertion?
164  assert(is_executed_);
165  return centerAssignments_;
166  }
167 
168  /** Returns the squared distance of each data point to its
169  respective cluster center */
170  IMP::Floats get_squared_distance_to_centers() const
171  {
172  // TODO: exception instead of assertion?
173  assert(is_executed_);
174  return ptsSqrDistToCenters_;
175  }
176 
177  /** @return The number of centers after a succeful execution */
178  unsigned int get_number_of_centers() const
179  {
180  assert( is_executed_ ); // TODO: exception?
181  return pCenters_->getK();
182  }
183 
184  /*********************** Private methods **************************/
185  private:
186 
187  /** Updates the wrapped data pts structure from the internal 2D STL vector
188  array (IMP::Float).
189  This method invalidates any prior information about clustering results,
190  unless the data was already synced (in which case no sync was needed)
191  */
192  void sync_KMdata_pts_from_STL();
193 
194  /**
195  Read a point from a stream into p
196 
197  @param[in] in input stream to read from
198  @param[out] p output point
199  @param[in] dim dimension of each data point
200 
201  @return false on error or EOF.
202  */
203  bool read_pt_from_stream
204  (std::istream& in,
205  IMP::Floats& p,
206  unsigned int dim);
207 
208  /**
209  Read up to max_nPts from a stream
210 
211  @param[in] in input stream to read from
212 1 @param[in] dim dimension of each data point
213  @param[in] max_nPts maximal number of points to read from stream
214  */
215  void read_data_pts_from_stream
216  (std::istream &in,
217  unsigned int dim,
218  unsigned int max_nPts);
219 
220  /**
221  print a point
222 
223  @param[in] out stream for printing the point
224  @param[in] p the point
225  */
226  void print_pt_to_stream
227  (std::ostream& out,
228  const IMP::Floats& p);
229 
230  // print final summary using stored data and centers after execution
231  void print_summary(const internal::KMlocal& theAlg); // the algorithm
232 
233  /*********************** Private Variables **************************/
234  private:
235 
236  // was k-means executed succesfully
237  bool is_executed_;
238 
239  // The data points in STL format
240  IMP::FloatsList STLDataPts_;
241 
242  // data points in wrapped internal::KMdata strcture
243  // (should be synced from STLDataPts_ before usage)
244  IMP::Pointer<internal::KMdata> pKMDataPts_;
245 
246  // was STL data updated to wrapped internal::KMdata points
247  bool is_KM_data_synced_;
248 
249  // the center points from a clustering execution
250  IMP::Pointer<internal::KMfilterCenters> pCenters_;
251 
252  // cluster of each point
253  IMP::Ints centerAssignments_;
254 
255  // data points squared distances from respective cluster centers
256  IMP::Floats ptsSqrDistToCenters_;
257 
258  //----------------------------------------------------------------------
259  // Termination conditions
260  // These are explained in the files internal/KMterm.h and internal/KMlocal.h
261  // Unless you are into fine tuning, don't worry about changing these.
262  //----------------------------------------------------------------------
263  internal::KMterm terminationConditions_;
264 
265 }; /*********************** class KMeans **************************/
266 
267 IMPKMEANS_END_NAMESPACE
268 
269 #endif /* IMPKMEANS_KMEANS_H */