IMP logo
IMP Reference Guide  develop.7cb8855c60,2024/10/04
The Integrative Modeling Platform
kmeans/KMeans.h
Go to the documentation of this file.
1 /**
2  * \file IMP/kmeans/KMeans.h
3  * \brief an interface to k-means open source library (stored internally)
4  *
5  * Copyright 2007-2022 IMP Inventors. All rights reserved.
6 */
7 
8 #ifndef IMPKMEANS_KMEANS_H
9 #define IMPKMEANS_KMEANS_H
10 
11 #include <IMP/kmeans/kmeans_config.h>
12 #include "IMP/kmeans/internal/KMlocal.h" // k-means algorithms
13 #include "IMP/kmeans/internal/KMdata.h" // k-means algorithms
14 #include "IMP/kmeans/internal/KMterm.h"
15 #include <IMP/Pointer.h>
16 #include <IMP/doxygen_macros.h>
17 #include <IMP/object_macros.h>
18 #include <IMP/warning_macros.h>
19 #include <IMP/Object.h>
20 #include <IMP/types.h>
21 #include <IMP/enums.h>
22 #include <cstdlib> // C standard includes
23 #include <iostream> // C++ I/O
24 #include <string> // C++ strings
25 #include <string.h> // strcmp
26 
27 IMPKMEANS_BEGIN_NAMESPACE
28 
29 /*********************** Typedefs **************************/
30 
31 /** Different k-means algorithm variants that are
32  implemented in the library, see also
33  http://www.cs.umd.edu/~mount/Projects/KMeans/
34 */
36  /**
37  Repeatedly applies Lloyd's algorithm with randomly sampled starting
38  points.
39  */
40  KM_LLOYDS = 1,
41  /**
42  A local search heuristic, which works by performing swaps between existing
43  centers and a set of candidate centers.
44  */
46  /**
47  A simple hybrid algorithm, which does one swap followed by some number of
48  iterations of Lloyd's.
49  */
51  /**
52  A more complex hybrid of Lloyd's and Swap, which performs some number of
53  swaps followed by some number of iterations of Lloyd's algorithm. To
54  avoid getting trapped in local minima, an approach similar to simulated
55  annealing is included as well.
56  */
58 };
59 
60 /*********************** Class Definition **************************/
61 
62 /** Class that wraps and provides an interface to the K-means
63  library by David Mount (GPL license), downloaded and adapted
64  to IMP from http://www.cs.umd.edu/~mount/Projects/KMeans/
65 
66  For a simple usage example, see
67  modules/kmeans/examples/kmeans_example.py
68 
69  \untested{KMeans}
70  \unstable{KMeans}
71  */
72 class IMPKMEANSEXPORT KMeans : public IMP::Object {
73 
74  /*********************** Constructors **************************/
75  public:
76  /**
77  Initialize the KMeans object with data from fname_data,
78  assuming input data of dimension dim
79 
80  @param[in] fname_data Input filename. Input is assumed to be in
81  textual (ascii) whitespace separated format, with a
82  fixed number of columns dim. Each row represents a
83  single data point of fixed dimension dim. For
84  example, a file with three examples of dimension 4
85  would look as follows:\n
86  10.3 0.7 1.3 11.1\n
87  2.1 1.5 20.1 0.2\n
88  10.1 0.9 2.1 10.9
89  @param[in] dim Dimension of each data point
90  @param[in] max_nPts Maximal number of points to be read from file
91  */
92  KMeans(const std::string& fname_data, int dim, unsigned int max_nPts);
93 
94  /** Empty constructor for default initialization -
95  object data is not considered initialized after this call
96  */
97  KMeans();
98 
99  /************ Object virtual methods / destructor ************/
100 
102 
103  /*********************** Public methods **************************/
104  public:
105  /**
106  Execute a kmeans algorithm variant on the data points stored.
107 
108  @param[in] k number of clusters
109  @param[in] alg_type The k-means algorithm variant to use
110  \see KM_ALG_TYPE
111  @param[in] stages Number of k-means iterations
112  */
113  void execute(unsigned int k, KM_ALG_TYPE alg_type = KM_LLOYDS,
114  int stages = 100);
115 
116  /**
117  Add a data point for clustering.
118 
119  @param[in] p point to be added
120  */
121  void add_data_pt(const IMP::Floats& p);
122 
123  //! Clears all data in object.
124  void clear_data();
125 
126  //! Returns the i'th point in the dataset
127  /**
128  @param[in] i Center number in range (0,...,nPts-1)
129  */
130  const IMP::Floats& get_data_point(unsigned int i) const;
131 
132  //! @return The number of data points
133  unsigned int get_number_of_data_points() const { return STLDataPts_.size(); }
134 
135  //! Print the centers (assuming execute() was applied) to log
136  /**
137  @param ll the log level for printout
138  */
139  void print_centers(LogLevel ll = PROGRESS) const;
140 
141  //! Returns the i'th center
142  /** Must be called only following a successful execute() invocation
143 
144  @param[in] i Center number in range (0,...,k-1)
145  */
146  IMP::Floats get_center(unsigned int i) const;
147 
148  //! Returns the cluster assignment of each data point
150  // TODO: exception instead of assertion?
151  assert(is_executed_);
152  return centerAssignments_;
153  }
154 
155  /** Returns the squared distance of each data point to its
156  respective cluster center */
158  // TODO: exception instead of assertion?
159  assert(is_executed_);
160  return ptsSqrDistToCenters_;
161  }
162 
163  /** @return The number of centers after a successful execution */
164  unsigned int get_number_of_centers() const {
165  assert(is_executed_); // TODO: exception?
166  return pCenters_->getK();
167  }
168 
169  /*********************** Private methods **************************/
170  private:
171  /** Updates the wrapped data pts structure from the internal 2D STL vector
172  array (IMP::Float).
173  This method invalidates any prior information about clustering results,
174  unless the data was already synced (in which case no sync was needed)
175  */
176  void sync_KMdata_pts_from_STL();
177 
178  /**
179  Read a point from a stream into p
180 
181  @param[in] in input stream to read from
182  @param[out] p output point
183  @param[in] dim dimension of each data point
184 
185  @return false on error or EOF.
186  */
187  bool read_pt_from_stream(std::istream& in, IMP::Floats& p, unsigned int dim);
188 
189  /**
190  Read up to max_nPts from a stream
191 
192  @param[in] in input stream to read from
193  @param[in] dim dimension of each data point
194  @param[in] max_nPts maximal number of points to read from stream
195  */
196  void read_data_pts_from_stream(std::istream& in, unsigned int dim,
197  unsigned int max_nPts);
198 
199  /**
200  print a point
201 
202  @param[in] out stream for printing the point
203  @param[in] p the point
204  */
205  void print_pt_to_stream(std::ostream& out, const IMP::Floats& p);
206 
207  /* print final summary using stored data and centers after execution
208  to progress log
209 
210  @param theAlg the algorithm used for running
211  @param ll the log level in which to print the summary
212  */
213  void print_summary(const internal::KMlocal& theAlg,
214  LogLevel ll = PROGRESS); // the algorithm
215 
216  /*********************** Private Variables **************************/
217  private:
218  // was k-means executed successfully
219  bool is_executed_;
220 
221  // The data points in STL format
222  IMP::FloatsList STLDataPts_;
223 
224  // data points in wrapped internal::KMdata structure
225  // (should be synced from STLDataPts_ before usage)
226  Pointer<internal::KMdata> pKMDataPts_;
227 
228  // was STL data updated to wrapped internal::KMdata points
229  bool is_KM_data_synced_;
230 
231  // the center points from a clustering execution
233 
234  // cluster of each point
235  IMP::Ints centerAssignments_;
236 
237  // data points squared distances from respective cluster centers
238  IMP::Floats ptsSqrDistToCenters_;
239 
240  //----------------------------------------------------------------------
241  // Termination conditions
242  // These are explained in the files internal/KMterm.h and internal/KMlocal.h
243  // Unless you are into fine tuning, don't worry about changing these.
244  //----------------------------------------------------------------------
245  internal::KMterm terminationConditions_;
246 
247 }; /*********************** class KMeans **************************/
248 
249 IMPKMEANS_END_NAMESPACE
250 
251 #endif /* IMPKMEANS_KMEANS_H */
Helper macros for implementing IMP Objects.
Basic types used by IMP.
#define IMP_OBJECT_METHODS(Name)
Define the basic things needed by any Object.
Definition: object_macros.h:25
LogLevel
The log levels supported by IMP.
Definition: enums.h:19
Helper macros for writing doxygen documentation.
IMP::Ints get_assignments() const
Returns the cluster assignment of each data point.
IMP::Floats get_squared_distance_to_centers() const
Common base class for heavy weight IMP objects.
Definition: Object.h:111
Macros to control compiler warnings.
unsigned int get_number_of_centers() const
Basic enumeration types used by IMP.
A nullptr-initialized pointer to an IMP Object.
A shared base class to help in debugging and things.
unsigned int get_number_of_data_points() const