00001 /** 00002 * \file KMCentersTree.h \brief A tree that handles point partition 00003 * 00004 * Copyright 2007-2010 IMP Inventors. All rights reserved. 00005 * 00006 */ 00007 00008 #ifndef IMPSTATISTICS_KM_CENTERS_TREE_H 00009 #define IMPSTATISTICS_KM_CENTERS_TREE_H 00010 00011 #include "KMData.h" 00012 #include "KMCenters.h" 00013 #include "KMCentersNode.h" 00014 #include "KMCentersNodeLeaf.h" 00015 #include "KMCentersNodeSplit.h" 00016 00017 IMPSTATISTICS_BEGIN_NAMESPACE 00018 00019 #ifndef IMP_DOXYGEN 00020 00021 //! A data structure for efficeint assignment of points to clusters 00022 /** 00023 \unstable{KMCentersTree} 00024 */ 00025 class KMCentersTree { 00026 public: 00027 //! Constructor 00028 /** 00029 \param[in] data_points all data points 00030 \param[in] centers the centers 00031 /parm[in] bb_lo Bounding box left-bottom coordinates (default: compute 00032 bounding box from points). 00033 /parm[in] bb_hi Bounding box right_upper coordiantes (default: compute 00034 bounding box from points). 00035 /note As long as the number of points is nonzero, or if a bounding box 00036 is provided, then the constructor will build a tree with at least one 00037 node (even an empty leaf). Otherwise, it returns with a null tree. 00038 */ 00039 KMCentersTree( KMData *data_points, KMCenters *centers, 00040 KMPoint *bb_lo = NULL, KMPoint *bb_hi = NULL); 00041 00042 00043 //! Compute the neighboring data points of each center 00044 /** This is the heart of the filter-based k-means algorithm. 00045 \param[in] sums an array of center sums to update 00046 \param[in] sums_sqs an array of center sums of squares to update 00047 \param[in] weights the number of points associated with each center 00048 /note From these parameters the final centroid and distortion 00049 (mean squared error) can be computed. This is done by determining the set of 00050 candidates for each node in the tree. 00051 When the number of center candidates for a node is equal to 1 (it cannot be 0) 00052 then all of the points in the subtree rooted at this node are assigned as 00053 neighbors to this center. This means that the centroid and weight for this 00054 cell is added into the neighborhood centroid sum for this center. If this 00055 node is a leaf, then we compute (by brute-force) the distance from each 00056 candidate to each data point, and assign the data point to the closest 00057 center. 00058 */ 00059 void get_neighbors( 00060 KMPointArray *sums, std::vector<double> *sum_sqs,std::vector<int> *weights); 00061 00062 //! Compute assignment of data points to closest center 00063 /** A structural copy of the procedure get_neighbors, but rather than 00064 incrementing the various sums and sums of squares, it simply records 00065 the assignment of each data point to its closest center. Unlike the 00066 filtering search, when only one candidate remains, it does not stop the 00067 search, but continues to traverse all the leaves descended from this node in 00068 order to perform the assignments. 00069 \param[out] close_center will contain the closest center index for 00070 each of the data points 00071 */ 00072 void get_assignments(std::vector<int> &close_center); 00073 00074 ~KMCentersTree(); 00075 //! sample a center point c 00076 /** This implements an approach suggested by Matoushek for sampling a 00077 center point. 00078 1. A node of the tree is selected at random. 00079 1.1 If this is an interior node, a point is sampled uniformly from a 3x 00080 expansion of its bounding rectangle. 00081 1.2 If the node is a leaf, then a data point is sampled at random from 00082 the associated bucket. 00083 */ 00084 00085 //! Sample a center point 00086 /** A node of is selected at random. If this is an interior node, a point is 00087 sampled uniformly from a 3x expansion of the cell about its center. 00088 If the node is a leaf, then a data point is sampled at random from the 00089 associated bucket. 00090 */ 00091 KMPoint sample_center(); 00092 void show(std::ostream &out=std::cout); 00093 protected: 00094 //! Initializes the basic tree elements (without building the tree) 00095 /** 00096 \param[in] bb_lo bounding box low point (optional) 00097 \param[in] bb_hi bounding box high point (optional) 00098 \param[in] p_id point indices (optional) 00099 /note If p_id is NULL then the constructor should initialize the array of 00100 indices 00101 */ 00102 void skeleton_tree(const std::vector<int> &pi, 00103 KMPoint *bb_lo=NULL, KMPoint *bb_hi=NULL); 00104 //!Recursive construction of the tree from a set of points. 00105 /** 00106 \param[in] pa the points 00107 \param[in] pidx point indices to store in subtree 00108 \param[in] n number of points 00109 \param[in] dim the dimension of space 00110 \param[in] bnd_box bounding box for current node 00111 /note The construction is based on a standard algorithm for constructing 00112 the kc-tree (see Friedman, Bentley, and Finkel, ``An algorithm for finding 00113 best matches in logarithmic expected time,'' ACM Transactions on Mathematical 00114 Software, 3(3):209-226, 1977). The procedure operates by a simple 00115 divide-and-conquer strategy, which determines an appropriate orthogonal 00116 cutting plane, and splits the points. When the number of points falls 00117 below 1, we simply store the points in a leaf node's bucket. 00118 This procedure selects a cutting dimension and cutting value, partitions 00119 ps about these values, and returns the number of points on the low side 00120 of the cut. Note that this procedure is not only used for constructing full 00121 trees, but is also used by the insertion routine to rebuild a subtree. 00122 */ 00123 KMCentersNode * build_tree(int start_ind,int end_ind,int level); 00124 00125 //! split a set of points about a cutting plane along a given cutting dimension. 00126 /** 00127 /note split the points whose indexes are stored in p_id [start_ind,end_ind] 00128 \param[in] start_ind the start index 00129 \param[in] end_ind the last index 00130 \param[in] p_id point indexes 00131 \param[in] dim the dimension to split upon 00132 \param[in] cv cutting value 00133 \param[out] a pair (a,b) of breaks such that ps[start_ind,...,a-1] < cv, 00134 ps[a,b-1]=cv and ps[b,end_ind] > cv 00135 /note the partition is stored in p_id 00136 */ 00137 std::pair<int,int> split_by_plane( 00138 int start_ind, int end_ind, int dim, double cv); 00139 00140 //! Split the points according to value of the middle point of 00141 //! a certain dimension 00142 /** 00143 /note Use the midpoint rule by bisecting the longest side. If there are ties, 00144 the dimension with the maximum spread is selected. 00145 If, however, the midpoint split produces a trivial split (no points on one side 00146 of the splitting plane) then we slide the splitting (maintaining its 00147 orientation) until it produces a nontrivial split. For example, if the 00148 splitting plane is along the x-axis, and all the data points have x-coordinate 00149 less than the x-bisector, then the split is taken along the maximum x-coordinate 00150 of the data points. Intuitively, this rule cannot generate trivial splits, 00151 and hence avoids midpt_split's tendency to produce trees with a very large 00152 number of nodes. 00153 \param[in] start_ind the index of the first point in p_id_ 00154 \param[in] end_ind the index of the last point in p_id_ 00155 \param[out] cut_dim the index of cutting dimension 00156 \param[out] cut_val the cutting value 00157 \param[out] n_lo the number of points in the low side 00158 */ 00159 void split_by_mid_point(int start_ind, int end_ind, 00160 int &cut_dim, double &cut_val, int &n_lo); 00161 double get_value(int p_id, int dim) const; 00162 //! Find the bounding rectangle of points represented by 00163 //! data_points_[p_id_[i]] i in [start_ind,end_ind] 00164 KMRectangle* bounding_rectangle(int start_ind,int end_ind); 00165 //! Find the lenght of bounding rectangle of points 00166 //! represented by data_points_[p_id_[i]] i in [start_ind,end_ind] 00167 //! in a certain dimension 00168 double spread(int start_ind, int end_ind,int dim); 00169 //! Find the limits of the edge in the bounding rectangle of points 00170 //! represented by data_points_[p_id_[i]] i in [start_ind,end_ind] in 00171 //! a certain dimension 00172 std::pair<double,double> 00173 limits_along_dimension(int start_ind, int end_ind, int dim); 00174 00175 KMData *data_points_; //all of the data points 00176 KMCenters *centers_; 00177 std::vector<int> p_id_; //the indexes of the data points sorted by 00178 //the plane splitting algorithm 00179 KMCentersNode *root_; 00180 KMRectangle *bnd_box_; 00181 }; 00182 00183 #endif 00184 00185 IMPSTATISTICS_END_NAMESPACE 00186 #endif /* IMPSTATISTICS_KM_CENTERS_TREE_H */