New k-means: Removed argument point id from argument list

Jira: MADLIB-522 Now that we have a function to sample vectors directly, it is unnecessary to supply a point id argument to k-means (it was only used for weighted sampling in k-means++). This should improve performance and usability. Other changes: - Added return type to closest_column. On Greenplum, we would otherwise see an error of the form "cannot serialize transient record type". - Adapted k-means unit tests. - New unit test for kmeans with initial centroids provided in table - Fixed oversight that squaredDistNorm1 did not return the square of the 1-norm. - C++ AL: Fixed bug in NativeArrayToMappedMatrix() and NativeArrayToMappedVector()
apache · Sep 14, 2012 · e6e99f1 · e6e99f1
1 parent c81c018
commit e6e99f1
Show file tree

Hide file tree

Showing 12 changed files with 131 additions and 134 deletions.
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
@@ -37,7 +37,7 @@ function(define_m4_macros OUT_M4_CMD_LINE OUT_M4_CODE)
     )
     list_replace("^(.+)$" "-D\\\\1" ${OUT_M4_CMD_LINE} ${MACROS})
     list_replace("^([^=]+)$" "m4_define(`\\\\1')" ${OUT_M4_CODE} ${MACROS})
-    list_replace("^([^=]+)=(.+)$" "m4_define(`\\\\1', ``\\\\2'')" ${OUT_M4_CODE}
+    list_replace("^([^=]+)=(.+)$" "m4_define(`\\\\1', `\\\\2')" ${OUT_M4_CODE}
         ${${OUT_M4_CODE}})
     string(REGEX REPLACE ";" "\\n" ${OUT_M4_CODE} "${${OUT_M4_CODE}}")
 

diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
@@ -56,6 +56,9 @@ and included third-party libraries can be found inside the
         @defgroup grp_kmeans k-Means Clustering
         @ingroup grp_unsuplearn
 
+        @defgroup grp_kmeans_new k-Means Clustering (new implementation)
+        @ingroup grp_unsuplearn
+
         @defgroup grp_lmf Low-rank Matrix Factorization
         @ingroup grp_unsuplearn
 

diff --git a/src/modules/linalg/metric.cpp b/src/modules/linalg/metric.cpp
@@ -45,7 +45,8 @@ squaredDistNorm1(
     const MappedColumnVector& inX,
     const MappedColumnVector& inY) {
 
-    return (inX - inY).lpNorm<1>();
+    double l1norm = (inX - inY).lpNorm<1>();
+    return l1norm * l1norm;
 }
 
 double

diff --git a/src/modules/sample/WeightedSample_impl.hpp b/src/modules/sample/WeightedSample_impl.hpp
@@ -86,8 +86,9 @@ prepareSample(
     WeightedSampleAccumulator<Container, MappedColumnVector>& ioAccumulator,
     const MappedColumnVector& inX) {
 
-    if (inX.size() != ioAccumulator.header.width) {
-        ioAccumulator.header.width = static_cast<uint32_t>(inX.size());
+    uint32_t width = static_cast<uint32_t>(inX.size());
+    if (width != ioAccumulator.header.width) {
+        ioAccumulator.header.width = width;
         ioAccumulator.resize();
     }
 }

diff --git a/src/ports/greenplum/4.0/config/Modules.yml b/src/ports/greenplum/4.0/config/Modules.yml
@@ -14,15 +14,19 @@ modules:
       depends: ['sketch']
 #    - name: cart
     - name: kmeans
-      depends: ['array_ops','sample','svec','utilities']
+      depends: ['array_ops','svec']
+    - name: kmeans_new
+      depends: ['sample']
     - name: kernel_machines
       depends: ['svec']
     - name: linalg
     - name: plda
     - name: prob
     - name: quantile
     - name: regress
+      depends: ['utilities']
     - name: sample
+      depends: ['utilities']
     - name: sketch
     - name: stats
     - name: svd_mf

diff --git a/src/ports/postgres/dbconnector/AnyType_impl.hpp b/src/ports/postgres/dbconnector/AnyType_impl.hpp
@@ -194,7 +194,7 @@ AnyType::getAs() const {
         throw std::invalid_argument(errorMsg.str());
     }
 
-    if (mDatum) {
+    if (mContent.empty()) {
         bool needMutableClone = (TypeTraits<T>::isMutable && !mIsMutable);
         return TypeTraits<T>::toCXXType(mDatum, needMutableClone, mSysInfo);
     } else {

diff --git a/src/ports/postgres/dbconnector/EigenIntegration_impl.hpp b/src/ports/postgres/dbconnector/EigenIntegration_impl.hpp
@@ -235,7 +235,7 @@ NativeArrayToMappedMatrix(Datum inDatum, bool inNeedMutableClone) {
 
     ArrayType* array = reinterpret_cast<ArrayType*>(
         madlib_DatumGetArrayTypeP(inDatum));
-    size_t arraySize = ARR_DIMS(array)[0];
+    size_t arraySize = ARR_DIMS(array)[0] * ARR_DIMS(array)[1];
 
     if (ARR_NDIM(array) != 2) {
         std::stringstream errorMsg;
@@ -270,9 +270,14 @@ NativeArrayToMappedVector(Datum inDatum, bool inNeedMutableClone) {
 
     ArrayType* array = reinterpret_cast<ArrayType*>(
         madlib_DatumGetArrayTypeP(inDatum));
-    size_t arraySize = ARR_DIMS(array)[0] * ARR_DIMS(array)[1];
+    size_t arraySize = ARR_NDIM(array) == 1
+        ? ARR_DIMS(array)[0]
+        : ARR_DIMS(array)[0] * ARR_DIMS(array)[1];
+
+    if (!(ARR_NDIM(array) == 1
+        || (ARR_NDIM(array) == 2
+            && (ARR_DIMS(array)[0] == 1 || ARR_DIMS(array)[1] == 1)))) {
 
-    if (ARR_NDIM(array) != 1) {
         std::stringstream errorMsg;
         errorMsg << "Invalid type conversion to matrix. Expected one-"
             "dimensional array but got " << ARR_NDIM(array)

diff --git a/src/ports/postgres/modules/kmeans_new/kmeans_new.py_in b/src/ports/postgres/modules/kmeans_new/kmeans_new.py_in
@@ -1,11 +1,12 @@
 # coding=utf-8
+m4_changequote(<!,!>)
 
 """
-@file kmeans.py_in
+@file kmeans_new.py_in
 
 @brief k-Means: Driver functions
 
-@namespace kmeans
+@namespace kmeans_new
 
 @brief k-Means: Driver functions
 """
@@ -182,7 +183,7 @@ class IterationController:
                 """.format(iteration = self.iteration, **self.kwargs))
 
 def compute_kmeanspp_seeding(schema_madlib, rel_args, rel_state, rel_source,
-    expr_id, expr_point, **kwargs):
+    expr_point, **kwargs):
     """
     Driver function for k-Means++ seeding
 
@@ -192,7 +193,6 @@ def compute_kmeanspp_seeding(schema_madlib, rel_args, rel_state, rel_source,
     @rel_state Name of the (temporary) table containing the inter-iteration
         states
     @param rel_source Name of the relation containing input points
-    @param expr_id Expression containing the unique identifiers
     @param expr_point Expression containing the point coordinates
     @param kwargs We allow the caller to specify additional arguments (all of
         which will be ignored though). The purpose of this is to allow the
@@ -208,46 +208,36 @@ def compute_kmeanspp_seeding(schema_madlib, rel_args, rel_state, rel_source,
         truncAfterIteration = True,
         schema_madlib = schema_madlib, # Identifiers start here
         rel_source = rel_source,
-        expr_id = expr_id,
         expr_point = expr_point)
     with iterationCtrl as it:
         if it.test("_args.initial_centroids IS NULL"):
             it.update("""
-                SELECT ARRAY[(
-                    SELECT CAST(_src.{expr_point} AS DOUBLE PRECISION[])
-                    FROM {rel_source} AS _src
-                    WHERE _src.{expr_id} = (
-                        SELECT {schema_madlib}.weighted_sample(_src.{expr_id}, 1)
-                        FROM {rel_source} AS _src
-                    )
-                )]
+                SELECT
+                    ARRAY[{schema_madlib}.weighted_sample(_src.{expr_point}, 1)]
+                FROM {rel_source} AS _src
                 """)
         else:
             it.update("""
                 SELECT _args.initial_centroids FROM {rel_args} AS _args
                 """)
         while it.test("array_upper(_state._state, 1) < _args.k"):
             it.update("""
-                SELECT _state._state || _src.{expr_point}
-                FROM {rel_source} AS _src, {rel_state} AS _state
-                WHERE
-                    _src.{expr_id} = (
-                        SELECT
-                            {schema_madlib}.weighted_sample(
-                                _src.{expr_id},
-                                ({schema_madlib}.closest_column(
-                                    _state._state,
-                                    _src.{expr_point},
-                                    _args.fn_squared_dist
-                                )).distance
-                            )
-                        FROM
-                            {rel_source} AS _src, {rel_args} AS _args,
-                            {rel_state} AS _state
-                        WHERE
-                            _state._iteration = {iteration}
-                    )
-                    AND _state._iteration = {iteration}
+                SELECT
+                    (
+                        SELECT _state FROM {rel_state}
+                        WHERE _iteration = {iteration}
+                    ) || {schema_madlib}.weighted_sample(
+                            _src.{expr_point},
+                            ({schema_madlib}.closest_column(
+                                (
+                                    SELECT _state FROM {rel_state}
+                                    WHERE _iteration = {iteration}
+                                ),
+                                _src.{expr_point},
+                                (SELECT fn_squared_dist FROM {rel_args})
+                            )).distance
+                        )
+                FROM {rel_source} AS _src
                 """)
     return iterationCtrl.iteration
 
@@ -313,7 +303,7 @@ def compute_kmeans_random_seeding(schema_madlib, rel_args, rel_state,
             m = it.evaluate("_args.k - coalesce(array_upper(_state._state, 1), 0)")
     return iterationCtrl.iteration
 
-def compute_kmeans(schema_madlib, rel_args, rel_state, rel_source, expr_id,
+def compute_kmeans(schema_madlib, rel_args, rel_state, rel_source,
     expr_point, agg_mean, **kwargs):
     """
     Driver function for Lloyd's k-means local-search heuristic
@@ -324,7 +314,6 @@ def compute_kmeans(schema_madlib, rel_args, rel_state, rel_source, expr_id,
     @rel_state Name of the (temporary) table containing the inter-iteration
         states
     @param rel_source Name of the relation containing input points
-    @param expr_id Expression containing the unique identifiers
     @param expr_point Expression containing the point coordinates
     @param kwargs We allow the caller to specify additional arguments (all of
         which will be ignored though). The purpose of this is to allow the
@@ -341,7 +330,6 @@ def compute_kmeans(schema_madlib, rel_args, rel_state, rel_source, expr_id,
         truncAfterIteration = False,
         schema_madlib = schema_madlib,
         rel_source = rel_source,
-        expr_id = expr_id,
         expr_point = expr_point,
         agg_mean = agg_mean)
     with iterationCtrl as it:
@@ -360,6 +348,9 @@ def compute_kmeans(schema_madlib, rel_args, rel_state, rel_source, expr_id,
                 SELECT
                     CAST((
                         {schema_madlib}.matrix_agg(_centroid),
+m4_ifdef(<!__GREENPLUM__!>,<!m4_ifdef(<!__HAS_ORDERED_AGGREGATES__!>,,<!
+                        {schema_madlib}.
+!>)!>)
                         array_agg(_new_centroid_id),
                         sum(_objective_fn),
                         CAST(sum(_num_reassigned) AS DOUBLE PRECISION)
@@ -419,7 +410,6 @@ def compute_kmeans(schema_madlib, rel_args, rel_state, rel_source, expr_id,
                         CAST(((_state._state).centroids
                             || {schema_madlib}.kmeanspp_seeding(
                                 '{rel_source}',
-                                '{expr_id}',
                                 '{expr_point}',
                                 CAST(_args.k AS INT2),
                                 textin(regprocout(_args.fn_squared_dist)),
@@ -430,3 +420,5 @@ def compute_kmeans(schema_madlib, rel_args, rel_state, rel_source, expr_id,
                     WHERE _state._iteration = {iteration}
                     """)
     return iterationCtrl.iteration
+
+m4_changequote(<!`!>,<!'!>)