Skip to content

Commit

Permalink
New k-means: Removed argument point id from argument list
Browse files Browse the repository at this point in the history
Jira: MADLIB-522

Now that we have a function to sample vectors directly, it is unnecessary to
supply a point id argument to k-means (it was only used for weighted sampling
in k-means++). This should improve performance and usability.

Other changes:
- Added return type to closest_column. On Greenplum, we would otherwise see an
  error of the form "cannot serialize transient record type".
- Adapted k-means unit tests.
- New unit test for kmeans with initial centroids provided in table
- Fixed oversight that squaredDistNorm1 did not return the square of the
  1-norm.
- C++ AL: Fixed bug in NativeArrayToMappedMatrix() and
  NativeArrayToMappedVector()
  • Loading branch information
Florian Schoppmann authored and Florian Schoppmann committed Sep 14, 2012
1 parent c81c018 commit e6e99f1
Show file tree
Hide file tree
Showing 12 changed files with 131 additions and 134 deletions.
2 changes: 1 addition & 1 deletion cmake/Utils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ function(define_m4_macros OUT_M4_CMD_LINE OUT_M4_CODE)
)
list_replace("^(.+)$" "-D\\\\1" ${OUT_M4_CMD_LINE} ${MACROS})
list_replace("^([^=]+)$" "m4_define(`\\\\1')" ${OUT_M4_CODE} ${MACROS})
list_replace("^([^=]+)=(.+)$" "m4_define(`\\\\1', ``\\\\2'')" ${OUT_M4_CODE}
list_replace("^([^=]+)=(.+)$" "m4_define(`\\\\1', `\\\\2')" ${OUT_M4_CODE}
${${OUT_M4_CODE}})
string(REGEX REPLACE ";" "\\n" ${OUT_M4_CODE} "${${OUT_M4_CODE}}")

Expand Down
3 changes: 3 additions & 0 deletions doc/mainpage.dox.in
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ and included third-party libraries can be found inside the
@defgroup grp_kmeans k-Means Clustering
@ingroup grp_unsuplearn

@defgroup grp_kmeans_new k-Means Clustering (new implementation)
@ingroup grp_unsuplearn

@defgroup grp_lmf Low-rank Matrix Factorization
@ingroup grp_unsuplearn

Expand Down
3 changes: 2 additions & 1 deletion src/modules/linalg/metric.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ squaredDistNorm1(
const MappedColumnVector& inX,
const MappedColumnVector& inY) {

return (inX - inY).lpNorm<1>();
double l1norm = (inX - inY).lpNorm<1>();
return l1norm * l1norm;
}

double
Expand Down
5 changes: 3 additions & 2 deletions src/modules/sample/WeightedSample_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,9 @@ prepareSample(
WeightedSampleAccumulator<Container, MappedColumnVector>& ioAccumulator,
const MappedColumnVector& inX) {

if (inX.size() != ioAccumulator.header.width) {
ioAccumulator.header.width = static_cast<uint32_t>(inX.size());
uint32_t width = static_cast<uint32_t>(inX.size());
if (width != ioAccumulator.header.width) {
ioAccumulator.header.width = width;
ioAccumulator.resize();
}
}
Expand Down
6 changes: 5 additions & 1 deletion src/ports/greenplum/4.0/config/Modules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,19 @@ modules:
depends: ['sketch']
# - name: cart
- name: kmeans
depends: ['array_ops','sample','svec','utilities']
depends: ['array_ops','svec']
- name: kmeans_new
depends: ['sample']
- name: kernel_machines
depends: ['svec']
- name: linalg
- name: plda
- name: prob
- name: quantile
- name: regress
depends: ['utilities']
- name: sample
depends: ['utilities']
- name: sketch
- name: stats
- name: svd_mf
Expand Down
2 changes: 1 addition & 1 deletion src/ports/postgres/dbconnector/AnyType_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ AnyType::getAs() const {
throw std::invalid_argument(errorMsg.str());
}

if (mDatum) {
if (mContent.empty()) {
bool needMutableClone = (TypeTraits<T>::isMutable && !mIsMutable);
return TypeTraits<T>::toCXXType(mDatum, needMutableClone, mSysInfo);
} else {
Expand Down
11 changes: 8 additions & 3 deletions src/ports/postgres/dbconnector/EigenIntegration_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ NativeArrayToMappedMatrix(Datum inDatum, bool inNeedMutableClone) {

ArrayType* array = reinterpret_cast<ArrayType*>(
madlib_DatumGetArrayTypeP(inDatum));
size_t arraySize = ARR_DIMS(array)[0];
size_t arraySize = ARR_DIMS(array)[0] * ARR_DIMS(array)[1];

if (ARR_NDIM(array) != 2) {
std::stringstream errorMsg;
Expand Down Expand Up @@ -270,9 +270,14 @@ NativeArrayToMappedVector(Datum inDatum, bool inNeedMutableClone) {

ArrayType* array = reinterpret_cast<ArrayType*>(
madlib_DatumGetArrayTypeP(inDatum));
size_t arraySize = ARR_DIMS(array)[0] * ARR_DIMS(array)[1];
size_t arraySize = ARR_NDIM(array) == 1
? ARR_DIMS(array)[0]
: ARR_DIMS(array)[0] * ARR_DIMS(array)[1];

if (!(ARR_NDIM(array) == 1
|| (ARR_NDIM(array) == 2
&& (ARR_DIMS(array)[0] == 1 || ARR_DIMS(array)[1] == 1)))) {

if (ARR_NDIM(array) != 1) {
std::stringstream errorMsg;
errorMsg << "Invalid type conversion to matrix. Expected one-"
"dimensional array but got " << ARR_NDIM(array)
Expand Down
66 changes: 29 additions & 37 deletions src/ports/postgres/modules/kmeans_new/kmeans_new.py_in
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# coding=utf-8
m4_changequote(<!,!>)

"""
@file kmeans.py_in
@file kmeans_new.py_in

@brief k-Means: Driver functions

@namespace kmeans
@namespace kmeans_new

@brief k-Means: Driver functions
"""
Expand Down Expand Up @@ -182,7 +183,7 @@ class IterationController:
""".format(iteration = self.iteration, **self.kwargs))

def compute_kmeanspp_seeding(schema_madlib, rel_args, rel_state, rel_source,
expr_id, expr_point, **kwargs):
expr_point, **kwargs):
"""
Driver function for k-Means++ seeding

Expand All @@ -192,7 +193,6 @@ def compute_kmeanspp_seeding(schema_madlib, rel_args, rel_state, rel_source,
@rel_state Name of the (temporary) table containing the inter-iteration
states
@param rel_source Name of the relation containing input points
@param expr_id Expression containing the unique identifiers
@param expr_point Expression containing the point coordinates
@param kwargs We allow the caller to specify additional arguments (all of
which will be ignored though). The purpose of this is to allow the
Expand All @@ -208,46 +208,36 @@ def compute_kmeanspp_seeding(schema_madlib, rel_args, rel_state, rel_source,
truncAfterIteration = True,
schema_madlib = schema_madlib, # Identifiers start here
rel_source = rel_source,
expr_id = expr_id,
expr_point = expr_point)
with iterationCtrl as it:
if it.test("_args.initial_centroids IS NULL"):
it.update("""
SELECT ARRAY[(
SELECT CAST(_src.{expr_point} AS DOUBLE PRECISION[])
FROM {rel_source} AS _src
WHERE _src.{expr_id} = (
SELECT {schema_madlib}.weighted_sample(_src.{expr_id}, 1)
FROM {rel_source} AS _src
)
)]
SELECT
ARRAY[{schema_madlib}.weighted_sample(_src.{expr_point}, 1)]
FROM {rel_source} AS _src
""")
else:
it.update("""
SELECT _args.initial_centroids FROM {rel_args} AS _args
""")
while it.test("array_upper(_state._state, 1) < _args.k"):
it.update("""
SELECT _state._state || _src.{expr_point}
FROM {rel_source} AS _src, {rel_state} AS _state
WHERE
_src.{expr_id} = (
SELECT
{schema_madlib}.weighted_sample(
_src.{expr_id},
({schema_madlib}.closest_column(
_state._state,
_src.{expr_point},
_args.fn_squared_dist
)).distance
)
FROM
{rel_source} AS _src, {rel_args} AS _args,
{rel_state} AS _state
WHERE
_state._iteration = {iteration}
)
AND _state._iteration = {iteration}
SELECT
(
SELECT _state FROM {rel_state}
WHERE _iteration = {iteration}
) || {schema_madlib}.weighted_sample(
_src.{expr_point},
({schema_madlib}.closest_column(
(
SELECT _state FROM {rel_state}
WHERE _iteration = {iteration}
),
_src.{expr_point},
(SELECT fn_squared_dist FROM {rel_args})
)).distance
)
FROM {rel_source} AS _src
""")
return iterationCtrl.iteration

Expand Down Expand Up @@ -313,7 +303,7 @@ def compute_kmeans_random_seeding(schema_madlib, rel_args, rel_state,
m = it.evaluate("_args.k - coalesce(array_upper(_state._state, 1), 0)")
return iterationCtrl.iteration

def compute_kmeans(schema_madlib, rel_args, rel_state, rel_source, expr_id,
def compute_kmeans(schema_madlib, rel_args, rel_state, rel_source,
expr_point, agg_mean, **kwargs):
"""
Driver function for Lloyd's k-means local-search heuristic
Expand All @@ -324,7 +314,6 @@ def compute_kmeans(schema_madlib, rel_args, rel_state, rel_source, expr_id,
@rel_state Name of the (temporary) table containing the inter-iteration
states
@param rel_source Name of the relation containing input points
@param expr_id Expression containing the unique identifiers
@param expr_point Expression containing the point coordinates
@param kwargs We allow the caller to specify additional arguments (all of
which will be ignored though). The purpose of this is to allow the
Expand All @@ -341,7 +330,6 @@ def compute_kmeans(schema_madlib, rel_args, rel_state, rel_source, expr_id,
truncAfterIteration = False,
schema_madlib = schema_madlib,
rel_source = rel_source,
expr_id = expr_id,
expr_point = expr_point,
agg_mean = agg_mean)
with iterationCtrl as it:
Expand All @@ -360,6 +348,9 @@ def compute_kmeans(schema_madlib, rel_args, rel_state, rel_source, expr_id,
SELECT
CAST((
{schema_madlib}.matrix_agg(_centroid),
m4_ifdef(<!__GREENPLUM__!>,<!m4_ifdef(<!__HAS_ORDERED_AGGREGATES__!>,,<!
{schema_madlib}.
!>)!>)
array_agg(_new_centroid_id),
sum(_objective_fn),
CAST(sum(_num_reassigned) AS DOUBLE PRECISION)
Expand Down Expand Up @@ -419,7 +410,6 @@ def compute_kmeans(schema_madlib, rel_args, rel_state, rel_source, expr_id,
CAST(((_state._state).centroids
|| {schema_madlib}.kmeanspp_seeding(
'{rel_source}',
'{expr_id}',
'{expr_point}',
CAST(_args.k AS INT2),
textin(regprocout(_args.fn_squared_dist)),
Expand All @@ -430,3 +420,5 @@ def compute_kmeans(schema_madlib, rel_args, rel_state, rel_source, expr_id,
WHERE _state._iteration = {iteration}
""")
return iterationCtrl.iteration

m4_changequote(<!`!>,<!'!>)
Loading

0 comments on commit e6e99f1

Please sign in to comment.