Skip to content

Commit

Permalink
Added a sampling module, currently containing only weighted_sample().
Browse files Browse the repository at this point in the history
Jira: MADLIB-584

Sampling module:
- Added a data-parallel UDA for weighted sampling of a single row
C++ AL:
- Added a boost/TR1-style front-end to PostgreSQL's random number generator.
  • Loading branch information
Florian Schoppmann authored and Florian Schoppmann committed Jul 19, 2012
1 parent 5ef0cd5 commit 1afb10f
Show file tree
Hide file tree
Showing 16 changed files with 385 additions and 14 deletions.
28 changes: 14 additions & 14 deletions doc/design/modules/sampling.tex
Original file line number Diff line number Diff line change
Expand Up @@ -59,15 +59,15 @@ \subsection{Generating a Random Variate According to a Discrete Probability Dist

\subsubsection{Formal Description}

\begin{algorithm}[DiscreteSample$(A, w)$] \label{alg:DiscreteSample}
\begin{algorithm}[WeightedSample$(A, w)$] \label{alg:WeightedSample}
\alginput{Finite set $A$, Mapping $w$ of each element $a \in A$ to its weight $w[a] \geq 0$}
\algoutput{Random element $\Sample \in A$ sampled according to distribution induced by $w$}
\begin{algorithmic}[1]
\State $W \set 0$
\For{$a \in A$}
\State $W \set W + w[a]$ \label{alg:DiscreteSample:UpdateWeight}
\With{probability $\frac{w[a]}{W}$} \label{alg:DiscreteSample:Prob}
\State $\Sample \set a$ \label{alg:DiscreteSample:SetSample}
\State $W \set W + w[a]$ \label{alg:WeightedSample:UpdateWeight}
\With{probability $\frac{w[a]}{W}$} \label{alg:WeightedSample:Prob}
\State $\Sample \set a$ \label{alg:WeightedSample:SetSample}
\EndWith
\EndFor
\end{algorithmic}
Expand All @@ -79,7 +79,7 @@ \subsubsection{Formal Description}
\item[Correctness]
Let $a_1, \dots, a_n$ be the order in which the algorithm processes the elements. Denote by $\Sample_t$ the value of $\Sample$ at the end of iteration $t \in \Nupto n$. We prove by induction over $t$ that it holds for all $i \in \Nupto t$ that $\Pr[\Sample_t = a_i] = \frac{w[a_i]}{W_t}$ where $W_t := \sum_{j=1}^t w[a_j]$.

The base case $t = 1$ holds immediately by lines~\ref{alg:DiscreteSample:Prob}--\ref{alg:DiscreteSample:SetSample}. To see the induction step $t - 1 \to t$, note that $\Pr[\Sample_t = a_t] = \frac{w[a_t]}{W_t}$ (again by lines~\ref{alg:DiscreteSample:Prob}--\ref{alg:DiscreteSample:SetSample}) and that for all $i \in \Nupto{t-1}$
The base case $t = 1$ holds immediately by lines~\ref{alg:WeightedSample:Prob}--\ref{alg:WeightedSample:SetSample}. To see the induction step $t - 1 \to t$, note that $\Pr[\Sample_t = a_t] = \frac{w[a_t]}{W_t}$ (again by lines~\ref{alg:WeightedSample:Prob}--\ref{alg:WeightedSample:SetSample}) and that for all $i \in \Nupto{t-1}$
\begin{align*}
\Pr[\Sample_t = a_i]
= \Pr[\Sample_t \neq a_t] \cdot \Pr[\Sample_{t-1} = a_i]
Expand All @@ -92,17 +92,17 @@ \subsubsection{Formal Description}
The algorithm can easily be transformed into a divide-and-conquer algorithm, as shown in the following.
\end{description}

\begin{algorithm}[RecursiveDiscreteSample$(A_1, A_2, w)$] \label{alg:RecursiveDiscreteSample}
\begin{algorithm}[RecursiveWeightedSample$(A_1, A_2, w)$] \label{alg:RecursiveWeightedSample}
\alginput{Disjoint finite sets $A_1, A_2$, Mapping $w$ of each element $a \in A_1 \cup A_2$ to its weight $w[a] \geq 0$}
\algoutput{Random element $\Sample \in A_1 \cup A_2$ sampled according to distribution induced by $w$}
\begin{algorithmic}[1]
\State $\tilde A \set \emptyset$
\For{$i = 1,2$}
\State $\Sample_i \set \texttt{DiscreteSample}(A_i, w)$
\State $\Sample_i \set \texttt{WeightedSample}(A_i, w)$
\State $\tilde A \set \tilde A \cup \{ \Sample_i \}$
\State $\tilde w[\Sample_i] \set \sum_{a \in A_i} w[a]$
\EndFor
\State $\Sample \set \texttt{DiscreteSample}(\tilde A, \tilde w)$
\State $\Sample \set \texttt{WeightedSample}(\tilde A, \tilde w)$
\end{algorithmic}
\end{algorithm}

Expand All @@ -113,12 +113,12 @@ \subsubsection{Formal Description}
\paragraph{Numerical Considerations}

\begin{itemize}
\item When Algorithm~\ref{alg:DiscreteSample} is used for large sets $A$, line~\ref{alg:DiscreteSample:UpdateWeight} will eventually add two numbers that are very different in size. Compensated summation should be used \cite{ORO05a}.
\item When Algorithm~\ref{alg:WeightedSample} is used for large sets $A$, line~\ref{alg:WeightedSample:UpdateWeight} will eventually add two numbers that are very different in size. Compensated summation should be used \cite{ORO05a}.
\end{itemize}

\subsubsection{Implementation as User-Defined Aggregate}

Algorithm~\ref{alg:DiscreteSample} is implemented as the user-defined aggregate \symlabel{discrete\_sample}{sym:discrete_sample}. Data-parallelism is implemented as in Algorithm~\ref{alg:RecursiveDiscreteSample}.
Algorithm~\ref{alg:WeightedSample} is implemented as the user-defined aggregate \symlabel{weighted\_sample}{sym:weighted_sample}. Data-parallelism is implemented as in Algorithm~\ref{alg:RecursiveWeightedSample}.

\paragraph{Input} The aggregate expects the following arguments:

Expand Down Expand Up @@ -154,19 +154,19 @@ \subsubsection{Implementation as User-Defined Aggregate}
\textbf{Field Name} & \textbf{Description} & \textbf{Type}
\\\otoprule
\texttt{weight\_sum} &
corresponds to $W$ in Algorithm~\ref{alg:DiscreteSample} &
corresponds to $W$ in Algorithm~\ref{alg:WeightedSample} &
floating-point
\\\midrule
\texttt{sample\_id} &
corresponds to $\Sample$ in Algorithm~\ref{alg:DiscreteSample}, takes value of column \texttt{id} &
corresponds to $\Sample$ in Algorithm~\ref{alg:WeightedSample}, takes value of column \texttt{id} &
integer
\\\bottomrule
\end{tabularx}
\end{center}
\item Transition Function (\texttt{state, id, weight}): Lines~\ref{alg:DiscreteSample:UpdateWeight}--\ref{alg:DiscreteSample:SetSample} of Algorithm~\ref{alg:DiscreteSample}
\item Transition Function (\texttt{state, id, weight}): Lines~\ref{alg:WeightedSample:UpdateWeight}--\ref{alg:WeightedSample:SetSample} of Algorithm~\ref{alg:WeightedSample}
\item Merge Function (\texttt{state1, state2}): It is enough to call the transition function with arguments (\texttt{state1, state2.sample\_id, state2.weight\_sum})
\end{itemize}

\paragraph{Tool Set}

While the user-defined aggregate is simple enough to be implemented in plain SQL, we choose a C++ implementation for performance. Moreover, a C++ implementation allows us to reuse the existing compensated-sum component. \todo{Not documented yet.}
While the user-defined aggregate is simple enough to be implemented in plain SQL, we choose a C++ implementation for performance. \todo{In the future, we want to use compensated summation. (Not documented yet.)}
3 changes: 3 additions & 0 deletions doc/mainpage.dox.in
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ and included third-party libraries can be found inside the
@defgroup grp_textfex_viterbi Feature Extraction
@ingroup grp_support

@defgroup grp_sample Random Sampling
@ingroup grp_support

@defgroup grp_compatibiity Compatibility
@ingroup grp_support

Expand Down
1 change: 1 addition & 0 deletions src/config/Modules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ modules:
- name: prob
- name: quantile
- name: regress
- name: sample
- name: sketch
- name: stats
depends: ['utilities']
Expand Down
1 change: 1 addition & 0 deletions src/modules/declarations.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@
#include "linalg/linalg.hpp"
#include "prob/prob.hpp"
#include "regress/regress.hpp"
#include "sample/sample.hpp"
#include "stats/stats.hpp"
9 changes: 9 additions & 0 deletions src/modules/sample/sample.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
/* -----------------------------------------------------------------------------
*
* @file sample.hpp
*
* @brief Umbrella header that includes all sampling headers
*
* -------------------------------------------------------------------------- */

#include "weighted_sample.hpp"
115 changes: 115 additions & 0 deletions src/modules/sample/weighted_sample.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/* ----------------------------------------------------------------------- *//**
*
* @file weighted_sample.cpp
*
* @brief Generate a single weighted random sample
*
*//* ----------------------------------------------------------------------- */

#include <dbconnector/dbconnector.hpp>
#include <modules/shared/HandleTraits.hpp>

#include <boost/tr1/random.hpp>

#include "weighted_sample.hpp"

// Import TR1 names (currently used from boost). This can go away once we make
// the switch to C++11.
namespace std {
using tr1::bernoulli_distribution;
}

namespace madlib {

namespace modules {

namespace sample {

/**
* @brief Transition state for weighted sample
*
* Note: We assume that the DOUBLE PRECISION array is initialized by the
* database with length 2, and all elemenets are 0.
*/
template <class Handle>
class WeightedSampleTransitionState {
public:
WeightedSampleTransitionState(const AnyType &inArray)
: mStorage(inArray.getAs<Handle>()),
sample_id(&mStorage[1]),
weight_sum(&mStorage[0]) { }

inline operator AnyType() const {
return mStorage;
}

private:
Handle mStorage;

public:
typename HandleTraits<Handle>::ReferenceToInt64 sample_id;
typename HandleTraits<Handle>::ReferenceToDouble weight_sum;
};

/**
* @brief Perform the weighted-sample transition step
*/
AnyType
weighted_sample_transition::run(AnyType& args) {
WeightedSampleTransitionState<MutableArrayHandle<double> > state = args[0];
uint64_t identifier = args[1].getAs<int64_t>();
double weight = args[2].getAs<double>();

// Instead of throwing an error, we will just ignore rows with a negative
// weight
if (weight > 0.) {
state.weight_sum += weight;
std::bernoulli_distribution success(weight / state.weight_sum);
// Note that a NativeRandomNumberGenerator object is stateless, so it
// is not a problem to instantiate an object for each RN generation...
NativeRandomNumberGenerator generator;
if (success(generator))
state.sample_id = identifier;
}

return state;
}

/**
* @brief Perform the merging of two transition states
*/
AnyType
weighted_sample_merge::run(AnyType &args) {
WeightedSampleTransitionState<MutableArrayHandle<double> > stateLeft
= args[0];
WeightedSampleTransitionState<ArrayHandle<double> > stateRight = args[1];

// FIXME: Once we have more modular states (independent of transition/merge
// function), implement using the logic from the transition function
stateLeft.weight_sum += stateRight.weight_sum;
std::bernoulli_distribution success(
stateRight.weight_sum / stateLeft.weight_sum);
// Note that a NativeRandomNumberGenerator object is stateless, so it
// is not a problem to instantiate an object for each RN generation...
NativeRandomNumberGenerator generator;
if (success(generator))
stateLeft.sample_id = stateRight.sample_id;

return stateLeft;
}

/**
* @brief Perform the weighted-sample final step
*/
AnyType
weighted_sample_final::run(AnyType &args) {
WeightedSampleTransitionState<ArrayHandle<double> > state = args[0];

return static_cast<int64_t>(state.sample_id);
}

} // namespace stats

} // namespace modules

} // namespace madlib
20 changes: 20 additions & 0 deletions src/modules/sample/weighted_sample.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/* ----------------------------------------------------------------------- *//**
*
* @file weighted_sample.hpp
*
*//* ----------------------------------------------------------------------- */

/**
* @brief Weighted random sample: Transition function
*/
DECLARE_UDF(sample, weighted_sample_transition)

/**
* @brief Weighted random sample: State merge function
*/
DECLARE_UDF(sample, weighted_sample_merge)

/**
* @brief Weighted random sample: Final function
*/
DECLARE_UDF(sample, weighted_sample_final)
1 change: 1 addition & 0 deletions src/ports/greenplum/4.0/config/Modules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ modules:
- name: prob
- name: quantile
- name: regress
- name: sample
- name: sketch
# - name: stats
# depends: ['utilities']
Expand Down
2 changes: 2 additions & 0 deletions src/ports/greenplum/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ list(APPEND MAD_DBAL_SOURCES
"${CMAKE_CURRENT_SOURCE_DIR}/../postgres/dbconnector/FunctionHandle_proto.hpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../postgres/dbconnector/main.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../postgres/dbconnector/NewDelete.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../postgres/dbconnector/NativeRandomNumberGenerator_impl.hpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../postgres/dbconnector/NativeRandomNumberGenerator_proto.hpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../postgres/dbconnector/OutputStreamBuffer_impl.hpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../postgres/dbconnector/OutputStreamBuffer_proto.hpp"
"${CMAKE_CURRENT_SOURCE_DIR}/../postgres/dbconnector/PGException_proto.hpp"
Expand Down
1 change: 1 addition & 0 deletions src/ports/postgres/8.4/config/Modules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ modules:
- name: prob
- name: quantile
- name: regress
- name: sample
- name: sketch
# - name: stats
# depends: ['utilities']
Expand Down
2 changes: 2 additions & 0 deletions src/ports/postgres/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ list(APPEND MAD_DBAL_SOURCES
"${CMAKE_CURRENT_SOURCE_DIR}/dbconnector/FunctionHandle_proto.hpp"
"${CMAKE_CURRENT_SOURCE_DIR}/dbconnector/main.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/dbconnector/NewDelete.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/dbconnector/NativeRandomNumberGenerator_impl.hpp"
"${CMAKE_CURRENT_SOURCE_DIR}/dbconnector/NativeRandomNumberGenerator_proto.hpp"
"${CMAKE_CURRENT_SOURCE_DIR}/dbconnector/OutputStreamBuffer_impl.hpp"
"${CMAKE_CURRENT_SOURCE_DIR}/dbconnector/OutputStreamBuffer_proto.hpp"
"${CMAKE_CURRENT_SOURCE_DIR}/dbconnector/PGException_proto.hpp"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/* ----------------------------------------------------------------------- *//**
*
* @file NativeRandomNumberGenerator_impl.hpp
*
*//* ----------------------------------------------------------------------- */

#ifndef MADLIB_POSTGRES_NATIVERANDOMNUMBERGENERATOR_IMPL_HPP
#define MADLIB_POSTGRES_NATIVERANDOMNUMBERGENERATOR_IMPL_HPP

namespace madlib {

namespace dbconnector {

namespace postgres {

inline
NativeRandomNumberGenerator::NativeRandomNumberGenerator() { }

/**
* @brief Sets the current state of the engine
*
* All this function does is calling the backend's seed functions.
*/
inline
void
NativeRandomNumberGenerator::seed(result_type inSeed) {
DirectFunctionCall1(setseed, Float8GetDatum(inSeed));
}

/**
* @brief Advances the engine's state and returns the generated value
*/
inline
NativeRandomNumberGenerator::result_type
NativeRandomNumberGenerator::operator()() {
// There seems to be no DirectFunctionCall0().
return DatumGetFloat8(DirectFunctionCall1(drandom, Datum(0)));
}

/**
* @brief Return tight lower bound on the set of all values returned by
* <tt>operator()</tt>
*/
inline
NativeRandomNumberGenerator::result_type
NativeRandomNumberGenerator::min() {
return 0.0;
}

/**
* @brief Return smallest representable number larger than maximum of all values
* returned by <tt>operator()</tt>
*
* Boost specifies, if \c result_type is not integer, then return "the smallest
* representable number larger than the tight upper bound on the set of all
* values returned by operator(). In any case, the return value of this function
* shall not change during the lifetime of the object."
* http://www.boost.org/doc/libs/1_50_0/doc/html/boost_random/reference.html
*/
inline
NativeRandomNumberGenerator::result_type
NativeRandomNumberGenerator::max() {
return 1.0;
}

} // namespace postgres

} // namespace dbconnector

} // namespace madlib

#endif // defined(MADLIB_POSTGRES_NATIVERANDOMNUMBERGENERATOR_IMPL_HPP)
Loading

0 comments on commit 1afb10f

Please sign in to comment.