Added a sampling module, currently containing only weighted_sample().

Jira: MADLIB-584 Sampling module: - Added a data-parallel UDA for weighted sampling of a single row C++ AL: - Added a boost/TR1-style front-end to PostgreSQL's random number generator.
apache · Jul 19, 2012 · 1afb10f · 1afb10f
1 parent 5ef0cd5
commit 1afb10f
Show file tree

Hide file tree

Showing 16 changed files with 385 additions and 14 deletions.
diff --git a/doc/design/modules/sampling.tex b/doc/design/modules/sampling.tex
@@ -59,15 +59,15 @@ \subsection{Generating a Random Variate According to a Discrete Probability Dist
 
 \subsubsection{Formal Description}
 
-\begin{algorithm}[DiscreteSample$(A, w)$] \label{alg:DiscreteSample}
+\begin{algorithm}[WeightedSample$(A, w)$] \label{alg:WeightedSample}
 \alginput{Finite set $A$, Mapping $w$ of each element $a \in A$ to its weight $w[a] \geq 0$}
 \algoutput{Random element $\Sample \in A$ sampled according to distribution induced by $w$}
 \begin{algorithmic}[1]
 	\State $W \set 0$
 	\For{$a \in A$}
-		\State $W \set W + w[a]$ \label{alg:DiscreteSample:UpdateWeight}
-		\With{probability $\frac{w[a]}{W}$} \label{alg:DiscreteSample:Prob}
-			\State $\Sample \set a$ \label{alg:DiscreteSample:SetSample}
+		\State $W \set W + w[a]$ \label{alg:WeightedSample:UpdateWeight}
+		\With{probability $\frac{w[a]}{W}$} \label{alg:WeightedSample:Prob}
+			\State $\Sample \set a$ \label{alg:WeightedSample:SetSample}
 		\EndWith
 	\EndFor
 \end{algorithmic}
@@ -79,7 +79,7 @@ \subsubsection{Formal Description}
 	\item[Correctness]
 		Let $a_1, \dots, a_n$ be the order in which the algorithm processes the elements. Denote by $\Sample_t$ the value of $\Sample$ at the end of iteration $t \in \Nupto n$. We prove by induction over $t$ that it holds for all $i \in \Nupto t$ that $\Pr[\Sample_t = a_i] = \frac{w[a_i]}{W_t}$ where $W_t := \sum_{j=1}^t w[a_j]$.
 
-		The base case $t = 1$ holds immediately by lines~\ref{alg:DiscreteSample:Prob}--\ref{alg:DiscreteSample:SetSample}. To see the induction step $t - 1 \to t$, note that $\Pr[\Sample_t = a_t] = \frac{w[a_t]}{W_t}$ (again by lines~\ref{alg:DiscreteSample:Prob}--\ref{alg:DiscreteSample:SetSample}) and that for all $i \in \Nupto{t-1}$
+		The base case $t = 1$ holds immediately by lines~\ref{alg:WeightedSample:Prob}--\ref{alg:WeightedSample:SetSample}. To see the induction step $t - 1 \to t$, note that $\Pr[\Sample_t = a_t] = \frac{w[a_t]}{W_t}$ (again by lines~\ref{alg:WeightedSample:Prob}--\ref{alg:WeightedSample:SetSample}) and that for all $i \in \Nupto{t-1}$
 		\begin{align*}
 			\Pr[\Sample_t = a_i]
 			=	\Pr[\Sample_t \neq a_t] \cdot \Pr[\Sample_{t-1} = a_i]
@@ -92,17 +92,17 @@ \subsubsection{Formal Description}
 		The algorithm can easily be transformed into a divide-and-conquer algorithm, as shown in the following.
 \end{description}
 
-\begin{algorithm}[RecursiveDiscreteSample$(A_1, A_2, w)$] \label{alg:RecursiveDiscreteSample}
+\begin{algorithm}[RecursiveWeightedSample$(A_1, A_2, w)$] \label{alg:RecursiveWeightedSample}
 \alginput{Disjoint finite sets $A_1, A_2$, Mapping $w$ of each element $a \in A_1 \cup A_2$ to its weight $w[a] \geq 0$}
 \algoutput{Random element $\Sample \in A_1 \cup A_2$ sampled according to distribution induced by $w$}
 \begin{algorithmic}[1]
 	\State $\tilde A \set \emptyset$
 	\For{$i = 1,2$}
-		\State $\Sample_i \set \texttt{DiscreteSample}(A_i, w)$
+		\State $\Sample_i \set \texttt{WeightedSample}(A_i, w)$
 		\State $\tilde A \set \tilde A \cup \{ \Sample_i \}$
 		\State $\tilde w[\Sample_i] \set \sum_{a \in A_i} w[a]$
 	\EndFor
-	\State $\Sample \set \texttt{DiscreteSample}(\tilde A, \tilde w)$
+	\State $\Sample \set \texttt{WeightedSample}(\tilde A, \tilde w)$
 \end{algorithmic}
 \end{algorithm}
 
@@ -113,12 +113,12 @@ \subsubsection{Formal Description}
 \paragraph{Numerical Considerations}
 
 \begin{itemize}
-	\item When Algorithm~\ref{alg:DiscreteSample} is used for large sets $A$, line~\ref{alg:DiscreteSample:UpdateWeight} will eventually add two numbers that are very different in size. Compensated summation should be used \cite{ORO05a}.
+	\item When Algorithm~\ref{alg:WeightedSample} is used for large sets $A$, line~\ref{alg:WeightedSample:UpdateWeight} will eventually add two numbers that are very different in size. Compensated summation should be used \cite{ORO05a}.
 \end{itemize}
 
 \subsubsection{Implementation as User-Defined Aggregate}
 
-Algorithm~\ref{alg:DiscreteSample} is implemented as the user-defined aggregate \symlabel{discrete\_sample}{sym:discrete_sample}. Data-parallelism is implemented as in Algorithm~\ref{alg:RecursiveDiscreteSample}.
+Algorithm~\ref{alg:WeightedSample} is implemented as the user-defined aggregate \symlabel{weighted\_sample}{sym:weighted_sample}. Data-parallelism is implemented as in Algorithm~\ref{alg:RecursiveWeightedSample}.
 
 \paragraph{Input} The aggregate expects the following arguments:
 
@@ -154,19 +154,19 @@ \subsubsection{Implementation as User-Defined Aggregate}
 				\textbf{Field Name} & \textbf{Description} & \textbf{Type}
 				\\\otoprule
 				\texttt{weight\_sum} &
-				corresponds to $W$ in Algorithm~\ref{alg:DiscreteSample} &
+				corresponds to $W$ in Algorithm~\ref{alg:WeightedSample} &
 				floating-point
 				\\\midrule
 				\texttt{sample\_id} &
-				corresponds to $\Sample$ in Algorithm~\ref{alg:DiscreteSample}, takes value of column \texttt{id} &
+				corresponds to $\Sample$ in Algorithm~\ref{alg:WeightedSample}, takes value of column \texttt{id} &
 				integer
 				\\\bottomrule
 			\end{tabularx}
 		\end{center}
-	\item Transition Function (\texttt{state, id, weight}): Lines~\ref{alg:DiscreteSample:UpdateWeight}--\ref{alg:DiscreteSample:SetSample} of Algorithm~\ref{alg:DiscreteSample}
+	\item Transition Function (\texttt{state, id, weight}): Lines~\ref{alg:WeightedSample:UpdateWeight}--\ref{alg:WeightedSample:SetSample} of Algorithm~\ref{alg:WeightedSample}
 	\item Merge Function (\texttt{state1, state2}): It is enough to call the transition function with arguments (\texttt{state1, state2.sample\_id, state2.weight\_sum})
 \end{itemize}
 
 \paragraph{Tool Set}
 
-While the user-defined aggregate is simple enough to be implemented in plain SQL, we choose a C++ implementation for performance. Moreover, a C++ implementation allows us to reuse the existing compensated-sum component. \todo{Not documented yet.}
+While the user-defined aggregate is simple enough to be implemented in plain SQL, we choose a C++ implementation for performance. \todo{In the future, we want to use compensated summation. (Not documented yet.)}
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
@@ -100,6 +100,9 @@ and included third-party libraries can be found inside the
     @defgroup grp_textfex_viterbi Feature Extraction
     @ingroup grp_support
 
+    @defgroup grp_sample Random Sampling
+    @ingroup grp_support
+
     @defgroup grp_compatibiity Compatibility
     @ingroup grp_support
 

diff --git a/src/config/Modules.yml b/src/config/Modules.yml
@@ -21,6 +21,7 @@ modules:
     - name: prob
     - name: quantile
     - name: regress
+    - name: sample
     - name: sketch
     - name: stats
       depends: ['utilities']

diff --git a/src/modules/declarations.hpp b/src/modules/declarations.hpp
@@ -9,4 +9,5 @@
 #include "linalg/linalg.hpp"
 #include "prob/prob.hpp"
 #include "regress/regress.hpp"
+#include "sample/sample.hpp"
 #include "stats/stats.hpp"
diff --git a/src/modules/sample/sample.hpp b/src/modules/sample/sample.hpp
@@ -0,0 +1,9 @@
+/* -----------------------------------------------------------------------------
+ *
+ * @file sample.hpp
+ *
+ * @brief Umbrella header that includes all sampling headers
+ *
+ * -------------------------------------------------------------------------- */
+
+#include "weighted_sample.hpp"
diff --git a/src/modules/sample/weighted_sample.cpp b/src/modules/sample/weighted_sample.cpp
@@ -0,0 +1,115 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * @file weighted_sample.cpp
+ *
+ * @brief Generate a single weighted random sample
+ *
+ *//* ----------------------------------------------------------------------- */
+
+#include <dbconnector/dbconnector.hpp>
+#include <modules/shared/HandleTraits.hpp>
+
+#include <boost/tr1/random.hpp>
+
+#include "weighted_sample.hpp"
+
+// Import TR1 names (currently used from boost). This can go away once we make
+// the switch to C++11.
+namespace std {
+    using tr1::bernoulli_distribution;
+}
+
+namespace madlib {
+
+namespace modules {
+
+namespace sample {
+
+/**
+ * @brief Transition state for weighted sample
+ *
+ * Note: We assume that the DOUBLE PRECISION array is initialized by the
+ * database with length 2, and all elemenets are 0.
+ */
+template <class Handle>
+class WeightedSampleTransitionState {
+public:
+    WeightedSampleTransitionState(const AnyType &inArray)
+      : mStorage(inArray.getAs<Handle>()),
+        sample_id(&mStorage[1]),
+        weight_sum(&mStorage[0]) { }
+
+    inline operator AnyType() const {
+        return mStorage;
+    }
+
+private:
+    Handle mStorage;
+
+public:
+    typename HandleTraits<Handle>::ReferenceToInt64 sample_id;
+    typename HandleTraits<Handle>::ReferenceToDouble weight_sum;
+};
+
+/**
+ * @brief Perform the weighted-sample transition step
+ */
+AnyType
+weighted_sample_transition::run(AnyType& args) {
+    WeightedSampleTransitionState<MutableArrayHandle<double> > state = args[0];
+    uint64_t identifier = args[1].getAs<int64_t>();
+    double weight = args[2].getAs<double>();
+
+    // Instead of throwing an error, we will just ignore rows with a negative
+    // weight
+    if (weight > 0.) {
+        state.weight_sum += weight;
+        std::bernoulli_distribution success(weight / state.weight_sum);
+        // Note that a NativeRandomNumberGenerator object is stateless, so it
+        // is not a problem to instantiate an object for each RN generation...
+        NativeRandomNumberGenerator generator;
+        if (success(generator))
+            state.sample_id = identifier;
+    }
+
+    return state;
+}
+
+/**
+ * @brief Perform the merging of two transition states
+ */
+AnyType
+weighted_sample_merge::run(AnyType &args) {
+    WeightedSampleTransitionState<MutableArrayHandle<double> > stateLeft
+        = args[0];
+    WeightedSampleTransitionState<ArrayHandle<double> > stateRight = args[1];
+
+    // FIXME: Once we have more modular states (independent of transition/merge
+    // function), implement using the logic from the transition function
+    stateLeft.weight_sum += stateRight.weight_sum;
+    std::bernoulli_distribution success(
+        stateRight.weight_sum / stateLeft.weight_sum);
+    // Note that a NativeRandomNumberGenerator object is stateless, so it
+    // is not a problem to instantiate an object for each RN generation...
+    NativeRandomNumberGenerator generator;
+    if (success(generator))
+        stateLeft.sample_id = stateRight.sample_id;
+
+    return stateLeft;
+}
+
+/**
+ * @brief Perform the weighted-sample final step
+ */
+AnyType
+weighted_sample_final::run(AnyType &args) {
+    WeightedSampleTransitionState<ArrayHandle<double> > state = args[0];
+
+    return static_cast<int64_t>(state.sample_id);
+}
+
+} // namespace stats
+
+} // namespace modules
+
+} // namespace madlib
diff --git a/src/modules/sample/weighted_sample.hpp b/src/modules/sample/weighted_sample.hpp
@@ -0,0 +1,20 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * @file weighted_sample.hpp
+ *
+ *//* ----------------------------------------------------------------------- */
+
+/**
+ * @brief Weighted random sample: Transition function
+ */
+DECLARE_UDF(sample, weighted_sample_transition)
+
+/**
+ * @brief Weighted random sample: State merge function
+ */
+DECLARE_UDF(sample, weighted_sample_merge)
+
+/**
+ * @brief Weighted random sample: Final function
+ */
+DECLARE_UDF(sample, weighted_sample_final)
diff --git a/src/ports/greenplum/4.0/config/Modules.yml b/src/ports/greenplum/4.0/config/Modules.yml
@@ -21,6 +21,7 @@ modules:
     - name: prob
     - name: quantile
     - name: regress
+    - name: sample
     - name: sketch
 #    - name: stats
 #      depends: ['utilities']

diff --git a/src/ports/greenplum/CMakeLists.txt b/src/ports/greenplum/CMakeLists.txt
@@ -47,6 +47,8 @@ list(APPEND MAD_DBAL_SOURCES
     "${CMAKE_CURRENT_SOURCE_DIR}/../postgres/dbconnector/FunctionHandle_proto.hpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../postgres/dbconnector/main.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../postgres/dbconnector/NewDelete.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../postgres/dbconnector/NativeRandomNumberGenerator_impl.hpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../postgres/dbconnector/NativeRandomNumberGenerator_proto.hpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../postgres/dbconnector/OutputStreamBuffer_impl.hpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../postgres/dbconnector/OutputStreamBuffer_proto.hpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../postgres/dbconnector/PGException_proto.hpp"

diff --git a/src/ports/postgres/8.4/config/Modules.yml b/src/ports/postgres/8.4/config/Modules.yml
@@ -21,6 +21,7 @@ modules:
     - name: prob
     - name: quantile
     - name: regress
+    - name: sample
     - name: sketch
 #    - name: stats
 #      depends: ['utilities']

diff --git a/src/ports/postgres/CMakeLists.txt b/src/ports/postgres/CMakeLists.txt
@@ -37,6 +37,8 @@ list(APPEND MAD_DBAL_SOURCES
     "${CMAKE_CURRENT_SOURCE_DIR}/dbconnector/FunctionHandle_proto.hpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/dbconnector/main.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/dbconnector/NewDelete.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/dbconnector/NativeRandomNumberGenerator_impl.hpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/dbconnector/NativeRandomNumberGenerator_proto.hpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/dbconnector/OutputStreamBuffer_impl.hpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/dbconnector/OutputStreamBuffer_proto.hpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/dbconnector/PGException_proto.hpp"

diff --git a/src/ports/postgres/dbconnector/NativeRandomNumberGenerator_impl.hpp b/src/ports/postgres/dbconnector/NativeRandomNumberGenerator_impl.hpp
@@ -0,0 +1,72 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * @file NativeRandomNumberGenerator_impl.hpp
+ *
+ *//* ----------------------------------------------------------------------- */
+
+#ifndef MADLIB_POSTGRES_NATIVERANDOMNUMBERGENERATOR_IMPL_HPP
+#define MADLIB_POSTGRES_NATIVERANDOMNUMBERGENERATOR_IMPL_HPP
+
+namespace madlib {
+
+namespace dbconnector {
+
+namespace postgres {
+
+inline
+NativeRandomNumberGenerator::NativeRandomNumberGenerator() { }
+
+/**
+ * @brief Sets the current state of the engine
+ *
+ * All this function does is calling the backend's seed functions.
+ */
+inline
+void
+NativeRandomNumberGenerator::seed(result_type inSeed) {
+    DirectFunctionCall1(setseed, Float8GetDatum(inSeed));
+}
+
+/**
+ * @brief Advances the engine's state and returns the generated value
+ */
+inline
+NativeRandomNumberGenerator::result_type
+NativeRandomNumberGenerator::operator()() {
+    // There seems to be no DirectFunctionCall0().
+    return DatumGetFloat8(DirectFunctionCall1(drandom, Datum(0)));
+}
+
+/**
+ * @brief Return tight lower bound on the set of all values returned by
+ *     <tt>operator()</tt>
+ */
+inline
+NativeRandomNumberGenerator::result_type
+NativeRandomNumberGenerator::min() {
+    return 0.0;
+}
+
+/**
+ * @brief Return smallest representable number larger than maximum of all values
+ *     returned by <tt>operator()</tt>
+ *
+ * Boost specifies, if \c result_type is not integer, then return "the smallest
+ * representable number larger than the tight upper bound on the set of all
+ * values returned by operator(). In any case, the return value of this function
+ * shall not change during the lifetime of the object."
+ * http://www.boost.org/doc/libs/1_50_0/doc/html/boost_random/reference.html
+ */
+inline
+NativeRandomNumberGenerator::result_type
+NativeRandomNumberGenerator::max() {
+    return 1.0;
+}
+
+} // namespace postgres
+
+} // namespace dbconnector
+
+} // namespace madlib
+
+#endif // defined(MADLIB_POSTGRES_NATIVERANDOMNUMBERGENERATOR_IMPL_HPP)