add nonparametric module

Vitvicky · Jun 30, 2019 · 029a370 · 029a370
1 parent 693cac0
commit 029a370
Show file tree

Hide file tree

Showing 7 changed files with 432 additions and 0 deletions.
diff --git a/nonparametric/README.md b/nonparametric/README.md
@@ -0,0 +1,18 @@
+# Nonparametric Models
+The nonparametric module implements several popular nonparameteric regression
+and classification models.
+
+- `kernel_regression.py` implements Nadaraya-Watson kernel regression
+  ([Nadaraya, 1964](https://epubs.siam.org/doi/abs/10.1137/1109020); [Watson,
+1964](https://www.jstor.org/stable/pdf/25049340.pdf))
+- `knn.py` implements k-nearest neighbors regression and classification
+  models using a ball-tree
+
+## Plots
+<p align="center">
+<strong>k-Nearest Neighbors</strong>
+<img src="img/knn_plots.png" align='center' height="550" />
+
+<strong>Nadaraya-Watson Kernel Regression</strong>
+<img src="img/kr_plots.png" align='center' height="550" />
+</p>
diff --git a/nonparametric/img/knn_plots.png b/nonparametric/img/knn_plots.png
diff --git a/nonparametric/img/kr_plots.png b/nonparametric/img/kr_plots.png
diff --git a/nonparametric/kernel_regression.py b/nonparametric/kernel_regression.py
@@ -0,0 +1,63 @@
+import sys
+
+sys.path.append("..")
+from utils.kernels import KernelInitializer
+
+
+class KernelRegression:
+    def __init__(self, kernel=None):
+        """
+        A Nadaraya-Watson kernel regression model.
+
+            f(x) = sum_i w_i(x) * y_i
+
+        where the sample weighting functions, w_i, are simply
+
+            w_i(x) = k(x, x_i) / sum_j k(x, x_j)
+
+        with k being the kernel function.
+
+        Observe that k-nearest neighbors (KNN) regression is a special case of
+        kernel regression where the k closest observations have a weight 1/k,
+        and all others have weight 0.
+
+        Parameters
+        ----------
+        kernel : str, `KernelBase` instance, or dict (default: None)
+            The kernel to use. If `None`, default to `LinearKernel`
+        """
+        self.parameters = {"X": None, "y": None}
+        self.hyperparameters = {"kernel": str(kernel)}
+        self.kernel = KernelInitializer(kernel)()
+
+    def fit(self, X, y):
+        """
+        Fit the regression model to the data and targets in `X` and `y`
+
+        Parameters
+        ----------
+        X : numpy array of shape (N, M)
+            An array of N examples to generate predictions on
+        y : numpy array of shape (N, ...)
+            Predicted targets for the N' rows in `X`
+        """
+        self.parameters = {"X": X, "y": y}
+
+    def predict(self, X):
+        """
+        Generate predictions for the targets associated with the rows in `X`.
+
+        Parameters
+        ----------
+        X : numpy array of shape (N', M')
+            An array of N' examples to generate predictions on
+
+        Returns
+        -------
+        y : numpy array of shape (N', ...)
+            Predicted targets for the N' rows in `X`
+        """
+        K = self.kernel
+        P = self.parameters
+        sim = K(P["X"], X)
+        return (sim * P["y"][:, None]).sum(axis=0) / sim.sum(axis=0)
diff --git a/nonparametric/knn.py b/nonparametric/knn.py
@@ -0,0 +1,96 @@
+import sys
+from collections import Counter
+
+import numpy as np
+
+sys.path.append("..")
+from utils.data_structures import BallTree
+
+
+class KNN:
+    def __init__(
+        self, k=5, leaf_size=40, classifier=True, metric=None, weights="uniform"
+    ):
+        """
+        A k-nearest neighbors (kNN) model relying on a ball tree for efficient
+        computation.
+
+        Parameters
+        ----------
+        k : int (default: 5)
+            The number of neighbors to use during prediction
+        leaf_size : int (default: 40)
+            The maximum number of datapoints at each leaf in the ball tree
+        classifier : bool (default: True)
+            Whether to treat the values in Y as class labels (classifier =
+            True) or real-valued targets (classifier = False)
+        metric : function (default: None)
+            The distance metric to use for computing nearest neighbors
+        weights : 'uniform' or 'distance' (default: 'uniform')
+            How to weight the predictions from each neighbors. 'uniform'
+            assigns uniform weights to each neighbor, while 'distance' assigns
+            weights proportional to the inverse of the distance from the query
+            point
+        """
+        self._ball_tree = BallTree(leaf_size=leaf_size, metric=metric)
+        self.hyperparameters = {
+            "id": "KNN",
+            "k": k,
+            "leaf_size": leaf_size,
+            "classifier": classifier,
+            "metric": str(metric),
+            "weights": weights,
+        }
+
+    def fit(self, X, y):
+        """
+        Fit the model to the data and targets in `X` and `y`
+
+        Parameters
+        ----------
+        X : numpy array of shape (N, M)
+            An array of N examples to generate predictions on
+        y : numpy array of shape (N, ...)
+            Predicted targets for the N' rows in `X`
+        """
+        if X.ndim != 2:
+            raise Exception("X must be two-dimensional")
+        self._ball_tree.fit(X, y)
+
+    def predict(self, X):
+        """
+        Generate predictions for the targets associated with the rows in `X`.
+
+        Parameters
+        ----------
+        X : numpy array of shape (N', M')
+            An array of N' examples to generate predictions on
+
+        Returns
+        -------
+        y : numpy array of shape (N', ...)
+            Predicted targets for the N' rows in `X`
+        """
+        predictions = []
+        H = self.hyperparameters
+        for x in X:
+            pred = None
+            nearest = self._ball_tree.nearest_neighbors(H["k"], x)
+            targets = [n.val for n in nearest]
+
+            if H["classifier"]:
+                if H["weights"] == "uniform":
+                    pred = Counter(targets).most_common(1)[0][0]
+                elif H["weights"] == "distance":
+                    best_score = -np.inf
+                    for label in set(targets):
+                        scores = [1 / n.distance for n in nearest if n.val == label]
+                        pred = label if np.sum(scores) > best_score else pred
+            else:
+                if H["weights"] == "uniform":
+                    pred = np.mean(targets)
+                elif H["weights"] == "distance":
+                    weights = [1 / n.distance for n in nearest]
+                    pred = np.average(targets, weights=weights)
+            predictions.append(pred)
+        return np.array(predictions)
diff --git a/nonparametric/plots.py b/nonparametric/plots.py
@@ -0,0 +1,177 @@
+import sys
+
+sys.path.append("..")
+
+import numpy as np
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# https://seaborn.pydata.org/generated/seaborn.set_context.html
+# https://seaborn.pydata.org/generated/seaborn.set_style.html
+sns.set_style("white")
+sns.set_context("paper", font_scale=0.5)
+
+from linear_models.lm import LinearRegression
+from kernel_regression import KernelRegression
+from knn import KNN
+
+from sklearn.model_selection import train_test_split
+from sklearn.datasets import make_regression
+
+
+def random_regression_problem(n_ex, n_in, n_out, d=3, intercept=0, std=1, seed=0):
+    coef = np.random.uniform(0, 50, size=d)
+    coef[-1] = intercept
+
+    y = []
+    X = np.random.uniform(-100, 100, size=(n_ex, n_in))
+    for x in X:
+        val = np.polyval(coef, x) + np.random.normal(0, std)
+        y.append(val)
+    y = np.array(y)
+
+    #  X, y, coef = make_regression(
+    #      n_samples=n_ex,
+    #      n_features=n_in,
+    #      n_targets=n_out,
+    #      bias=intercept,
+    #      noise=std,
+    #      coef=True,
+    #      random_state=seed,
+    #  )
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.3, random_state=seed
+    )
+    return X_train, y_train, X_test, y_test, coef
+
+
+def plot_regression():
+    np.random.seed(12345)
+    fig, axes = plt.subplots(4, 4)
+    for i, ax in enumerate(axes.flatten()):
+        n_in = 1
+        n_out = 1
+        d = np.random.randint(1, 5)
+        n_ex = np.random.randint(5, 500)
+        std = np.random.randint(0, 1000)
+        intercept = np.random.rand() * np.random.randint(-300, 300)
+        X_train, y_train, X_test, y_test, coefs = random_regression_problem(
+            n_ex, n_in, n_out, d=d, intercept=intercept, std=std, seed=i
+        )
+
+        LR = LinearRegression(fit_intercept=True)
+        LR.fit(X_train, y_train)
+        y_pred = LR.predict(X_test)
+        loss = np.mean((y_test.flatten() - y_pred.flatten()) ** 2)
+
+        d = 3
+        best_loss = np.inf
+        for gamma in np.linspace(1e-10, 1, 100):
+            for c0 in np.linspace(-1, 1000, 100):
+                kernel = "PolynomialKernel(d={}, gamma={}, c0={})".format(d, gamma, c0)
+                KR_poly = KernelRegression(kernel=kernel)
+                KR_poly.fit(X_train, y_train)
+                y_pred_poly = KR_poly.predict(X_test)
+                loss_poly = np.mean((y_test.flatten() - y_pred_poly.flatten()) ** 2)
+                if loss_poly <= best_loss:
+                    KR_poly_best = kernel
+                    best_loss = loss_poly
+
+        print("Best kernel: {} || loss: {:.4f}".format(KR_poly_best, best_loss))
+        KR_poly = KernelRegression(kernel=KR_poly_best)
+        KR_poly.fit(X_train, y_train)
+
+        KR_rbf = KernelRegression(kernel="RBFKernel(gamma=0.01)")
+        KR_rbf.fit(X_train, y_train)
+        y_pred_rbf = KR_rbf.predict(X_test)
+        loss_rbf = np.mean((y_test.flatten() - y_pred_rbf.flatten()) ** 2)
+
+        xmin = min(X_test) - 0.1 * (max(X_test) - min(X_test))
+        xmax = max(X_test) + 0.1 * (max(X_test) - min(X_test))
+        X_plot = np.linspace(xmin, xmax, 100)
+        y_plot = LR.predict(X_plot)
+        y_plot_poly = KR_poly.predict(X_plot)
+        y_plot_rbf = KR_rbf.predict(X_plot)
+
+        ax.scatter(X_test, y_test, alpha=0.5)
+        ax.plot(X_plot, y_plot, label="OLS", alpha=0.5)
+        ax.plot(
+            X_plot, y_plot_poly, label="KR (poly kernel, d={})".format(d), alpha=0.5
+        )
+        ax.plot(X_plot, y_plot_rbf, label="KR (rbf kernel)", alpha=0.5)
+        ax.legend()
+        #  ax.set_title(
+        #      "MSE\nLR: {:.2f} KR (poly): {:.2f}\nKR (rbf): {:.2f}".format(
+        #          loss, loss_poly, loss_rbf
+        #      )
+        #  )
+
+        ax.xaxis.set_ticklabels([])
+        ax.yaxis.set_ticklabels([])
+
+    plt.tight_layout()
+    plt.savefig("img/kr_plots.png", dpi=300)
+    plt.close("all")
+
+
+def plot_knn():
+    np.random.seed(12345)
+    fig, axes = plt.subplots(4, 4)
+    for i, ax in enumerate(axes.flatten()):
+        n_in = 1
+        n_out = 1
+        d = np.random.randint(1, 5)
+        n_ex = np.random.randint(5, 500)
+        std = np.random.randint(0, 1000)
+        intercept = np.random.rand() * np.random.randint(-300, 300)
+        X_train, y_train, X_test, y_test, coefs = random_regression_problem(
+            n_ex, n_in, n_out, d=d, intercept=intercept, std=std, seed=i
+        )
+
+        LR = LinearRegression(fit_intercept=True)
+        LR.fit(X_train, y_train)
+        y_pred = LR.predict(X_test)
+        loss = np.mean((y_test.flatten() - y_pred.flatten()) ** 2)
+
+        knn_1 = KNN(k=1, classifier=False, leaf_size=10, weights="uniform")
+        knn_1.fit(X_train, y_train)
+        y_pred_1 = knn_1.predict(X_test)
+        loss_1 = np.mean((y_test.flatten() - y_pred_1.flatten()) ** 2)
+
+        knn_5 = KNN(k=5, classifier=False, leaf_size=10, weights="uniform")
+        knn_5.fit(X_train, y_train)
+        y_pred_5 = knn_5.predict(X_test)
+        loss_5 = np.mean((y_test.flatten() - y_pred_5.flatten()) ** 2)
+
+        knn_10 = KNN(k=10, classifier=False, leaf_size=10, weights="uniform")
+        knn_10.fit(X_train, y_train)
+        y_pred_10 = knn_10.predict(X_test)
+        loss_10 = np.mean((y_test.flatten() - y_pred_10.flatten()) ** 2)
+
+        xmin = min(X_test) - 0.1 * (max(X_test) - min(X_test))
+        xmax = max(X_test) + 0.1 * (max(X_test) - min(X_test))
+        X_plot = np.linspace(xmin, xmax, 100)
+        y_plot = LR.predict(X_plot)
+        y_plot_1 = knn_1.predict(X_plot)
+        y_plot_5 = knn_5.predict(X_plot)
+        y_plot_10 = knn_10.predict(X_plot)
+
+        ax.scatter(X_test, y_test, alpha=0.5)
+        ax.plot(X_plot, y_plot, label="OLS", alpha=0.5)
+        ax.plot(X_plot, y_plot_1, label="KNN (k=1)", alpha=0.5)
+        ax.plot(X_plot, y_plot_5, label="KNN (k=5)", alpha=0.5)
+        ax.plot(X_plot, y_plot_10, label="KNN (k=10)", alpha=0.5)
+        ax.legend()
+        #  ax.set_title(
+        #      "MSE\nLR: {:.2f} KR (poly): {:.2f}\nKR (rbf): {:.2f}".format(
+        #          loss, loss_poly, loss_rbf
+        #      )
+        #  )
+
+        ax.xaxis.set_ticklabels([])
+        ax.yaxis.set_ticklabels([])
+
+    plt.tight_layout()
+    plt.savefig("img/knn_plots.png", dpi=300)
+    plt.close("all")