Skip to content

Commit

Permalink
add nonparametric module
Browse files Browse the repository at this point in the history
  • Loading branch information
ddbourgin committed Jun 30, 2019
1 parent 693cac0 commit 029a370
Show file tree
Hide file tree
Showing 7 changed files with 432 additions and 0 deletions.
18 changes: 18 additions & 0 deletions nonparametric/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Nonparametric Models
The nonparametric module implements several popular nonparameteric regression
and classification models.

- `kernel_regression.py` implements Nadaraya-Watson kernel regression
([Nadaraya, 1964](https://epubs.siam.org/doi/abs/10.1137/1109020); [Watson,
1964](https://www.jstor.org/stable/pdf/25049340.pdf))
- `knn.py` implements k-nearest neighbors regression and classification
models using a ball-tree

## Plots
<p align="center">
<strong>k-Nearest Neighbors</strong>
<img src="img/knn_plots.png" align='center' height="550" />

<strong>Nadaraya-Watson Kernel Regression</strong>
<img src="img/kr_plots.png" align='center' height="550" />
</p>
Binary file added nonparametric/img/knn_plots.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added nonparametric/img/kr_plots.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
63 changes: 63 additions & 0 deletions nonparametric/kernel_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import sys

sys.path.append("..")
from utils.kernels import KernelInitializer


class KernelRegression:
def __init__(self, kernel=None):
"""
A Nadaraya-Watson kernel regression model.
f(x) = sum_i w_i(x) * y_i
where the sample weighting functions, w_i, are simply
w_i(x) = k(x, x_i) / sum_j k(x, x_j)
with k being the kernel function.
Observe that k-nearest neighbors (KNN) regression is a special case of
kernel regression where the k closest observations have a weight 1/k,
and all others have weight 0.
Parameters
----------
kernel : str, `KernelBase` instance, or dict (default: None)
The kernel to use. If `None`, default to `LinearKernel`
"""
self.parameters = {"X": None, "y": None}
self.hyperparameters = {"kernel": str(kernel)}
self.kernel = KernelInitializer(kernel)()

def fit(self, X, y):
"""
Fit the regression model to the data and targets in `X` and `y`
Parameters
----------
X : numpy array of shape (N, M)
An array of N examples to generate predictions on
y : numpy array of shape (N, ...)
Predicted targets for the N' rows in `X`
"""
self.parameters = {"X": X, "y": y}

def predict(self, X):
"""
Generate predictions for the targets associated with the rows in `X`.
Parameters
----------
X : numpy array of shape (N', M')
An array of N' examples to generate predictions on
Returns
-------
y : numpy array of shape (N', ...)
Predicted targets for the N' rows in `X`
"""
K = self.kernel
P = self.parameters
sim = K(P["X"], X)
return (sim * P["y"][:, None]).sum(axis=0) / sim.sum(axis=0)
96 changes: 96 additions & 0 deletions nonparametric/knn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import sys
from collections import Counter

import numpy as np

sys.path.append("..")
from utils.data_structures import BallTree


class KNN:
def __init__(
self, k=5, leaf_size=40, classifier=True, metric=None, weights="uniform"
):
"""
A k-nearest neighbors (kNN) model relying on a ball tree for efficient
computation.
Parameters
----------
k : int (default: 5)
The number of neighbors to use during prediction
leaf_size : int (default: 40)
The maximum number of datapoints at each leaf in the ball tree
classifier : bool (default: True)
Whether to treat the values in Y as class labels (classifier =
True) or real-valued targets (classifier = False)
metric : function (default: None)
The distance metric to use for computing nearest neighbors
weights : 'uniform' or 'distance' (default: 'uniform')
How to weight the predictions from each neighbors. 'uniform'
assigns uniform weights to each neighbor, while 'distance' assigns
weights proportional to the inverse of the distance from the query
point
"""
self._ball_tree = BallTree(leaf_size=leaf_size, metric=metric)
self.hyperparameters = {
"id": "KNN",
"k": k,
"leaf_size": leaf_size,
"classifier": classifier,
"metric": str(metric),
"weights": weights,
}

def fit(self, X, y):
"""
Fit the model to the data and targets in `X` and `y`
Parameters
----------
X : numpy array of shape (N, M)
An array of N examples to generate predictions on
y : numpy array of shape (N, ...)
Predicted targets for the N' rows in `X`
"""
if X.ndim != 2:
raise Exception("X must be two-dimensional")
self._ball_tree.fit(X, y)

def predict(self, X):
"""
Generate predictions for the targets associated with the rows in `X`.
Parameters
----------
X : numpy array of shape (N', M')
An array of N' examples to generate predictions on
Returns
-------
y : numpy array of shape (N', ...)
Predicted targets for the N' rows in `X`
"""
predictions = []
H = self.hyperparameters
for x in X:
pred = None
nearest = self._ball_tree.nearest_neighbors(H["k"], x)
targets = [n.val for n in nearest]

if H["classifier"]:
if H["weights"] == "uniform":
pred = Counter(targets).most_common(1)[0][0]
elif H["weights"] == "distance":
best_score = -np.inf
for label in set(targets):
scores = [1 / n.distance for n in nearest if n.val == label]
pred = label if np.sum(scores) > best_score else pred
else:
if H["weights"] == "uniform":
pred = np.mean(targets)
elif H["weights"] == "distance":
weights = [1 / n.distance for n in nearest]
pred = np.average(targets, weights=weights)
predictions.append(pred)
return np.array(predictions)
177 changes: 177 additions & 0 deletions nonparametric/plots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
import sys

sys.path.append("..")

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# https://seaborn.pydata.org/generated/seaborn.set_context.html
# https://seaborn.pydata.org/generated/seaborn.set_style.html
sns.set_style("white")
sns.set_context("paper", font_scale=0.5)

from linear_models.lm import LinearRegression
from kernel_regression import KernelRegression
from knn import KNN

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression


def random_regression_problem(n_ex, n_in, n_out, d=3, intercept=0, std=1, seed=0):
coef = np.random.uniform(0, 50, size=d)
coef[-1] = intercept

y = []
X = np.random.uniform(-100, 100, size=(n_ex, n_in))
for x in X:
val = np.polyval(coef, x) + np.random.normal(0, std)
y.append(val)
y = np.array(y)

# X, y, coef = make_regression(
# n_samples=n_ex,
# n_features=n_in,
# n_targets=n_out,
# bias=intercept,
# noise=std,
# coef=True,
# random_state=seed,
# )
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=seed
)
return X_train, y_train, X_test, y_test, coef


def plot_regression():
np.random.seed(12345)
fig, axes = plt.subplots(4, 4)
for i, ax in enumerate(axes.flatten()):
n_in = 1
n_out = 1
d = np.random.randint(1, 5)
n_ex = np.random.randint(5, 500)
std = np.random.randint(0, 1000)
intercept = np.random.rand() * np.random.randint(-300, 300)
X_train, y_train, X_test, y_test, coefs = random_regression_problem(
n_ex, n_in, n_out, d=d, intercept=intercept, std=std, seed=i
)

LR = LinearRegression(fit_intercept=True)
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)
loss = np.mean((y_test.flatten() - y_pred.flatten()) ** 2)

d = 3
best_loss = np.inf
for gamma in np.linspace(1e-10, 1, 100):
for c0 in np.linspace(-1, 1000, 100):
kernel = "PolynomialKernel(d={}, gamma={}, c0={})".format(d, gamma, c0)
KR_poly = KernelRegression(kernel=kernel)
KR_poly.fit(X_train, y_train)
y_pred_poly = KR_poly.predict(X_test)
loss_poly = np.mean((y_test.flatten() - y_pred_poly.flatten()) ** 2)
if loss_poly <= best_loss:
KR_poly_best = kernel
best_loss = loss_poly

print("Best kernel: {} || loss: {:.4f}".format(KR_poly_best, best_loss))
KR_poly = KernelRegression(kernel=KR_poly_best)
KR_poly.fit(X_train, y_train)

KR_rbf = KernelRegression(kernel="RBFKernel(gamma=0.01)")
KR_rbf.fit(X_train, y_train)
y_pred_rbf = KR_rbf.predict(X_test)
loss_rbf = np.mean((y_test.flatten() - y_pred_rbf.flatten()) ** 2)

xmin = min(X_test) - 0.1 * (max(X_test) - min(X_test))
xmax = max(X_test) + 0.1 * (max(X_test) - min(X_test))
X_plot = np.linspace(xmin, xmax, 100)
y_plot = LR.predict(X_plot)
y_plot_poly = KR_poly.predict(X_plot)
y_plot_rbf = KR_rbf.predict(X_plot)

ax.scatter(X_test, y_test, alpha=0.5)
ax.plot(X_plot, y_plot, label="OLS", alpha=0.5)
ax.plot(
X_plot, y_plot_poly, label="KR (poly kernel, d={})".format(d), alpha=0.5
)
ax.plot(X_plot, y_plot_rbf, label="KR (rbf kernel)", alpha=0.5)
ax.legend()
# ax.set_title(
# "MSE\nLR: {:.2f} KR (poly): {:.2f}\nKR (rbf): {:.2f}".format(
# loss, loss_poly, loss_rbf
# )
# )

ax.xaxis.set_ticklabels([])
ax.yaxis.set_ticklabels([])

plt.tight_layout()
plt.savefig("img/kr_plots.png", dpi=300)
plt.close("all")


def plot_knn():
np.random.seed(12345)
fig, axes = plt.subplots(4, 4)
for i, ax in enumerate(axes.flatten()):
n_in = 1
n_out = 1
d = np.random.randint(1, 5)
n_ex = np.random.randint(5, 500)
std = np.random.randint(0, 1000)
intercept = np.random.rand() * np.random.randint(-300, 300)
X_train, y_train, X_test, y_test, coefs = random_regression_problem(
n_ex, n_in, n_out, d=d, intercept=intercept, std=std, seed=i
)

LR = LinearRegression(fit_intercept=True)
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)
loss = np.mean((y_test.flatten() - y_pred.flatten()) ** 2)

knn_1 = KNN(k=1, classifier=False, leaf_size=10, weights="uniform")
knn_1.fit(X_train, y_train)
y_pred_1 = knn_1.predict(X_test)
loss_1 = np.mean((y_test.flatten() - y_pred_1.flatten()) ** 2)

knn_5 = KNN(k=5, classifier=False, leaf_size=10, weights="uniform")
knn_5.fit(X_train, y_train)
y_pred_5 = knn_5.predict(X_test)
loss_5 = np.mean((y_test.flatten() - y_pred_5.flatten()) ** 2)

knn_10 = KNN(k=10, classifier=False, leaf_size=10, weights="uniform")
knn_10.fit(X_train, y_train)
y_pred_10 = knn_10.predict(X_test)
loss_10 = np.mean((y_test.flatten() - y_pred_10.flatten()) ** 2)

xmin = min(X_test) - 0.1 * (max(X_test) - min(X_test))
xmax = max(X_test) + 0.1 * (max(X_test) - min(X_test))
X_plot = np.linspace(xmin, xmax, 100)
y_plot = LR.predict(X_plot)
y_plot_1 = knn_1.predict(X_plot)
y_plot_5 = knn_5.predict(X_plot)
y_plot_10 = knn_10.predict(X_plot)

ax.scatter(X_test, y_test, alpha=0.5)
ax.plot(X_plot, y_plot, label="OLS", alpha=0.5)
ax.plot(X_plot, y_plot_1, label="KNN (k=1)", alpha=0.5)
ax.plot(X_plot, y_plot_5, label="KNN (k=5)", alpha=0.5)
ax.plot(X_plot, y_plot_10, label="KNN (k=10)", alpha=0.5)
ax.legend()
# ax.set_title(
# "MSE\nLR: {:.2f} KR (poly): {:.2f}\nKR (rbf): {:.2f}".format(
# loss, loss_poly, loss_rbf
# )
# )

ax.xaxis.set_ticklabels([])
ax.yaxis.set_ticklabels([])

plt.tight_layout()
plt.savefig("img/knn_plots.png", dpi=300)
plt.close("all")
Loading

0 comments on commit 029a370

Please sign in to comment.