diff --git a/.Rhistory b/.Rhistory index 259a596..21e5816 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,160 +1,3 @@ -<<<<<<< HEAD -results_2012_sorted <- results_2012[order(results_2012$team),] -## Get the dataset with simulation output for 2012 -simulation <- read.csv("scripts/sim_2012.csv") -sim_sort <- simulation[order(simulation$X),] -======= -<<<<<<< HEAD -inside <- (j*pi*x) -prod(sqrt(2),cos(inside)) -} -X*sqrt(2) -cosine_basis(1, X) -cosine_basis <- function(j,x){ -inside <- (j*pi*x) -cos(inside) -} -cosine_basis(1, X) -1*pi -1*pi*X -cos(1*pi*X) -cos(1*pi*X, digits=3)) -cos(1*pi*X, digits=3) -cos(1*pi*X) -cos(X) -cos(1*X) -cos(pi*X) -cos(2*pi*X) -cos(3*pi*X) -cos(4*pi*X) -cos(5*pi*X) -pi*X -glmFit <- glm(Y ~ x0, family=gaussian(link="identity")) -k <- 3 -kfCV <- cv.glm(data=dfRegr, glmfit=glmFit, K=k) -?cv.glm -library(boot) -?cv.glm -cosine_basis <- function(j,x){ -inside <- (j*pi*x) -cos(inside) -} -pi*X -cos(X) -cos(pi*X) -plot(X, cos(pi*X)) -cosine_basis(1, Xnew) -## Function to compute the cosine basis -cosine_basis <- function(j,x){ -inside <- (j*pi*x) -cos(inside)*sqrt(2) -} -cosine_basis(1, Xnew) -x1 <- cosine_basis(1, Xnew) -x1 -plot(Xnew, x1) -# Get the rest of the covariates -x1 <- cosine_basis(1, Xnew) -x2 <- cosine_basis(2, Xnew) -x3 <- cosine_basis(3, Xnew) -x4 <- cosine_basis(4, Xnew) -x5 <- cosine_basis(5, Xnew) -x6 <- cosine_basis(6, Xnew) -x6 -x5 -plot(Xnew, x5) -plot(Xnew, x6) -plot(Xnew, x2) -plot(Xnew, x3) -plot(Xnew, x0) -for(i in 1:6){print(i)} -plot(Xnew, Y) -mod0 <- glm(Y ~ x0, family=gaussian(link="identity")) -mod0 <- glm(Y ~ x0, family=gaussian(link="identity")) -data(ufcwc) -Y = ufcwc$Height -X = ufcwc$Dbh -n = length(Xnew) -Xnew <- (X-min(X))/(max(X)-min(X)) -x0 <- rep(1, n) -## Function to compute the cosine basis -cosine_basis <- function(j,x){ -inside <- (j*pi*x) -cos(inside)*sqrt(2) -} -# Get the rest of the covariates -x1 <- cosine_basis(1, Xnew) -x2 <- cosine_basis(2, Xnew) -x3 <- cosine_basis(3, Xnew) -x4 <- cosine_basis(4, Xnew) -x5 <- cosine_basis(5, Xnew) -x6 <- cosine_basis(6, Xnew) -## Cost Function -cost = function(y, predy){ -err = mean( (y-predy)^2 ) -return(err) -} -## Get the fits -mod0 <- glm(Y ~ x0, family=gaussian(link="identity")) -mod1 <- glm(Y ~ x0+x1, family=gaussian(link="identity")) -mod2 <- glm(Y ~ x0+x1+x2, family=gaussian(link="identity")) -mod3 <- glm(Y ~ x0+x1+x3, family=gaussian(link="identity")) -mod4 <- glm(Y ~ x0+x1+x3+x4, family=gaussian(link="identity")) -mod5 <- glm(Y ~ x0+x1+x3+x4+x5, family=gaussian(link="identity")) -mod6 <- glm(Y ~ x0+x1+x3+x4+x5+x6, family=gaussian(link="identity")) -points(Xnew, fitted(mod6)) -points(Xnew, fitted(mod6), type="l") -?cv.glm -kfCV <- cv.glm(data=ufcwc, cost=cost, glmfit=glmFit, K=k) -cbind(ufcwc, x0, x1) -ufcwc <- cbind(ufcwc, x0, x1, x2, x3, x4, x5, x6) -data(ufcwc) -Y = ufcwc$Height -X = ufcwc$Dbh -n = length(Xnew) -Xnew <- (X-min(X))/(max(X)-min(X)) -x0 <- rep(1, n) -## Function to compute the cosine basis -cosine_basis <- function(j,x){ -inside <- (j*pi*x) -cos(inside)*sqrt(2) -} -# Get the rest of the covariates -x1 <- cosine_basis(1, Xnew) -x2 <- cosine_basis(2, Xnew) -x3 <- cosine_basis(3, Xnew) -x4 <- cosine_basis(4, Xnew) -x5 <- cosine_basis(5, Xnew) -x6 <- cosine_basis(6, Xnew) -ufcwc <- cbind(ufcwc, x0, x1, x2, x3, x4, x5, x6) -## Cost Function -cost = function(y, predy){ -err = mean( (y-predy)^2 ) -return(err) -} -## Get the fits -mod0 <- glm(Height ~ x0, family=gaussian(link="identity"), data=ufcwc) -mod1 <- glm(Height ~ x0+x1, family=gaussian(link="identity"), data=ufcwc) -mod2 <- glm(Height ~ x0+x1+x2, family=gaussian(link="identity"), data=ufcwc) -mod3 <- glm(Height ~ x0+x1+x3, family=gaussian(link="identity"), data=ufcwc) -mod4 <- glm(Height ~ x0+x1+x3+x4, family=gaussian(link="identity"), data=ufcwc) -mod5 <- glm(Height ~ x0+x1+x3+x4+x5, family=gaussian(link="identity"), data=ufcwc) -mod6 <- glm(Height ~ x0+x1+x3+x4+x5+x6, family=gaussian(link="identity"), data=ufcwc) -kfCV <- cv.glm(data=ufcwc, cost=cost, glmfit=mod0, K=4) -names(ufcwc) -data(ufcwc) -Y = ufcwc$Height -X = ufcwc$Dbh -n = length(Xnew) -Xnew <- (X-min(X))/(max(X)-min(X)) -x0 <- rep(1, n) -## Function to compute the cosine basis -cosine_basis <- function(j,x){ -inside <- (j*pi*x) -cos(inside)*sqrt(2) -} -# Get the rest of the covariates -x1 <- cosine_basis(1, Xnew) x2 <- cosine_basis(2, Xnew) x3 <- cosine_basis(3, Xnew) x4 <- cosine_basis(4, Xnew) @@ -247,250 +90,6 @@ x0 <- rep(1, n) cosine_basis <- function(j,x){ inside <- (j*pi*x) cos(inside)*sqrt(2) -======= -<<<<<<< HEAD -y_ord2 <- Y[ord] -points(x_ord2, exp(fitted(log_fit))[ord], type="l") -log_fit <- lm(Height ~ log(Dbh), data=ufcwc) -plot(X, Y) -ord <- order(X) -x_ord2 <- X[ord] -log_fitted <- exp(fitted(log_fit)) -y_ord2 <- Y[ord] -points(x_ord2, exp(fitted(log_fit))[ord], type="l") -log_fit <- lm(log(Height) ~ log(Dbh), data=ufcwc) -plot(X, Y) -ord <- order(X) -log_fitted <- exp(fitted(log_fit)) -x_ord2 <- X[ord] -y_ord2 <- Y[ord] -points(x_ord2, exp(fitted(log_fit))[ord], type="l") -weeks_t <- c(.95, .95, .95, .95, .03, .03, .03, .03, .02, .02, .02, .02) -e_w <- c(.001, .001, .999, .999, .02, .02, .98, .98, .04, .04, .96, .96) -r_e <- c(.9, .1, .2, .8, .9, .1, .2, .8, .9, .1, .2, .8) -probs2 <- cbind(weeks_t, e_w, r_e) -row.names(probs2) <- NULL -R <- apply(probs2, 1, prod) -condensed <- c(sum(R[1], R[3]), sum(R[2], R[4]), sum(R[5], R[7]), -sum(R[6], R[8]),sum(R[9], R[11]), sum(R[10], R[12])) -dist_true <- condensed[c(1, 3, 5)]/sum(condensed[c(1, 3, 5)]) -dist_false <- condensed[c(2, 4, 6)]/sum(condensed[c(2, 4, 6)]) -exp_false <- 1*dist_false[2] + 2*dist_false[3] -dist_true -probs2 -R -######## Part 1 ########### -library(xtable) -wt = c(.95, .95, .95, .95) -e = c(.001,.001,.999,.999) -sp = c(.1, .9, .1, .9) -ap = c(.99, .95, .95, .01) -r = c(.9, .9, .2, .2) -probs <- cbind(wt, e, sp, ap, r) -xtable(probs, digits=3) -r1=prod(wt[1], e[1], sp[1], ap[1], r[1]) -r2=prod(wt[2], e[2], sp[2], ap[2], r[1]) -r3=prod(wt[3], e[3], sp[3], ap[3], r[3]) -r4=prod(wt[4], e[4], sp[4], ap[4], r[4]) -sum(r1, r2)/sum(r1, r2, r3, r4) -sum(r3, r4)/sum(r1, r2, r3, r4) -xtable(probs) -wt = c(.02, .02, .02, .02) -e = c(.04,.04,.96,.96) -sp = c(.2, .8, .2, .8) -wt = c(.02, .02, .02, .02) -e = c(.04,.04,.96,.96) -sp = c(.2, .8, .2, .8) -ap = c(.99, .95, .95, .01) -r = c(.9, .9, .2, .2) -probs <- cbind(wt, e, sp, ap, r) -xtable(probs, digits=3) -r1=prod(wt[1], e[1], sp[1], ap[1], r[1]) -r2=prod(wt[2], e[2], sp[2], ap[2], r[1]) -r3=prod(wt[3], e[3], sp[3], ap[3], r[3]) -r4=prod(wt[4], e[4], sp[4], ap[4], r[4]) -sum(r1, r2)/sum(r1, r2, r3, r4) -sum(r3, r4)/sum(r1, r2, r3, r4) -probs_week2 <- cbind(wt, e, sp, ap, r) -wt = c(.02, .02, .02, .02) -e = c(.04,.04,.96,.96) -sp = c(.2, .8, .2, .8) -ap = c(.99, .95, .95, .01) -r = c(.9, .9, .2, .2) -probs_week2 <- cbind(wt, e, sp, ap, r) -xtable(probs_week2, digits=3) -wt = c(.02, .02, .02, .02) -e = c(.04,.04,.96,.96) -sp = c(.2, .8, .2, .8) -ap = c(.99, .95, .95, .01) -r = c(.9, .9, .2, .2) -probs_week2 <- cbind(wt, e, sp, ap, r) -xtable(probs_week2, digits=3) -r1=prod(wt[1], e[1], sp[1], ap[1], r[1]) -r2=prod(wt[2], e[2], sp[2], ap[2], r[1]) -r3=prod(wt[3], e[3], sp[3], ap[3], r[3]) -r4=prod(wt[4], e[4], sp[4], ap[4], r[4]) -sum(r1, r2)/sum(r1, r2, r3, r4) -sum(r3, r4)/sum(r1, r2, r3, r4) -######## Part 1 ########### -library(xtable) -wt = c(.95, .95, .95, .95) -e = c(.001,.001,.999,.999) -sp = c(.1, .9, .1, .9) -ap = c(.99, .95, .95, .01) -r = c(.9, .9, .2, .2) -probs <- cbind(wt, e, sp, ap, r) -xtable(probs, digits=3) -r1=prod(wt[1], e[1], sp[1], ap[1], r[1]) -r2=prod(wt[2], e[2], sp[2], ap[2], r[1]) -r3=prod(wt[3], e[3], sp[3], ap[3], r[3]) -r4=prod(wt[4], e[4], sp[4], ap[4], r[4]) -sum(r1, r2)/sum(r1, r2, r3, r4) -sum(r3, r4)/sum(r1, r2, r3, r4) -### WEEKS = 2 ### -wt = c(.02, .02, .02, .02) -e = c(.04,.04,.96,.96) -sp = c(.2, .8, .2, .8) -ap = c(.99, .95, .95, .01) -r = c(.9, .9, .2, .2) -probs_week2 <- cbind(wt, e, sp, ap, r) -xtable(probs_week2, digits=3) -r1=prod(wt[1], e[1], sp[1], ap[1], r[1]) -r2=prod(wt[2], e[2], sp[2], ap[2], r[1]) -r3=prod(wt[3], e[3], sp[3], ap[3], r[3]) -r4=prod(wt[4], e[4], sp[4], ap[4], r[4]) -sum(r1, r2)/sum(r1, r2, r3, r4) -sum(r3, r4)/sum(r1, r2, r3, r4) -xtable(dist_trye) -xtable(dist_true) -xtable(cbind(c("0", "1", "2")), dist_true) -names = c("Weeks = 2", "Weeks = 1", "Weeks = 2") -cbind(names, dist_true) -rbind(names, dist_true) -dist_true <- condensed[c(1, 3, 5)]/sum(condensed[c(1, 3, 5)]) -names = c(0, 1, 2) -cbind(names, dist_true) -rbind(names, dist_true) -dist_true <- condensed[c(1, 3, 5)]/sum(condensed[c(1, 3, 5)]) -weeks = c(0, 1, 2) -rbind(weeks, dist_true) -xtable(res, digits=2) -res <- rbind(weeks, dist_true) -xtable(res, digits=2) -xtable(res, digits=3) -weeks_t <- c(.95, .95, .95, .95, .03, .03, .03, .03, .02, .02, .02, .02) -e_w <- c(.001, .001, .999, .999, .02, .02, .98, .98, .04, .04, .96, .96) -r_e <- c(.9, .1, .2, .8, .9, .1, .2, .8, .9, .1, .2, .8) -probs2 <- cbind(weeks_t, e_w, r_e) -row.names(probs2) <- NULL -R <- apply(probs2, 1, prod) -condensed <- c(sum(R[1], R[3]), sum(R[2], R[4]), sum(R[5], R[7]), -sum(R[6], R[8]),sum(R[9], R[11]), sum(R[10], R[12])) -dist_true <- condensed[c(1, 3, 5)]/sum(condensed[c(1, 3, 5)]) -weeks = c(0, 1, 2) -res <- rbind(weeks, dist_true) -xtable(res, digits=2) -dist_false <- condensed[c(2, 4, 6)]/sum(condensed[c(2, 4, 6)]) -dist_false -exp_false <- 1*dist_false[2] + 2*dist_false[3] -exp_false -######## Part 3 ########### -weeks_t <- c(.95, .95, .95, .95, .03, .03, .03, .03, .02, .02, .02, .02) -e_w <- c(.001, .001, .999, .999, .02, .02, .98, .98, .04, .04, .96, .96) -r_e <- c(.9, .1, .2, .8, .9, .1, .2, .8, .9, .1, .2, .8) -probs2 <- cbind(weeks_t, e_w, r_e) -row.names(probs2) <- NULL -R <- apply(probs2, 1, prod) -condensed <- c(sum(R[1], R[3]), sum(R[2], R[4]), sum(R[5], R[7]), -sum(R[6], R[8]),sum(R[9], R[11]), sum(R[10], R[12])) -dist_true <- condensed[c(1, 3, 5)]/sum(condensed[c(1, 3, 5)]) -weeks = c(0, 1, 2) -res <- rbind(weeks, dist_true) -xtable(res, digits=2) -dist_false <- condensed[c(2, 4, 6)]/sum(condensed[c(2, 4, 6)]) -exp_false <- 1*dist_false[2] + 2*dist_false[3] -dist_true -######## Part 1 ########### -library(xtable) -wt = c(.95, .95, .95, .95) -e = c(.001,.001,.999,.999) -sp = c(.1, .9, .1, .9) -ap = c(.99, .95, .95, .01) -r = c(.9, .9, .2, .2) -probs <- cbind(wt, e, sp, ap, r) -xtable(probs, digits=3) -r1=prod(wt[1], e[1], sp[1], ap[1], r[1]) -r2=prod(wt[2], e[2], sp[2], ap[2], r[1]) -r3=prod(wt[3], e[3], sp[3], ap[3], r[3]) -r4=prod(wt[4], e[4], sp[4], ap[4], r[4]) -sum(r1, r2)/sum(r1, r2, r3, r4) -sum(r3, r4)/sum(r1, r2, r3, r4) -######## Part 1 ########### -library(xtable) -wt = c(.95, .95, .95, .95) -e = c(.001,.001,.999,.999) -sp = c(.1, .9, .1, .9) -ap = c(.99, .95, .95, .01) -r = c(.9, .9, .2, .2) -probs <- cbind(wt, e, sp, ap, r) -xtable(probs, digits=3) -r1=prod(wt[1], e[1], sp[1], ap[1], r[1]) -r2=prod(wt[2], e[2], sp[2], ap[2], r[1]) -r3=prod(wt[3], e[3], sp[3], ap[3], r[3]) -r4=prod(wt[4], e[4], sp[4], ap[4], r[4]) -sum(r1, r2)/sum(r1, r2, r3, r4) -sum(r3, r4)/sum(r1, r2, r3, r4) -######## Part 1 ########### -library(xtable) -wt = c(.95, .95, .95, .95) -e = c(.001,.001,.999,.999) -sp = c(.1, .9, .1, .9) -ap = c(.99, .95, .95, .01) -r = c(.9, .9, .2, .2) -probs <- cbind(wt, e, sp, ap, r) -xtable(probs, digits=3) -r1=prod(wt[1], e[1], sp[1], ap[1], r[1]) -r2=prod(wt[2], e[2], sp[2], ap[2], r[1]) -r3=prod(wt[3], e[3], sp[3], ap[3], r[3]) -r4=prod(wt[4], e[4], sp[4], ap[4], r[4]) -sum(r1, r2)/sum(r1, r2, r3, r4) -sum(r3, r4)/sum(r1, r2, r3, r4) -probs -wt = c(.02, .02, .02, .02) -e = c(.04,.04,.96,.96) -sp = c(.2, .8, .2, .8) -ap = c(.99, .95, .95, .01) -r = c(.9, .9, .2, .2) -probs_week2 <- cbind(wt, e, sp, ap, r) -xtable(probs_week2, digits=3) -r1=prod(wt[1], e[1], sp[1], ap[1], r[1]) -r2=prod(wt[2], e[2], sp[2], ap[2], r[1]) -r3=prod(wt[3], e[3], sp[3], ap[3], r[3]) -r4=prod(wt[4], e[4], sp[4], ap[4], r[4]) -sum(r1, r2)/sum(r1, r2, r3, r4) -sum(r3, r4)/sum(r1, r2, r3, r4) -######## Part 3 ########### -weeks_t <- c(.95, .95, .95, .95, .03, .03, .03, .03, .02, .02, .02, .02) -e_w <- c(.001, .001, .999, .999, .02, .02, .98, .98, .04, .04, .96, .96) -r_e <- c(.9, .1, .2, .8, .9, .1, .2, .8, .9, .1, .2, .8) -probs2 <- cbind(weeks_t, e_w, r_e) -row.names(probs2) <- NULL -R <- apply(probs2, 1, prod) -condensed <- c(sum(R[1], R[3]), sum(R[2], R[4]), sum(R[5], R[7]), -sum(R[6], R[8]),sum(R[9], R[11]), sum(R[10], R[12])) -dist_true <- condensed[c(1, 3, 5)]/sum(condensed[c(1, 3, 5)]) -weeks = c(0, 1, 2) -res <- rbind(weeks, dist_true) -xtable(res, digits=2) -dist_false <- condensed[c(2, 4, 6)]/sum(condensed[c(2, 4, 6)]) -exp_false <- 1*dist_false[2] + 2*dist_false[3] -dist_true -exp_false -======= -abline(v=results_2013[results_2013$team == name, "wins"], col="red") -abline(v=results_2013[results_2013$team == name, "lower"], col="blue") -abline(v=results_2013[results_2013$team == name, "upper"], col="blue") ->>>>>>> 7f763f76725fb85f2dbe3f0e30b27aae3d6208b0 } # Get the rest of the covariates x1 <- cosine_basis(1, Xnew) @@ -534,174 +133,286 @@ x_ord <- Xnew[foo] y_ord <- Y[foo] plot(Xnew, Y) lines(x_ord, fitted(mod6)[foo]) -View(dfs) -simulation <- read.csv("scripts/sim_2013_logit.csv") -setwd("C:/Users/leeri_000/basketball_stats/game_simulation") -## Read in the logit sims -simulation <- read.csv("scripts/sim_2013_logit.csv") -means -######################################################################### -## Purpose: Using probabilities for each game result, simulate the season -## to compute a distribution of wins #################################### -######################################################################### -## SET WORKING DIRCTORY ## -setwd("C:/Users/leeri_000/basketball_stats/game_simulation") -## LIBRARIES -library("dplyr") -library("e1071") -## READ IN OUR FEATURE DATASETS -data <- read.csv("scripts/rpm_dataset.csv") -## ADD home feature and win/loss column -data <- mutate(data, home = 1) -data$homeWin <- ifelse(data$home_team_score > data$visit_team_score, 1, 0) -## Set up datasets ## -train = filter(data, game_year %in% c(2008, 2009, 2010, 2011, 2012)) -test = filter(data, game_year == 2013) -xtest = test[,9:17] -ytest = test[,18] -xtrain = train[,9:17] -ytrain = train[,18] -## Logistic regression to get probabilities -mylogit <- glm(homeWin ~ RPM_weight.0 + ORPM_weight.0 + DRPM_weight.0 + PER_weight.0 + -RPM_weight.1 + ORPM_weight.1 + DRPM_weight.1 + PER_weight.1 + home, data=train, -family = "binomial") -logit_preds <- as.data.frame(predict(mylogit, newdata=xtest, type="response")) -logit_preds$class <- ifelse(logit_preds[,1] >= .5, 1, 0) -logit_preds <- cbind(logit_preds, ytest) -logit_preds$result <- abs(logit_preds[,2] - logit_preds[,3]) -logit_probs <- cbind(test, logit_preds)[, c(2:8, 17:22)] -names(logit_probs)[10] = "home_prob" -##Create the dataset for simulation -num_seasons <- 1000 -season_df <- data.frame(matrix(0, nrow=length(unique(logit_probs$home_team)), ncol= num_seasons)) -row.names(season_df) <- unique(logit_probs$home_team) -colnames(season_df) = paste("season_", 1:1000, sep="") -## Get the unique match Id's for a given season -match_ids <- unique(logit_probs$match_id) -### Loop through each Season -for(i in 1:1000){ -print(paste("season", i)) -## Generate the random outcomes -random_outcomes <- runif(length(logit_probs[,1])) -logit_probs <- cbind(logit_probs, random_outcomes) -## Loop through each match ID -for(match in match_ids){ -# Using the random number, assign the winner of the game to the data frame -if(logit_probs[logit_probs$match_id == match,]$random_outcomes <= logit_probs[logit_probs$match_id == match,]$home_prob){ -## Iterate the season data frame for the Home Team -season_df[as.character(logit_probs[logit_probs$match_id == match,]$home_team), i] = season_df[as.character(logit_probs[logit_probs$match_id == match,]$home_team), i] + 1 +library(XML) +first_date <- "http://www.nba.com/gameline/20091001/" +## Link to where the data is +nba_one <- "http://www.nba.com/games/20101201/CHANOH/gameinfo.html" +sample_game <- readLines(nba_one) +doc <- htmlParse(sample_game) +doc +################################################################## +## Download all of the raw game play by play files from NBA.com ## +################################################################## +library(XML) +first_date <- "http://www.nba.com/gameline/20091001/" +## Link to where the data is +nba_one <- "http://www.nba.com/games/20101201/CHANOH/gameinfo.html" +sample_game <- readLines(nba_one) +doc <- htmlParse(sample_game) +doc +doc <- xmlParse(sample_game) +doc +names(doc) +first_date <- "http://www.nba.com/gameline/20091001/" +doc <- xmlParse(first_date) +doc +names(doc) +doc <- xmlParse(first_date, isHTML=TRUE) +class(doc) +doc <- htmlTreeParse(first_date, isHTML=TRUE) +doc +names(doc) +doc$file +class(doc) +doc <- htmlParse(first_date, isHTML=TRUE) +class(doc) +names(doc) +doc +xmlRoot(doc) +child_doc <- xmlChildren(doc) +class(child_doc) +names(class_doc) +names(child_doc) +child_doc$html +child_doc$comment +links <- xpathSApply(doc, "//a/@href") +links +?xpathSApply +links <- getNodeSet(doc, "//a/@href") +links +links <- getNodeSet(doc) +game_links <- xpathSApply(doc, "gameInfo") +game_links +game_links <- xpathSApply(doc, "gameinfo") +game_links <- xpathSApply(doc, "//a/@href") +library(XML) +## Read in the first date +first_date <- "http://www.nba.com/gameline/20091001/" +## Extract all Game links from this home link +doc <- htmlParse(first_date, isHTML=TRUE) +game_links <- xpathSApply(doc, "//a/@href") +free(game_links) +install.packages("RCurl") +library(RCurl) +#################################################################### +## Download all of the raw game play by play files from NBA.com #### +#################################################################### +library(XML) +library(RCurl) +## Read in the first date +first_date <- "http://www.nba.com/gameline/20091001/" +doc <- htmlParse(first_date, isHTML=TRUE) +game_links <- xpathSApply(doc, "//a/@href") +game_links <- strsplit(doc,"a href=") +#################################################################### +## Download all of the raw game play by play files from NBA.com #### +#################################################################### +library(XML) +library(RCurl) +## Read in the first date +first_date <- "http://www.nba.com/gameline/20091001/" +## Extract all Game links from this home link +doc <- htmlParse(first_date, isHTML=TRUE) +game_links <- strsplit(doc,"a href=") +game_links <- strsplit(doc,"a href=")[[1]] +query = '//div[@class="nbaFnlMnRecapDiv"]' +xpathSApply(xml, query, xmlValue) +xpathSApply(doc, query, xmlValue) +## Read in the first date +first_date <- "http://www.nba.com/gameline/20091001/" +## Extract all Game links from this home link +doc <- htmlParse(first_date, isHTML=TRUE) +query = '//div[@class="nbaFnlMnRecapDiv"]' +xpathSApply(doc, query, xmlValue) +game_links <- xpathSApply(fifa.doc, "//*/div[@class='nbaFnlMnRecapDiv']", xmlValue) +game_links <- xpathSApply(doc, "//*/div[@class='nbaFnlMnRecapDiv']", xmlValue) +game_links +game_links <- xpathSApply(doc, "//*//*/div[@class='nbaFnlMnRecapDiv']", xmlValue) +game_links <- xpathSApply(doc, "//*//*//*/div[@class='nbaFnlMnRecapDiv']", xmlValue) +game_links <- xpathSApply(doc, "//*//*//*//*/div[@class='nbaFnlMnRecapDiv']", xmlValue) +game_links <- xpathSApply(doc, "//*//*//*//*//*/div[@class='nbaFnlMnRecapDiv']", xmlValue) +game_links <- xpathSApply(doc, "//*//*//*//*//*//*/div[@class='nbaFnlMnRecapDiv']", xmlValue) +game_links +game_links <- xpathSApply(doc, "//*/div[@class='nbaMnStatsFtr']", xmlValue) +game_links <- xpathSApply(doc, "//*/div[@class='nbaMnStatsFtr']/a", xmlValue) +game_links <- xpathSApply(doc, "//*/div[@class='nbaMnStatsFtr']//a", xmlValue) +getHTMLLinks(doc) +?getHTMLLinks +game_links <- getHTMLLinks(doc) +#################################################################### +## Download all of the raw game play by play files from NBA.com #### +#################################################################### +library(XML) +library(RCurl) +## Read in the first date +first_date <- "http://www.nba.com/gameline/20091001/" +## Extract all Game links from this home link +doc <- htmlParse(first_date, isHTML=TRUE) +game_links <- getHTMLLinks(doc) +game_links <- unique(getHTMLLinks(doc)) +grep(game_links, "gameinfo") +grep(game_links, 'gameinfo') +grep('gameinfo', game_links) +game_links <- grep('gameinfo', unique(getHTMLLinks(doc))) +game_links +game_links <- grep('gameinfo', game_links) +game_links <- unique(getHTMLLinks(doc)) +game_links <- grep('gameinfo', game_links) +doc <- htmlParse(first_date, isHTML=TRUE) +game_links <- unique(getHTMLLinks(doc)) +game_string <- grep('gameinfo', game_links) +game_links[game_string] +library(XML) +library(RCurl) +## Read in the first date +first_date <- "http://www.nba.com/gameline/20091001/" +## Extract all Game links from this home link +doc <- htmlParse(first_date, isHTML=TRUE) +game_links <- grep('gameinfo', unique(getHTMLLinks(doc))) +library(XML) +library(RCurl) +## Read in the first date +first_date <- "http://www.nba.com/gameline/20091001/" +## Extract all Game links from this home link +doc <- htmlParse(first_date, isHTML=TRUE) +game_links <- grep('gameinfo', unique(getHTMLLinks(doc))) +game_links <- unique(getHTMLLinks(doc)) +game_urls <- game_links[grep('gameinfo', game_links)] +game_urls <- paste("http://www.nba.com/", game_links[grep('gameinfo', game_links)], sep="") +game_urls +first_date <- "http://www.nba.com/gameline/20091027/" +## Extract all Game links from this home link +doc <- htmlParse(first_date, isHTML=TRUE) +game_links <- unique(getHTMLLinks(doc)) +game_urls <- paste("http://www.nba.com/", game_links[grep('gameinfo', game_links)], sep="") +game_urls +for(g in game_urls){ +print(g) } -else{ -season_df[as.character(logit_probs[logit_probs$match_id == match,]$visit_team), i] = season_df[as.character(logit_probs[logit_probs$match_id == match,]$visit_team), i] + 1 +#################################################################### +## Download all of the raw game play by play files from NBA.com #### +#################################################################### +library(XML) +library(RCurl) +## Read in the first date +first_date <- "http://www.nba.com/gameline/20091027/" +## Extract all Game links from this home link +doc <- htmlParse(first_date, isHTML=TRUE) +game_links <- unique(getHTMLLinks(doc)) +game_urls <- paste("http://www.nba.com/", game_links[grep('gameinfo', game_links)], sep="") +for(g in game_urls){ +print(g) } +#################################################################### +## Download all of the raw game play by play files from NBA.com #### +#################################################################### +library(XML) +library(RCurl) +## Read in the first date +first_date <- "http://www.nba.com/gameline/20091027/" +## Extract all Game links from this home link +doc <- htmlParse(first_date, isHTML=TRUE) +game_links <- unique(getHTMLLinks(doc)) +game_urls <- paste("http://www.nba.com/", game_links[grep('gameinfo', game_links)], sep="") +## Loop through each game to get the play by play data +for(g in game_urls){ +print(g) } -## Remove the random outcomes -logit_probs <- logit_probs[,-14] +#################################################################### +## Download all of the raw game play by play files from NBA.com #### +#################################################################### +library(XML) +library(RCurl) +## Read in the first date +first_date <- "http://www.nba.com/gameline/20091027/" +## Extract all Game links from this home link +doc <- htmlParse(first_date, isHTML=TRUE) +game_links <- unique(getHTMLLinks(doc)) +game_urls <- paste("http://www.nba.com/", game_links[grep('gameinfo', game_links)], sep="") +## Loop through each game to get the play by play data +for(g in game_urls){ +print(g) } -### Check out the results -means <- apply(season_df, 1, mean) -ses <- apply(season_df, 1, sd) -## Save the outcomes -write.csv(season_df, "scripts/sim_2013_logit_df.csv") -write.csv(cbind(means, ses), "scripts/sim_2013_logit.csv") -means -## SET WORKING DIRCTORY ## -setwd("C:/Users/leeri_000/basketball_stats/game_simulation") -## Read in the logit sims -simulation <- read.csv("scripts/sim_2013_logit.csv") -simulation <- read.csv("scripts/sim_2013_logit_df.csv") -head(simulation) -simulation <- read.csv("scripts/sim_2013_logit.csv") -head(simulation) -means <- simulation$means -ses <- simulation$ses -season_df <- read.csv("scripts/sim_2013_logit_df.csv") -## Get dataset with the results -results <- read.csv("data/espn_data/team_wins.csv")[,2:4] -results_2013 <- filter(results, year == 2013)[order(results_2013$team),] -## Add in the confidence interval -results_2013 <- cbind(results_2013, means[order(names(means))]) -results_2013 <- cbind(results_2013, ses[order(names(ses))]) -names(results_2013)[4] = "estimate" -names(results_2013)[5] = "se" -results_2013$lower <- results_2013$estimate - 2*results_2013$se -results_2013$upper <- results_2013$estimate + 2*results_2013$se -results_2013$trapped <- ifelse(results_2013$wins >= results_2013$lower & -results_2013$wins <= results_2013$upper, 1, 0) -results <- read.csv("data/espn_data/team_wins.csv")[,2:4] -results_2013 <- filter(results, year == 2013)[order(results_2013$team),] -results_2013 <- filter(results, year == 2013) -results_2013 <- filter(results, year == 2013)[order(results_2013$team),] -results_2013 -## Add in the confidence interval -results_2013 <- cbind(results_2013, means[order(names(means))]) -results_2013 <- cbind(results_2013, ses[order(names(ses))]) -names(results_2013)[4] = "estimate" -names(results_2013)[5] = "se" -results_2013$lower <- results_2013$estimate - 2*results_2013$se -results_2013$upper <- results_2013$estimate + 2*results_2013$se -results_2013$trapped <- ifelse(results_2013$wins >= results_2013$lower & -results_2013$wins <= results_2013$upper, 1, 0) -typeof(means) -as.vector(means) -means <- as.vector(simulation$means) -ses <- as.vector(simulation$ses) -season_df <- read.csv("scripts/sim_2013_logit_df.csv") +#################################################################### +## Download all of the raw game play by play files from NBA.com #### +#################################################################### +library(XML) +library(RCurl) +## Read in the first date +first_date <- "http://www.nba.com/gameline/20091027/" +## Extract all Game links from this home link +doc <- htmlParse(first_date, isHTML=TRUE) +game_links <- unique(getHTMLLinks(doc)) +game_urls <- paste("http://www.nba.com/", game_links[grep('gameinfo', game_links)], sep="") +## Loop through each game to get the play by play data +for(g in game_urls){ +print(g) +} +library(dplyr) ## SET WORKING DIRCTORY ## setwd("C:/Users/leeri_000/basketball_stats/game_simulation") -## Read in the logit sims -simulation <- read.csv("scripts/sim_2013_logit.csv") -means <- as.vector(simulation$means) -ses <- as.vector(simulation$ses) -season_df <- read.csv("scripts/sim_2013_logit_df.csv") ## Get dataset with the results results <- read.csv("data/espn_data/team_wins.csv")[,2:4] +############################################################ +################# 2013 LOGIT ############################### +############################################################ +## Get 2013 Results results_2013 <- filter(results, year == 2013) -results_2013 <- filter(results, year == 2013)[order(results_2013$team),] -## Add in the confidence interval -results_2013 <- cbind(results_2013, means[order(names(means))]) -results_2013 <- cbind(results_2013, ses[order(names(ses))]) -## SET WORKING DIRCTORY ## -setwd("C:/Users/leeri_000/basketball_stats/game_simulation") -## Read in the logit sims +results_2013_sorted <- results_2013[order(results_2013$team),] +## Get the dataset with simulation output for 2013 simulation <- read.csv("scripts/sim_2013_logit.csv") -means <- as.vector(simulation$means) -ses <- as.vector(simulation$ses) -season_df <- read.csv("scripts/sim_2013_logit_df.csv") -## Get dataset with the results -results <- read.csv("data/espn_data/team_wins.csv")[,2:4] -results_2013 <- filter(results, year == 2013) -results_2013 <- filter(results, year == 2013)[order(results_2013$team),] -## Add in the confidence interval -results_2013 <- cbind(results_2013, means[order(names(means))]) -results_2013 <- cbind(results_2013, ses[order(names(ses))]) -names(results_2013)[4] = "estimate" -names(results_2013)[5] = "se" -results_2013 -results_2013 <- cbind(results_2013, means[order(names(means))]) -order(names(means)) -names(means) -names(simulation) -names(simulation$means) -simulation -results_2013 <- cbind(results_2013, means[order(simulation$X)]) -results_2013 -results_2013 <- cbind(results_2013, ses[order(simulation$X))]) -results_2013 <- cbind(results_2013, ses[order(simulation$X)]) -names(results_2013)[4] = "estimate" -names(results_2013)[5] = "se" -results_2013$lower <- results_2013$estimate - 2*results_2013$se -results_2013$upper <- results_2013$estimate + 2*results_2013$se -results_2013$trapped <- ifelse(results_2013$wins >= results_2013$lower & -results_2013$wins <= results_2013$upper, 1, 0) -sum(results_2013$trapped) +sim_sort_logit <- simulation[order(simulation$X),] +## Get comparison dataframe and mean and absolute error losses +compare_2013_logit <- cbind(results_2013_sorted, sim_sort_logit) +compare_2013_logit$squared <- (compare_2013_logit$means - compare_2013_logit$wins)^2 +rmse <- sqrt(mean(compare_2013_logit$squared)) +rmse +compare_2013_logit$absolute <- abs(compare_2013_logit$means - compare_2013_logit$wins) +mae <- mean(compare_2013_logit$absolute) +mae +############################################################ +################# 2013 NAIVE BAYES ########################## +############################################################ ## Get 2013 Results results_2013 <- filter(results, year == 2013) results_2013_sorted <- results_2013[order(results_2013$team),] ## Get the dataset with simulation output for 2013 -simulation <- read.csv("scripts/sim_2013_logit.csv") +simulation <- read.csv("scripts/sim_2013.csv") +sim_sort <- simulation[order(simulation$X),] +## Get comparison dataframe and mean and absolute error losses +compare <- cbind(results_2013_sorted, sim_sort) +compare$squared <- (compare$means - compare$wins)^2 +rmse <- sqrt(mean(compare$squared)) +rmse +compare$absolute <- abs(compare$means - compare$wins) +mae <- mean(compare$absolute) +mae +####################################################### +################# 2012 LOGIT ########################## +####################################################### +## Get 2012 results +results_2012 <- filter(results, year == 2012) +results_2012_sorted <- results_2012[order(results_2012$team),] +## Get the dataset with simulation output for 2012 +simulation <- read.csv("scripts/sim_2012_logit.csv") sim_sort_logit <- simulation[order(simulation$X),] ->>>>>>> ba62d0a00c3e38d2836c06ff586c377052d46c27 +## Get comparison dataframe and mean and absolute error losses +compare_2012_logit <- cbind(results_2012_sorted, sim_sort_logit) +compare_2012_logit$squared <- (compare_2012_logit$means - compare_2012_logit$wins)^2 +rmse <- sqrt(mean(compare_2012_logit$squared)) +rmse +compare_2012_logit$absolute <- abs(compare_2012_logit$means - compare_2012_logit$wins) +mae <- mean(compare_2012_logit$absolute) +mae +############################################################# +################# 2012 NAIVE BAYES ########################## +############################################################# +## Get 2012 results +results_2012 <- filter(results, year == 2012) +results_2012_sorted <- results_2012[order(results_2012$team),] +## Get the dataset with simulation output for 2012 +simulation <- read.csv("scripts/sim_2012.csv") +sim_sort <- simulation[order(simulation$X),] ## Get comparison dataframe and mean and absolute error losses compare_2012 <- cbind(results_2012_sorted, sim_sort) compare_2012$squared <- (compare_2012$means - compare_2012$wins)^2 @@ -710,565 +421,92 @@ rmse compare_2012$absolute <- abs(compare_2012$means - compare_2012$wins) mae <- mean(compare_2012$absolute) mae -<<<<<<< HEAD -## SET WORKING DIRCTORY ## -setwd("C:/Users/Lee/game_simulation") -## LIBRARIES -library("dplyr") -library("e1071") -library("randomForest") -## READ IN OUR FEATURE DATASETS -data <- read.csv("scripts/rpm_dataset.csv") -data_rpi <- read.csv("scripts/rpi.csv") -## ADD home feature and win/loss column -data <- mutate(data, home = 1) -data <- mutate(data, RPM_dif = RPM_weight.1 - RPM_weight.0) -data$homeWin <- ifelse(data$home_team_score > data$visit_team_score, 1, 0) -## Set up datasets ## -years <- c(2008, 2009, 2010, 2011, 2012, 2013) -train = filter(data, game_year %in% c(2008, 2009, 2011, 2013)) -test = filter(data, game_year == 2013) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -rf <- randomForest(homeWin ~ RPM_weight.0 + ORPM_weight.0 + DRPM_weight.0 + PER_weight.0 + -RPM_weight.1 + ORPM_weight.1 + DRPM_weight.1 + PER_weight.1 + home, -data=train, type="classification") -importance(rf) -xtable(importance(rf)) -library(xtable) -xtable(importance(rf)) -======= -<<<<<<< HEAD -par(mfrow=c(6, 5)) -team_names <- unique(results_2013$team) -for(name in team_names){ -season_result <- as.numeric(season_df[name,]) -hist(season_result, main=paste("Wins for", toupper(name), sep=" "), xlab="Wins" -, breaks=10) -abline(v=results_2013[results_2013$team == name, "wins"], col="red") -abline(v=results_2013[results_2013$team == name, "lower"], col="blue") -abline(v=results_2013[results_2013$team == name, "upper"], col="blue") -} -## Plot the outcomes -par(mfrow=c(6, 5)) -team_names <- unique(results_2013$team) -for(name in team_names){ -season_result <- as.numeric(season_df[name,]) -hist(season_result, main=paste("Wins for", toupper(name), sep=" "), xlab="Wins" -, breaks=10) -abline(v=results_2013[results_2013$team == name, "wins"], col="red") -abline(v=results_2013[results_2013$team == name, "lower"], col="blue") -abline(v=results_2013[results_2013$team == name, "upper"], col="blue") -} -team_names -team_names <- unique(results_2013$team) -as.numeric(season_df[1,]) -names(season_df -) -rownames(season_df) -dim(season_df) -season_df[1,] -season_df[,1] -as.numeric(season_df[season_df$X == "tor",]) -## Plot the outcomes -par(mfrow=c(6, 5)) -team_names <- unique(results_2013$team) -for(name in team_names){ -season_result <- as.numeric(season_df[season_df$X == name,]) -hist(season_result, main=paste("Wins for", toupper(name), sep=" "), xlab="Wins" -, breaks=10) -abline(v=results_2013[results_2013$team == name, "wins"], col="red") -abline(v=results_2013[results_2013$team == name, "lower"], col="blue") -abline(v=results_2013[results_2013$team == name, "upper"], col="blue") -} -par(mfrow=c(6, 5)) -team_names <- unique(results_2013$team) -for(name in team_names){ -season_result <- as.numeric(season_df[season_df$X == name,]) -hist(season_result, main=paste("Wins for", toupper(name), sep=" "), xlab="Wins" -, breaks=10) -abline(v=results_2013[results_2013$team == name, "wins"], col="red") -abline(v=results_2013[results_2013$team == name, "lower"], col="blue") -abline(v=results_2013[results_2013$team == name, "upper"], col="blue") -} -======= ->>>>>>> ba62d0a00c3e38d2836c06ff586c377052d46c27 -## Purpose: Figure out most important covariates for prediction +compare_2013 +compare_2012 +sd(compare_2012$means) +sd(compare_2012_logit$means) +sd(compare_2013_logit$means) +sd(compare_2013$means) +sd(compare_2013_nb$means) +sd(compare$means) +## Purpose: Compare simulated season to actual results for 2012 and 2013 library(dplyr) -library(lars) ## SET WORKING DIRCTORY ## setwd("C:/Users/leeri_000/basketball_stats/game_simulation") -# Read in the full feature matrix -full_matrix <- read.csv("featuresAll.csv") -## Set up full and null models -null_mod <- glm(formula = homeWin ~ 1, family = "binomial", data = full_matrix) -full_mod <- glm(homeWin ~ RPM_weight_0 + ORPM_weight_0 + DRPM_weight_0 + PER_weight_0 + -RPM_weight_1 + ORPM_weight_1 + DRPM_weight_1 + PER_weight_1 + avg_scoreDiff + -avg_scoreDiff_home + avg_win_home + avg_scoreDiff_visit + avg_win_visit + home_rpi + -away_rpi + avg_GP + avg_GS+avg_MIN + avg_FG_made + avg_FG_attempted + -avg_FGpercent + avg_ThreeP_made + avg_ThreeP_attempted + avg_ThreePpercent + avg_FT_made + -avg_FT_attempted+ avg_FTpercent+ avg_OR+ avg_DR+ avg_REB+ avg_AST + avg_BLK+ -avg_STL+ avg_PF +avg_TO+ avg_PTS, family="binomial", data=full_matrix) -stepwise_mod <- step(null_mod, scope=list(lower=null_mod, upper=full_mod), direction="both") -dim(full_matrix) -head(full_matrix) -plot(full_matrix$home_rpi) -plot(full_matrix$home_rpi) -plot(full_matrix$home_rpi) -plot(full_matrix$home_rpi) -plot(full_matrix$home_rpi) -plot(full_matrix$home_rpi) -plot(full_matrix$home_rpi) -full_matrix <- read.csv("featuresAll.csv") -## Set up full and null models -null_mod <- glm(formula = homeWin ~ 1, family = "binomial", data = full_matrix) -full_mod <- glm(homeWin ~ RPM_weight_0 + ORPM_weight_0 + DRPM_weight_0 + PER_weight_0 + -RPM_weight_1 + ORPM_weight_1 + DRPM_weight_1 + PER_weight_1 + avg_scoreDiff + -avg_scoreDiff_home + avg_win_home + avg_scoreDiff_visit + avg_win_visit + log(home_rpi) + -log(away_rpi) + avg_GP + avg_GS+avg_MIN + avg_FG_made + avg_FG_attempted + -avg_FGpercent + avg_ThreeP_made + avg_ThreeP_attempted + avg_ThreePpercent + avg_FT_made + -avg_FT_attempted+ avg_FTpercent+ avg_OR+ avg_DR+ avg_REB+ avg_AST + avg_BLK+ -avg_STL+ avg_PF +avg_TO+ avg_PTS, family="binomial", data=full_matrix) -## Stepwise model selection -stepwise_mod <- step(null_mod, scope=list(lower=null_mod, upper=full_mod), direction="both") -?step -stepwise_mod <- step(null_mod, scope=list(lower=null_mod, upper=full_mod), direction="both", k=log(n)) -stepwise_mod <- step(null_mod, scope=list(lower=null_mod, upper=full_mod), direction="both", -k = log(n)) -stepwise_mod <- step(null_mod, scope=list(lower=null_mod, upper=full_mod), direction="both") -importance(rf) -means -results_2013 -library(dplyr) -## SET WORKING DIRCTORY ## -setwd("C:/Users/Lee/game_simulation") -## Read in the logit sims -simulation <- read.csv("scripts/sim_2013_logit.csv") -means <- as.vector(simulation$means) -ses <- as.vector(simulation$ses) -season_df <- read.csv("scripts/sim_2013_logit_df.csv") ## Get dataset with the results results <- read.csv("data/espn_data/team_wins.csv")[,2:4] +############################################################ +################# 2013 LOGIT ############################### +############################################################ +## Get 2013 Results results_2013 <- filter(results, year == 2013) -results_2013 <- filter(results, year == 2013)[order(results_2013$team),] -## Add in the confidence interval -results_2013 <- cbind(results_2013, means[order(simulation$X)]) -results_2013 <- cbind(results_2013, ses[order(simulation$X)]) -names(results_2013)[4] = "estimate" -names(results_2013)[5] = "se" -results_2013$lower <- results_2013$estimate - 2*results_2013$se -results_2013$upper <- results_2013$estimate + 2*results_2013$se -results_2013$trapped <- ifelse(results_2013$wins >= results_2013$lower & -results_2013$wins <= results_2013$upper, 1, 0) -results_2013 -rf <- randomForest(homeWin ~ RPM_dif, data=train, type="classification") -train = filter(data, game_year %in% c(2008, 2009, 2011, 2012)) -test = filter(data, game_year == 2013) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -model <- naiveBayes(xtrain, ytrain) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -## Logistic Regression -mylogit <- glm(homeWin ~ RPM_dif, data=train, family = "binomial") -logit_preds <- as.data.frame(predict(mylogit, newdata=xtest, type="response")) -logit_preds$class <- ifelse(logit_preds[,1] >= .5, 1, 0) -logit_preds <- cbind(logit_preds, ytest) -logit_preds$result <- abs(logit_preds[,2] - logit_preds[,3]) -logit_accurary <- 1 - sum(logit_preds$result)/length(ytest) -logit_accurary -mylinear <- lm(homeWin ~ RPM_weight.0 + ORPM_weight.0 + DRPM_weight.0 + PER_weight.0 + -RPM_weight.1 + ORPM_weight.1 + DRPM_weight.1 + PER_weight.1 + home, data=train) -linear_preds <- as.data.frame(predict(mylinear, newdata=xtest, type="response")) -linear_preds$class <- ifelse(linear_preds[,1] >= .5, 1, 0) -linear_preds <- cbind(linear_preds, ytest) -linear_preds$result <- abs(linear_preds[,2] - linear_preds[,3]) -linear_accurary <- 1 - sum(linear_preds$result)/length(ytest) -linear_accurary -## Random Forest -rf <- randomForest(homeWin ~ RPM_dif, data=train, type="classification") -rf_preds <- as.data.frame(predict(rf, xtest)) -rf_preds <- cbind(rf_preds, ytest) -rf_preds$class <- ifelse(rf_preds[,1] >= .5, 1, 0) -rf_preds$result <- abs(rf_preds[,2] - rf_preds[,3]) -rf_accurary <- 1 - sum(rf_preds$result)/length(ytest) -rf_accurary -plot(RPM_weight.0, ORPM_weight.0) -plot(train$RPM_weight.0, train$ORPM_weight.0) -plot(train$RPM_weight.0, train$ORPM_weight.0) -plot(train$RPM_weight.0, train$ORPM_weight.0) -plot(train$RPM_weight.0, train$ORPM_weight.0) -plot(train$RPM_weight.0, train$ORPM_weight.0) -plot(train$RPM_weight.0, train$ORPM_weight.0) -plot(train$RPM_weight.0, train$ORPM_weight.0) -data <- mutate(data, home = 1) -data <- mutate(data, RPM_dif = RPM_weight.1 - RPM_weight.0) -data$homeWin <- ifelse(data$home_team_score > data$visit_team_score, 1, 0) -## Set up datasets ## -years <- c(2008, 2009, 2010, 2011, 2012, 2013) -train = filter(data, game_year %in% c(2008, 2009, 2011, 2012)) -test = filter(data, game_year == 2013) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -## Naive Bayes -model <- naiveBayes(xtrain, ytrain) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -years <- c(2008, 2009, 2010, 2011, 2012, 2013) -train = filter(data, game_year %in% c(2008, 2009, 2011, 2011)) -test = filter(data, game_year == 2012) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -## Naive Bayes -model <- naiveBayes(xtrain, ytrain) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -years <- c(2008, 2009, 2010, 2011, 2012, 2013) -train = filter(data, game_year %in% c(2008, 2009, 2010)) -test = filter(data, game_year == 2011) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -## Naive Bayes -model <- naiveBayes(xtrain, ytrain) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -years <- c(2008, 2009, 2010, 2011, 2012, 2013) -train = filter(data, game_year %in% c(2008, 2009, 2010, 2011)) -test = filter(data, game_year == 2012) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -## Naive Bayes -model <- naiveBayes(xtrain, ytrain) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -stepwise_mod <- step(null_mod, scope=list(lower=null_mod, upper=full_mod), direction="both") -years <- c(2008, 2009, 2010, 2011, 2012, 2013) -train = filter(data, game_year %in% c(2008)) -test = filter(data, game_year == 2009) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -## Naive Bayes -model <- naiveBayes(xtrain, ytrain) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -## Set up datasets ## -years <- c(2008, 2009, 2010, 2011, 2012, 2013) -train = filter(data, game_year %in% c(2008, 2009)) -test = filter(data, game_year == 2010) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -## Naive Bayes -model <- naiveBayes(xtrain, ytrain) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -years <- c(2008, 2009, 2010, 2011, 2012, 2013) -train = filter(data, game_year %in% c(2008, 2009, 2010)) -test = filter(data, game_year == 2011) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -## Naive Bayes -model <- naiveBayes(xtrain, ytrain) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -years <- c(2008, 2009, 2010, 2011, 2012, 2013) -train = filter(data, game_year %in% c(2008, 2009, 2010, 2011)) -test = filter(data, game_year == 2012) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -## Naive Bayes -model <- naiveBayes(xtrain, ytrain) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -years <- c(2008, 2009, 2010, 2011, 2012, 2013) -train = filter(data, game_year %in% c(2008, 2009, 2010, 2011, 2012)) -test = filter(data, game_year == 2013) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -## Naive Bayes -model <- naiveBayes(xtrain, ytrain) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -years <- c(2008, 2009, 2010, 2011, 2012, 2013) -train = filter(data, game_year %in% c(2008, 2009, 2010, 2011, 2012)) -test = filter(data, game_year == 2013) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -## Naive Bayes -model <- naiveBayes(xtrain, ytrain) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -## Logistic Regression -mylogit <- glm(homeWin ~ RPM_weight.0 + ORPM_weight.0 , data=train, family = "binomial") -logit_preds <- as.data.frame(predict(mylogit, newdata=xtest, type="response")) -logit_preds$class <- ifelse(logit_preds[,1] >= .5, 1, 0) -logit_preds <- cbind(logit_preds, ytest) -logit_preds$result <- abs(logit_preds[,2] - logit_preds[,3]) -logit_accurary <- 1 - sum(logit_preds$result)/length(ytest) -logit_accurary -years <- c(2008, 2009, 2010, 2011, 2012, 2013) -train = filter(data, game_year %in% c(2008, 2009, 2010, 2011)) -test = filter(data, game_year == 2012) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -## Naive Bayes -model <- naiveBayes(xtrain, ytrain) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -## Logistic Regression -mylogit <- glm(homeWin ~ RPM_weight.0 + ORPM_weight.0 , data=train, family = "binomial") -logit_preds <- as.data.frame(predict(mylogit, newdata=xtest, type="response")) -logit_preds$class <- ifelse(logit_preds[,1] >= .5, 1, 0) -logit_preds <- cbind(logit_preds, ytest) -logit_preds$result <- abs(logit_preds[,2] - logit_preds[,3]) -logit_accurary <- 1 - sum(logit_preds$result)/length(ytest) -logit_accurary -mylogit <- glm(homeWin ~ RPM_weight.0 + ORPM_weight.0 + PER_weight.1 + PER_weight.0 , data=train, family = "binomial") -logit_preds <- as.data.frame(predict(mylogit, newdata=xtest, type="response")) -logit_preds$class <- ifelse(logit_preds[,1] >= .5, 1, 0) -logit_preds <- cbind(logit_preds, ytest) -logit_preds$result <- abs(logit_preds[,2] - logit_preds[,3]) -logit_accurary <- 1 - sum(logit_preds$result)/length(ytest) -logit_accurary -stepwise_mod <- step(null_mod, scope=list(lower=null_mod, upper=full_mod), direction="both") -## Set up full and null models -null_mod <- lm(formula = homeWin ~ 1, family = "binomial", data = full_matrix) -full_mod <- lm(homeWin ~ RPM_weight_0 + ORPM_weight_0 + DRPM_weight_0 + PER_weight_0 + -RPM_weight_1 + ORPM_weight_1 + DRPM_weight_1 + PER_weight_1 + avg_scoreDiff + -avg_scoreDiff_home + avg_win_home + avg_scoreDiff_visit + avg_win_visit + log(home_rpi) + -log(away_rpi) + avg_GP + avg_GS+avg_MIN + avg_FG_made + avg_FG_attempted + -avg_FGpercent + avg_ThreeP_made + avg_ThreeP_attempted + avg_ThreePpercent + avg_FT_made + -avg_FT_attempted+ avg_FTpercent+ avg_OR+ avg_DR+ avg_REB+ avg_AST + avg_BLK+ -avg_STL+ avg_PF +avg_TO+ avg_PTS, family="binomial", data=full_matrix) -## Stepwise model selection -stepwise_mod <- step(null_mod, scope=list(lower=null_mod, upper=full_mod), direction="both") -importance(rf) -stepwise_mod <- step(null_mod, scope=list(lower=null_mod, upper=full_mod), direction="both") -head(data) -## Set up full and null models -null_mod <- glm(formula = homeWin ~ 1, family = "binomial", data = full_matrix) -full_mod <- glm(homeWin ~ RPM_weight_0 + ORPM_weight_0 + DRPM_weight_0 + PER_weight_0 + -RPM_weight_1 + ORPM_weight_1 + DRPM_weight_1 + PER_weight_1 + avg_scoreDiff + -avg_scoreDiff_home + avg_win_home + avg_scoreDiff_visit + avg_win_visit + log(home_rpi) + -log(away_rpi) + avg_GP + avg_GS+avg_MIN + avg_FG_made + avg_FG_attempted + -avg_FGpercent + avg_ThreeP_made + avg_ThreeP_attempted + avg_ThreePpercent + avg_FT_made + -avg_FT_attempted+ avg_FTpercent+ avg_OR+ avg_DR+ avg_REB+ avg_AST + avg_BLK+ -avg_STL+ avg_PF +avg_TO+ avg_PTS, family="binomial", data=full_matrix) -## Stepwise model selection -stepwise_mod <- step(null_mod, scope=list(lower=null_mod, upper=full_mod), direction="both") -names(stepwise_mod) -stepwise_mod$coeff -head(xtrain) -data <- mutate(data, home = 1) -data <- mutate(data, RPM_dif = RPM_weight.1 - RPM_weight.0) -data$homeWin <- ifelse(data$home_team_score > data$visit_team_score, 1, 0) -## Set up datasets ## -years <- c(2008, 2009, 2010, 2011, 2012, 2013) -train = filter(data, game_year %in% c(2008, 2009, 2010, 2011)) -test = filter(data, game_year == 2012) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -## Naive Bayes -model <- naiveBayes(xtrain[,c(1,5) ], ytrain) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -years <- c(2008, 2009, 2010, 2011, 2012, 2013) -train = filter(data, game_year %in% c(2008, 2009, 2010, 2011, 2012)) -test = filter(data, game_year == 2013) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -## Naive Bayes -model <- naiveBayes(xtrain[,c(1,5) ], ytrain) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -years <- c(2008, 2009, 2010, 2011, 2012, 2013) -train = filter(data, game_year %in% c(2008, 2009, 2010)) -test = filter(data, game_year == 2011) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -## Naive Bayes -model <- naiveBayes(xtrain[,c(1,5) ], ytrain) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -train = filter(data, game_year %in% c(2008)) -test = filter(data, game_year == 2009) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -## Naive Bayes -model <- naiveBayes(xtrain[,c(1,5) ], ytrain) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -train = filter(data, game_year %in% c(2008)) -test = filter(data, game_year == 2009) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -## Naive Bayes -model <- naiveBayes(xtrain[,c(1,5) ], ytrain) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -train = filter(data, game_year %in% c(2008, 2009)) -test = filter(data, game_year == 2010) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -## Naive Bayes -model <- naiveBayes(xtrain[,c(1,5) ], ytrain) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -years <- c(2008, 2009, 2010, 2011, 2012, 2013) -train = filter(data, game_year %in% c(2008, 2009, 2010)) -test = filter(data, game_year == 2011) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -## Naive Bayes -model <- naiveBayes(xtrain[,c(1,5) ], ytrain) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -train = filter(data, game_year %in% c(2008, 2009, 2010, 2011)) -test = filter(data, game_year == 2012) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -## Naive Bayes -model <- naiveBayes(xtrain[,c(1,5) ], ytrain) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -years <- c(2008, 2009, 2010, 2011, 2012, 2013) -train = filter(data, game_year %in% c(2008, 2009, 2010, 2011, 2012)) -test = filter(data, game_year == 2013) -xtest = test[,9:18] -ytest = test[,19] -xtrain = train[,9:18] -ytrain = train[,19] -## Naive Bayes -model <- naiveBayes(xtrain[,c(1,5) ], ytrain) -preds <- as.data.frame(predict(model, xtest, type = c("raw"), threshold = 0.001)) -preds$class <- ifelse(preds[,2] > preds[,1], 1, 0) -preds <- cbind(preds, ytest) -preds$result <- abs(preds[,3] - preds[,4]) -accuracy <- 1 - sum(preds$result)/length(ytest) -accuracy -<<<<<<< HEAD -======= ->>>>>>> 9da187c2fd605aa9904fe34ea8e4aab9073fa807 ->>>>>>> 7f763f76725fb85f2dbe3f0e30b27aae3d6208b0 ->>>>>>> ba62d0a00c3e38d2836c06ff586c377052d46c27 +results_2013_sorted <- results_2013[order(results_2013$team),] +## Get the dataset with simulation output for 2013 +simulation <- read.csv("scripts/sim_2013_logit.csv") +sim_sort_logit <- simulation[order(simulation$X),] +## Get comparison dataframe and mean and absolute error losses +compare_2013_logit <- cbind(results_2013_sorted, sim_sort_logit) +compare_2013_logit$squared <- (compare_2013_logit$means - compare_2013_logit$wins)^2 +rmse <- sqrt(mean(compare_2013_logit$squared)) +rmse +compare_2013_logit$absolute <- abs(compare_2013_logit$means - compare_2013_logit$wins) +mae <- mean(compare_2013_logit$absolute) +mae +############################################################ +################# 2013 NAIVE BAYES ########################## +############################################################ +## Get 2013 Results +results_2013 <- filter(results, year == 2013) +results_2013_sorted <- results_2013[order(results_2013$team),] +## Get the dataset with simulation output for 2013 +simulation <- read.csv("scripts/sim_2013.csv") +sim_sort <- simulation[order(simulation$X),] +## Get comparison dataframe and mean and absolute error losses +compare <- cbind(results_2013_sorted, sim_sort) +compare$squared <- (compare$means - compare$wins)^2 +rmse <- sqrt(mean(compare$squared)) +rmse +compare$absolute <- abs(compare$means - compare$wins) +mae <- mean(compare$absolute) +mae +####################################################### +################# 2012 LOGIT ########################## +####################################################### +## Get 2012 results +results_2012 <- filter(results, year == 2012) +results_2012_sorted <- results_2012[order(results_2012$team),] +## Get the dataset with simulation output for 2012 +simulation <- read.csv("scripts/sim_2012_logit.csv") +sim_sort_logit <- simulation[order(simulation$X),] +## Get comparison dataframe and mean and absolute error losses +compare_2012_logit <- cbind(results_2012_sorted, sim_sort_logit) +compare_2012_logit$squared <- (compare_2012_logit$means - compare_2012_logit$wins)^2 +rmse <- sqrt(mean(compare_2012_logit$squared)) +rmse +compare_2012_logit$absolute <- abs(compare_2012_logit$means - compare_2012_logit$wins) +mae <- mean(compare_2012_logit$absolute) +mae +############################################################# +################# 2012 NAIVE BAYES ########################## +############################################################# +## Get 2012 results +results_2012 <- filter(results, year == 2012) +results_2012_sorted <- results_2012[order(results_2012$team),] +## Get the dataset with simulation output for 2012 +simulation <- read.csv("scripts/sim_2012.csv") +sim_sort <- simulation[order(simulation$X),] +## Get comparison dataframe and mean and absolute error losses +compare_2012 <- cbind(results_2012_sorted, sim_sort) +compare_2012$squared <- (compare_2012$means - compare_2012$wins)^2 +rmse <- sqrt(mean(compare_2012$squared)) +rmse +compare_2012$absolute <- abs(compare_2012$means - compare_2012$wins) +mae <- mean(compare_2012$absolute) +mae +89-57 +68/100 +68+75+94 +68+75+94/300 +68+75+94 +237/300 +257/320 diff --git a/final_paper/Thumbs.db b/final_paper/Thumbs.db new file mode 100644 index 0000000..ae807d5 Binary files /dev/null and b/final_paper/Thumbs.db differ diff --git a/final_paper/final_paper.aux b/final_paper/final_paper.aux index 404b2f7..6b9842b 100644 --- a/final_paper/final_paper.aux +++ b/final_paper/final_paper.aux @@ -44,25 +44,26 @@ \@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Training and test datasets}{4}{subsection.4.1}} \@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces A look at what our training and test datasets look like. The first four columns are features and the 5th column (indicating whether or not the home team won the game) is our label.}}{4}{table.1}} \newlabel{table:matrix}{{1}{4}{A look at what our training and test datasets look like. The first four columns are features and the 5th column (indicating whether or not the home team won the game) is our label}{table.1}{}} -\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Left: visit team RAPM against home team RAPM; the green dots indicate visit team wins while the red dots indicate home team wins. Right: box plot of RAPM; the $y$ axis represents the difference in RAPM between the home team and the visit team.}}{5}{figure.2}} -\newlabel{feats}{{2}{5}{Left: visit team RAPM against home team RAPM; the green dots indicate visit team wins while the red dots indicate home team wins. Right: box plot of RAPM; the $y$ axis represents the difference in RAPM between the home team and the visit team}{figure.2}{}} +\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Left: Visiting team overall weighted RAPM against the same statistic for the home team; the green dots indicate that the vistiting team wins while the red dots indicate home team wins. Right: box plot of RAPM; the $y$ axis represents the difference in RAPM between the home team and the visit team.}}{5}{figure.2}} +\newlabel{feats}{{2}{5}{Left: Visiting team overall weighted RAPM against the same statistic for the home team; the green dots indicate that the vistiting team wins while the red dots indicate home team wins. Right: box plot of RAPM; the $y$ axis represents the difference in RAPM between the home team and the visit team}{figure.2}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Algorithms and feature selection}{5}{subsection.4.2}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.3}A discussion of the algorithms}{5}{subsection.4.3}} -\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Test accuracy of each algorithm }}{6}{figure.3}} -\newlabel{diffmodel}{{3}{6}{Test accuracy of each algorithm}{figure.3}{}} +\citation{nba_oracle} +\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Test Accuracy for various different algorithms classifying the outcomes of NBA games in seasons 2009-2014. }}{6}{figure.3}} +\newlabel{diffmodel}{{3}{6}{Test Accuracy for various different algorithms classifying the outcomes of NBA games in seasons 2009-2014}{figure.3}{}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.4}Feature selection}{6}{subsection.4.4}} \citation{projections} \@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces A look at how feature selection improves the prediction accuracy.}}{7}{table.2}} \newlabel{table:matrix}{{2}{7}{A look at how feature selection improves the prediction accuracy}{table.2}{}} \newlabel{featsel}{{2}{7}{A look at how feature selection improves the prediction accuracy}{table.2}{}} \@writefile{toc}{\contentsline {section}{\numberline {5}Simulation}{7}{section.5}} -\@writefile{toc}{\contentsline {section}{\numberline {6}Conclusion}{7}{section.6}} \@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Distribution of wins for each team from simulating the 2013 NBA season 1000 times, using probabilities from our best Logistic regression model. The blue lines represent our confidence intervals whereas the Red lines represent the actual number of wins for each team. Our simulations trapped the true number of wins in 70 \% of our intervals.}}{8}{figure.4}} \newlabel{fig:simulations}{{4}{8}{Distribution of wins for each team from simulating the 2013 NBA season 1000 times, using probabilities from our best Logistic regression model. The blue lines represent our confidence intervals whereas the Red lines represent the actual number of wins for each team. Our simulations trapped the true number of wins in 70 \% of our intervals}{figure.4}{}} \citation{projections} \citation{sportsvu} \citation{projections} \bibcite{nba_oracle}{1} +\@writefile{toc}{\contentsline {section}{\numberline {6}Conclusion}{9}{section.6}} \bibcite{data_mining}{2} \bibcite{rpm}{3} \bibcite{bigrpm}{4} diff --git a/final_paper/final_paper.log b/final_paper/final_paper.log index 57abb52..36680a9 100644 --- a/final_paper/final_paper.log +++ b/final_paper/final_paper.log @@ -1,7 +1,10 @@ -This is pdfTeX, Version 3.1415926-2.5-1.40.14 (MiKTeX 2.9) (preloaded format=pdflatex 2014.8.20) 11 DEC 2014 16:14 +This is pdfTeX, Version 3.1415926-2.5-1.40.14 (MiKTeX 2.9) (preloaded format=pdflatex 2014.8.5) 12 DEC 2014 11:26 entering extended mode -**C:/Users/Lee/game_simulation/final_paper/final_paper.tex -(C:/Users/Lee/game_simulation/final_paper/final_paper.tex +**C:/Users/leeri_000/basketball_stats/game_simulation/final_paper/final_paper.t +ex + +(C:/Users/leeri_000/basketball_stats/game_simulation/final_paper/final_paper.te +x LaTeX2e <2011/06/27> Babel and hyphenation patterns for english, afrikaans, ancientgreek, ar abic, armenian, assamese, basque, bengali, bokmal, bulgarian, catalan, coptic, @@ -324,7 +327,8 @@ Package: geometry 2010/09/12 v5.6 Page Geometry \Gm@dimlist=\toks21 ("C:\Program Files (x86)\MiKTeX 2.9\tex\latex\geometry\geometry.cfg")) -(C:\Users\Lee\game_simulation\final_paper\final_paper.aux +(C:\Users\leeri_000\basketball_stats\game_simulation\final_paper\final_paper.au +x LaTeX Warning: Label `table:matrix' multiply defined. @@ -362,11 +366,12 @@ LaTeX Info: Redefining \ref on input line 29. LaTeX Info: Redefining \pageref on input line 29. LaTeX Info: Redefining \nameref on input line 29. -(C:\Users\Lee\game_simulation\final_paper\final_paper.out) -(C:\Users\Lee\game_simulation\final_paper\final_paper.out) +(C:\Users\leeri_000\basketball_stats\game_simulation\final_paper\final_paper.ou +t) +(C:\Users\leeri_000\basketball_stats\game_simulation\final_paper\final_paper.ou +t) \@outlinefile=\write3 - -("C:\Program Files (x86)\MiKTeX 2.9\tex\context\base\supp-pdf.mkii" + ("C:\Program Files (x86)\MiKTeX 2.9\tex\context\base\supp-pdf.mkii" [Loading MPS to PDF converter (version 2006.09.02).] \scratchcounter=\count118 \scratchdimen=\dimen130 @@ -464,7 +469,7 @@ Underfull \hbox (badness 10000) in paragraph at lines 60--64 [] - + File: sqliteERDiagram.png Graphic file (type png) Package pdftex.def Info: sqliteERDiagram.png used on input line 67. @@ -489,111 +494,132 @@ Underfull \hbox (badness 10000) in paragraph at lines 81--82 [] -[2] [3 ] +[2] [3 ] Underfull \hbox (badness 10000) in paragraph at lines 85--86 [] -[4] + +Overfull \hbox (48.26144pt too wide) in paragraph at lines 100--112 + [][] + [] + +[4] File: features.pdf Graphic file (type pdf) Package pdftex.def Info: features.pdf used on input line 120. (pdftex.def) Requested size: 505.88953pt x 234.88132pt. +LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <10> not available +(Font) Font shape `OT1/ptm/b/n' tried instead on input line 127. + + +LaTeX Warning: Reference `:' on page 5 undefined on input line 127. - + File: linear2.PNG Graphic file (type png) -Package pdftex.def Info: linear2.PNG used on input line 131. +Package pdftex.def Info: linear2.PNG used on input line 130. (pdftex.def) Requested size: 433.63559pt x 289.08781pt. LaTeX Warning: `h' float specifier changed to `ht'. -LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <10> not available -(Font) Font shape `OT1/ptm/b/n' tried instead on input line 140. -[5 ] -Underfull \hbox (badness 10000) in paragraph at lines 148--149 +[5 ] +Underfull \hbox (badness 10000) in paragraph at lines 147--148 [] -Underfull \hbox (badness 10000) in paragraph at lines 156--179 +Underfull \hbox (badness 10000) in paragraph at lines 151--172 + + [] + +[6 ] +Underfull \hbox (badness 10000) in paragraph at lines 173--174 [] -[6 ] - + File: season_wins.png Graphic file (type png) -Package pdftex.def Info: season_wins.png used on input line 183. +Package pdftex.def Info: season_wins.png used on input line 181. (pdftex.def) Requested size: 426.79028pt x 426.7961pt. -Underfull \hbox (badness 10000) in paragraph at lines 188--189 +Underfull \hbox (badness 10000) in paragraph at lines 186--187 [] -Underfull \hbox (badness 10000) in paragraph at lines 195--196 +Underfull \hbox (badness 10000) in paragraph at lines 193--194 [] -[7] [8 ] -Underfull \hbox (badness 10000) in paragraph at lines 198--199 +[7] [8 ] +Underfull \hbox (badness 10000) in paragraph at lines 196--197 [] -Underfull \hbox (badness 10000) in paragraph at lines 202--203 +Underfull \hbox (badness 10000) in paragraph at lines 199--200 [] -Underfull \hbox (badness 10000) in paragraph at lines 206--207 +Underfull \hbox (badness 10000) in paragraph at lines 202--203 [] -Underfull \hbox (badness 10000) in paragraph at lines 209--210 +Underfull \hbox (badness 10000) in paragraph at lines 205--206 [] +[9] Missing character: There is no Å in font ptmr7t! Missing character: There is no ¡ in font ptmr7t! -Underfull \hbox (badness 10000) in paragraph at lines 219--220 +Underfull \hbox (badness 10000) in paragraph at lines 215--216 []\OT1/ptm/m/n/10 Paul Fearn-head, Ben-jamin M. Tay-lor \OT1/ptm/m/it/10 On Es- ti-mat-ing the Abil-ity of NBA Play-ers\OT1/ptm/m/n/10 . 2010: [] -[9] -Underfull \hbox (badness 10000) in paragraph at lines 233--234 + +Underfull \hbox (badness 10000) in paragraph at lines 229--230 []\OT1/ptm/m/n/10 Sam Hinkie and the An-a-lyt-ics Rev-o-lu-tion in Bas-ket-ball . Nilka-nth Pa-tel. [] -Package atveryend Info: Empty hook `BeforeClearDocument' on input line 243. +Package atveryend Info: Empty hook `BeforeClearDocument' on input line 239. [10] -Package atveryend Info: Empty hook `AfterLastShipout' on input line 243. - (C:\Users\Lee\game_simulation\final_paper\final_paper.aux) -Package atveryend Info: Executing hook `AtVeryEndDocument' on input line 243. -Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 243. +Package atveryend Info: Empty hook `AfterLastShipout' on input line 239. + +(C:\Users\leeri_000\basketball_stats\game_simulation\final_paper\final_paper.au +x) +Package atveryend Info: Executing hook `AtVeryEndDocument' on input line 239. +Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 239. Package rerunfilecheck Info: File `final_paper.out' has not changed. (rerunfilecheck) Checksum: 65F4D1C3350D0BAA1F2755D99F022272;880. +LaTeX Warning: There were undefined references. + + LaTeX Warning: There were multiply-defined labels. -Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 243. +Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 239. ) Here is how much of TeX's memory you used: - 7739 strings out of 493921 - 110795 string characters out of 3144882 - 208616 words of memory out of 3000000 - 10874 multiletter control sequences out of 15000+200000 + 7740 strings out of 493921 + 111051 string characters out of 3144865 + 208745 words of memory out of 3000000 + 10875 multiletter control sequences out of 15000+200000 27626 words of font info for 64 fonts, out of 3000000 for 9000 841 hyphenation exceptions out of 8191 - 37i,10n,39p,1672b,441s stack positions out of 5000i,500n,10000p,200000b,50000s + 37i,10n,39p,1713b,443s stack positions out of 5000i,500n,10000p,200000b,50000s {C:/Program Files (x86)/MiKTeX 2.9/fonts/enc/dvips/fontname/8r.enc} -Output written on final_paper.pdf (10 pages, 856798 bytes). +Output written on final_paper.pdf (10 pages, 858006 bytes). PDF statistics: - 330 PDF objects out of 1000 (max. 8388607) + 332 PDF objects out of 1000 (max. 8388607) 49 named destinations out of 1000 (max. 500000) 133 words of extra memory for PDF output out of 10000 (max. 10000000) diff --git a/final_paper/final_paper.pdf b/final_paper/final_paper.pdf index 306dcff..08ae3b8 100644 Binary files a/final_paper/final_paper.pdf and b/final_paper/final_paper.pdf differ diff --git a/final_paper/final_paper.synctex.gz b/final_paper/final_paper.synctex.gz index 378a954..0aa545c 100644 Binary files a/final_paper/final_paper.synctex.gz and b/final_paper/final_paper.synctex.gz differ diff --git a/final_paper/final_paper.tex b/final_paper/final_paper.tex index 1452579..797f330 100644 --- a/final_paper/final_paper.tex +++ b/final_paper/final_paper.tex @@ -31,7 +31,7 @@ \maketitle \begin{abstract} -Last year, the defending champion Miami Heat lost consecutive games against the lowly Detroit Pistons and Milwaukee Bucks. While this two game stretch was certainly unlikely, it exemplifies the inherent randomness present in NBA basketball games. In this paper, we examine how well we can predict the outcomes of individual games using various machine learning algorithms. Specifically, we see that we can accurately predict around 70\% of individual games, and that simple, interpretable classifiers such as Logistic Regression and Naive Bayes work the best. We then use these predictions to simulate full NBA seasons, which land us in the top 15 of all publicly available NBA projection systems \cite{projections}. Finally, we see that the recently developed Regularized Adjusted Plus Minus (RAPM) statistic is a better predictor than the more commonly used box score statistics. +Last year, the defending champion Miami Heat lost consecutive games against the lowly Detroit Pistons and Milwaukee Bucks. While this two game stretch was certainly unlikely, it exemplifies the inherent randomness present in NBA basketball games. In this paper, we examine how well we can predict the outcomes of individual games using various machine learning algorithms. Specifically, we see that we can accurately predict around 70\% of individual games, and that simple, interpretable classifiers such as Logistic Regression and Naive Bayes work the best. We then use these predictions to simulate full NBA seasons, which land us in the top 20 of all publicly available NBA projection systems \cite{projections}. Finally, we see that the recently developed Regularized Adjusted Plus Minus (RAPM) statistic is a better predictor than the more commonly used box score statistics. \end{abstract} \section{Introduction} @@ -41,7 +41,7 @@ \section{Introduction} \section{Related Works} %%% TALK ABOUT NBA ORACLE AND DATA MINING TO COMPARE PREDICTION ACCURACIES - We looked into the literature to see if anyone had worked on the same or similar problems. From this, we found a few papers, specifically \cite{nba_oracle} and \cite{data_mining}, which also attempted to predict the outcomes of NBA games. These papers used linear and logistic regression, naive Bayes, neural networks, and Support Vector Machine's as their classifiers, and used mainly box score statistics as features. They also used the same loss function (zero one loss) that we are proposing, which gave us a target test error to shoot for when implementing our algorithms. Specifically, \cite{nba_oracle} achieved the highest single season classification rate of 73\% in the 1996 season using linear regression. All of the other season/algorithms combinations had error rates from the mid 60's to low 70's. \\ + We looked into the literature to see if anyone had worked on the same or similar problems. From this, we found a few papers, specifically \cite{nba_oracle} and \cite{data_mining}, which also attempted to predict the outcomes of NBA games. These papers used linear and logistic regression, naive Bayes, neural networks, and support vector machine's as their classifiers, and used mainly box score statistics as features. They also used the same loss function (zero one loss) that we are proposing, which gave us a target accuracy to shoot for when implementing our algorithms. Specifically, \cite{nba_oracle} achieved the highest single season classification rate of 73\% in the 1996 season using linear regression. All of the other season/algorithms combinations had error rates from the mid 60's to low 70's. \\ %%% DELVE INTO THE RAPM PAPER TO EXPLAIN WHY IT'S SO GOOD One advantage we believe we have compared with these groups is that we have a more robust feature set, most notably, we have RAPM. RAPM has been anointed by many as the next big thing \cite{bigrpm} in the basketball statistics community, and we hope that using it as a feature will help differentiate our attempts at game classification. Taylor et al. have a great explanation of the statistic \cite{rpm}, but the basic idea is to split each game into miniature games, each one occurring in time periods when there's no substitutions. The final dataset consists of the score margin of each miniature game, and indicator variables for each player involved (+1 for offense, -1 for defense). Then a regression is ran (people are now using ridge regression, which is why the R in RAPM stands for Regularization), and the coefficients represent each players individual contribution per 100 possessions on offense and defense. RAPM adjusts for flaws in original plus minus, by correcting for the fact that player totals are heavily influenced by the play of his opponents and teammates, and by pooling information from previous seasons to reduce the margin of error. @@ -53,14 +53,14 @@ \subsection{Challenges for dataset preparation} \begin{itemize} \item There are currently no ready-made data sources for our experiments. What we can find is several websites on NBA games. Since we are interested in statistics of thousands of games with hundreds of players, it's not realistic to acquire those data manually from websites. -\item Data sources from different websites have to be merged using a uniformed format. Different websites may treat player names and season labels differently. For example Roger Mason Jr. and Roger Mason are in fact same person from different websites, while uta and utah are different abbreviations for the Utah Jazz. -\item Data needs to be stored in a proper manner so that our future experiments can be implemented based on a uniformed and easily accessible API. +\item Data sources from different websites have to be merged using a unified format. Different websites may treat player names and season labels differently. For example Roger Mason Jr. and Roger Mason are in fact same person from different websites, while uta and utah are different abbreviations for the Utah Jazz. +\item Data needs to be stored in a proper manner so that our future experiments can be implemented based on an easily accessible API. \end{itemize} -To meet those challenges, we are writing our own web crawlers to get data from websites, then made lots efforts on ETL on ensure different data sources were properly combined. Finally we stored all data into a SQLite database. -It's worth mentioning that we are making our dataset public online at \\ +To meet those challenges, we are writing our own web crawlers to get data from websites, then made lots efforts on ETL on ensure different data sources were properly combined. Finally we stored all data into a SQLite database, shown in \ref{fig:database} +We are making our dataset publically available online at \\ \url{https://github.com/leerichardson/game_simulation/blob/master/nba_rRegression_chi/nba.db}\\ -Hopefully this will save some effort for anyone interested in research on NBA statistics in the coming years. \\ +Hopefully this will save some effort for anyone interested in this type of NBA statistics in the coming years. \\ \begin{figure} \centering @@ -70,9 +70,9 @@ \subsection{Challenges for dataset preparation} \end{figure} \subsection{Data Sources} - We did not have a processed dataset for this project, so we created our own database. The three main sources we used were ESPN's NBA website \cite{espn}, basketball reference \cite{bball_ref}, and a new website from Jeremias Engleman \cite{rpm_data}. We used the ESPN data to get information about all NBA games from 2009-2014. This includes the game score, the home and away teams, the players involved and their individual statistics. Also from ESPN, we have a player database, which has 50 individual statistics for each player in each season. There are 7139 games in this dataset. \\ + The three main sources we used were ESPN's NBA website \cite{espn}, basketball reference \cite{bball_ref}, and a new website from Jeremias Engleman \cite{rpm_data}. We used the ESPN data to get information about all NBA games from 2009-2014. This includes the game score, the home and away teams, the players involved and their individual statistics. Also from ESPN, we have a player database, which has 50 individual statistics for each player in each season. There are 7139 games in this dataset. \\ - The next data source we used was basketball reference \cite{bball_ref}. The main reason we used this site is because they have a larger individual player database, with information dating back to the 1950's and more advanced statistics, such as the widely used Player Efficiency Rating (PER), compared with just box score statistics provided by ESPN. \\ + The next data source we used was basketball reference \cite{bball_ref}. The main reason we used this site is because they have a larger individual player database, with information dating back to the 1950's. They also use different, more advanced statistics, such as the widely used Player Efficiency Rating (PER), compared with just box score statistics provided by ESPN. \\ The final source we used was from a website put together by Jeremias Engleman \cite{rpm_data}. This site has RAPM statistics described above, dating back to the 1980's. This statistic has been widely adopted in the NBA statistics community, and it's one of the few trustworthy stats which provides an individual assessment of defense. As we see below, RAPM is a very useful feature in predicting game outcomes. \\ @@ -87,11 +87,11 @@ \subsection{Web Crawlers} \section{Experiments} %%% TWOFEATURE MATRICES/.. LARGE AND JUST RAPM/PER \subsection{Training and test datasets} - We devoted a substantial amount of time to construct features that can be used to train the model. The features are constructed using of both the RAPM dataset and the ESPN data (recall that the ESPN data provide basic statistics such as the average number of points per game or the average number of rebounds per game for a given season). To create the features, we considered each match and listed the players on each team, then we merged the players' statistics from the previous season with the results in the matches of the current season. We then add up the players' statistics to form statistics for home teams and away teams. In the computation of the offensive and defensive RAPM statistics of each team, the players' average minutes per game from the previous season were used as weights. To illustrate, suppose that there are $n$ players in each team, with the $i$-th player playing $m_i$ minutes per game and having RAPM score $r_i$ in the previous season. Then, the weight for the $i$-th player and the team offensive RAPM are determined as: + We devoted a substantial amount of time to construct features that can be used to train the model. The features are constructed using all three data sources (recall that the ESPN data provide basic statistics such as the average number of points per game or the average number of rebounds per game for a given season). To create the features, we considered each match and listed the players on each team, then we merged the players' statistics from the previous season with the results in the matches of the current season. We then add up the players' statistics to form statistics for the home team and away teams. In the computation of the offensive and defensive RAPM statistics of each team, the players' average minutes per game from the previous season were used as weights. To illustrate, suppose that there are $n$ players in each team, with the $i$-th player playing $m_i$ minutes per game and having RAPM score $r_i$ in the previous season. Then, the weight for the $i$-th player and the team offensive RAPM are determined as: \begin{align*} w_i &= \frac{m_i}{\frac{1}{n}\sum_{i=1}^{n} m_i} \\ - \text{Team Offensive RAPM} &= \frac{1}{n}\sum_{i=1}^{n} w_i \times r_i. + \text{Team RAPM} &= \frac{1}{n}\sum_{i=1}^{n} w_i \times r_i. \end{align*} %%% TABLE OF EXAMPLE MATRIX %%% @@ -99,7 +99,7 @@ \section{Experiments} \centering \begin{tabular}{rrrrrr} \hline - ORPM\_home & DRPM\_home & ORPM\_away & DRPM\_away & homeWin \\ + Home Offensive RAPM & Home Defensive RAPM & Away Offensive RAPM & Away Defensive RAPM & Home Team Wins \\ \hline -0.28 & 0.89 & 0.65 & 0.18 & 1 \\ -0.28 & 0.89 & 1.15 & 1.05 & 1 \\ @@ -112,48 +112,43 @@ \section{Experiments} \caption{A look at what our training and test datasets look like. The first four columns are features and the 5th column (indicating whether or not the home team won the game) is our label.} \label{table:matrix} \end{table} -Before training the model, we tried to understand how the features that we constructed related to the game results. -In Figure \ref{feats}, the left plot has the home team RAPM on the $x$ axis and the away team RAPM on the $y$ axis. The plot suggests that the game results are not linearly separable using RAPM. However, the green dots (corresponding to the losses for the home team) are close to a Gaussian distribution centered above the diagonal line of the plot, while the red dots are centered below the diagonal line. If one takes the diagonal line as the decision boundary, the majority of the data points are correctly classified. The right plot displays the teams' RAPM in wins compared with the teams' RAPM in losses. The horizontal line at the middle of each box corresponds to the sample mean. The top and bottom ends of the boxes are the empirical first and third quartiles, respectively. From the plot, one observes that RAPM is higher in wins than losses, suggesting that RAPM is a relevant feature. + +Before training the model, we tried to understand how the features that we constructed related to the game results with exploratory data analysis. In Figure \ref{feats}, the left plot has the home team RAPM on the $x$ axis and the away team RAPM on the $y$ axis. The plot suggests that the game results are not linearly separable using RAPM. However, the green dots (corresponding to the losses for the home team) are close to a Gaussian distribution centered above the diagonal line of the plot, while the red dots are centered below the diagonal line. If one takes the diagonal line as the decision boundary, the majority of the data points are correctly classified. The right plot displays the teams' RAPM in wins compared with the teams' RAPM in losses. The horizontal line at the middle of each box corresponds to the sample mean. The top and bottom ends of the boxes are the empirical first and third quartiles, respectively. From the plot, one observes that RAPM is higher in wins than losses, suggesting that RAPM is a relevant feature. \begin{figure}[h] \centerline{\epsfig{file=features.pdf, width=7in,height=3.25in}} - \caption{Left: visit team RAPM against home team RAPM; the green dots indicate visit team wins while the red dots indicate home team wins. Right: box plot of RAPM; the $y$ axis represents the difference in RAPM between the home team and the visit team.} + \caption{Left: Visiting team overall weighted RAPM against the same statistic for the home team; the green dots indicate that the vistiting team wins while the red dots indicate home team wins. Right: box plot of RAPM; the $y$ axis represents the difference in RAPM between the home team and the visit team.} \label{feats} \end{figure} - \subsection{Algorithms and feature selection} - We tried different models based on the features that we constructed to predict the game results. For each model, we split our dataset into a training set and a test set. - We trained our model using available data from the previous season and we used the model to predict the game results of the particular years in which we are interested. For example, to predict the game results for the 2010 season, we trained the model using data from seasons 2008 and 2009. %Chi's plot pleas + We tried different algorithms based on the feature matrix we constructed to predict the game results. For each model, we split our dataset into a training set and a test set. + We trained our model using available data from the previous seasons and we used the model to predict the game results in the 2009-2014 seasons. For example, to predict the game results for the 2010 season, we trained the model using data from seasons 2008 and 2009. Figure \ref:dif \begin{figure}[h] \centerline{\epsfig{file=linear2.PNG, width=6in,height=4in}} - \caption{Test accuracy of each algorithm } + \caption{Test Accuracy for various different algorithms classifying the outcomes of NBA games in seasons 2009-2014. } \label{diffmodel} \end{figure} \subsection{A discussion of the algorithms} - To predict the game results, we used algorithms such as linear regression, logistic regression, naive Bayes, SVM, and decision trees. + To predict the game results, we used algorithms such as linear regression, logistic regression, naive Bayes, support vector machines, and decision trees. \begin{enumerate} \item \textbf{Linear Regression.} In order to fit the linear regression model, the game results are denoted as 0 and 1 (with 1's denoting home team wins). The fitted model is described in terms of a vector of estimated coefficients (one for each feature) which we denote $\hat \beta$. To predict new games results using this model, one evaluates $\hat \beta^{T} X_{\text{test}}$, where $X_{\text{test}}$ corresponds to the vector of features for the new games. We predict that the home team wins if $\hat \beta^{T} X_{\text{test}} \ge 0.5$, and that the home team loses otherwise. \item \textbf{Logistic regression.} Logistic regression is particularly useful when the outcome is binary. As before, we use 0 and 1 to denote wins and losses, and we predict that the home team wins a new game if the predicted value for that game is greater than or equal to $0.5$. Predicted values from a logistic regression are estimates of the probability that the home team wins. Thus, predictions from the logistic regression model are easily interpretable. - \item \textbf{Naive Bayes.} The naive Bayes classifier is a simple probabilistic classifier that selects the hypothesis that maximizes a posterior probability. The underlining assumption for naive Bayes classifier is that all the features are independent. Although in our case the features are not independent, the naive Bayes classifier performs relatively well compared with the other models. - \item \textbf{SVM.} A Support Vector Machine is a non-probabilistic classifier. We tried different kernels, and Figure \ref{diffmodel} displays the highest model accuracy achieved by the SVM among the kernels that we considered. Note that even though different kernels generate different decision boundaries, none allows to exactly classify all of the data points. For this reason, we use soft margin to train the model. + \item \textbf{Naive Bayes.} The naive Bayes classifier is a simple probabilistic classifier that selects the hypothesis that maximizes a posterior probability. The underlining assumption for naive Bayes classifier is that all the features are conditionally independent. Although in our case the features are not independent, the naive Bayes classifier performs relatively well compared with the other models. + \item \textbf{SVM.} A Support Vector Machine is a non-probabilistic classifier. We tried different kernels, and figure \ref{diffmodel} displays the highest model accuracy achieved by the SVM among the kernels that we considered. Note that even though different kernels generate different decision boundaries, none allows to exactly classify all of the data points. For this reason, we use soft margin to train the model. \item \textbf{Decision tree.} A decision tree is useful when one needs a good way to split a dataset with many features. In our case, there are 44 features overall. In order to avoid overfitting, we choose the subtree that minimized the regularized training error, defined as the sum of the training error and $\lambda |T|$. Here, $|T|$ denotes the size of the tree while $\lambda$ is a penalty parameter which is selected using cross-validation. \end{enumerate} - Figure \ref{diffmodel} displays the test error of different algorithm. The linear model appears to have the best accuracy. Note that, on the basis of the discussion above, the linear decision boundary were expected to perform well. The test accuracy plot confirms with this conclusion.\\ - + Figure \ref{diffmodel} displays the test error of different algorithms, from seasons 2009-2013. The linear model appears to have the best accuracy, which is the same result obtained by \cite{nba_oracle}. Note that, on the basis of the discussion above, the linear decision boundary was expected to perform well. The test accuracy plot confirms our intuition. \\ - - - \subsection{Feature selection} - In this section we discuss feature selection. There are 44 features for each model. Feature selection can prevent overfitting and thus improve prediction accuracy. We discuss feature selection for the linear regression model. For the linear regression model, we perform feature selection by using lasso regularization and a stepwise AIC procedure. In the lasso regression, the effective degrees of freedom $\lambda$ are chosen to minimize the generalized cross validation error. In the stepwise AIC procedure, we set the minimum model to be the model including only the intercept term and the maximum model to be the model including all of the 44 features. At each step of the stepwise AIC procedure, a feature is included or dropped from the current model in such a way to give the highest decrease in the AIC score until no further changes in the AIC score occur. + There are 44 features for each model. Feature selection is used to prevent overfitting and thus improve prediction accuracy. We discuss feature selection fin terms of the linear regression model, since it gave us the best test accuracy. For the linear regression model, we perform feature selection by using lasso regularization and a stepwise AIC procedure. In the lasso regression, the effective degrees of freedom $\lambda$ are chosen to minimize the generalized cross validation error. In the stepwise AIC procedure, we set the minimum model to be the model including only the intercept term (which represents the probability that the home team wins) and the maximum model to be the model including all of the 44 features. At each step of the stepwise AIC procedure, a feature is included or dropped from the current model in such a way to give the highest decrease in the AIC score until no further changes in the AIC score occur. \\ \begin{table}[ht] \centering @@ -174,8 +169,11 @@ \section{Experiments} \label{featsel} \end{table} \\ - According to Table \ref{featsel}, stepwise AIC found a model with higher accuracy. The absolute difference in accuracy between the original model model and the selected model are not significant. However, the best prediction accuracy known is up to 70$\%$, while if one always predicts home team win, the accuracy is around 60$\%$. The improvement is not negligible. -\\ \\ We also note that both the lasso and the stepwise AIC procedure identify the overall team weighted RAPM as the most important predictive feature. This is confirmed by the fact that the stepwise AIC procedure starting with the null model selects home and away RAPM as the first two features that should be included in the model. Moreover, the coefficient of the Weighted RAPM didn't shrink to 0 after the lasso and this is another indication that these feature is more relevant than the others. + + According to table \ref{featsel}, stepwise AIC found a model with higher accuracy. The absolute difference in accuracy between the original model model and the selected model are not significant. However, the best prediction accuracy is known to be around 70$\%$, and when one always predicts home team win, the accuracy is around 60$\%$. From this, we note that improvement in predictive accuracy from feaure selection is not negligible. \\ + + We also note that both the lasso and the stepwise AIC procedure identify the overall team weighted RAPM as the most important predictive feature. This is confirmed by the fact that the stepwise AIC procedure starting with the null model selects home and away RAPM as the first two features that should be included in the model. Moreover, the coefficient of the Weighted RAPM didn't shrink to 0 after the lasso, whereas all the others did, and this is another indication that these feature is more relevant than the others. + \section{Simulation} \begin{figure} @@ -185,28 +183,26 @@ \section{Simulation} \label{fig:simulations} \end{figure} -In order for the various classifier's to predict which team would win each game, they generate probabilities for each team (class), and predict the maximum. Another natural way to utilize these probabilities is to use them to simulation the entire season. Specifically, by generating Uniform(0,1) random variables, we were able to stochastically determine the outcome of each game, and simulation each game in a season. Then we simulated the season 1000 times to obtain the distribution of predicted win totals for each team. Figure \ref{fig:simulations} shows the win distributions for each team, using logistic regression for the 2013-14 season. Out of the two best classifiers (Naive Bayes and Logistic Regression), logistic regression performed much better in these simulations, using both the Root Mean Squared Error (RMSE) and Median Absolute Deviation (MAE) loss functions. This is because the game probabilities generated by Naive Bayes were much more extreme than logistic regression. While this doesn't effect classification as much since we're only selection the maximum, the exact probabilities matter in a more complicated function such as the result of simulating a complete season. To see how sensitive the simulated seasons are to these probabilities, consider the standard deviations of the number of predicted wins for each team using Naive Bayes and Logistic Regression. For Naive Bayes, they were 16.12 and 17.07, compared with 9.66 and 9.68 for logistic regression. This shows that the win totals predicted by Naive Bayes were much more extreme than logistic regression. \\ +In order for the various classifier's to predict which team would win each game, they generate probabilities for each team (class), and predict the maximum. Another natural way to utilize these probabilities is to use them to simulation the entire season. Specifically, by generating Uniform(0,1) random variables, we were able to stochastically determine the outcome of each game, and simulation each game in a season. Next, we simulated the season 1000 times to obtain the distribution of predicted win totals for each team. Figure \ref{fig:simulations} shows the win distributions for each team, using logistic regression for the 2013-14 season. Out of the two classifiers with easy to use probabilities (Naive Bayes and Logistic Regression), logistic regression performed much better in these simulations, using both the Root Mean Squared Error (RMSE) and Median Absolute Deviation (MAE) loss functions. This is because the game probabilities generated by Naive Bayes were much more extreme than logistic regression. While this doesn't effect classification as much since we're only predicting the maximum, the exact probabilities matter in a more complicated dstribution, such as simulating a complete season. To see how sensitive the simulated seasons are to these probabilities, consider the standard deviations of the number of predicted wins for each team using Naive Bayes and Logistic Regression. For Naive Bayes, they were 16.12 and 17.07, compared with 9.66 and 9.68 for logistic regression. This shows that the win totals predicted by Naive Bayes were much more extreme than logistic regression. \\ -Forecasting how many wins each time will get during a seasons turns out to be a much more popular task than computing test errors, so we were able to compare our projections to lots of other systems using our RMSE loss function (which is commonly used by in the basketball analytics community \cite{projections}). Our RMSE for 2012 was 7.54 and 9.7278 in 2013. This would be in the top 20 for the past two seasons. Had we used Naive Bayes as opposed to Logistic regression, our system would have consistently performed in the bottom five. +Forecasting how many wins each time will get during a seasons turns out to be a much more popular task than computing test errors, so we were able to compare our projections to lots of other systems using our RMSE loss function (which is commonly used by in the basketball analytics community \cite{projections}). Our RMSE for 2012 was 7.54 and 9.7278 in 2013. This would be in the top 20 for the past two seasons. Had we used Naive Bayes as opposed to Logistic regression, our system would have consistently performed as one of the worst. \section {Conclusion} %% General Theory for How to clasify. Why is predicting the outcomes of seasons harder? -In the end, it seems that sticking with a simple classification algorithm yields the best accuracy in forecasting NBA games. It's clear that the two factors have the largest impact on each game are home court advantage and the quality of each team. Imagine that there's some perfect measure of the quality of each team, and that two teams are identical. In this situation, you should always pick the home team to win. The question then becomes, how much better does the away team have to be than the home team in order to choose them to win? Figuring out where to draw this line seems like the way to get the best performance out of classification algorithms. Obviously, since there's no way to know the exact quality of each team, this will be an estimate and is not an exact science. \\ +In the end, it seems that sticking with a simple classification rule using the best features yields the best accuracy in forecasting NBA games. It's clear that the two factors have the largest impact on each game are home court advantage and the quality of each team. Imagine that there's some perfect measure of the quality of each team, and that two teams are identical. In this situation, you should always pick the home team to win. The question then becomes, how much better does the away team have to be in order to choose them to win? Figuring out where to draw this line seems like the way to get the best performance out of classification algorithms. Obviously, since there's no way to know the exact quality of each team, this is estimated from data and is not an exact science. \\ %% Discuss why other systems bear us. (Projections, more developed, more nuances built in, etc...) Hollinger won, had been doing it the longers, mainly with PER -In terms of simulating seasons, it seems that more conservative estimates, such as those yielded by logistic regression, work better than more extreme predictions. For this endeavor, there were more public predictions available to compare our results with, and we see that while our system performs well, it did not approach some of the best publicly available systems. For instance, John Hollinger, who was recently hired by the Memphis Grizzlies, had the best projection system in 2012. He has been doing this for around 5 years, and written various books on the subject, so it makes sense his system would have better performance than ours, just based on experience. As we discuss below, there's many nuances that could be built into a projection system to make it more accurate, and since we only spent a semester on this project, there wasn't enough time to incorporate everything we hoped to include. \\ - +In terms of simulating seasons, it seems that more conservative estimates, such as those yielded by logistic regression, work better than more extreme predictions. For this endeavor, there were more public predictions available to compare our results with, and we see that while our system performs well, it did not approach some of the best publicly available systems. For instance, John Hollinger, who was recently hired by the Memphis Grizzlies, had the best projection system in 2012. He has been doing this for around 5 years, and written various books on the subject, so it makes sense his system would have better performance than ours, just based on experience. As we discuss below, there's many nuances that could be built into a projection system to make it more marginally accurate, and since we only spent a semester on this project, there wasn't enough time to incorporate everything we hoped to include. \\ %% Why was 2013 so much less predictable than other years. Especially since we had the MOST training data. This gives some evidence to the theory that randomness may impact the results of these games more than a fancy classifier. sometimes the underdog will just win more. -One interesting finding that came out of our report was how much lower our test error was in the final season, 2013. This is counter to what one would expect in a typical machine learning situation, since we had the most amount of training data for this season. However, we think the reason for this is that there was just more variability in the 2013 compared to other seasons. This shows that no matter what algorithm/features you use, upsets happen in the NBA, and some seasons have more unexpected results than others. Other systems also showed much lower accuracy in 2013 compared with 2012 \cite{projections}. \\ %% Salary cap? Player Movement? - +One interesting finding that came out of our report was how much lower our test error was in the final season, 2013. This is counter to what one would expect in a typical machine learning situation, since we had the most amount of training data for this season. However, we think the reason for this is that there was just more variability in the 2013 compared to other seasons. This shows that no matter what algorithm/features you use, upsets happen in the NBA, and some seasons have more unexpected results than others. Other systems also showed much lower accuracy in 2013 compared with 2012 \cite{projections}. \\ %% RAPM- Why did it do so much better? More information?? What does this say about SportsVU data's potential impact as a predictor? Another interesting result was how much predictive the RAPM statistic was than box score statistics. We think the main reason for this is simply that it simply contains more information. RAPM uses possession level events to construct offensive and defensive ratings for each player, and all box score statistics can be computed with the same possession level data that calculates RAPM. What does this mean for the future of statistics in the NBA? We think it indicates that soon enough, statistics like RAPM might soon become less state of the art, as even higher resultion camera data is now being collected by a company called STATS LLC \cite{sportsvu}. The data from these cameras tracks the location of each player 10 times each second, and is sure to produce some novel insights into the game of basketball. \\ %% Future developments: Using current season's data, Predicting the Spread, and building a projection of the RAPM feature as opposed to just last year (more than one year, what to do with rookies? Etc..) -There are many ways to improve upon the results displayed in this paper. One of the main flaws in our system is that we only used individual player data from the previous season in order to predict the current season. This means that if a player misses the previous season due to injury (IE: Derick Rose, Rajon Rondo, etc...), then a significant aspect of a team may be underrated by our systems. We are also treating rookies as league average players, which is a very favorable assumption. A couple ways we could combat this is by using projected season statistics as features to predict the current season. Some of the most accurate RAPM based systems used in 2013 used this strategy \cite{projections}. That way, we could account for things like player age, injury history, and expected changes due to team composition. We could also consider not just the previous season, but a weighted average of the past 3-5 seasons to get an estimate for the quality of each player. \\ +There are many ways to improve upon the results displayed in this paper. One of the main flaws in our system is that we only used individual player data from the previous season in order to predict the current season. This means that if a player misses the previous season due to injury (IE: Derick Rose, Rajon Rondo, etc...), then a significant aspect of a team may be left out of our systems. We are also treating rookies as league average players, which is a very favorable assumption, since most rookies are worse then the league average in their first season. A couple ways we could combat this is by using projected season statistics as features to predict the current season. Some of the most accurate RAPM based systems used in 2013 used this strategy \cite{projections}. That way, we could account for things like player age, injury history, and expected changes due to team composition. We could also consider not just the previous season, but a weighted average of the past 3-5 seasons to get an estimate for the quality of each player. \\ We also wanted to include data from the current season into our projections. To do this, we could divide each season into K different chunks, and use the teams winning percentage in all chunks of the season before each game to predict game outcomes. As seen in figure \ref{fig:simulations}, some of the estimates which represented how good we thought each team was before the season were very off. Incorporating current season data into the predictions would allow us to save ourselves when it becomes clear a team is much better than expected. We can update our valuation of how good each team is as we get more information on them throughout the season. diff --git a/final_paper/final_paper/final_paper.log b/final_paper/final_paper/final_paper.log index 6746edf..efd9be5 100644 --- a/final_paper/final_paper/final_paper.log +++ b/final_paper/final_paper/final_paper.log @@ -1,4 +1,4 @@ -This is pdfTeX, Version 3.1415926-2.5-1.40.14 (MiKTeX 2.9) (preloaded format=pdflatex 2014.8.5) 8 DEC 2014 21:37 +This is pdfTeX, Version 3.1415926-2.5-1.40.14 (MiKTeX 2.9) (preloaded format=pdflatex 2014.8.5) 12 DEC 2014 10:53 entering extended mode **C:/Users/leeri_000/basketball_stats/game_simulation/final_paper/final_paper/f inal_paper.tex @@ -473,11 +473,22 @@ Underfull \hbox (badness 10000) in paragraph at lines 61--65 [] - -File: sqliteERDiagram.png Graphic file (type png) - -Package pdftex.def Info: sqliteERDiagram.png used on input line 68. -(pdftex.def) Requested size: 426.78752pt x 341.44263pt. + +LaTeX Warning: File `sqliteERDiagram.png' not found on input line 68. + + +! Package pdftex.def Error: File `sqliteERDiagram.png' not found. + +See the pdftex.def package documentation for explanation. +Type H for immediate help. + ... + +l.68 ...=150mm, height=120mm]{sqliteERDiagram.png} + +Using draft setting for this image. +Try typing to proceed. +If that doesn't work, type X to quit. + Underfull \hbox (badness 10000) in paragraph at lines 74--75 @@ -503,13 +514,22 @@ Underfull \hbox (badness 10000) in paragraph at lines 86--87 [] -[2] [3 ] - -File: features.pdf Graphic file (type pdf) - -Package pdftex.def Info: features.pdf used on input line 120. -(pdftex.def) Requested size: 505.883pt x 234.88132pt. +[2] [3] + +LaTeX Warning: File `features.pdf' not found on input line 120. + + +! Package pdftex.def Error: File `features.pdf' not found. + +See the pdftex.def package documentation for explanation. +Type H for immediate help. + ... + +l.120 ...e=features.pdf, width=7in,height=3.25in}} + +Using draft setting for this image. +Try typing to proceed. +If that doesn't work, type X to quit. LaTeX Warning: `h' float specifier changed to `ht'. @@ -520,11 +540,23 @@ Underfull \hbox (badness 10000) in paragraph at lines 127--128 [] -[4] -File: algorithms.png Graphic file (type png) - -Package pdftex.def Info: algorithms.png used on input line 130. -(pdftex.def) Requested size: 397.48375pt x 252.94914pt. +[4] + +LaTeX Warning: File `algorithms.png' not found on input line 130. + + +! Package pdftex.def Error: File `algorithms.png' not found. + +See the pdftex.def package documentation for explanation. +Type H for immediate help. + ... + +l.130 ...gorithms.png, width=5.5in,height=3.50in}} + +Using draft setting for this image. +Try typing to proceed. +If that doesn't work, type X to quit. + Underfull \hbox (badness 10000) in paragraph at lines 135--136 @@ -535,21 +567,29 @@ Underfull \hbox (badness 10000) in paragraph at lines 159--160 [] -[5 ] -File: season_wins.png Graphic file (type png) +[5] + +LaTeX Warning: File `season_wins.png' not found on input line 167. + + +! Package pdftex.def Error: File `season_wins.png' not found. + +See the pdftex.def package documentation for explanation. +Type H for immediate help. + ... + +l.167 ...height=15cm, width=15cm]{season_wins.png} + +Using draft setting for this image. +Try typing to proceed. +If that doesn't work, type X to quit. - -Package pdftex.def Info: season_wins.png used on input line 167. -(pdftex.def) Requested size: 426.79028pt x 426.7961pt. Underfull \hbox (badness 10000) in paragraph at lines 172--173 [] -[6 ] [7 ] +[6] [7] Underfull \hbox (badness 10000) in paragraph at lines 179--180 [] @@ -606,10 +646,10 @@ LaTeX Warning: There were multiply-defined labels. Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 227. ) Here is how much of TeX's memory you used: - 7724 strings out of 493921 - 111063 string characters out of 3144865 + 7708 strings out of 493921 + 110731 string characters out of 3144865 208392 words of memory out of 3000000 - 10869 multiletter control sequences out of 15000+200000 + 10853 multiletter control sequences out of 15000+200000 24922 words of font info for 61 fonts, out of 3000000 for 9000 841 hyphenation exceptions out of 8191 37i,10n,39p,1706b,333s stack positions out of 5000i,500n,10000p,200000b,50000s @@ -624,9 +664,9 @@ KTeX 2.9/fonts/type1/urw/courier/ucrr8a.pfb> -Output written on final_paper.pdf (9 pages, 688655 bytes). +Output written on final_paper.pdf (9 pages, 146425 bytes). PDF statistics: - 315 PDF objects out of 1000 (max. 8388607) + 194 PDF objects out of 1000 (max. 8388607) 42 named destinations out of 1000 (max. 500000) - 125 words of extra memory for PDF output out of 10000 (max. 10000000) + 105 words of extra memory for PDF output out of 10000 (max. 10000000) diff --git a/final_paper/final_paper/final_paper.pdf b/final_paper/final_paper/final_paper.pdf index 297101a..a73d71c 100644 Binary files a/final_paper/final_paper/final_paper.pdf and b/final_paper/final_paper/final_paper.pdf differ diff --git a/final_paper/final_paper/final_paper.synctex.gz b/final_paper/final_paper/final_paper.synctex.gz index 1daea86..e7c8af3 100644 Binary files a/final_paper/final_paper/final_paper.synctex.gz and b/final_paper/final_paper/final_paper.synctex.gz differ