obtained the new 2013 RAPM numbers for 2014 projections

LavinLeo · Nov 28, 2014 · 49b98aa · 49b98aa
1 parent 0dd6c5a
commit 49b98aa
Show file tree

Hide file tree

Showing 8 changed files with 1,031 additions and 529 deletions.
diff --git a/.RData b/.RData
diff --git a/.Rhistory b/.Rhistory
diff --git a/data/stats_nba_appspot/data_2013.csv b/data/stats_nba_appspot/data_2013.csv
diff --git a/data/stats_nba_appspot/rapm_2014.r b/data/stats_nba_appspot/rapm_2014.r
@@ -0,0 +1,14 @@
+## Set working directory
+setwd("C:/Users/Lee/game_simulation/data/stats_nba_appspot")
+
+## Load the XML Library 
+library(XML)
+
+## Set the url
+url <- "http://stats-for-the-nba.appspot.com/ratings/2014.html"
+rpm_2013 <- readHTMLTable(url[1])[[1]]
+
+## Save the data 
+write.csv(rpm_2013, "data_2013.csv")
+
+
diff --git a/scripts/create_rapm_dataset.R b/scripts/create_rapm_dataset.R
@@ -36,7 +36,8 @@ game_train <- dbGetQuery(con, 'SELECT gameScore.match_id, gameScore.gameDate, ga
       visit_team_score + home + playerId + playerName, FUN=c(mean), data=game_train)
 
 ### ASSIGN NULL VALUES TO 0 FOR RPM ####
-  game_train[is.na(game_train)] <- 0
+  game_train[c("PER.mean")][is.na(game_train[c("PER.mean")])] <- 15
+  game_train[c("RPM.mean", "DRPM.mean", "ORPM.mean")][is.na(game_train[c("RPM.mean", "DRPM.mean", "ORPM.mean")])] <- 0  
 
 ### CONSTRUCT A WEIGHTED AVERAGE OF OFFENSIVE AND DEFENSIVE RPM FOR BOTH TEAMS ###
 #   detach(package:dplyr)    
@@ -124,10 +125,10 @@ full_game_summary <- dbGetQuery(con, 'SELECT gameScore.match_id, gameScore.gameD
                                visit_team_score + home + playerId + playerName, FUN=c(mean), data=gt)
 
      ### ASSIGN NULL VALUES TO 0 FOR RPM ####
-     game_train[is.na(game_train)] <- 0
+     game_train[c("RPM.mean", "DRPM.mean", "ORPM.mean")][is.na(game_train[c("RPM.mean", "DRPM.mean", "ORPM.mean")])] <- 0  
 
-     ## SET 0 PER's TO 15's 
-
+     ## SET MISSING PER's TO 15's 
+     game_train[c("PER.mean")][is.na(game_train[c("PER.mean")])] <- 15
 
      ### CONSTRUCT A WEIGHTED AVERAGE OF OFFENSIVE AND DEFENSIVE RPM FOR BOTH TEAMS ###
      total_mins <- ddply(game_train, .(home), summarise, total_mins = sum(avg_MIN.mean))

diff --git a/scripts/fits_using_rpm.R b/scripts/fits_using_rpm.R
@@ -15,8 +15,8 @@
 
 ## Set up datasets ##   
   years <- c(2008, 2009, 2010, 2011, 2012, 2013)
-  train = filter(data, game_year %in% c(2008))
-  test = filter(data, game_year == 2009)
+  train = filter(data, game_year %in% c(2008, 2009, 2010, 2011, 2012))
+  test = filter(data, game_year == 2013)
 
   xtest = test[,9:17]
   ytest = test[,18]
@@ -33,12 +33,32 @@
   accuracy
 
 ## Logistic Regression  
+  mylogit <- glm(homeWin ~ RPM_weight.0 + ORPM_weight.0 + DRPM_weight.0 + PER_weight.0 + 
+                   RPM_weight.1 + ORPM_weight.1 + DRPM_weight.1 + PER_weight.1 + home, data=train,
+                 family = "binomial")
+
+  logit_preds <- as.data.frame(predict(mylogit, newdata=xtest, type="response"))
+  logit_preds$class <- ifelse(logit_preds[,1] >= .5, 1, 0)
+  logit_preds <- cbind(logit_preds, ytest)
+  logit_preds$result <- abs(logit_preds[,2] - logit_preds[,3])
+  logit_accurary <- 1 - sum(logit_preds$result)/length(ytest)
+  logit_accurary
 
 ## Linear Regression
+  mylinear <- lm(homeWin ~ RPM_weight.0 + ORPM_weight.0 + DRPM_weight.0 + PER_weight.0 + 
+                   RPM_weight.1 + ORPM_weight.1 + DRPM_weight.1 + PER_weight.1 + home, data=train)
+  linear_preds <- as.data.frame(predict(mylinear, newdata=xtest, type="response"))
+  linear_preds$class <- ifelse(linear_preds[,1] >= .5, 1, 0)
+  linear_preds <- cbind(linear_preds, ytest)
+  linear_preds$result <- abs(linear_preds[,2] - linear_preds[,3])
+  linear_accurary <- 1 - sum(linear_preds$result)/length(ytest)
+  linear_accurary
 
 ## Support Vector Machine
 
-## Non parametric regression 
+
+## Non parametric regression
+
 
 
 

diff --git a/scripts/simulate_season.R b/scripts/simulate_season.R
@@ -56,9 +56,7 @@ for(i in 1:1000){
 
   ## Loop through each match and probability of a season
   for(match in match_ids){
-    ## Print the match ID
-    print(match)
-
+
     # Generate uniform random number
     res <- runif(1)
 
@@ -78,5 +76,8 @@ for(i in 1:1000){
   }
 }
 
+### Check out the results 
+  means <- apply(season_df, 1, mean) 
+  ses <- apply(season_df, 1, sd)
 
 
diff --git a/to_do.txt b/to_do.txt
@@ -4,19 +4,12 @@
 
 - Teams that show up as NA (Seattle Sonics, Vancouver Grizzlies, New Jersey Nets, New Orleans Hornets, more to find)
 
-- Check game_year in dates where the date is after January 1 of the season to avoid confusion
-
-- Decide what to do when a player shows up twice in one year. Average relevant columns?
-
-- Large dataset has season totals AND season averages. 
-
-- Also could be using the wrong players for each team 
-
-- Mention how good home does as a feature, show that we need to improve on the 60 % baseline 
-	(Could be a good plot for the poster)
+- Mention how good home does as a feature, show that we need to improve on the 60 % baseline (Could be a good plot for the poster)
 
 - Could test best (tell all) metrics on their predictive accuracy. 
 
+- Look into Brooklyn (0 wins?, maybe the year they moved)
+
 
 ------- MIDTERM REVIEW NOTES ----------------
 
@@ -28,7 +21,11 @@ RPI section- Talk about how we need to use it to give some signals about how goo
 
 
 ----- FUTURE GOALS -------------
+
 - Scrape play by play data and calculate our own RAPM statistic
+
 - Mention "All in One" Statistic predictive accuracy 
+
 - Mention home team wins 60 % of games, maybe get a table of JUST home teams 
+
 - How to simulate games without knowing the players who played beforehand ???