Skip to content

Commit

Permalink
obtained the new 2013 RAPM numbers for 2014 projections
Browse files Browse the repository at this point in the history
  • Loading branch information
leerichardson committed Nov 28, 2014
1 parent 0dd6c5a commit 49b98aa
Show file tree
Hide file tree
Showing 8 changed files with 1,031 additions and 529 deletions.
Binary file modified .RData
Binary file not shown.
1,018 changes: 509 additions & 509 deletions .Rhistory

Large diffs are not rendered by default.

469 changes: 469 additions & 0 deletions data/stats_nba_appspot/data_2013.csv

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions data/stats_nba_appspot/rapm_2014.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
## Set working directory
setwd("C:/Users/Lee/game_simulation/data/stats_nba_appspot")

## Load the XML Library
library(XML)

## Set the url
url <- "http://stats-for-the-nba.appspot.com/ratings/2014.html"
rpm_2013 <- readHTMLTable(url[1])[[1]]

## Save the data
write.csv(rpm_2013, "data_2013.csv")


9 changes: 5 additions & 4 deletions scripts/create_rapm_dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ game_train <- dbGetQuery(con, 'SELECT gameScore.match_id, gameScore.gameDate, ga
visit_team_score + home + playerId + playerName, FUN=c(mean), data=game_train)

### ASSIGN NULL VALUES TO 0 FOR RPM ####
game_train[is.na(game_train)] <- 0
game_train[c("PER.mean")][is.na(game_train[c("PER.mean")])] <- 15
game_train[c("RPM.mean", "DRPM.mean", "ORPM.mean")][is.na(game_train[c("RPM.mean", "DRPM.mean", "ORPM.mean")])] <- 0

### CONSTRUCT A WEIGHTED AVERAGE OF OFFENSIVE AND DEFENSIVE RPM FOR BOTH TEAMS ###
# detach(package:dplyr)
Expand Down Expand Up @@ -124,10 +125,10 @@ full_game_summary <- dbGetQuery(con, 'SELECT gameScore.match_id, gameScore.gameD
visit_team_score + home + playerId + playerName, FUN=c(mean), data=gt)

### ASSIGN NULL VALUES TO 0 FOR RPM ####
game_train[is.na(game_train)] <- 0
game_train[c("RPM.mean", "DRPM.mean", "ORPM.mean")][is.na(game_train[c("RPM.mean", "DRPM.mean", "ORPM.mean")])] <- 0

## SET 0 PER's TO 15's

## SET MISSING PER's TO 15's
game_train[c("PER.mean")][is.na(game_train[c("PER.mean")])] <- 15

### CONSTRUCT A WEIGHTED AVERAGE OF OFFENSIVE AND DEFENSIVE RPM FOR BOTH TEAMS ###
total_mins <- ddply(game_train, .(home), summarise, total_mins = sum(avg_MIN.mean))
Expand Down
26 changes: 23 additions & 3 deletions scripts/fits_using_rpm.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@

## Set up datasets ##
years <- c(2008, 2009, 2010, 2011, 2012, 2013)
train = filter(data, game_year %in% c(2008))
test = filter(data, game_year == 2009)
train = filter(data, game_year %in% c(2008, 2009, 2010, 2011, 2012))
test = filter(data, game_year == 2013)

xtest = test[,9:17]
ytest = test[,18]
Expand All @@ -33,12 +33,32 @@
accuracy

## Logistic Regression
mylogit <- glm(homeWin ~ RPM_weight.0 + ORPM_weight.0 + DRPM_weight.0 + PER_weight.0 +
RPM_weight.1 + ORPM_weight.1 + DRPM_weight.1 + PER_weight.1 + home, data=train,
family = "binomial")

logit_preds <- as.data.frame(predict(mylogit, newdata=xtest, type="response"))
logit_preds$class <- ifelse(logit_preds[,1] >= .5, 1, 0)
logit_preds <- cbind(logit_preds, ytest)
logit_preds$result <- abs(logit_preds[,2] - logit_preds[,3])
logit_accurary <- 1 - sum(logit_preds$result)/length(ytest)
logit_accurary

## Linear Regression
mylinear <- lm(homeWin ~ RPM_weight.0 + ORPM_weight.0 + DRPM_weight.0 + PER_weight.0 +
RPM_weight.1 + ORPM_weight.1 + DRPM_weight.1 + PER_weight.1 + home, data=train)
linear_preds <- as.data.frame(predict(mylinear, newdata=xtest, type="response"))
linear_preds$class <- ifelse(linear_preds[,1] >= .5, 1, 0)
linear_preds <- cbind(linear_preds, ytest)
linear_preds$result <- abs(linear_preds[,2] - linear_preds[,3])
linear_accurary <- 1 - sum(linear_preds$result)/length(ytest)
linear_accurary

## Support Vector Machine

## Non parametric regression

## Non parametric regression




Expand Down
7 changes: 4 additions & 3 deletions scripts/simulate_season.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,7 @@ for(i in 1:1000){

## Loop through each match and probability of a season
for(match in match_ids){
## Print the match ID
print(match)


# Generate uniform random number
res <- runif(1)

Expand All @@ -78,5 +76,8 @@ for(i in 1:1000){
}
}

### Check out the results
means <- apply(season_df, 1, mean)
ses <- apply(season_df, 1, sd)


17 changes: 7 additions & 10 deletions to_do.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,12 @@

- Teams that show up as NA (Seattle Sonics, Vancouver Grizzlies, New Jersey Nets, New Orleans Hornets, more to find)

- Check game_year in dates where the date is after January 1 of the season to avoid confusion

- Decide what to do when a player shows up twice in one year. Average relevant columns?

- Large dataset has season totals AND season averages.

- Also could be using the wrong players for each team

- Mention how good home does as a feature, show that we need to improve on the 60 % baseline
(Could be a good plot for the poster)
- Mention how good home does as a feature, show that we need to improve on the 60 % baseline (Could be a good plot for the poster)

- Could test best (tell all) metrics on their predictive accuracy.

- Look into Brooklyn (0 wins?, maybe the year they moved)


------- MIDTERM REVIEW NOTES ----------------

Expand All @@ -28,7 +21,11 @@ RPI section- Talk about how we need to use it to give some signals about how goo


----- FUTURE GOALS -------------

- Scrape play by play data and calculate our own RAPM statistic

- Mention "All in One" Statistic predictive accuracy

- Mention home team wins 60 % of games, maybe get a table of JUST home teams

- How to simulate games without knowing the players who played beforehand ???

0 comments on commit 49b98aa

Please sign in to comment.