Skip to content

Commit

Permalink
Merge branch 'master' of github.com:johnmyleswhite/ML_for_Hackers
Browse files Browse the repository at this point in the history
  • Loading branch information
drewconway committed Sep 21, 2012
2 parents 9c21fba + 298fd39 commit b58488c
Show file tree
Hide file tree
Showing 13 changed files with 74 additions and 20 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
.dropbox
code_check*
nohup.out
+*.pdf
4 changes: 2 additions & 2 deletions 01-Introduction/data/ufo/16154.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ infochimps_schema:
title: Infochimps Simple Schema (ICSS)
version: 0.1.1
generated_at: Wed Oct 20 14:34:20 +0000 2010
url: "http://www.infochimps.com/datasets/60000-documented-ufo-sightings-with-text-descriptions-and-metada.yaml"
url: "http://infochimps.com/datasets/d60000-documented-ufo-sightings-with-text-descriptions-and-metad.yaml"
description: >-
This is an Infochimps Simple Schema (ICSS).
Expand All @@ -24,7 +24,7 @@ dataset:
title: 60,000+ Documented UFO Sightings With Text Descriptions And Metadata
subtitle: ~
main_link: ~
url: "http://www.infochimps.com/datasets/60000-documented-ufo-sightings-with-text-descriptions-and-metada"
url: "http://infochimps.com/datasets/d60000-documented-ufo-sightings-with-text-descriptions-and-metad"
created_at: "Wed Oct 20 04:49:06 UTC 2010"
updated_at: "Wed Oct 20 04:50:01 UTC 2010"
tags: [ alien,aliens,awesome,database,ET,extra-terrestrial,flying,ship,sighting,space,tsv,ufo,unidentified-flying-objec ]
Expand Down
4 changes: 2 additions & 2 deletions 02-Exploration/chapter02.R
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ ggplot(heights.weights[1:2000, ], aes(x = Height, y = Weight)) +
# Visualize how gender depends on height and weight.
ggplot(heights.weights, aes(x = Height, y = Weight)) +
geom_point(aes(color = Gender, alpha = 0.25)) +
scale_alpha(legend = FALSE) +
scale_alpha(guide = "none") +
scale_color_manual(values = c("Male" = "black", "Female" = "gray")) +
theme_bw()

Expand All @@ -393,7 +393,7 @@ logit.model <- glm(Male ~ Weight + Height,

ggplot(heights.weights, aes(x = Height, y = Weight)) +
geom_point(aes(color = Gender, alpha = 0.25)) +
scale_alpha(legend = FALSE) +
scale_alpha(guide = "none") +
scale_color_manual(values = c("Male" = "black", "Female" = "gray")) +
theme_bw() +
stat_abline(intercept = -coef(logit.model)[1] / coef(logit.model)[2],
Expand Down
10 changes: 4 additions & 6 deletions 03-Classification/email_classify.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ val <- data.frame(cbind(x, rbind(y1, y2, y3)),
ex1 <- ggplot(val, aes(x, V2)) +
geom_jitter(aes(shape = as.factor(V3)),
position = position_jitter(height = 2)) +
scale_shape_discrete(legend = FALSE, solid = FALSE) +
scale_shape_discrete(guide = "none", solid = FALSE) +
geom_hline(aes(yintercept = c(10,30), linetype = 2)) +
theme_bw() +
xlab("X") +
Expand Down Expand Up @@ -302,20 +302,18 @@ class.df$Class <- as.logical(as.numeric(class.df$Class))
class.df$Type <- as.factor(class.df$Type)

# Create final plot of results
class.plot <- ggplot(class.df, aes(x = Pr.HAM, Pr.SPAM)) +
class.plot <- ggplot(class.df, aes(x = log(Pr.HAM), log(Pr.SPAM))) +
geom_point(aes(shape = Type, alpha = 0.5)) +
stat_abline(yintercept = 0, slope = 1) +
scale_x_log10() +
scale_y_log10() +
scale_shape_manual(values = c("EASYHAM" = 1,
"HARDHAM" = 2,
"SPAM" = 3),
name = "Email Type") +
scale_alpha(legend = FALSE) +
scale_alpha(guide = "none") +
xlab("log[Pr(HAM)]") +
ylab("log[Pr(SPAM)]") +
theme_bw() +
opts(axis.text.x = theme_blank(), axis.text.y = theme_blank())
theme(axis.text.x = element_blank(), axis.text.y = element_blank())
ggsave(plot = class.plot,
filename = file.path("images", "03_final_classification.pdf"),
height = 10,
Expand Down
15 changes: 8 additions & 7 deletions 04-Ranking/priority_inbox.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# the simple work counts used in Chapter 3.
# Data Used: Email messages contained in ../../03-Classification/code/data/
# source: http://spamassassin.apache.org/publiccorpus/
# Packages Used: tm, ggplot2
# Packages Used: tm, ggplot2, plyr

# All source code is copyright (c) 2012, under the Simplified BSD License.
# For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php
Expand All @@ -24,6 +24,7 @@
# Load libraries
library('tm')
library('ggplot2')
library('plyr')

# Set the global paths
data.path <- file.path("..", "03-Classification", "data")
Expand Down Expand Up @@ -161,12 +162,12 @@ from.scales <- ggplot(from.ex) +
color = "darkblue")) +
scale_x_continuous(breaks = 1:nrow(from.ex), labels = from.ex$From.EMail) +
coord_flip() +
scale_fill_manual(values = c("lightgrey" = "lightgrey"), legend = FALSE) +
scale_color_manual(values = c("darkblue" = "darkblue"), legend = FALSE) +
scale_fill_manual(values = c("lightgrey" = "lightgrey"), guide = "none") +
scale_color_manual(values = c("darkblue" = "darkblue"), guide = "none") +
ylab("Number of Emails Received (truncated at 6)") +
xlab("Sender Address") +
theme_bw() +
opts(axis.text.y = theme_text(size = 5, hjust = 1))
theme(axis.text.y = element_text(size = 5, hjust = 1))
ggsave(plot = from.scales,
filename = file.path("images", "0011_from_scales.pdf"),
height = 4.8,
Expand All @@ -188,7 +189,7 @@ from.rescaled <- ggplot(from.weight, aes(x = 1:nrow(from.weight))) +
xlab("") +
ylab("Number of emails Receieved") +
theme_bw() +
opts(axis.text.y = theme_blank(), axis.text.x = theme_blank())
theme(axis.text.y = element_blank(), axis.text.x = element_blank())
ggsave(plot = from.rescaled,
filename = file.path("images", "0012_from_rescaled.pdf"),
height = 4.8,
Expand Down Expand Up @@ -423,7 +424,7 @@ priority.threshold <- median(train.ranks.df$Rank)
threshold.plot <- ggplot(train.ranks.df, aes(x = Rank)) +
stat_density(aes(fill="darkred")) +
geom_vline(xintercept = priority.threshold, linetype = 2) +
scale_fill_manual(values = c("darkred" = "darkred"), legend = FALSE) +
scale_fill_manual(values = c("darkred" = "darkred"), guide = "none") +
theme_bw()
ggsave(plot = threshold.plot,
filename = file.path("images", "01_threshold_plot.pdf"),
Expand Down Expand Up @@ -455,7 +456,7 @@ testing.plot <- ggplot(subset(final.df, Type == "TRAINING"), aes(x = Rank)) +
stat_density(data = subset(final.df, Type == "TESTING"),
aes(fill = Type, alpha = 0.65)) +
geom_vline(xintercept = priority.threshold, linetype = 2) +
scale_alpha(legend = FALSE) +
scale_alpha(guide = "none") +
scale_fill_manual(values = c("TRAINING" = "darkred", "TESTING" = "darkblue")) +
theme_bw()
ggsave(plot = testing.plot,
Expand Down
3 changes: 3 additions & 0 deletions 05-Regression/chapter05.R
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ top.1000.sites <- read.csv(file.path('data', 'top_1000_sites.tsv'),

ggplot(top.1000.sites, aes(x = PageViews, y = UniqueVisitors)) +
geom_point()
ggsave(file.path("images", "page_views_vs_visitors.pdf"))

# Eighteenth snippet
ggplot(top.1000.sites, aes(x = PageViews)) +
Expand All @@ -185,11 +186,13 @@ ggplot(top.1000.sites, aes(x = log(PageViews))) +
# Twentieth snippet
ggplot(top.1000.sites, aes(x = log(PageViews), y = log(UniqueVisitors))) +
geom_point()
ggsave(file.path("images", "log_page_views_vs_log_visitors.pdf"))

# Twenty-first snippet
ggplot(top.1000.sites, aes(x = log(PageViews), y = log(UniqueVisitors))) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE)
ggsave(file.path("images", "log_page_views_vs_log_visitors_with_lm.pdf"))

# Twenty-second snippet
lm.fit <- lm(log(PageViews) ~ log(UniqueVisitors),
Expand Down
Binary file not shown.
Binary file not shown.
Binary file added 05-Regression/images/page_views_vs_visitors.pdf
Binary file not shown.
4 changes: 2 additions & 2 deletions 08-PCA/chapter08.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ correlations <- as.numeric(cor.matrix)
ggplot(data.frame(Correlation = correlations),
aes(x = Correlation, fill = 1)) +
geom_density() +
opts(legend.position = 'none')
theme(legend.position = 'none')

# Sixth code snippet
pca <- princomp(date.stock.matrix[,2:ncol(date.stock.matrix)])
Expand Down Expand Up @@ -77,7 +77,7 @@ loadings <- as.numeric(principal.component)
ggplot(data.frame(Loading = loadings),
aes(x = Loading, fill = 1)) +
geom_density() +
opts(legend.position = 'none')
them(legend.position = 'none')

# Tenth code snippet
market.index <- predict(pca)[, 1]
Expand Down
1 change: 1 addition & 0 deletions 12-Model_Comparison/chapter12.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ mean(with(df, svm.predictions == Label))
#[1] 0.7204

# Third code snippet
library("reshape")
df <- cbind(df,
data.frame(Logit = ifelse(predict(logit.fit) > 0, 1, 0),
SVM = ifelse(predict(svm.fit) > 0, 1, 0)))
Expand Down
47 changes: 47 additions & 0 deletions fast_check.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
setwd('01-Introduction')
source('ufo_sightings.R')
setwd('..')

setwd('02-Exploration')
source('chapter02.R')
setwd('..')

setwd('03-Classification')
source('email_classify.R')
setwd('..')

setwd('04-Ranking')
source('priority_inbox.R')
setwd('..')

setwd('05-Regression')
source('chapter05.R')
setwd('..')

setwd('06-Regularization')
source('chapter06.R')
setwd('..')

setwd('07-Optimization')
source('chapter07.R')
setwd('..')

setwd('08-PCA')
source('chapter08.R')
setwd('..')

setwd('09-MDS')
source('chapter09.R')
setwd('..')

setwd('10-Recommendations')
source('chapter10.R')
setwd('..')

#setwd('11-SNA')
#source('chapter09.R')
#setwd('..')

setwd('12-Model_Comparison')
source('chapter12.R')
setwd('..')
5 changes: 4 additions & 1 deletion package_installer.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,16 @@
# Create a vector containing all of the packages that will be used in the case studies
# (in no particular order)

cran.packages <- c("ggplot2",
cran.packages <- c("e1071",
"ggplot2",
"glmnet",
"Hmisc",
"igraph",
"lme4",
"lubridate",
"plyr",
"RCurl",
"reshape",
"RJSONIO",
"scales"
"tm",
Expand Down

0 comments on commit b58488c

Please sign in to comment.