-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathggplot2_Densityplots.Rmd
191 lines (152 loc) · 6.41 KB
/
ggplot2_Densityplots.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
---
output: html_document
---
```{r global_options, include=FALSE}
knitr::opts_chunk$set(warning=FALSE, message=FALSE,
fig.width=5, fig.height=3, fig.align='center')
```
### ggplot2 Series 2 - Histogram and Density plot
```{r, warning = FALSE}
# Load libraries
library(ggplot2)
library(ggfortify)
library(dplyr)
library(knitr)
library(kableExtra)
```
The dataset we will use for this tutorial is from Kaggle: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data
```{r}
# Load data for demonstration
house_data <- read.csv('house_price.csv', header = TRUE)
options(scipen=999) # turn off scientific notation like 1e+06
kable(head(house_data[c('GrLivArea', 'GarageType', 'SalePrice', 'CentralAir')], 3), align=c(rep('c',times=4))) %>%
kable_styling(bootstrap_options=c("condensed"), full_width=FALSE)
```
Columns we will use in this tutorial:
- GrLivArea: Above grade (ground) living area square feet
- GarageType: Type of Garage location
- SalePrice: the property's sale price in dollars
- CentralAir - Central air conditioning (Binary): Y: yes, N: no
#### Basic Syntax
```{r}
#plot distribution of SalePrice
gg_hist <- ggplot(house_data, aes(x=SalePrice)) + geom_histogram()
print(gg_hist)
```
#### Customize histogram
```{r}
ggplot(house_data, aes(x=SalePrice)) +
geom_histogram(
alpha= .3, # set transparency of histogram
binwidth=50000, # set the width of the bins
colour="lightsteelblue3", # color of the outline
fill="lightsteelblue" # color of bins
)
```
#### Add lines to the histogram
```{r}
ggplot(house_data, aes(x=SalePrice)) +
geom_histogram(alpha=.3,
colour="lightsteelblue3",
fill='lightsteelblue') +
geom_vline(xintercept=300000, # vertical line
linetype="longdash",
color="lightsteelblue4", size=.5) +
geom_hline(yintercept=100, # horizontal line
linetype="dotdash",
color="lightsteelblue4", size=.5)
```
Some linetypes availables:
```{r pressure, echo=FALSE, fig.cap="Line types", out.width = '60%'}
include_graphics("line_type.JPG")
```
#### Add annotation to the histogram
```{r}
# Add mean and variance to the histogram
avg <- round(mean(house_data$SalePrice), 2)
std <- round(sd(house_data$SalePrice), 2)
# Method 1
ggplot(house_data, aes(x=SalePrice)) +
geom_histogram(alpha=.3,
colour="lightsteelblue3",
fill='lightsteelblue') +
annotate("text", x=350000, y=100,
color="lightsteelblue4",
label=paste("Mean:", avg)) +
annotate("text", x=350000, y=150,
color="lightsteelblue4",
label=paste("Standard Deviaion:", std))
# Method 2: adding all annotations at once
annotation <- data.frame(
x=rep(350000, 2),
y=c(100, 150),
label=c(paste("Standard Deviaion:", std),
paste("Mean:", avg))
)
ggplot(house_data, aes(x=SalePrice)) +
geom_histogram(alpha=.3,
colour="lightsteelblue3",
fill='lightsteelblue') +
geom_text(data=annotation,
aes(x=x, y=y, label=label),
color="lightsteelblue4",
size=4, angle=0, fontface="italic" )
```
#### Add kernel density line on to histogram
```{r}
# Add kernel density line on to histogram
ggplot(house_data, aes(x=SalePrice)) +
geom_histogram(aes(y=..density..), # the y axis of histogram has to be density too
alpha=.3,
colour="lightsteelblue3",
fill='lightsteelblue') +
geom_density(alpha=.3,
color=NA, # no outline
fill='lightsteelblue4')
```
#### Overlaid histograms
```{r}
# Two histograms seperated by categorical column: GarageType
ggplot(house_data, aes(x=SalePrice,
fill=GarageType)) +
geom_histogram(alpha=.3,
colour="lightsteelblue3") +
scale_fill_brewer(palette="RdBu") # change colour scheme
```
What if I want to overlay a few density plots? For example, I want to plot the density of SalePrice and GrLivArea at the same time. We can use the melt function from reshape2 package
```{r}
library(reshape2)
tmp <- house_data[c('TotalBsmtSF', 'GrLivArea')]
```
The melt function stacks a set of columns into a single column of data.
It generates a new dataframe is generated with two columns: variable and value
```{r}
kable(head(melt(tmp), 3), row.names = FALSE) %>%
kable_styling(bootstrap_options=c("striped", "hover", "condensed"), full_width=FALSE)
kable(tail(melt(tmp), 3), row.names = FALSE) %>%
kable_styling(bootstrap_options=c("striped", "hover", "condensed"), full_width=FALSE)
ggplot(melt(tmp), aes(x=value, fill=variable)) +
geom_histogram(aes(y=..density..),
alpha=.3,
colour="lightsteelblue1") +
geom_density(alpha=.5,
color="lightsteelblue3") +
scale_fill_brewer(palette="Pastel1")
```
#### Add vertical lines to each of the histograms in the same plot
```{r}
# 1. Find the mean of each group
library(plyr)
mean_Ggroup <- ddply(house_data, "CentralAir", summarise, meanPrice=mean(SalePrice))
kable(mean_Ggroup, align=c(rep('c',times=2))) %>%
kable_styling(bootstrap_options=c("striped", "hover", "condensed"), full_width=FALSE)
# 2. Add vertical lines using the mean value as xintercept
ggplot(house_data, aes(x=SalePrice,
fill=CentralAir)) + # seperated by categorical column: GarageType
geom_histogram(alpha=.3,
colour="lightsteelblue3") +
geom_vline(data=mean_Ggroup,
aes(xintercept=meanPrice, colour=CentralAir),
linetype="dotdash", size=.5) +
scale_fill_brewer(palette="Pastel1") # change colour scheme
```