forked from KimmoVehkalahti/IODS-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_alc.R
100 lines (70 loc) · 2.88 KB
/
create_alc.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# Mohan, 15.11.2019, Alc consumption and student performance (logistic regression analysis)
# Reference to the source data
url <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_2218/datasets"
# Location of math class data
url_math <- paste("~/IODS-project/data/student/student-mat.csv", sep = "/")
# Print out url_math
url_math
# read the math class questionaire data into memory
math <- read.table(url_math, sep = ";" , header=TRUE)
# Location for portuguese class data
url_por <- paste("~/IODS-project/data/student/student-por.csv", sep = "/")
# Print out url_por
url_por
# read the portuguese class questionaire data into memory
por <- read.table(url_por, sep = ";", header = TRUE)
# str of math data
str(math)
#There are 395 observations and 33 variables in math data
# dimensions of math data
dim(math)
# str of Por data
str(por)
# There are 649 observations and 33 variables in por data
# dimensions of por data
dim(por)
# math and por are available
# access the dplyr library
library(dplyr)
# joining the math and por data sets with identifiers
join_by <- c("school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet")
# join the two datasets by the selected identifiers
math_por <- inner_join(math, por, by = join_by, suffix = c(".math", ".por"))
# see the new column names
math_por
# glimpse at the data
glimpse(math_por)
# print out the column names of 'math_por'
colnames(math_por)
# create a new data frame with only the joined columns
alc <- select(math_por, one_of(join_by))
# columns that were not used for joining the data
notjoined_columns <- colnames(math)[!colnames(math) %in% join_by]
# print out the columns not used for joining
notjoined_columns
# for every column name not used for joining...
for(column_name in notjoined_columns) {
# select two columns from 'math_por' with the same original name
two_columns <- select(math_por, starts_with(column_name))
# select the first column vector of those two columns
first_column <- select(two_columns, 1)[[1]]
# if that first column vector is numeric...
if(is.numeric(first_column)) {
# take a rounded average of each row of the two columns and
# add the resulting vector to the alc data frame
alc[column_name] <- round(rowMeans(two_columns))
} else { # else if it's not numeric...
# add the first column vector to the alc data frame
alc[column_name] <- first_column
}
}
# glimpse at the new combined data
glimpse(alc)
# defining a new column alc_use by combining weekday and weekend alcohol use
alc <- mutate(alc, alc_use = (Dalc + Walc) / 2)
# defining a new logical column 'high_use'
alc <- mutate(alc, high_use = alc_use > 2)
# Glimpse of the joined and modified data
glimpse(alc)
setwd(("~/IODS-project/Data/student"))
write.csv(alc, 'alc.csv')