Skip to content

Commit

Permalink
datasummary_ch01_06
Browse files Browse the repository at this point in the history
Update R codes with datasummary
  • Loading branch information
regulyagoston committed Oct 1, 2021
1 parent 47b5b6f commit 72fbb60
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 50 deletions.
8 changes: 8 additions & 0 deletions ch01-hotels-data-collect/ch01-hotels-data-collect.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
# CLEAR MEMORY
rm(list=ls())

# install.packages("tidyverse")
library(tidyverse)
#----------------------------------------------------------------------------------------------------

Expand All @@ -29,8 +30,12 @@ library(tidyverse)
# example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/")

# set data dir, load theme and functions

source("ch00-tech-prep/theme_bg.R")
source("ch00-tech-prep/da_helper_functions.R")
# If source code does not run, install the following packages:
# install.packages("urca")
# install.packages("stargazer")

# data used
source("set-data-directory.R") #data_dir must be first defined #
Expand All @@ -45,6 +50,8 @@ create_output_if_doesnt_exist(output)

# load in clean and tidy data and create workfile
df <- read.csv(paste0(data_in,"hotels-vienna.csv"))
# or from the website
# df <- read_csv("https://osf.io/y6jvb/download")

############################################
# First look
Expand All @@ -55,6 +62,7 @@ df <- df%>%
weekend, holiday, nnights, price, scarce_room, offer, offer_cat)

summary(df)
glimpse(df)

# export list
df <- subset(df, select = c(hotel_id, accommodation_type, country, city, city_actual, center1label, distance, stars, rating, price))
Expand Down
14 changes: 7 additions & 7 deletions ch02-hotels-data-prep/ch02-hotels-data-prep.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ library(haven)
library(Hmisc)
library(desc)
library(reshape2)
library(modelsummary)

# set working directory
# option A: open material as project
Expand All @@ -47,6 +48,8 @@ create_output_if_doesnt_exist(output)

# load in clean and tidy data and create workfile
data <- read_csv(paste0(data_in_clean,"/hotels-vienna.csv",sep=""))
# Can load from website as well
# data <- read_csv("https://osf.io/y6jvb/download")
data <- data %>% select(hotel_id, accommodation_type ,distance, stars,rating,rating_count,price)

# look at accomodation types
Expand Down Expand Up @@ -86,6 +89,8 @@ data %>% select(hotel_id,price,distance) %>% slice(1:3)
# need to transform then to numbers that we can use

data <- read_csv(paste0(data_in_raw,"/hotelbookingdata-vienna.csv",sep=""))
# Can load from website as well
# data <- read_csv( "https://osf.io/g5dmw/download" )

# distance to center entered as string in miles with one decimal
# generate numerical variable of rating variable from string variable
Expand All @@ -97,7 +102,7 @@ data <- data %>% separate(center1distance,c("distance",NA),sep = " ") %>%
separate(guestreviewsrating,c("rating",NA),sep = " ")


# check: frequency table of all values incl. missing varlues
# check: frequency table of all values incl. missing values

tab_rating <- data %>%
group_by(rating) %>%
Expand Down Expand Up @@ -162,12 +167,7 @@ data <- data %>% distinct()
# Missing values in text
#**********************************************

summary(data)


summary_df <- t(stat.desc(data))

View(summary_df)
datasummary_skim(data=data,histogram=F)

data <- data %>% mutate(misrating = ifelse(is.na(rating),1,0))

Expand Down
7 changes: 6 additions & 1 deletion ch03-distributions-height-income/ch03-height-income.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ rm(list=ls())
# Import libraries
library(tidyverse)
library(scales)
#!library(modelsummary)


# set working directory
Expand Down Expand Up @@ -50,20 +51,24 @@ create_output_if_doesnt_exist(output)
#-----------------------------------------------------------------------------------------
# load in clean and tidy data and create workfile
hrs <- read.csv(paste(data_in,"hrs_height_income.csv", sep = "/"))
# or load from the web
# hrs <- read_csv("https://osf.io/rnuh2/download")

#------------------------------------------------------------------------------------------------------

hrs$height <- as.numeric(as.character(hrs$height))

# NORMAL: height of women age 55-59
Hmisc::describe(hrs$height)
#! datasummary_skim(hrs$height)
filtered_women <- hrs %>%
filter(age >= 55 & age < 60 & female == 1)
Hmisc::describe(hrs$height)
#! datasummary_skim(filtered_women$height)
filtered_women_height <- hrs %>%
filter(age >= 55 & age < 60 & female == 1 & height > 1.3 & height < 2.1)
Hmisc::describe(filtered_women_height$height)

#! datasummary_skim(filtered_women_height$height)

# graph --height
ch03_normal_height <- ggplot(filtered_women_height, aes(x = height)) +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ create_output_if_doesnt_exist(output)
# Import dataset
df <- read.csv(paste0(data_in,"epl_games.csv"),
stringsAsFactors = F)
# Or can load from web
#! df <- read_csv( "https://osf.io/bdjt5/download" )

# look at 2016/17 season only
df <- subset(df, season==2016)
Expand All @@ -69,7 +71,6 @@ df <- df %>%
summary(df$home_goaladv)
describe(df$home_goaladv)


# Histogram
p1<-ggplot(data = df, aes (x = home_goaladv, y = (..count..)/sum(..count..))) +
geom_histogram(color = color.outline, fill = theme_colors[1],
Expand Down
11 changes: 8 additions & 3 deletions ch03-hotels-vienna-explore/ch03-hotels-vienna-explore.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ rm(list=ls())
library(tidyverse)
library(scales)
library(xtable)
library(modelsummary)


# set working directory
Expand Down Expand Up @@ -51,12 +52,16 @@ create_output_if_doesnt_exist(output)

# load vienna
vienna <- read_csv(paste0(data_in,"hotels-vienna.csv"))
# or load from the web
# vienna <- read_csv("https://osf.io/y6jvb/download" )


####################################################################################
# Figures 1a and 1b
####################################################################################
# apply filters: Hotels
datasummary( accommodation_type ~ N , data = vienna )
# alternatively one can use the table function to tabulate
table(vienna$accommodation_type)

vienna_cut <- vienna %>% filter(accommodation_type=="Hotel")
Expand Down Expand Up @@ -99,8 +104,8 @@ vienna_cut <- vienna %>% filter(accommodation_type=="Hotel") %>%


# brief look at data
table(vienna_cut$city)
table(vienna_cut$stars)
vienna_cut$stars <- factor( vienna_cut$stars )
datasummary( city + stars ~ N , data = vienna_cut )

####################################################################################
# Figure 3.2 a) and b)
Expand Down Expand Up @@ -200,7 +205,7 @@ save_fig("ch03-figure-5-hist-dist-annot-large", output, "large")


# look at actual city
table(vienna_cut$city_actual)
datasummary( city_actual ~ N , data = vienna_cut )



Expand Down
49 changes: 14 additions & 35 deletions ch04-management-firm-size/ch04-wms-management-size.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ library(haven)
library(Hmisc)
library(binsreg)
library(xtable)
library(modelsummary)

# set working directory
# option A: open material as project
Expand All @@ -57,13 +58,15 @@ create_output_if_doesnt_exist(output)
########################################################################
# Import data
df <- read_csv(paste0(data_in,"wms_da_textbook.csv"))
# Can load from the web as well
# df <- read_csv( "https://osf.io/uzpce/download" )

# Sample selection
df <- df %>%
filter(country=="Mexico" & wave==2013 & emp_firm>=100 & emp_firm<=5000)

# Summary
summary(df$emp_firm)
# Summary in two steps
datasummary_skim( df$emp_firm )
describe(df$emp_firm)

# Save workfile
Expand All @@ -72,6 +75,8 @@ write.csv(data, paste0(data_out, "ch04-wms-work.csv"), row.names = F)
########################################################################

# Summary
datasummary( management + emp_firm ~ mean + Median + SD + Min + Max + N , data = df )
# Somewhat more coumbersome to use dplyr:
df %>%
dplyr::select(management, emp_firm) %>%
summarise_all(tibble::lst(min, max, mean, median, sd, length))
Expand Down Expand Up @@ -173,20 +178,14 @@ save_fig("ch04-figure-3b-wms-mex-perf2-emp3bins",output , "small")
# Option 1: create 3 bins as defined by thresholds

# Summary
df %>%
select(emp_firm, emp3bins) %>%
group_by(emp3bins) %>%
dplyr::summarise_all(tibble::lst(min, max, mean, median, sd, length))
datasummary( emp_firm * emp3bins ~ mean + Median + SD + Min + Max + N , data = df )

# Recode employee bins
df$emp3bins <- ifelse(df$emp3bins == 1 , 150,
ifelse(df$emp3bins == 2, 600,
ifelse(df$emp3bins == 3, 3000, NA)))
# Summary
df %>%
select(emp_firm, emp3bins) %>%
group_by(emp3bins) %>%
summarise_all(tibble::lst(min, max, mean, median, sd, length))
datasummary( emp_firm * Factor( emp3bins ) ~ Mean + Median + SD + Min + Max + N , data = df )

# Generate variables by mean
df1<-df %>% group_by(emp3bins) %>%
Expand Down Expand Up @@ -217,7 +216,7 @@ df$emp10bins <- df$emp_firm %>% cut_number(10)
df_summary<-df %>%
select(emp_firm, emp10bins) %>%
group_by(emp10bins) %>%
summarise_all(ftibble::lst(min, max, mean, median, sd, length))
summarise_all(tibble::lst(min, max, mean, median, sd, length))
df_summary

# Recode
Expand Down Expand Up @@ -312,7 +311,7 @@ save_fig("ch04-figure-6b-wms-mex-violin-mgmt-emp3bins", output, "small")
# Correlation
cor(df$management, df$emp_firm, use = "complete.obs")

table(df$sic)
datasummary( Factor( sic ) ~ N + Percent() , data = df )

# by industry
df$industry_broad[df$sic<=21] <- 'food_drinks_tobacco'
Expand All @@ -324,41 +323,21 @@ df$industry_broad[df$sic>=35 & df$sic<37] <- 'electronics'
df$industry_broad[df$sic==37 ] <- 'auto'
df$industry_broad[df$sic>=38] <- 'other'

table(df$industry_broad)
datasummary( industry_broad ~ N , data = df )

# Correlation
df %>%
group_by(industry_broad) %>%
dplyr::summarize(COR=cor(management, emp_firm))

# Summary
df %>%
select(management, industry_broad) %>%
filter(!is.na(industry_broad)) %>%
group_by(industry_broad) %>%
dplyr::summarise(Min = min(management),
Max= max(management),
SD = sd(management),
Median = median(management),
n())

df %>%
select(emp_firm, industry_broad) %>%
filter(!is.na(industry_broad)) %>%
group_by(industry_broad) %>%
dplyr::summarise(Min = min(emp_firm),
Max= max(emp_firm),
SD = sd(emp_firm),
Median = median(emp_firm),
n())

# Summarize along industries for both emp_firm and management
datasummary( Median + SD + Min + Max + N ~ Factor( industry_broad ) * ( emp_firm + management ) , data = df )

cor<-df %>%
group_by(industry_broad) %>%
dplyr::summarize(COR=cor(management, emp_firm))



table41 <-df %>%
select(emp_firm, industry_broad,management) %>%
# filter(!is.na(industry_broad)) %>%
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ rm(list=ls())
library(tidyverse)
library(xtable)
library(broom)
library(modelsummary)

# set working directory
# option A: open material as project
Expand Down Expand Up @@ -51,6 +52,8 @@ create_output_if_doesnt_exist(output)

# load data
pd <- read.csv(paste0(data_in,"online_offline_ALL_clean.csv"))
# Load from the web
# pd <- read_csv( "https://osf.io/yhbr5/download" )


# FILTER DATA
Expand All @@ -67,9 +70,8 @@ pd <- pd %>% filter(price<1000)
# Compare variables
pd<-pd %>% mutate(diff = price_online-price)

descr <- pd %>% summarise(mean = mean(diff,na.rm=T), sd = sd(diff,na.rm=T), min=min(diff,na.rm=T),
median=median(diff,na.rm=T), max=max(diff,na.rm=T))
descr
# Check the main descriptives
datasummary( diff ~ Mean + SD + Min + Max + Median + Max , data = pd )

hist1<- ggplot(data=pd, aes(diff))+
geom_histogram(binwidth = 5, boundary=0, closed="left",
Expand Down

0 comments on commit 72fbb60

Please sign in to comment.