datasummary_ch01_06

Update R codes with datasummary
adammia · Oct 1, 2021 · 72fbb60 · 72fbb60
1 parent 47b5b6f
commit 72fbb60
Show file tree

Hide file tree

Showing 7 changed files with 50 additions and 50 deletions.
diff --git a/ch01-hotels-data-collect/ch01-hotels-data-collect.R b/ch01-hotels-data-collect/ch01-hotels-data-collect.R
@@ -19,6 +19,7 @@
 # CLEAR MEMORY
 rm(list=ls())
 
+# install.packages("tidyverse")
 library(tidyverse)
 #----------------------------------------------------------------------------------------------------
 
@@ -29,8 +30,12 @@ library(tidyverse)
 #           example: setwd("C:/Users/bekes.gabor/Documents/github/da_case_studies/")
 
 # set data dir, load theme and functions
+
 source("ch00-tech-prep/theme_bg.R")
 source("ch00-tech-prep/da_helper_functions.R")
+# If source code does not run, install the following packages:
+# install.packages("urca")
+# install.packages("stargazer")
 
 # data used
 source("set-data-directory.R") #data_dir must be first defined #
@@ -45,6 +50,8 @@ create_output_if_doesnt_exist(output)
 
 # load in clean and tidy data and create workfile
 df <-  read.csv(paste0(data_in,"hotels-vienna.csv"))
+# or from the website
+# df <- read_csv("https://osf.io/y6jvb/download")
 
 ############################################
 # First look
@@ -55,6 +62,7 @@ df <- df%>%
           weekend, holiday, nnights, price, scarce_room, offer, offer_cat)
 
 summary(df)
+glimpse(df)
 
 # export list
 df <- subset(df, select = c(hotel_id, accommodation_type, country, city, city_actual, center1label, distance, stars, rating, price))

diff --git a/ch02-hotels-data-prep/ch02-hotels-data-prep.R b/ch02-hotels-data-prep/ch02-hotels-data-prep.R
@@ -25,6 +25,7 @@ library(haven)
 library(Hmisc)
 library(desc)
 library(reshape2)
+library(modelsummary)
 
 # set working directory
 # option A: open material as project
@@ -47,6 +48,8 @@ create_output_if_doesnt_exist(output)
 
 # load in clean and tidy data and create workfile
 data <- read_csv(paste0(data_in_clean,"/hotels-vienna.csv",sep=""))
+# Can load from website as well
+# data <- read_csv("https://osf.io/y6jvb/download")
 data <- data %>% select(hotel_id, accommodation_type ,distance, stars,rating,rating_count,price)
 
 # look at accomodation types
@@ -86,6 +89,8 @@ data %>% select(hotel_id,price,distance) %>% slice(1:3)
 # need to transform then to numbers that we can use
 
 data <- read_csv(paste0(data_in_raw,"/hotelbookingdata-vienna.csv",sep=""))
+# Can load from website as well
+# data <- read_csv( "https://osf.io/g5dmw/download" )
 
 # distance to center entered as string in miles with one decimal
 # generate numerical variable of rating variable from string variable
@@ -97,7 +102,7 @@ data <- data %>% separate(center1distance,c("distance",NA),sep = " ") %>%
   separate(guestreviewsrating,c("rating",NA),sep = " ")
 
 
-# check: frequency table of all values incl. missing varlues
+# check: frequency table of all values incl. missing values
 
 tab_rating <- data %>%
   group_by(rating) %>%
@@ -162,12 +167,7 @@ data <- data %>% distinct()
 #     Missing values in text
 #**********************************************
 
-summary(data)
-
-
-summary_df <- t(stat.desc(data))
-
-View(summary_df)
+datasummary_skim(data=data,histogram=F)
 
 data <-  data %>% mutate(misrating = ifelse(is.na(rating),1,0))
 

diff --git a/ch03-distributions-height-income/ch03-height-income.R b/ch03-distributions-height-income/ch03-height-income.R
@@ -23,6 +23,7 @@ rm(list=ls())
 # Import libraries
 library(tidyverse)
 library(scales)
+#!library(modelsummary)
 
 
 # set working directory
@@ -50,20 +51,24 @@ create_output_if_doesnt_exist(output)
 #-----------------------------------------------------------------------------------------
 # load in clean and tidy data and create workfile
 hrs <-  read.csv(paste(data_in,"hrs_height_income.csv", sep = "/"))
+# or load from the web
+# hrs <- read_csv("https://osf.io/rnuh2/download")
 
 #------------------------------------------------------------------------------------------------------
 
 hrs$height <- as.numeric(as.character(hrs$height))
 
 # NORMAL: height of women age 55-59 
 Hmisc::describe(hrs$height)
+#! datasummary_skim(hrs$height)
 filtered_women <-  hrs %>%
   filter(age >= 55 & age < 60 & female == 1)
 Hmisc::describe(hrs$height)
+#! datasummary_skim(filtered_women$height)
 filtered_women_height <-  hrs %>%
   filter(age >= 55 & age < 60 & female == 1 & height > 1.3 & height < 2.1)
 Hmisc::describe(filtered_women_height$height)
-
+#! datasummary_skim(filtered_women_height$height)
 
 # graph --height  
 ch03_normal_height <- ggplot(filtered_women_height, aes(x = height)) +

diff --git a/ch03-football-home-advantage/ch03-football-home-advantage-describe.R b/ch03-football-home-advantage/ch03-football-home-advantage-describe.R
@@ -56,6 +56,8 @@ create_output_if_doesnt_exist(output)
 # Import dataset
 df <- read.csv(paste0(data_in,"epl_games.csv"),
                 stringsAsFactors = F)
+# Or can load from web
+#! df <- read_csv( "https://osf.io/bdjt5/download" )
 
 # look at 2016/17 season only
 df <- subset(df, season==2016)
@@ -69,7 +71,6 @@ df <-  df %>%
 summary(df$home_goaladv)
 describe(df$home_goaladv)
 
-
 # Histogram
 p1<-ggplot(data = df, aes (x = home_goaladv, y = (..count..)/sum(..count..))) +
   geom_histogram(color = color.outline, fill = theme_colors[1],

diff --git a/ch03-hotels-vienna-explore/ch03-hotels-vienna-explore.R b/ch03-hotels-vienna-explore/ch03-hotels-vienna-explore.R
@@ -24,6 +24,7 @@ rm(list=ls())
 library(tidyverse)
 library(scales)
 library(xtable)
+library(modelsummary)
 
 
 # set working directory
@@ -51,12 +52,16 @@ create_output_if_doesnt_exist(output)
 
 # load vienna
 vienna <- read_csv(paste0(data_in,"hotels-vienna.csv"))
+# or load from the web
+# vienna <- read_csv("https://osf.io/y6jvb/download" )
 
 
 ####################################################################################
 # Figures 1a and 1b
 ####################################################################################
 # apply filters: Hotels
+datasummary( accommodation_type ~ N , data = vienna )
+# alternatively one can use the table function to tabulate
 table(vienna$accommodation_type)
 
 vienna_cut <- vienna %>% filter(accommodation_type=="Hotel")
@@ -99,8 +104,8 @@ vienna_cut <- vienna %>% filter(accommodation_type=="Hotel") %>%
 
 
 # brief look at data
-table(vienna_cut$city)
-table(vienna_cut$stars)
+vienna_cut$stars <- factor( vienna_cut$stars )
+datasummary( city + stars ~ N , data = vienna_cut )
 
 ####################################################################################
 # Figure 3.2 a) and b)
@@ -200,7 +205,7 @@ save_fig("ch03-figure-5-hist-dist-annot-large", output, "large")
 
 
 # look at actual city
-table(vienna_cut$city_actual)
+datasummary( city_actual ~ N , data = vienna_cut )
 
 
 

diff --git a/ch04-management-firm-size/ch04-wms-management-size.R b/ch04-management-firm-size/ch04-wms-management-size.R
@@ -33,6 +33,7 @@ library(haven)
 library(Hmisc)
 library(binsreg)
 library(xtable)
+library(modelsummary)
 
 # set working directory
 # option A: open material as project
@@ -57,13 +58,15 @@ create_output_if_doesnt_exist(output)
 ########################################################################
 # Import data
 df <- read_csv(paste0(data_in,"wms_da_textbook.csv"))
+# Can load from the web as well
+# df <- read_csv( "https://osf.io/uzpce/download" )
 
 # Sample selection
 df <- df %>%
   filter(country=="Mexico" & wave==2013 & emp_firm>=100  & emp_firm<=5000)
 
-# Summary
-summary(df$emp_firm)
+# Summary in two steps
+datasummary_skim( df$emp_firm )
 describe(df$emp_firm)
 
 # Save workfile
@@ -72,6 +75,8 @@ write.csv(data, paste0(data_out, "ch04-wms-work.csv"), row.names = F)
 ########################################################################
 
 # Summary
+datasummary( management + emp_firm ~ mean + Median + SD + Min + Max + N , data = df )
+# Somewhat more coumbersome to use dplyr:
 df %>%
   dplyr::select(management, emp_firm) %>% 
   summarise_all(tibble::lst(min, max, mean, median, sd, length))
@@ -173,20 +178,14 @@ save_fig("ch04-figure-3b-wms-mex-perf2-emp3bins",output , "small")
 # Option 1: create 3 bins as defined by thresholds
 
 # Summary
-df %>%
-  select(emp_firm, emp3bins) %>% 
-  group_by(emp3bins) %>% 
-  dplyr::summarise_all(tibble::lst(min, max, mean, median, sd, length))
+datasummary( emp_firm * emp3bins ~ mean + Median + SD + Min + Max + N , data = df )
 
 # Recode employee bins
 df$emp3bins <- ifelse(df$emp3bins == 1 , 150, 
                       ifelse(df$emp3bins == 2, 600,
                              ifelse(df$emp3bins == 3, 3000, NA)))
 # Summary
-df %>%
-  select(emp_firm, emp3bins) %>% 
-  group_by(emp3bins) %>%
-  summarise_all(tibble::lst(min, max, mean, median, sd, length))
+datasummary( emp_firm * Factor( emp3bins ) ~ Mean + Median + SD + Min + Max + N , data = df )
 
 # Generate variables by mean
 df1<-df %>% group_by(emp3bins) %>%
@@ -217,7 +216,7 @@ df$emp10bins <- df$emp_firm %>% cut_number(10)
 df_summary<-df %>%
   select(emp_firm, emp10bins) %>% 
   group_by(emp10bins) %>%
-  summarise_all(ftibble::lst(min, max, mean, median, sd, length))
+  summarise_all(tibble::lst(min, max, mean, median, sd, length))
 df_summary
 
 # Recode
@@ -312,7 +311,7 @@ save_fig("ch04-figure-6b-wms-mex-violin-mgmt-emp3bins", output, "small")
 # Correlation
 cor(df$management, df$emp_firm, use = "complete.obs")
 
-table(df$sic)
+datasummary( Factor( sic ) ~ N + Percent() , data = df )
 
 # by industry
 df$industry_broad[df$sic<=21] <- 'food_drinks_tobacco'
@@ -324,41 +323,21 @@ df$industry_broad[df$sic>=35 & df$sic<37] <- 'electronics'
 df$industry_broad[df$sic==37 ] <- 'auto'
 df$industry_broad[df$sic>=38]             <- 'other'
 
-table(df$industry_broad)
+datasummary( industry_broad ~ N , data = df )
 
 # Correlation
 df %>%
   group_by(industry_broad) %>%
   dplyr::summarize(COR=cor(management, emp_firm))
 
-# Summary
-df %>%
-  select(management, industry_broad) %>% 
-  filter(!is.na(industry_broad)) %>% 
-  group_by(industry_broad) %>%
-  dplyr::summarise(Min = min(management), 
-                   Max= max(management),
-                   SD = sd(management),
-                   Median = median(management),
-                   n())
-
-df %>%
-  select(emp_firm, industry_broad) %>% 
-  filter(!is.na(industry_broad)) %>% 
-  group_by(industry_broad) %>%
-  dplyr::summarise(Min = min(emp_firm), 
-                   Max= max(emp_firm),
-                   SD = sd(emp_firm),
-                   Median = median(emp_firm),
-                   n())
-
+# Summarize along industries for both emp_firm and management
+datasummary( Median + SD + Min + Max + N ~ Factor( industry_broad ) * ( emp_firm + management ) , data = df )
 
 cor<-df %>%
   group_by(industry_broad) %>%
   dplyr::summarize(COR=cor(management, emp_firm))
 
 
-
 table41 <-df %>%
   select(emp_firm, industry_broad,management) %>% 
   # filter(!is.na(industry_broad)) %>% 

diff --git a/ch06-online-offline-price-test/ch06-online-offline-price-test.R b/ch06-online-offline-price-test/ch06-online-offline-price-test.R
@@ -24,6 +24,7 @@ rm(list=ls())
 library(tidyverse)
 library(xtable)
 library(broom)
+library(modelsummary)
 
 # set working directory
 # option A: open material as project
@@ -51,6 +52,8 @@ create_output_if_doesnt_exist(output)
 
 # load data
 pd <- read.csv(paste0(data_in,"online_offline_ALL_clean.csv"))
+# Load from the web
+# pd <- read_csv( "https://osf.io/yhbr5/download" )
 
 
 # FILTER DATA
@@ -67,9 +70,8 @@ pd <- pd %>% filter(price<1000)
 # Compare variables
 pd<-pd %>% mutate(diff = price_online-price)
 
-descr <- pd %>% summarise(mean = mean(diff,na.rm=T), sd = sd(diff,na.rm=T), min=min(diff,na.rm=T),
-                          median=median(diff,na.rm=T), max=max(diff,na.rm=T))
-descr
+# Check the main descriptives
+datasummary( diff ~ Mean + SD + Min + Max + Median + Max , data = pd )
 
 hist1<- ggplot(data=pd, aes(diff))+
   geom_histogram(binwidth = 5, boundary=0, closed="left",