diff --git a/.gitignore b/.gitignore index 57cae10..4674983 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,4 @@ /data /reference /graphs -/output/*.rds +/output/* diff --git a/R/collecting_metrics_for_models.R b/R/collecting_metrics_for_models.R new file mode 100644 index 0000000..937ee39 --- /dev/null +++ b/R/collecting_metrics_for_models.R @@ -0,0 +1,30 @@ +library(doParallel) + +cores <- parallel::detectCores() +cl <- parallel::makePSOCKcluster(50) + +registerDoParallel(cl) + + +cv_class_log <- fit_resamples(wf_class_log, mc_train_set, control = control_resamples(save_pred = TRUE, + parallel_over = "everything", + verbose = TRUE), + metrics = metric_set(sens, spec, ppv, npv, roc_auc)) + + +cv_class_xg <- fit_resamples(recipe_class_xg, mc_train_set, control = control_resamples(save_pred = TRUE, + parallel_over = "everything", + verbose = TRUE), + metrics = metric_set(sens, spec, ppv, npv, roc_auc)) + +stopCluster(cl) + + +cv_class_log |> + collect_metrics() + +cv_class_xg |> + collect_metrics() + +saveRDS(cv_class_log, "output/cv_class_log_no_hh.rds") +saveRDS(cv_class_xg, "output/cv_class_xg_no_hh.rds") diff --git a/R/creating_extractable_models.R b/R/creating_extractable_models.R new file mode 100644 index 0000000..f35f534 --- /dev/null +++ b/R/creating_extractable_models.R @@ -0,0 +1,163 @@ +# importing and preparing dataset ---------------------------------------------- +library(future) +plan(multisession, workers = 4) +source("R/prediction_data_import.R") + +ukmod_tidy <- import_ukmod_data() + +plan(sequential) + +# pre-processing manually ------------------------------------------------- + +tidy_stage1 <- ukmod_tidy |> + group_by(year, idhh) |> + mutate(lba_income = sum(lba_income), + uc_income = max(uc_income), + uc_receipt = max(uc_receipt), + n_hh_emp = sum(employment == "Employed"), + n_hh_unemp = sum(employment == "Unemployed"), + n_hh_inact = sum(employment == "Inactive")) |> + ungroup() + +model_data <- tidy_stage1 |> + filter(age > 17 & age < 66) |> + select(-idhh, -i_0, -i_m, -i_l, - income, -uc_income, -lba_income, -employment) |> + mutate( + year = as.integer(year), + # p_hh_emp = if_else(employment == "Employed" & n_hh_emp > 0, 1, 0), + n_hh_emp = fct_other(factor(n_hh_emp), c("0", "1"), other_level = "2+"), + n_hh_unemp = fct_other(factor(n_hh_unemp), c("0", "1"), other_level = "2+"), + n_hh_inact = fct_other(factor(n_hh_inact), c("0", "1"), other_level = "2+") + ) |> + fastDummies::dummy_cols(remove_first_dummy = TRUE, remove_selected_columns = TRUE) |> + janitor::clean_names() |> + select(-starts_with("n_hh")) |> + mutate(uc_receipt = factor(uc_receipt, levels = 1:0, labels = c("Yes", "No"))) + + +# predict data from UKMOD ------------------------------------------------- + +data_split <- initial_split(model_data, prop = 0.8, strata = year) + +train_data <- training(data_split) |> select(-year) +test_data <- testing(data_split) |> select(-year) + +mc_train_set <- mc_cv(train_data) +cv_train_set <- vfold_cv(train_data, v = 10) + +# Running logistic model ------------------------------------------------------- + +mod_class_log <- logistic_reg() |> + set_engine("glm") |> + set_mode("classification") + + +recipe_class_log <- recipe( + uc_receipt ~ ., + data = train_data +) |> + step_interact( + # ~ starts_with('gender_'):starts_with('children_') + starts_with('gender_'):starts_with('children_'):starts_with('emp_len_') + starts_with('children_'):starts_with('emp_len_') + student:starts_with('children_') + student:starts_with('caring_') + starts_with('n_hh_emp_'):starts_with('children_') + starts_with('n_hh_unemp_'):starts_with('children_') + starts_with('n_hh_inact_'):starts_with('children_') + starts_with('n_hh_emp_'):starts_with('caring_') + starts_with('n_hh_unemp_'):starts_with('caring_') + starts_with('n_hh_inact_'):starts_with('caring_') + starts_with('marsta_')*starts_with('gender_')*starts_with('children_') + ~ starts_with('gender_'):starts_with('children_') + starts_with('gender_'):starts_with('children_'):starts_with('emp_len_') + starts_with('children_'):starts_with('emp_len_') + student:starts_with('children_') + student:starts_with('caring_') + starts_with('marsta_')*starts_with('gender_')*starts_with('children_') + ) + + +wf_class_log <- workflow() |> + add_model(mod_class_log) |> + add_recipe(recipe_class_log) + +fit_class_log <- + fit( + wf_class_log, + data = train_data + ) + +pred_class_log <- test_data |> + bind_cols(predict(fit_class_log, new_data = test_data, type = "prob")) |> + mutate(.pred_class = as.numeric(.pred_Yes > 0.5)) + +pred_class_log |> + (\(x) {print(roc_auc(x, uc_receipt, .pred_Yes)); x})() |> + roc_curve(uc_receipt, .pred_Yes) |> + autoplot() + +tidy(fit_class_log) |> + filter(is.na(estimate)) + +intercept <- tidy(fit_class_log) |> head(1) |> pull(estimate) + +library(tidypredict) +library(yaml) + +# fit_class_log |> extract_fit_engine() |> parse_model() |> write_yaml("output/log_pred.yml") +fit_class_log |> extract_fit_engine() |> parse_model() |> write_yaml("output/log_pred_no_hh.yml") + +loaded_model <- as_parsed_model(read_yaml("output/log_pred.yml")) + +tidypredict_fit(loaded_model) +# testing calculation ----------------------------------------------------- + +pred_class_log |> + mutate(id = row_number()) |> + select(- uc_receipt, -starts_with(".pred")) |> + pivot_longer(- id, names_to = "term", values_to = "val") |> + left_join(tidy(fit_class_log), by = "term") |> + mutate(pred_val = val * estimate) |> + group_by(id) |> + summarise(pred_val = sum(pred_val, na.rm = TRUE) + intercept) |> + mutate(prob = 1 - (exp(pred_val))/(1 + exp(pred_val))) |> + bind_cols(.pred_1 = pred_class_log$.pred_1, uc_receipt = pred_class_log$uc_receipt) |> + roc_auc(uc_receipt, prob) + +recipe_class_log |> + prep() |> + bake(test_data) |> + mutate(prob = 1 - eval(tidypredict_fit(loaded_model))) |> + bind_cols(predict(fit_class_log, new_data = test_data, type = "prob")) |> + roc_auc(uc_receipt, .pred_Yes) + + mutate(id = row_number()) |> + pivot_longer(- c(id, uc_receipt), names_to = "term", values_to = "val") |> + left_join(tidy(fit_class_log), by = "term") |> + mutate(pred_val = val * estimate) |> + group_by(id, uc_receipt) |> + summarise(pred_val = sum(pred_val, na.rm = TRUE) + intercept, .groups = "drop") |> + mutate(prob = 1 - (exp(pred_val))/(1 + exp(pred_val))) |> + roc_auc(uc_receipt, prob) + +pred_class_log |> + ggplot(aes(age, fill = factor(.pred_class))) + + geom_bar(position = "fill") + +fit_class_log |> tidy() |> + # write_csv(file = "output/log_pred_coefs.csv") + write_csv(file = "output/log_pred_coefs_no_hh.csv") + +# xg_boost out ------------------------------------------------------------ + +mod_class_xg <- boost_tree(tree_depth = 10, trees = 1000, min_n = 40, + learn_rate = 0.0178, loss_reduction = 0.0000562) |> + set_engine("xgboost") |> + set_mode("classification") + +recipe_class_xg <- workflow() |> + add_model(mod_class_xg) |> + add_formula( + uc_receipt ~ . + ) + +fit_class_xg <- recipe_class_xg |> + fit(data = train_data) + +pred_class_xg <- test_data |> + bind_cols(predict(fit_class_xg, new_data = test_data, type = "prob")) |> + mutate(.pred_class = factor(as.numeric(.pred_Yes > 0.5), levels = c("1", "0"))) +# mutate(predict(fit_class_xg, new_data = test_data)) + +pred_class_xg |> + (\(x) {print(roc_auc(x, uc_receipt, .pred_Yes)); x})() |> + roc_curve(uc_receipt, .pred_Yes) |> + autoplot() + +# fit_class_xg |> extract_fit_engine() |> parse_model() |> write_yaml("output/boost_tree.yml") +fit_class_xg |> extract_fit_engine() |> parse_model() |> write_yaml("output/boost_tree_no_hh.yml") diff --git a/R/graphing_respondents_conditions.R b/R/graphing_respondents_conditions.R index 2bd3f29..a090957 100644 --- a/R/graphing_respondents_conditions.R +++ b/R/graphing_respondents_conditions.R @@ -30,27 +30,44 @@ apr14_mar21 <- c( "tpbn1310", "refdte")) +wts1 <- c( + "data/fst_files/apr14_mar15.fst", + "data/fst_files/apr15_mar16.fst", + "data/fst_files/apr16_mar17.fst", + "data/fst_files/apr17_mar18.fst", + "data/fst_files/apr18_mar19.fst" + # "data/fst_files/apr19_mar20.fst" + # "data/fst_files/apr20_mar21.fst" +) |> map(read_fst, "pwta18") |> + map(`colnames<-`, "weight") + +wts2 <- c("data/fst_files/apr19_mar20.fst", + "data/fst_files/apr20_mar21.fst") |> + map(read_fst, "pwta20") |> + map(`colnames<-`, "weight") n_uc <- apr14_mar21 |> + map2(append(wts1, wts2), cbind) |> reduce(bind_rows) |> mutate(date = floor_date(dmy(refdte), "months"), across(tpbn1301:tpbn1310, ~.x == 1)) |> rowwise() |> - mutate(uc = sum(tpbn1301:tpbn1310)) + mutate(uc = sum(tpbn1301:tpbn1310)*weight) n_uc |> group_by(date) |> - select(date, uc) |> - summarise(prop_uc = sum(uc)/n()) |> + select(date, uc, weight) |> + summarise(prop_uc = sum(uc)/sum(weight)) |> ggplot(aes(date, prop_uc)) + - geom_col(fill = sphsu_cols("University Blue"), width = 31) + + geom_col(fill = sphsu_cols("University Blue"), width = 31, position = "identity") + scale_x_date("Year") + scale_y_continuous("Percentage of respondents on UC", labels = scales::percent, expand = expansion(mult = c(0, 0.05))) apr14_mar21_dt <- apr14_mar21 |> + map2(append(wts1, wts2), cbind) |> reduce(bind_rows) |> mutate(date = floor_date(dmy(refdte), "months")) |> data.table() @@ -63,15 +80,15 @@ apr14_mar21_dt[,housing := rowSums(.SD == 2), .SD = 2:11] apr14_mar21_dt[,income_s := rowSums(.SD == 4), .SD = 2:11] apr14_mar21_dt[,jsa := rowSums(.SD == 5), .SD = 2:11] apr14_mar21_dt[,disab := rowSums(.SD == 6), .SD = 2:11] -apr14_mar21_dt[,other := rowSums(.SD), .SD = tax_cr:jsa] +apr14_mar21_dt[,other := as.numeric(rowSums(.SD)>0), .SD = tax_cr:jsa] (perc_other <- apr14_mar21_dt |> as_tibble() |> group_by(date) |> - select(date, other) |> - summarise(prop_other = sum(other)/n()) |> + select(date, other, weight) |> + summarise(prop_other = sum(other * weight)/sum(weight)) |> ggplot(aes(date, prop_other)) + - geom_col(fill = sphsu_cols("Pumpkin"), width = 31) + + geom_col(fill = sphsu_cols("Pumpkin"), width = 32) + scale_x_date("Year") + scale_y_continuous("Respondents on\nLegacy Benefits", labels = scales::percent, expand = expansion(mult = c(0, 0.05)))) @@ -79,17 +96,17 @@ apr14_mar21_dt[,other := rowSums(.SD), .SD = tax_cr:jsa] (perc_all <- apr14_mar21_dt |> as_tibble() |> group_by(date) |> - select(date, uc) |> - summarise(prop_uc = sum(uc)/n()) |> + select(date, uc, weight) |> + summarise(prop_uc = sum(uc * weight)/sum(weight)) |> ggplot(aes(date, prop_uc)) + - geom_col(fill = sphsu_cols("University Blue"), width = 31) + + geom_col(fill = sphsu_cols("University Blue"), width = 32, position = "identity") + scale_x_date("Year") + scale_y_continuous("Respondents on UC", labels = scales::percent, expand = expansion(mult = c(0, 0.05)))) (perc_claim <- apr14_mar21_dt |> group_by(date) |> - select(date, uc, other) |> + select(date, uc, other, weight) |> filter(uc == 1 | other == 1) |> mutate(benefits = case_when( uc == 1 & other == 1 ~ "Combination", @@ -98,11 +115,17 @@ apr14_mar21_dt[,other := rowSums(.SD), .SD = tax_cr:jsa] ), benefits = factor(benefits, levels = c("Legacy Benefits", "Combination", "Universal Credit"))) |> as_tibble() |> - ggplot(aes(date, fill = benefits)) + - stat_count(position = "fill", geom = "bar", aes(y = after_stat(count)), width = 31) + + # group_by(date, benefits) |> + # summarise(n = sum(weight)) |> + ggplot(aes(date, weight, fill = benefits)) + + # geom_col(aes(y = weight), width = 32, position = "fill") + + stat_summary(fun = sum, position = "fill", geom = "bar", width = 32) + + # stat_sum(aes(y = weight), position = "fill", geom = "bar", width = 31) + + # stat_count(position = "fill", geom = "bar", aes(y = after_stat(count)), width = 31, position = "identity") + scale_fill_manual(name = "Benefits claimed", values = sphsu_cols("Pumpkin", "Leaf", "University Blue", names = FALSE)) + scale_x_date("Year") + theme(legend.position = "bottom") + + guides(size = guide_none()) + scale_y_continuous("Claimant by benefit types", labels = scales::percent, expand = expansion(mult = c(0, 0.05)))) @@ -114,7 +137,7 @@ patch + subtitle = "April 2014 - March 2021") -ggsave(filename = "graphs/proporrions_benefit_type.png", width = 20, height = 24, +ggsave(filename = "graphs/proporrions_benefit_type_weighted.png", width = 20, height = 24, units = "cm", dpi = 400) @@ -136,3 +159,59 @@ ggsave(filename = "graphs/proporrions_benefit_type.png", width = 20, height = 24 theme(legend.position = "bottom") + scale_y_continuous("Claimant by benefit types", expand = expansion(mult = c(0, 0.05)))) + + + +# compare with proportion on UC as reported by statxplore ----------------- + + +pop_uc <- read_rds("data/pop_on_uc.rds") + +(perc_all <- apr14_mar21_dt |> + as_tibble() |> + group_by(date) |> + select(date, uc, weight) |> + summarise(prop_uc = sum(uc * weight)/sum(weight)) |> + ggplot(aes(date, prop_uc)) + + geom_col(fill = sphsu_cols("University Blue"), width = 32, position = "identity") + + scale_x_date("Year") + + scale_y_continuous("Respondents on UC", labels = scales::percent, + expand = expansion(mult = c(0, 0.05)))) + +(perc_statxplore <- pop_uc |> + mutate(prop_uc = n_uc / pop) |> + ggplot(aes(date, prop_uc)) + + geom_col(fill = sphsu_cols("University Blue"), width = 32, position = "identity") + + scale_x_date("Year") + + scale_y_continuous("Respondents on UC", labels = scales::percent, + expand = expansion(mult = c(0, 0.05))) +) + +perc_all / perc_statxplore + +pop_uc |> + mutate(prop_uc_obs = n_uc / pop) + +comb_uc_rec <- apr14_mar21_dt |> + as_tibble() |> + group_by(date) |> + select(date, uc, weight) |> + summarise(prop_uc_rep = sum(uc * weight)/sum(weight)) |> + inner_join(pop_uc |> + mutate(prop_uc_obs = n_uc / pop), + by = "date") + +comb_uc_rec |> + ggplot(aes(date, prop_uc_rep)) + + geom_col(aes(linetype = "Reported"), fill = sphsu_cols("University Blue"), width = 32, position = "identity") + + geom_step(aes(y = prop_uc_obs, linetype = "Observed"), colour = sphsu_cols("University Blue"), linewidth = 0.5) + + scale_x_date("Year") + + guides(linetype = guide_legend(title = "Data source", + override.aes = list(fill = c(NA, sphsu_cols("University Blue")), + linetype = c(1, 0)))) + + scale_y_continuous("Respondents on UC", labels = scales::percent, + expand = expansion(mult = c(0, 0.05))) + + plot_annotation(title = "Proportions of working-age adults reporting receiving UC vs observed rates", + subtitle = "April 2014 - March 2021") + +ggsave(filename = "graphs/comparison_uc_reporting.png", width = 20, height = 10, dpi = 400, units = "cm") diff --git a/R/household_by_govof.R b/R/household_by_govof.R new file mode 100644 index 0000000..9971e79 --- /dev/null +++ b/R/household_by_govof.R @@ -0,0 +1,93 @@ +library(fst) +library(SPHSUgraphs) +library(tidyverse) +library(haven) + +make_perc <- function(p, dp = 1) { + sprintf(glue::glue("%.{dp}f%%"), p * 100) +} + +library(tidyverse) + +lfs_hh <- haven::read_sav("data/UKDA-7101-spss/spss/spss19/lfsh_aj11_eul.sav") + +lfs_hh |> + select(starts_with("hhtype"), + starts_with("hserial"), + starts_with("hrp")) + +lfs_hh |> + select(starts_with("HSERIAL")) + +hh_by_region <- lfs_hh |> + rename_all(str_to_upper) |> + group_by(HSERIALP) |> + slice_head(n = 1) |> + group_by(GOVTOF, HHTYPE6) |> + summarise(n = sum(PHHWT14), .groups = "drop_last") |> + mutate(perc_in_area = make_perc(n/sum(n))) |> + ungroup() |> + mutate(across(where(haven::is.labelled), .fns = as_factor)) + +lfs_hh |> + filter(PERSNO == 1) |> + select(HSERIALP, CASENOP, PERSNO) + +lfs_hh |> + group_by(HSERIALP) |> + arrange(PERSNO) |> + slice_head(n = 1) |> + select(CASENOP, PERSNO) |> + ggplot(aes(as_factor(PERSNO))) + + stat_count() + + +hh_by_region |> + ggplot(aes(n, GOVTOF, fill = HHTYPE6)) + + geom_col(position = "fill") + + scale_x_continuous(labels = scales::percent_format())+ + scale_fill_sphsu() + + +lfs_hh |> + group_by(HSERIALP) |> + slice_head(n = 1) + count(THISWV) + + + +aps_2020_ind <- read_sav("data/UKDA-8789-spss/spss/spss25/apsp_jd20_eul_pwta22.sav") + +aps_2020_ind |> select(contains("hrp")) + +aps_2020_hh <- haven::read_sav("data/UKDA-8861-spss/spss/spss25/apsh_jd20_eul_phhwta22.sav") + +aps_2020_hh |> + select(starts_with("hhtype"), + starts_with("he"), + starts_with("hserial"), + starts_with("hrp")) + +aps_2020_hh |> + filter(HRP == 1) |> + group_by(GOVTOF, HHTYPE6) |> + summarise(n = sum(PHHWTA22), .groups = "drop_last") |> + mutate(perc_in_area = make_perc(n/sum(n))) |> + ungroup() |> + mutate(across(where(haven::is.labelled), .fns = as_factor)) + + + # count(HSERIALP) |> + # arrange(desc(n)) + # + +# Checking all files ------------------------------------------------------ + +hh_set_path <- "data/hh_set_test/sav_files/" + +all_savs <- dir(hh_set_path) |> + paste0(hh_set_path, x = _) |> + map(haven::read_sav, col_select = starts_with("H")) + +all_savs |> + map(select, starts_with("HS")) diff --git a/R/optimise_predictions.R b/R/optimise_predictions.R index 88d6532..b36a3d6 100644 --- a/R/optimise_predictions.R +++ b/R/optimise_predictions.R @@ -1,156 +1,52 @@ -library(tidyverse) -library(tidymodels) -library(furrr) -library(data.table) - -# importing ukmod data ---------------------------------------------------------- - -input_parts <- - tibble( - file = file.path("data/ukmod_out", dir("data/ukmod_out")), - year_policy = str_extract(file, "(?<=uk_)\\w*(?=\\.txt)") - ) |> - mutate(data = future_map(file, fread)) - -full_pred_data <- input_parts |> - select(-file) |> - separate(year_policy, - into = c("year", "policy"), - sep = "_") |> - pivot_wider(names_from = policy, values_from = data) |> - mutate( - UCAon = map( - UCAon, - transmute, - uc_income = bho_s + bmu_s + boamt_s + boamtmm_s + boamtxp_s + bsauc_s - brduc_s, - uc_receipt = as.numeric(bsauc_s - brduc_s > 0), - idperson = idperson - ), - LBAon = map( - LBAon, - mutate, - lba_income = bho_s + bmu_s + boamt_s + boamtmm_s + boamtxp_s + bfamt_s + bsadi_s + bsa_s + bwkmt_s - brd_s - ) - ) |> - mutate(comb_data = map2(LBAon, UCAon, left_join, by = "idperson"), - .keep = "unused") |> - unnest(comb_data) |> - arrange(year, idperson) - -rm(input_parts) - -full_pred_data <- full_pred_data |> - group_by(year, idhh) |> - mutate(children = sum(dag < 16)) |> - ungroup() +library(future) +# importing and preparing dataset ---------------------------------------------- +plan(multisession, workers = 4) +source("R/prediction_data_import.R") -ukmod_tidy <- full_pred_data |> - mutate( - age = dag, - cit = if_else(dcz == 1, "UK", "Other"), # leave out as odd in UKMOD? - disab = factor( - ddi, - levels = c("0", "1"), - labels = c("Not disabled", "Disabled") - ), - employment = case_when( - les %in% 1:3 ~ "Employed", - les == 5 ~ "Unemployed", - les %in% 6:7 ~ "Inactive", - les == 4 ~ "Retired", - les == 8 ~ "Sick or disabled", - TRUE ~ "Other" - ), - student = if_else(les == 6, 1L, 0L), - educ = case_when( - dec == 4 ~ "Degree or College", - dec == 3 ~ "Secondary", - dec == 2 ~ "Secondary", - dec == 5 ~ "Tertiary", - TRUE ~ "None" - ), - gender = factor( - dgn, - levels = c("0", "1"), - labels = c("Female", "Male") - ), - marsta = factor( - if_else(dms == 0, 1L, dms), - levels = 1:5, - labels = c("Single", "Married", "Separated", "Divorced", "Widowed") - ), - region = if_else(drgn1 < 3, LETTERS[drgn1 + 2], LETTERS[drgn1 + 1]), - emp_len = case_when( - les %in% c(4:8) ~ "Not in employment", - liwwh < 12 ~ "Less than 12 months", - liwwh < 24 ~ "Between 1 and 2 years", - liwwh < 60 ~ "Between 2 and 5 years", - liwwh < 120 ~ "Between 5 and 10 years", - liwwh < 240 ~ "Between 10 and 20 years", - liwwh >= 240 ~ "20 years or more", - ), - seeking = factor( - lowas, - levels = c("0", "1"), - labels = c("No", "Yes") - ), - children = fct_other(factor(children), c("0", "1"), other_level = "2+"), - income = if_else(yem > 3415, 3415, yem), - i_0 = as.numeric(income == 0), - i_m = as.numeric(income == 3415), - i_l = income * (1 - i_m), - # i.e. if income is max then turn off the continuous - i_c = cut( - income, - c(0, 1, 500, 1000, 2000, 3000, 3500), - right = FALSE, - include.lowest = TRUE, - labels = c("0", - "1-499", - "500-999", - "1000-1999", - "2000-2999", - "3000+") - ), - house_ten = fct_collapse(factor(amrtn), Mortgaged = "1", Outright = "2", Rented = c("3", "4", "5"), Free = "6", Other = "7"), - house_resp = factor(dhr, labels = c("No", "Yes")), - caring = factor(full_pred_data$lcr01 == 0, levels = c(TRUE, FALSE), labels = c("No", "Yes")) - ) |> - select( - year, idhh, uc_income, lba_income, uc_receipt, - age, cit, disab, employment, educ, gender, marsta, region, emp_len, seeking, - student, - children, income, i_0, i_m, i_l, i_c, house_ten, house_resp, caring - ) +ukmod_tidy <- import_ukmod_data() +plan(sequential) -# predict data from UKMOD ------------------------------------------------- +# pre-processing manually ------------------------------------------------- -model_data <- ukmod_tidy |> +tidy_stage1 <- ukmod_tidy |> group_by(year, idhh) |> mutate(lba_income = sum(lba_income), uc_income = max(uc_income), - uc_receipt = factor(max(uc_receipt)), + uc_receipt = max(uc_receipt), n_hh_emp = sum(employment == "Employed"), n_hh_unemp = sum(employment == "Unemployed"), n_hh_inact = sum(employment == "Inactive")) |> - ungroup() |> + ungroup() + +model_data <- tidy_stage1 |> filter(age > 17 & age < 66) |> + select(-idhh, -i_0, -i_m, -i_l, - income, -uc_income, -lba_income, -employment) |> mutate( - p_hh_emp = if_else(employment == "Employed" & n_hh_emp > 0, 1L, 0L), + year = as.integer(year), + # p_hh_emp = if_else(employment == "Employed" & n_hh_emp > 0, 1, 0), n_hh_emp = fct_other(factor(n_hh_emp), c("0", "1"), other_level = "2+"), n_hh_unemp = fct_other(factor(n_hh_unemp), c("0", "1"), other_level = "2+"), - n_hh_inact = fct_other(factor(n_hh_inact), c("0", "1"), other_level = "2+")) + n_hh_inact = fct_other(factor(n_hh_inact), c("0", "1"), other_level = "2+") + ) |> + fastDummies::dummy_cols(remove_first_dummy = TRUE, remove_selected_columns = TRUE) |> + janitor::clean_names() |> + # select(-starts_with("n_hh")) |> + mutate(uc_receipt = factor(uc_receipt, levels = 1:0, labels = c("Yes", "No"))) + + +# predict data from UKMOD ------------------------------------------------- data_split <- initial_split(model_data, prop = 0.8, strata = year) -train_data <- training(data_split) -test_data <- testing(data_split) +train_data <- training(data_split) |> select(-year) +test_data <- testing(data_split) |> select(-year) mc_train_set <- mc_cv(train_data) -cv_train_set <- vfold_cv(train_data, v = 5) +cv_train_set <- vfold_cv(train_data, v = 10) + # tuning grid ------------------------------------------------------------- @@ -158,28 +54,65 @@ cv_train_set <- vfold_cv(train_data, v = 5) library(doParallel) cores <- parallel::detectCores() -cl <- parallel::makePSOCKcluster(floor(0.98*cores)) +cl <- parallel::makePSOCKcluster(floor(0.95*cores)) registerDoParallel(cl) + + +# random forest ----------------------------------------------------------- +# Not working - huuuuuuuge RAM increases! +# tune_class_forest <- +# rand_forest( +# engine = "ranger", +# mode = "classification", +# trees = tune(), +# min_n = tune() +# ) +# +# recipe_tune_class_forest <- workflow() |> +# add_model(tune_class_forest) |> +# add_formula( +# uc_receipt ~ . +# ) +# +# +# +# forest_tune_grid <- tune_class_forest |> +# extract_parameter_set_dials() |> +# grid_regular(levels = 4) +# +# tune_out_class_forest <- +# tune_grid( +# recipe_tune_class_forest, +# grid = forest_tune_grid, +# resamples = cv_train_set, +# metrics = metric_set(sens, spec, ppv, npv, roc_auc), +# control = control_grid(parallel_over = "everything", +# verbose = TRUE) +# ) +# +# +# saveRDS(tune_out_class_forest, "output/tune_out_class_forest.rds") + +# ## xgboost --------------------------------------------------------------- +# +# +# tune_class_xg <- boost_tree(trees = 1000, tree_depth = tune(), min_n = tune(), learn_rate = tune(), loss_reduction = tune()) |> set_engine("xgboost") |> set_mode("classification") recipie_tune_class_xg <- workflow() |> add_model(tune_class_xg) |> - add_formula(uc_receipt ~ age + - i_c + - region + disab + educ + gender + emp_len + seeking + student + - house_ten + house_resp + caring + n_hh_emp + n_hh_unemp + n_hh_inact + - children + employment + marsta) + add_formula(uc_receipt ~ .) xg_tune_grid <- tune_class_xg |> extract_parameter_set_dials() |> grid_regular(levels = 3) -xg_tune_grid$min_n <- rep(c(40, 50, 60), 27) -xg_tune_grid$tree_depth <- rep(rep(c(5, 10, 15), each = 3), 9) +xg_tune_grid$min_n <- rep(c(40, 45, 50), 27) +xg_tune_grid$tree_depth <- rep(rep(c(10, 15, 20), each = 3), 9) start <- Sys.time() diff --git a/R/predicting_benefit_change.R b/R/predicting_benefit_change.R new file mode 100644 index 0000000..138a7dc --- /dev/null +++ b/R/predicting_benefit_change.R @@ -0,0 +1,103 @@ +library(future) + +# importing and preparing dataset ---------------------------------------------- +plan(multisession, workers = 4) +source("R/prediction_data_import.R") + +ukmod_tidy <- import_ukmod_data() + +plan(sequential) + + + +# linear regression on benefit receipt difference ------------------------- + +tidy_stage1 <- ukmod_tidy |> + group_by(year, idhh) |> + mutate(lba_income = sum(lba_income), + uc_income = max(uc_income), + uc_receipt = max(uc_receipt), + benefit_change = uc_income - lba_income, + n_hh_emp = sum(employment == "Employed"), + n_hh_unemp = sum(employment == "Unemployed"), + n_hh_inact = sum(employment == "Inactive")) |> + ungroup() + +data_ben_diff <- tidy_stage1 |> + filter(age > 17 & age < 66) |> + select(-idhh, -i_0, -i_m, -i_l, - income, -uc_income, -lba_income, -employment) |> + mutate( + year = as.integer(year), + # p_hh_emp = if_else(employment == "Employed" & n_hh_emp > 0, 1, 0), + n_hh_emp = fct_other(factor(n_hh_emp), c("0", "1"), other_level = "2+"), + n_hh_unemp = fct_other(factor(n_hh_unemp), c("0", "1"), other_level = "2+"), + n_hh_inact = fct_other(factor(n_hh_inact), c("0", "1"), other_level = "2+") + ) |> + fastDummies::dummy_cols(remove_first_dummy = TRUE, remove_selected_columns = TRUE) |> + janitor::clean_names() |> + # select(-starts_with("n_hh")) |> + mutate(uc_receipt = factor(uc_receipt, levels = 1:0, labels = c("Yes", "No"))) + + +data_split <- initial_split(data_ben_diff, prop = 0.8, strata = year) + +train_data <- training(data_split) |> select(-year) +test_data <- testing(data_split) |> select(-year) + +mc_train_set <- mc_cv(train_data) +cv_train_set <- vfold_cv(train_data, v = 10) + + +rec_bd <- recipe(benefit_change ~ ., + data = train_data) |> + step_interact( + ~ starts_with('gender_'):starts_with('children_') + + starts_with('gender_'):starts_with('children_'):starts_with('emp_len_') + + starts_with('children_'):starts_with('emp_len_') + + student:starts_with('children_') + student:starts_with('caring_') + + starts_with('marsta_') * starts_with('gender_') * starts_with('children_') + ) + +mod_diff_knn <- nearest_neighbor() |> + set_mode("regression") + +wf_diff_knn <- workflow() |> + add_recipe(rec_bd) |> + add_model(mod_diff_knn) + +fit_diff_knn <- fit(wf_diff_knn, + data = train_data) + +# nearest_neighbor("regression") |> +# fit( +# benefit_change ~ ., +# data = train_data +# ) + +pred_out <- test_data |> + bind_cols(predict(fit_diff_knn, new_data = test_data)) + +pred_out |> + (\(x) {print(rmse(x, benefit_change, .pred)); x})() |> + ggplot(aes(benefit_change, .pred)) + + geom_point() + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + +# linear mod -------------------------------------------------------------- + +wf_diff_ln <- workflow() |> + add_model(linear_reg()) |> + add_recipe(rec_bd) + +fit_diff_ln <- fit(wf_diff_ln, + data = train_data) + +pred_out <- test_data |> + bind_cols(predict(fit_diff_ln, new_data = test_data)) + +pred_out |> + (\(x) {print(rsq(x, benefit_change, .pred)); x})() |> + ggplot(aes(benefit_change, .pred)) + + geom_point() + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + diff --git a/R/predicting_benefit_receipt.R b/R/predicting_benefit_receipt.R index ee6b4ef..b765de3 100644 --- a/R/predicting_benefit_receipt.R +++ b/R/predicting_benefit_receipt.R @@ -1,302 +1,16 @@ library(tidyverse) library(tidymodels) -# library(SPHSUgraphs) +library(SPHSUgraphs) library(furrr) library(data.table) -# theme_set(theme_sphsu_light()) +source("R/prediction_data_import.R") -# practice data from APS -------------------------------------------------- - -# library(haven) -# aps18_19 <- -# read_sav("data/UKDA-8510-spss/spss/spss24/apsp_a18m19_eul_pwta18.sav") -# aps18_19_dta <- -# read_dta("data/UKDA-8197-stata/stata/stata11/apsp_a16m17_eul.dta") -# aps_16_hh_spss <- -# read_sav("data/UKDA-8216-spss/spss/spss24/apsh_jd16_eul.sav") - - -# importing ukmod data ---------------------------------------------------------- - -input_parts <- - tibble( - file = file.path("data/ukmod_out", dir("data/ukmod_out")), - year_policy = str_extract(file, "(?<=uk_)\\w*(?=\\.txt)") - ) |> - mutate(data = future_map(file, fread)) - - -input_parts |> - select(-file) |> - separate(year_policy, - into = c("year", "policy"), - sep = "_") |> - mutate( - data = map(data, summarise, across(starts_with("b") & - ends_with("_s"), sum)), - data = map( - data, - pivot_longer, - everything(), - names_to = "benefit", - values_to = "val" - ) - ) |> - unnest(data) |> - group_by(benefit, policy) |> - summarise(val = sum(val), .groups = "drop_last") |> - mutate(diff_val = diff(val)) |> - filter(abs(diff_val) > 1000) |> - arrange(abs(diff_val)) |> - ggplot(aes(val, fct_inorder(benefit))) + - geom_bar(stat = "identity") + - facet_wrap(~ policy) - -full_pred_data <- input_parts |> - select(-file) |> - separate(year_policy, - into = c("year", "policy"), - sep = "_") |> - pivot_wider(names_from = policy, values_from = data) |> - mutate( - UCAon = map( - UCAon, - transmute, - uc_income = bho_s + bmu_s + boamt_s + boamtmm_s + boamtxp_s + bsauc_s - brduc_s, - uc_receipt = as.numeric(bsauc_s - brduc_s > 0), - idperson = idperson - ), - LBAon = map( - LBAon, - mutate, - lba_income = bho_s + bmu_s + boamt_s + boamtmm_s + boamtxp_s + bfamt_s + bsadi_s + bsa_s + bwkmt_s - brd_s - ) - ) |> - mutate(comb_data = map2(LBAon, UCAon, left_join, by = "idperson"), - .keep = "unused") |> - unnest(comb_data) |> - arrange(year, idperson) - -rm(input_parts) - -full_pred_data <- full_pred_data |> - group_by(year, idhh) |> - mutate(children = sum(dag < 16)) |> - ungroup() - -# processing and matching data -------------------------------------------- -# # code in progress for processing within Safe Lab -# aps_tidy <- aps18_19_dta |> -# mutate( -# age = AGE, -# cit = if_else(NTNLTY12 == 926, "UK", "Other"), # leave out as odd in UKMOD? -# disab = fct_collapse( -# as_factor(DISEA), -# `Disabled` = "Equality Act Disabled", -# `Not disabled` = c("Not Equality Act Disabled", "Does not apply", "No answer") -# ), -# employment = case_when( -# INECAC05 %in% 1:4 ~ "Employed", -# INECAC05 == 5 ~ "Unemployed", -# INECAC05 %in% c(20, 31) ~ "Retired", -# INECAC05 %in% c(15:16, 26:27) ~ "Sick or disabled", -# INECAC05 %in% 12:33 ~ "Inactive", -# TRUE ~ "Other" -# ), -# educ = case_when( -# if_any(matches("QUAL_[1-9]$"), ~ .x == 1) ~ "Degree or College", -# if_any(matches("QUAL_1[0-7]$"), ~ .x == 1) | -# QUAL_23 == 1 | QUAL_29 == 1 | QUAL_30 == 1 ~ "Secondary", -# if_any(matches("QUAL_(18|19|2[0-4])$"), ~ .x == 1) ~ "Secondary", -# if_any(matches("QUAL_(2[5-9]|3[0-5])$"), ~ .x == 1) ~ "Tertiary", -# TRUE ~ "None" -# ), -# gender = as_factor(SEX), -# marsta = case_when( -# MARSTA == 1 ~ "Single", -# MARSTA %in% c(2, 6) ~ "Married", -# MARSTA %in% c(3, 7) ~ "Separated", -# MARSTA %in% c(4, 8) ~ "Divorced", -# MARSTA %in% c(5, 9) ~ "Widowed", -# TRUE ~ "Single" -# ), -# emp_len = case_when( -# INECAC05 > 4 ~ "Not in employment", -# EMPMON < 12 ~ "Less than 12 months", -# EMPMON < 24 ~ "Between 1 and 2 years", -# EMPMON < 60 ~ "Between 2 and 5 years", -# EMPMON < 120 ~ "Between 5 and 10 years", -# EMPMON < 240 ~ "Between 10 and 20 years", -# EMPMON >= 240 ~ "20 years or more", -# ), -# seeking = if_else(ILODEFR == 2, "Yes", "No"), -# GRSSWK = if_else(GRSSWK < 0, 0, GRSSWK), -# GRSSWK2 = if_else(GRSSWK2 < 0, 0, GRSSWK2), -# income = 52 * (GRSSWK + GRSSWK2) / 12, -# income = if_else(income > 3415, 3415, income), -# income = if_else(is.na(income), 0, income), -# i_0 = as.numeric(income == 0), -# i_m = as.numeric(income == 3415), -# i_l = income * (1 - i_m), -# # i.e. if income is max then turn off the continuous -# i_c = cut( -# income, -# c(0, 1, 500, 1000, 2000, 3000, 3500), -# right = FALSE, -# include.lowest = TRUE, -# labels = c("0", -# "1-499", -# "500-999", -# "1000-1999", -# "2000-2999", -# "3000+") -# ), -# house_ten = fct_collapse(factor(TEN1), Mortgaged = "2", Outright = "1", Rented = c("3", "4"), Free = "5", Other = c("6", "-9", "-8")), -# house_resp = fct_other(as_factor(HRPID), "Yes", other_level = "No"), -# # Missing variable outside safe lab -# # children = fct_other(factor(FDPCH16), c("0", "1"), other_level = "2+"), -# # region = str_extract(NUTS102, "(?<=^UK)\\w"), -# # Totally missing variable? -# # caring = factor(as.numeric(NOLWM == 3), labels = c("No", "Yes")) -# ) |> -# select( -# # uc_income, lba_income, uc_receipt, -# age, cit, disab, employment, educ, gender, marsta, emp_len, seeking, -# income, i_0, i_m, i_l, i_c, house_ten, house_resp -# # idhh, year -# # children, region, caring -# ) - - -ukmod_tidy <- full_pred_data |> - mutate( - age = dag, - cit = if_else(dcz == 1, "UK", "Other"), # leave out as odd in UKMOD? - disab = factor( - ddi, - levels = c("0", "1"), - labels = c("Not disabled", "Disabled") - ), - employment = case_when( - les %in% 1:3 ~ "Employed", - les == 5 ~ "Unemployed", - les %in% 6:7 ~ "Inactive", - les == 4 ~ "Retired", - les == 8 ~ "Sick or disabled", - TRUE ~ "Other" - ), - student = if_else(les == 6, 1L, 0L), - educ = case_when( - deh == 4 ~ "Degree or College", - deh == 3 ~ "Secondary", - deh == 2 ~ "Lower Secondary", - deh == 5 ~ "Tertiary", - TRUE ~ "None" - ), - gender = factor( - dgn, - levels = c("0", "1"), - labels = c("Female", "Male") - ), - marsta = factor( - if_else(dms == 0, 1, dms), - levels = 1:5, - labels = c("Single", "Married", "Separated", "Divorced", "Widowed") - ), - region = if_else(drgn1 < 3, LETTERS[drgn1 + 2], LETTERS[drgn1 + 1]), - emp_len = case_when( - les %in% c(4:8) ~ "Not in employment", - liwwh < 12 ~ "Less than 12 months", - liwwh < 24 ~ "Between 1 and 2 years", - liwwh < 60 ~ "Between 2 and 5 years", - liwwh < 120 ~ "Between 5 and 10 years", - liwwh < 240 ~ "Between 10 and 20 years", - liwwh >= 240 ~ "20 years or more", - ), - seeking = factor( - lowas, - levels = c("0", "1"), - labels = c("No", "Yes") - ), - children = fct_other(factor(children), c("0", "1"), other_level = "2+"), - income = if_else(yem > 3415, 3415, yem), - i_0 = as.numeric(income == 0), - i_m = as.numeric(income == 3415), - i_l = income * (1 - i_m), - # i.e. if income is max then turn off the continuous - i_c = cut( - income, - c(0, 1, 500, 1000, 2000, 3000, 3500), - right = FALSE, - include.lowest = TRUE, - labels = c("0", - "1-499", - "500-999", - "1000-1999", - "2000-2999", - "3000+") - ), - house_ten = fct_collapse(factor(amrtn), Mortgaged = "1", Outright = "2", Rented = c("3", "4", "5"), Free = "6", Other = "7"), - house_resp = factor(dhr, labels = c("No", "Yes")), - caring = factor(full_pred_data$lcr01 == 0, levels = c(TRUE, FALSE), labels = c("No", "Yes")) - ) |> - select( - year, idhh, uc_income, lba_income, uc_receipt, - age, cit, disab, employment, educ, gender, marsta, region, emp_len, seeking, student, - children, income, i_0, i_m, i_l, i_c, house_ten, house_resp, caring - ) - - -# testing data ------------------------------------------------------------ - -# # testing for comparable income distributions -# -# aps_inc |> select(i_c) |> mutate(source = "aps") |> -# bind_rows(ukmod_tidy |> select(i_c) |> mutate(source = "ukmod")) |> -# ggplot(aes(y = i_c)) + -# geom_bar() + -# facet_wrap(~source, scale = "free_x") -# -# # testing for aps income dist -# -# aps_inc |> -# ggplot(aes(GROSS99)) + geom_histogram() - -# testing for linear effects of income on uc_income - -# ukmod_tidy |> -# filter(i_l !=0) |> -# ggplot(aes(i_l, uc_income)) + geom_point() + -# geom_smooth() -# -# ukmod_tidy |> -# ggplot(aes(yem, uc_income)) + #geom_point() -# geom_smooth() - -# propose income be categorised: -# - 0-500 -# - 500-1000 -# - 1000-2000 -# - 2000-3000 -# - 3000+ - -# ukmod_tidy |> -# ggplot(aes(i_c)) + -# geom_bar() -# -# ukmod_tidy |> -# filter(i_l !=0) |> -# ggplot(aes(i_l)) + -# geom_histogram() +ukmod_tidy <- import_ukmod_data() # predict data from UKMOD ------------------------------------------------- - - - - model_data <- ukmod_tidy |> group_by(year, idhh) |> mutate(lba_income = sum(lba_income), @@ -347,14 +61,14 @@ fit_class_xg <- recipe_class_xg |> pred_class_xg <- test_data |> bind_cols(predict(fit_class_xg, new_data = test_data, type = "prob")) |> mutate(.pred_class = factor(as.numeric(.pred_1 > 0.5), levels = c("1", "0"))) - # mutate(predict(fit_class_xg, new_data = test_data)) +# mutate(predict(fit_class_xg, new_data = test_data)) pred_class_xg |> (\(x) {print(roc_auc(x, uc_receipt, .pred_1)); x})() |> roc_curve(uc_receipt, .pred_1) |> autoplot() - + pred_class_xg |> (\(x) {print(conf_mat(x, uc_receipt, .pred_class)); x})() |> ggplot(aes(uc_receipt, fill = .pred_class)) + @@ -405,24 +119,40 @@ mod_class_log <- logistic_reg() |> set_engine("glm") |> set_mode("classification") -recipe_class_log <- workflow() |> - add_model(mod_class_log) |> - add_formula( - uc_receipt ~ poly(age, 2) + i_c + - # # poly(i_l, 4) + i_0 + i_m + - region + disab + educ + emp_len + - house_ten + house_resp + seeking + student + - n_hh_emp * n_hh_unemp * n_hh_inact + - caring * employment + gender:children + gender:children:employment + - children:employment + student:children + student:caring + marsta:employment + - n_hh_emp:children + n_hh_unemp:children + n_hh_inact:children + - n_hh_emp:caring + n_hh_unemp:caring + n_hh_inact:caring + - gender*marsta*children +recipe_class_log <- recipe( + uc_receipt ~ age + i_c + + region + disab + educ + emp_len + + house_ten + house_resp + seeking + student + + n_hh_emp + n_hh_unemp + n_hh_inact + + caring + employment + gender + marsta + children, + data = train_data +) |> + step_dummy(all_nominal_predictors()) |> + step_interact( + ~ starts_with('gender_'):starts_with('children_') + starts_with('gender_'):starts_with('children_'):starts_with('employment_') + starts_with('children_'):starts_with('employment_') + student:starts_with('children_') + student:starts_with('caring_') + starts_with('marsta_'):starts_with('employment_') + starts_with('n_hh_emp_'):starts_with('children_') + starts_with('n_hh_unemp_'):starts_with('children_') + starts_with('n_hh_inact_'):starts_with('children_') + starts_with('n_hh_emp_'):starts_with('caring_') + starts_with('n_hh_unemp_'):starts_with('caring_') + starts_with('n_hh_inact_'):starts_with('caring_') + starts_with('marsta_')*starts_with('gender_')*starts_with('children_') + ) + +wf_class_log <- workflow() |> + add_model(mod_class_log) |> + add_recipe(recipe_class_log) +# add_formula( +# uc_receipt ~ poly(age, 2) + i_c + +# # # poly(i_l, 4) + i_0 + i_m + +# region + disab + educ + emp_len + +# house_ten + house_resp + seeking + student + +# n_hh_emp * n_hh_unemp * n_hh_inact + +# caring * employment + gender:children + gender:children:employment + +# children:employment + student:children + student:caring + marsta:employment + +# n_hh_emp:children + n_hh_unemp:children + n_hh_inact:children + +# n_hh_emp:caring + n_hh_unemp:caring + n_hh_inact:caring + +# gender*marsta*children +# ) + fit_class_log <- fit( - recipe_class_log, + wf_class_log, data = train_data ) @@ -459,6 +189,23 @@ pred_class_log |> ## practice exports ---------------------------------------------- library(butcher) +library(tidypredict) + +log_mod_exp <- fit_class_log |> extract_fit_engine() + +object.size(log_mod_exp) + +log_mod_sm <- axe_data(log_mod_exp) + +object.size(log_mod_sm) + +log_mod_tiny <- butcher(log_mod_exp) + +weigh(log_mod_tiny) +parse_model(log_mod_exp) +parse_model(log_mod_tiny) + +fit_class_log |> extract_preprocessor() ## mixed effects classification ---------------------------------- @@ -494,10 +241,10 @@ recipe_class_me <- workflow() |> recipe_class_me <- recipe(uc_receipt ~ age + i_c + - disab + educ + gender + emp_len + seeking + student + - house_ten + house_resp + n_hh_emp + n_hh_unemp + n_hh_inact + - children + employment + marsta + caring + region, - data = train_data) |> + disab + educ + gender + emp_len + seeking + student + + house_ten + house_resp + n_hh_emp + n_hh_unemp + n_hh_inact + + children + employment + marsta + caring + region, + data = train_data) |> add_role(region, new_role = "exp_unit") |> step_interact( ~ starts_with("children_"):starts_with("employment_"):starts_with("marsta_"):starts_with("caring_") + @@ -543,6 +290,15 @@ pred_class_me |> scale_y_continuous("Perc predicted", labels = scales::label_percent()) + facet_wrap(~ prob, nrow = 1) +## knn --------------------------------------------------------------------- + +mod_class_knn <- nearest_neighbor(mode = "classification", neighbors = 10) + +recipe_class_knn <- recipe(uc_receipt ~ ., + data = train_data) + +fit_class_knn <- fit(mod_class_knn, recipe_class_knn, data = train_data) + ## with Monte Carlo resampling for cross-validation ---------------------- library(doParallel) @@ -590,9 +346,9 @@ mod_lin_lm <- linear_reg() |> region + disab + educ + gender + emp_len + seeking + house_ten + house_resp + caring + n_hh_emp + n_hh_unemp + n_hh_inact + children * employment * marsta, - # employment * children * seeking * marsta, - # employment * children * seeking * marsta, - # employment * children * seeking * marsta, + # employment * children * seeking * marsta, + # employment * children * seeking * marsta, + # employment * children * seeking * marsta, data = train_data ) @@ -608,6 +364,9 @@ test_data |> geom_point() + geom_abline(slope = 1, intercept = 0, linetype = "dashed") +test_data |> + bind_cols(predict(mod_lin_lm, new_data = test_data)) |> + rsq(uc_income, .pred) mod_lin_xg <- boost_tree(min_n = 3) |> set_mode("regression") |> @@ -629,6 +388,24 @@ test_data |> geom_point() + geom_abline(slope = 1, intercept = 0, linetype = "dashed") +mod_lin_knn <- workflow() |> + add_model(nearest_neighbor(mode = "regression")) |> + add_formula(uc_income ~ poly(age, 2) + + i_c + + # poly(i_l, 4) + i_0 + i_m + + region + disab + educ + gender + emp_len + seeking + + house_ten + house_resp + caring + n_hh_emp + n_hh_unemp + n_hh_inact + + children + employment + marsta) |> + fit(train_data) + +test_data |> + bind_cols(predict(mod_lin_knn, new_data = test_data)) |> + # mutate(.pred = if_else(.pred < 0, 0, .pred)) |> + (\(x) {print(rmse(x, uc_income, .pred)); x})() |> + ggplot(aes(uc_income, .pred)) + + geom_point() + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + mod_lin_forest <- rand_forest(trees = 1000) |> set_engine("ranger") |> @@ -641,12 +418,11 @@ mod_lin_forest <- rand_forest(trees = 1000) |> children + employment + marsta, data = train_data ) - + test_data |> bind_cols(predict(mod_lin_forest, new_data = test_data)) |> mutate(.pred = if_else(.pred < 0, 0, .pred)) |> (\(x) {print(rmse(x, uc_income, .pred)); x})() |> ggplot(aes(uc_income, .pred)) + geom_point() + - geom_abline(slope = 1, intercept = 0, linetype = "dashed") - + geom_abline(slope = 1, intercept = 0, linetype = "dashed") \ No newline at end of file diff --git a/R/prediction_data_import.R b/R/prediction_data_import.R new file mode 100644 index 0000000..055267a --- /dev/null +++ b/R/prediction_data_import.R @@ -0,0 +1,324 @@ +library(tidyverse) +library(tidymodels) +library(furrr) +library(data.table) + +# importing ukmod data --------------------------------------------------------- +import_ukmod_data <- function() { + input_parts <- + tibble( + file = file.path("data/ukmod_out", dir("data/ukmod_out")), + year_policy = str_extract(file, "(?<=uk_)\\w*(?=\\.txt)") + ) |> + mutate(data = future_map(file, fread)) + + + # input_parts |> + # select(-file) |> + # separate(year_policy, + # into = c("year", "policy"), + # sep = "_") |> + # mutate( + # data = map(data, summarise, across(starts_with("b") & + # ends_with("_s"), sum)), + # data = map( + # data, + # pivot_longer, + # everything(), + # names_to = "benefit", + # values_to = "val" + # ) + # ) |> + # unnest(data) |> + # group_by(benefit, policy) |> + # summarise(val = sum(val), .groups = "drop_last") |> + # mutate(diff_val = diff(val)) |> + # filter(abs(diff_val) > 1000) |> + # arrange(abs(diff_val)) |> + # ggplot(aes(val, fct_inorder(benefit))) + + # geom_bar(stat = "identity") + + # facet_wrap(~ policy) + + full_pred_data <- input_parts |> + select(-file) |> + separate(year_policy, + into = c("year", "policy"), + sep = "_") |> + pivot_wider(names_from = policy, values_from = data) |> + mutate( + UCAon = map( + UCAon, + transmute, + uc_income = bho_s + bmu_s + boamt_s + boamtmm_s + boamtxp_s + bsauc_s - brduc_s, + uc_receipt = as.numeric(bsauc_s - brduc_s > 0), + idperson = idperson + ), + LBAon = map( + LBAon, + mutate, + lba_income = bho_s + bmu_s + boamt_s + boamtmm_s + boamtxp_s + bfamt_s + bsadi_s + bsa_s + bwkmt_s - brd_s + ) + ) |> + mutate(comb_data = map2(LBAon, UCAon, left_join, by = "idperson"), + .keep = "unused") |> + unnest(comb_data) |> + arrange(year, idperson) + + + full_pred_data <- full_pred_data |> + group_by(year, idhh) |> + mutate(children = sum(dag < 16)) |> + ungroup() + + ukmod_tidy <- full_pred_data |> + mutate( + age = dag, + age_2 = age ^ 2, + cit = if_else(dcz == 1, "UK", "Other"), + # leave out as odd in UKMOD? + disab = factor( + ddi, + levels = c("0", "1"), + labels = c("Not disabled", "Disabled") + ), + employment = case_when( + les %in% 1:3 ~ "Employed", + les == 5 ~ "Unemployed", + les %in% 6:7 ~ "Inactive", + les == 4 ~ "Retired", + les == 8 ~ "Sick or disabled", + TRUE ~ "Other"), + emp_len = case_when( + les == 5 ~ "Unemployed", + les %in% 6:7 ~ "Inactive", + les == 4 ~ "Retired", + les == 8 ~ "Sick or disabled", + liwwh < 12 ~ "Less than 12 months", + liwwh < 24 ~ "Between 1 and 2 years", + liwwh < 60 ~ "Between 2 and 5 years", + liwwh < 120 ~ "Between 5 and 10 years", + liwwh < 240 ~ "Between 10 and 20 years", + liwwh >= 240 ~ "20 years or more", + ), + student = if_else(les == 6, 1L, 0L), + educ = case_when( + deh == 4 ~ "Degree or College", + deh == 3 ~ "Secondary", + deh == 2 ~ "Lower Secondary", + deh == 5 ~ "Tertiary", + TRUE ~ "None" + ), + gender = factor( + dgn, + levels = c("0", "1"), + labels = c("Female", "Male") + ), + marsta = factor( + if_else(dms == 0, 1L, dms), + levels = 1:5, + labels = c("Single", "Married", "Separated", "Divorced", "Widowed") + ), + region = if_else(drgn1 < 3, LETTERS[drgn1 + 2], LETTERS[drgn1 + 1]), + seeking = factor( + lowas, + levels = c("0", "1"), + labels = c("No", "Yes") + ), + children = fct_other(factor(children), c("0", "1"), other_level = "2+"), + income = if_else(yem > 3415, 3415, yem), + i_0 = as.numeric(income == 0), + i_m = as.numeric(income == 3415), + i_l = income * (1 - i_m), + hours_worked_wk = lhw, + # i.e. if income is max then turn off the continuous + i_c = cut( + income, + c(0, 1, 500, 1000, 2000, 3000, 3500), + right = FALSE, + include.lowest = TRUE, + labels = c("0", + "1-499", + "500-999", + "1000-1999", + "2000-2999", + "3000+") + ), + house_ten = fct_collapse( + factor(amrtn), + Mortgaged = "1", + Outright = "2", + Rented = c("3", "4", "5"), + Free = "6", + Other = "7" + ), + house_resp = factor(dhr, labels = c("No", "Yes")), + caring = factor( + full_pred_data$lcr01 == 0, + levels = c(TRUE, FALSE), + labels = c("No", "Yes") + ) + ) |> + select( + year, + idhh, + uc_income, + lba_income, + uc_receipt, + age, + cit, + disab, + employment, + educ, + gender, + marsta, + region, + emp_len, + seeking, + student, + children, + income, + i_0, + i_m, + i_l, + i_c, + hours_worked_wk, + house_ten, + house_resp, + caring + ) + + ukmod_tidy +} + + + + +# importing aps data ------------------------------------------------------ + +import_aps_data <- function(aps_data) { + aps_tidy <- aps_data |> + mutate( + age = AGE, + age_2 = age ^ 2, + cit = if_else(NTNLTY12 == 926, "UK", "Other"), + # leave out as odd in UKMOD? + disab = fct_collapse( + as_factor(DISEA), + `Disabled` = "Equality Act Disabled", + `Not disabled` = c("Not Equality Act Disabled", "Does not apply", "No answer") + ), + employment = case_when( + INECAC05 %in% 1:4 ~ "Employed", + INECAC05 == 5 ~ "Unemployed", + INECAC05 %in% 12:33 ~ "Inactive", + INECAC05 %in% c(20, 31) ~ "Retired", + INECAC05 %in% c(15:16, 26:27) ~ "Sick or disabled", + TRUE ~ "Other" + ), + student = if_else(STUCUR == 1, 1L, 0L), + educ = case_when( + if_any(matches("QUAL_[1-9]$"), ~ .x == 1) ~ "Degree or College", + if_any(matches("QUAL_1[0-7]$"), ~ .x == 1) | + QUAL_23 == 1 | QUAL_29 == 1 | QUAL_30 == 1 ~ "Secondary", + if_any(matches("QUAL_(18|19|2[0-4])$"), ~ .x == 1) ~ "Secondary", + if_any(matches("QUAL_(2[5-9]|3[0-5])$"), ~ .x == 1) ~ "Tertiary", + TRUE ~ "None" + ), + gender = as_factor(SEX), + marsta = case_when( + MARSTA == 1 ~ "Single", + MARSTA %in% c(2, 6) ~ "Married", + MARSTA %in% c(3, 7) ~ "Separated", + MARSTA %in% c(4, 8) ~ "Divorced", + MARSTA %in% c(5, 9) ~ "Widowed", + TRUE ~ "Single" + ), + emp_len = case_when( + INECAC05 == 5 ~ "Unemployed", + INECAC05 %in% 12:33 ~ "Inactive", + INECAC05 %in% c(20, 31) ~ "Retired", + INECAC05 %in% c(15:16, 26:27) ~ "Sick or disabled", + EMPMON < 12 ~ "Less than 12 months", + EMPMON < 24 ~ "Between 1 and 2 years", + EMPMON < 60 ~ "Between 2 and 5 years", + EMPMON < 120 ~ "Between 5 and 10 years", + EMPMON < 240 ~ "Between 10 and 20 years", + EMPMON >= 240 ~ "20 years or more", + ), + seeking = if_else(ILODEFR == 2, "Yes", "No"), + GRSSWK = if_else(GRSSWK < 0, 0, GRSSWK), + GRSSWK2 = if_else(GRSSWK2 < 0, 0, GRSSWK2), + income = 52 * (GRSSWK + GRSSWK2) / 12, + income = if_else(income > 3415, 3415, income), + income = if_else(is.na(income), 0, income), + i_0 = as.numeric(income == 0), + i_m = as.numeric(income == 3415), + i_l = income * (1 - i_m), + # i.e. if income is max then turn off the continuous + i_c = cut( + income, + c(0, 1, 500, 1000, 2000, 3000, 3500), + right = FALSE, + include.lowest = TRUE, + labels = c("0", + "1-499", + "500-999", + "1000-1999", + "2000-2999", + "3000+") + ), + hours_worked_wk = if_else(lhw %in% -9:-8, 0, lhw), + house_ten = fct_collapse( + factor(TEN1), + Mortgaged = "2", + Outright = "1", + Rented = c("3", "4"), + Free = "5", + Other = c("6", "-9", "-8") + ), + house_resp = fct_other(as_factor(HRPID), "Yes", other_level = "No"), + # Missing variable outside safe lab + # children = fct_other(factor(FDPCH16), c("0", "1"), other_level = "2+"), + # region = str_extract(NUTS132, "(?<=^UK)\\w"), + # caring = factor(as.numeric(NOLWM == 3), labels = c("No", "Yes")) + ) |> + select( + # uc_income, lba_income, uc_receipt, + age, + cit, + disab, + employment, + educ, + gender, + marsta, + emp_len, + seeking, + income, + i_0, + i_m, + i_l, + i_c, + hours_worked_wk, + house_ten, + house_resp + # idhh, year + # children, region, caring + ) + + aps_tidy +} + +process_aps_data <- function(aps_data) { + recipe_class_log <- recipe( + uc_receipt ~ ., + data = aps_data + ) |> + step_interact( + # ~ starts_with('gender_'):starts_with('children_') + starts_with('gender_'):starts_with('children_'):starts_with('emp_len_') + starts_with('children_'):starts_with('emp_len_') + student:starts_with('children_') + student:starts_with('caring_') + starts_with('n_hh_emp_'):starts_with('children_') + starts_with('n_hh_unemp_'):starts_with('children_') + starts_with('n_hh_inact_'):starts_with('children_') + starts_with('n_hh_emp_'):starts_with('caring_') + starts_with('n_hh_unemp_'):starts_with('caring_') + starts_with('n_hh_inact_'):starts_with('caring_') + starts_with('marsta_')*starts_with('gender_')*starts_with('children_') + ~ starts_with('gender_'):starts_with('children_') + starts_with('gender_'):starts_with('children_'):starts_with('emp_len_') + starts_with('children_'):starts_with('emp_len_') + student:starts_with('children_') + student:starts_with('caring_') + starts_with('marsta_')*starts_with('gender_')*starts_with('children_') + ) + + recipe_class_log |> + prep() |> + bake(aps_data) +} diff --git a/R/temp_testing_ddd.R b/R/temp_testing_ddd.R new file mode 100644 index 0000000..b5d72ab --- /dev/null +++ b/R/temp_testing_ddd.R @@ -0,0 +1,39 @@ +library(tidyverse) +library(did2s) + +df_het |> + ggplot(aes(year, dep_var, colour= group)) + + stat_summary(geom = "line") + + stat_summary() + +mod_df <- tibble(state = 1:40, + l = if_else(state %in% sample(40, 20), 1, 0)) |> + left_join(df_het, by = "state") |> + mutate(dep_var = if_else(l == 1 & treat, dep_var +0.5 , dep_var)) + +dids <- did2s(mod_df, + yname = "dep_var", + treatment = "treat", + first_stage = ~ 0 | unit + year, + second_stage = ~ i(rel_year, ref = c(-1, Inf)), + cluster_var = "unit") + +iplot(dids) + +feols(dep_var ~ i(rel_year, ref = c(-1, Inf))| unit + year, fsplit = ~ l , data = mod_df) + +lm(dep_var ~ i(rel_year, ref = c(-1, Inf)) * l + unit + year, data = mod_df) + +dids |> + ggplot(aes(rel)) + +dd1 <- event_study(mod_df |> filter(l == 1), "dep_var", idname = "unit", + tname = "year", gname = "g", estimator = "did") + +dd0 <- event_study(mod_df |> filter(l == 0), "dep_var", idname = "unit", + tname = "year", gname = "g", estimator = "did") + +library(patchwork) +library(bacon) + +plot_event_study(dd1) + plot_event_study(dd0) diff --git a/R/testing_for_missing_data.R b/R/testing_for_missing_data.R index e6699e0..b81d060 100644 --- a/R/testing_for_missing_data.R +++ b/R/testing_for_missing_data.R @@ -29,6 +29,14 @@ full_dataset <- "anxious", "happy", "worth", + "grsswk", + "grsswk2", + "empmon", + # "nolwm", + "ioutcome", + "caind", + "sumhrs", + "inecac05", # "ethuk11", # "fdpch19", # "oycirc", @@ -42,14 +50,23 @@ full_dataset <- # "limitk", # "pwta18", # "pwta20", + # "stat", "benfts"), as.data.table = TRUE ) full_dataset |> - reduce(bind_rows) |> - summarise(across(.fns = ~sum(.x == -8)/n())) - + reduce(bind_rows) |> + # ggplot(aes(x = sumhrs)) + geom_histogram() + # count(ioutcome, satis) |> + # pivot_wider(names_from = satis, values_from = n) + filter(ioutcome == 1, caind == 1) |> + summarise(across(.fns = ~sum(.x == -8|.x == -9)/n())) +full_dataset |> + reduce(bind_rows) |> + select(sumhrs, inecac05) |> + mutate(sumhrsna = if_else(sumhrs == -8, 1, 0)) |> + summarise(sum(sumhrsna)/n()) diff --git a/R/testing_relationship_uc_work_pay.R b/R/testing_relationship_uc_work_pay.R new file mode 100644 index 0000000..d97288c --- /dev/null +++ b/R/testing_relationship_uc_work_pay.R @@ -0,0 +1,80 @@ +library(fst) +library(data.table) +library(tidyverse) +library(dtplyr) +library(lubridate) +library(SPHSUgraphs) +library(patchwork) + +theme_set(theme_sphsu_light()) + +apr14_mar21 <- c( + "data/fst_files/apr14_mar15.fst", + "data/fst_files/apr15_mar16.fst", + "data/fst_files/apr16_mar17.fst", + "data/fst_files/apr17_mar18.fst", + "data/fst_files/apr18_mar19.fst", + "data/fst_files/apr19_mar20.fst", + "data/fst_files/apr20_mar21.fst" +) |> map(read_fst, c( + "age", + "tpbn1301", + "tpbn1302", + "tpbn1303", + "tpbn1304", + "tpbn1305", + "tpbn1306", + "tpbn1307", + "tpbn1308", + "tpbn1309", + "tpbn1310", + "grsswk", + "sumhrs", + "refdte")) + +wts1 <- c( + "data/fst_files/apr14_mar15.fst", + "data/fst_files/apr15_mar16.fst", + "data/fst_files/apr16_mar17.fst", + "data/fst_files/apr17_mar18.fst", + "data/fst_files/apr18_mar19.fst" + # "data/fst_files/apr19_mar20.fst" + # "data/fst_files/apr20_mar21.fst" +) |> map(read_fst, "pwta18") |> + map(`colnames<-`, "weight") + +wts2 <- c("data/fst_files/apr19_mar20.fst", + "data/fst_files/apr20_mar21.fst") |> + map(read_fst, "pwta20") |> + map(`colnames<-`, "weight") + + +# combined_dt <- apr14_mar21 |> +# map2(append(wts1, wts2), cbind) |> +# reduce(bind_rows) |> +# mutate(date = floor_date(dmy(refdte), "months")) |> +# as.data.table() + +apr14_mar21_dt <- apr14_mar21 |> + map2(append(wts1, wts2), cbind) |> + reduce(bind_rows) |> + mutate(date = floor_date(dmy(refdte), "months")) |> + as.data.table() + +apr14_mar21_dt <- apr14_mar21_dt[age > 17 & age < 66] + +apr14_mar21_dt[,uc := rowSums(.SD == 1), .SD = 2:11] +apr14_mar21_dt[,tax_cr := rowSums(.SD == 3), .SD = 2:11] +apr14_mar21_dt[,housing := rowSums(.SD == 2), .SD = 2:11] +apr14_mar21_dt[,income_s := rowSums(.SD == 4), .SD = 2:11] +apr14_mar21_dt[,jsa := rowSums(.SD == 5), .SD = 2:11] +apr14_mar21_dt[,disab := rowSums(.SD == 6), .SD = 2:11] +apr14_mar21_dt[,other := as.numeric(rowSums(.SD)>0), .SD = tax_cr:jsa] + +# linear model testing uc effect on earned income not mediated through hours worked +library(magrittr) + +apr14_mar21_dt |> + filter((uc == 1 | other == 1) & sumhrs > 0) %$% + lm(grsswk ~ uc + sumhrs) |> + summary() diff --git a/pred_models.Rmd b/pred_models.Rmd new file mode 100644 index 0000000..e7d3a88 --- /dev/null +++ b/pred_models.Rmd @@ -0,0 +1,330 @@ +--- +title: "Predicting benefit receipt - 3 methods" +output: + html_document: + df_print: paged + toc: true + number_sections: true +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set(echo = TRUE) +library(tidyverse) +library(tidymodels) +library(SPHSUgraphs) +library(furrr) +library(janitor) +library(data.table) +library(themis) +library(magrittr) + +source("R/prediction_data_import.R") + +ukmod_tidy <- import_ukmod_data() + +print_perc <- function(x) paste0(sprintf("%.1f", 100 * x), "\\%") + +``` + +```{r data_import} + +model_data <- ukmod_tidy |> + group_by(year, idhh) |> + mutate(lba_income = sum(lba_income), + uc_income = max(uc_income), + uc_receipt = max(uc_receipt), + n_hh_emp = sum(employment == "Employed"), + n_hh_unemp = sum(employment == "Unemployed"), + n_hh_inact = sum(employment == "Inactive")) |> + ungroup() |> + filter(age > 17 & age < 66) |> + select(-idhh, -i_0, -i_m, -i_l, - income, -employment) |> + mutate( + year = as.integer(year), + # p_hh_emp = if_else(employment == "Employed" & n_hh_emp > 0, 1, 0), + n_hh_emp = fct_other(factor(n_hh_emp), c("0", "1"), other_level = "2+"), + n_hh_unemp = fct_other(factor(n_hh_unemp), c("0", "1"), other_level = "2+"), + n_hh_inact = fct_other(factor(n_hh_inact), c("0", "1"), other_level = "2+"), + benefit_change = uc_income - lba_income, + ) |> + fastDummies::dummy_cols(remove_first_dummy = TRUE, remove_selected_columns = TRUE) |> + janitor::clean_names() |> + select(-starts_with("n_hh")) |> + mutate(uc_receipt = factor(uc_receipt, levels = 1:0, labels = c("Yes", "No"))) + +set.seed(123) + +data_split <- initial_split(model_data, prop = 0.8, strata = year) + +train_data <- training(data_split) |> select(-year) +test_data <- testing(data_split) |> select(-year) + +DT::datatable(head(model_data, 10)) + +``` + + +# Predicting by individual observation (no household data) + +## Predicting benefit types separately + +### UC receipt amount + +```{r} + +train_data_uc_income <- train_data |> select(-uc_receipt, -lba_income, -benefit_change) +test_data_uc_income <- test_data |> select(-uc_receipt, -lba_income, -benefit_change) + +rc_income <- recipe(uc_income ~ ., + data = train_data_uc_income) |> + step_interact( + ~ starts_with('gender_'):starts_with('children_') + + starts_with('gender_'):starts_with('children_'):starts_with('emp_len_') + + starts_with('children_'):starts_with('emp_len_') + + student:starts_with('children_') + student:starts_with('caring_') + + starts_with('marsta_') * starts_with('gender_') * starts_with('children_') + ) + +mod_ln <- linear_reg(mode = "regression") + +wf_ln <- workflow() |> + add_recipe(rc_income) |> + add_model(mod_ln) + +mod_ln_uc <- fit(wf_ln, data = train_data_uc_income) + +pred_uc_income <- + test_data_uc_income |> + bind_cols(predict(mod_ln_uc, new_data = test_data_uc_income)) + +uc_rsq <- rsq_vec(pred_uc_income$uc_income, pred_uc_income$.pred) +uc_rmse <- rmse_vec(pred_uc_income$uc_income, pred_uc_income$.pred) + +pred_uc_income |> + select(observed = uc_income, predicted = .pred) |> + ggplot(aes(observed, predicted)) + + geom_point(colour = sphsu_cols("Pumpkin"), names = FALSE) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + geom_hline(yintercept = 0) + + scale_colour_sphsu() + +``` + +```{r, echo=FALSE, results='asis'} + +cat(glue::glue("For this model, $R^2 = {round(uc_rsq, 3)}$ and root mean squared error $RMSE = {round(uc_rmse, 1)}$")) + + +``` + +### Legacy benefit receipt amount + +```{r} +train_data_lba_income <- train_data |> select(-uc_receipt, -uc_income, -benefit_change) +test_data_lba_income <- test_data |> select(-uc_receipt, -uc_income, -benefit_change) + +rc_income <- recipe(lba_income ~ ., + data = train_data_lba_income) |> + step_interact( + ~ starts_with('gender_'):starts_with('children_') + + starts_with('gender_'):starts_with('children_'):starts_with('emp_len_') + + starts_with('children_'):starts_with('emp_len_') + + student:starts_with('children_') + student:starts_with('caring_') + + starts_with('marsta_') * starts_with('gender_') * starts_with('children_') + ) + +wf_ln <- workflow() |> + add_recipe(rc_income) |> + add_model(mod_ln) + +mod_ln_lb <- fit(wf_ln, data = train_data_lba_income) + +pred_lb_income <- + test_data_lba_income |> + bind_cols(predict(mod_ln_lb, new_data = test_data_lba_income)) + +lb_rsq <- rsq_vec(pred_lb_income$lba_income, pred_lb_income$.pred) +lb_rmse <- rmse_vec(pred_lb_income$lba_income, pred_lb_income$.pred) + +pred_lb_income |> + select(observed = lba_income, predicted = .pred) |> + ggplot(aes(observed, predicted)) + + geom_point(colour = sphsu_cols("University Blue", names = FALSE)) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + geom_hline(yintercept = 0) + + scale_colour_sphsu() + +``` + + +```{r, echo=FALSE, results='asis'} + +cat(glue::glue("For this model, $R^2 = {round(lb_rsq, 3)}$ and root mean squared error $RMSE = {round(lb_rmse, 1)}$")) + + +``` + +### Plotting both income types + +```{r} + +pred_uc_income |> select(observed = uc_income, predicted = .pred) |> + mutate(source = "Universal Credit") |> + bind_rows( + pred_lb_income |> select(observed = lba_income, predicted = .pred) |> + mutate(source = "Legacy Benefits") + ) |> + ggplot(aes(observed, predicted, colour = source)) + + geom_point() + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + geom_hline(yintercept = 0) + + facet_wrap(~source) + + scale_colour_sphsu() + +``` + +## Predicting difference + +```{r} + +train_data_benefit_change <- train_data |> select(-uc_receipt, -uc_income, -lba_income) +test_data_benefit_change <- test_data |> select(-uc_receipt, -uc_income, -lba_income) + + +rc_income <- recipe(benefit_change ~ ., + data = train_data_benefit_change) |> + step_interact( + ~ starts_with('gender_'):starts_with('children_') + + starts_with('gender_'):starts_with('children_'):starts_with('emp_len_') + + starts_with('children_'):starts_with('emp_len_') + + student:starts_with('children_') + student:starts_with('caring_') + + starts_with('marsta_') * starts_with('gender_') * starts_with('children_') + ) + +wf_ln <- workflow() |> + add_recipe(rc_income) |> + add_model(mod_ln) + +mod_ln_lb <- fit(wf_ln, data = train_data_benefit_change) + +pred_benefit_change <- + test_data_benefit_change |> + bind_cols(predict(mod_ln_lb, new_data = test_data_benefit_change)) + +bc_rsq <- rsq_vec(pred_benefit_change$benefit_change, pred_benefit_change$.pred) +bc_rmse <- rmse_vec(pred_benefit_change$benefit_change, pred_benefit_change$.pred) +``` + +```{r, echo=FALSE, results='asis'} + +cat(glue::glue("For this model, $R^2 = {round(bc_rsq, 3)}$ and root mean squared error $RMSE = {round(bc_rmse, 1)}$")) + + +``` + + +### Plotting + +```{r} +pred_benefit_change |> + select(observed = benefit_change, predicted = .pred) |> + ggplot(aes(observed, predicted)) + + geom_point() + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + geom_hline(yintercept = 0) + + scale_colour_sphsu() +``` + +## Trichotomise + +```{r} +model_data_tri <- model_data |> + mutate(benefit_change = case_when( + benefit_change/lba_income >= 0.02 ~ "Increase", + abs(benefit_change/lba_income) < 0.02 ~ "No change", + TRUE ~ "Decrease" + ), + benefit_change = factor(benefit_change, levels = c("Decrease", "No change", "Increase"))) |> + select(-uc_income, -lba_income, -uc_receipt) + +data_split <- initial_split(model_data_tri, prop = 0.8, strata = year) + +train_data <- training(data_split) |> select(-year) +test_data <- testing(data_split) |> select(-year) + + +rc_income <- recipe(benefit_change ~ ., + data = train_data) |> + step_interact( + ~ starts_with('gender_'):starts_with('children_') + + starts_with('gender_'):starts_with('children_'):starts_with('emp_len_') + + starts_with('children_'):starts_with('emp_len_') + + student:starts_with('children_') + student:starts_with('caring_') + + starts_with('marsta_') * starts_with('gender_') * starts_with('children_') + ) + +mod_mlogit <- multinom_reg(penalty = double(1), mixture = double(1)) |> + set_engine("glmnet") + + +imbal_rec <- rc_income |> + step_smote(benefit_change, over_ratio = 0.75) + +wf_ln <- workflow() |> + add_recipe(imbal_rec) |> + add_model(mod_mlogit) + +mod_ml_tri <- fit(wf_ln, data = train_data) + +pred_benefit_change <- + test_data |> + bind_cols(predict(mod_ml_tri, new_data = test_data)) + +pred_benefit_change_prob <- + test_data |> + bind_cols(predict(mod_ml_tri, new_data = test_data, type = "prob")) + +acc_bc <- pred_benefit_change %$% accuracy_vec(benefit_change, .pred_class) +roc_inc <- pred_benefit_change_prob %$% roc_auc_vec(benefit_change, as.matrix(data.frame(.pred_Decrease, `.pred_No change`, .pred_Increase))) + +sens_dec <- pred_benefit_change |> + mutate(ob_decrease = fct_other(benefit_change, keep = "Decrease"), + pred_decrease = fct_other(.pred_class, keep = "Decrease")) %$% + sens_vec(ob_decrease, pred_decrease) + +sens_inc <- pred_benefit_change |> + mutate(ob_increase = fct_other(benefit_change, keep = "Increase"), + pred_increase = fct_other(.pred_class, keep = "Increase")) %$% + sens_vec(ob_increase, pred_increase) +``` + +```{r} +pred_benefit_change |> + ggplot(aes(benefit_change, .pred_class)) + + geom_count(shape = 15) + + ylab("Predicted change") + + xlab("Observed change") + + scale_size_continuous(range = c(1, 40)) +``` + + +```{r, echo = FALSE, results = 'asis'} + +cat(glue::glue("This model has an accuracy of ${print_perc(acc_bc)}$, a sensitivity for detecting decreasing benefit changes of ${print_perc(sens_dec)}$ and a sensitivity of detecting increasing benefits of ${print_perc(sens_inc)}$ (area under ROC-curve = ${round(roc_inc, 3)}$).")) + +``` + + +### ROC curves + +```{r} +pred_benefit_change_prob |> roc_curve(benefit_change, .pred_Decrease, `.pred_No change`, .pred_Increase) |> autoplot() +``` + +Proportions of predictions within each category of observed benefit change: + +```{r} +pred_benefit_change %$% table(benefit_change, .pred_class) |> prop.table(margin = 1) +``` + + diff --git a/pred_models.html b/pred_models.html new file mode 100644 index 0000000..0ab10d7 --- /dev/null +++ b/pred_models.html @@ -0,0 +1,4619 @@ + + + + +
+ + + + + + + + +model_data <- ukmod_tidy |>
+ group_by(year, idhh) |>
+ mutate(lba_income = sum(lba_income),
+ uc_income = max(uc_income),
+ uc_receipt = max(uc_receipt),
+ n_hh_emp = sum(employment == "Employed"),
+ n_hh_unemp = sum(employment == "Unemployed"),
+ n_hh_inact = sum(employment == "Inactive")) |>
+ ungroup() |>
+ filter(age > 17 & age < 66) |>
+ select(-idhh, -i_0, -i_m, -i_l, - income, -employment) |>
+ mutate(
+ year = as.integer(year),
+ # p_hh_emp = if_else(employment == "Employed" & n_hh_emp > 0, 1, 0),
+ n_hh_emp = fct_other(factor(n_hh_emp), c("0", "1"), other_level = "2+"),
+ n_hh_unemp = fct_other(factor(n_hh_unemp), c("0", "1"), other_level = "2+"),
+ n_hh_inact = fct_other(factor(n_hh_inact), c("0", "1"), other_level = "2+"),
+ benefit_change = uc_income - lba_income,
+ ) |>
+ fastDummies::dummy_cols(remove_first_dummy = TRUE, remove_selected_columns = TRUE) |>
+ janitor::clean_names() |>
+ select(-starts_with("n_hh")) |>
+ mutate(uc_receipt = factor(uc_receipt, levels = 1:0, labels = c("Yes", "No")))
+
+set.seed(123)
+
+data_split <- initial_split(model_data, prop = 0.8, strata = year)
+
+train_data <- training(data_split) |> select(-year)
+test_data <- testing(data_split) |> select(-year)
+
+DT::datatable(head(model_data, 10))
+
+
+train_data_uc_income <- train_data |> select(-uc_receipt, -lba_income, -benefit_change)
+test_data_uc_income <- test_data |> select(-uc_receipt, -lba_income, -benefit_change)
+
+rc_income <- recipe(uc_income ~ .,
+ data = train_data_uc_income) |>
+ step_interact(
+ ~ starts_with('gender_'):starts_with('children_') +
+ starts_with('gender_'):starts_with('children_'):starts_with('emp_len_') +
+ starts_with('children_'):starts_with('emp_len_') +
+ student:starts_with('children_') + student:starts_with('caring_') +
+ starts_with('marsta_') * starts_with('gender_') * starts_with('children_')
+ )
+
+mod_ln <- linear_reg(mode = "regression")
+
+wf_ln <- workflow() |>
+ add_recipe(rc_income) |>
+ add_model(mod_ln)
+
+mod_ln_uc <- fit(wf_ln, data = train_data_uc_income)
+
+pred_uc_income <-
+ test_data_uc_income |>
+ bind_cols(predict(mod_ln_uc, new_data = test_data_uc_income))
+
+uc_rsq <- rsq_vec(pred_uc_income$uc_income, pred_uc_income$.pred)
+uc_rmse <- rmse_vec(pred_uc_income$uc_income, pred_uc_income$.pred)
+
+pred_uc_income |>
+ select(observed = uc_income, predicted = .pred) |>
+ ggplot(aes(observed, predicted)) +
+ geom_point(colour = sphsu_cols("Pumpkin"), names = FALSE) +
+ geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
+ geom_hline(yintercept = 0) +
+ scale_colour_sphsu()
+## Warning in geom_point(colour = sphsu_cols("Pumpkin"), names = FALSE): Ignoring
+## unknown parameters: `names`
+
+For this model, \(R^2 = 0.475\) and +root mean squared error \(RMSE = +313.6\)
+train_data_lba_income <- train_data |> select(-uc_receipt, -uc_income, -benefit_change)
+test_data_lba_income <- test_data |> select(-uc_receipt, -uc_income, -benefit_change)
+
+rc_income <- recipe(lba_income ~ .,
+ data = train_data_lba_income) |>
+ step_interact(
+ ~ starts_with('gender_'):starts_with('children_') +
+ starts_with('gender_'):starts_with('children_'):starts_with('emp_len_') +
+ starts_with('children_'):starts_with('emp_len_') +
+ student:starts_with('children_') + student:starts_with('caring_') +
+ starts_with('marsta_') * starts_with('gender_') * starts_with('children_')
+ )
+
+wf_ln <- workflow() |>
+ add_recipe(rc_income) |>
+ add_model(mod_ln)
+
+mod_ln_lb <- fit(wf_ln, data = train_data_lba_income)
+
+pred_lb_income <-
+ test_data_lba_income |>
+ bind_cols(predict(mod_ln_lb, new_data = test_data_lba_income))
+
+lb_rsq <- rsq_vec(pred_lb_income$lba_income, pred_lb_income$.pred)
+lb_rmse <- rmse_vec(pred_lb_income$lba_income, pred_lb_income$.pred)
+
+pred_lb_income |>
+ select(observed = lba_income, predicted = .pred) |>
+ ggplot(aes(observed, predicted)) +
+ geom_point(colour = sphsu_cols("University Blue", names = FALSE)) +
+ geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
+ geom_hline(yintercept = 0) +
+ scale_colour_sphsu()
+
+For this model, \(R^2 = 0.437\) and +root mean squared error \(RMSE = +342.1\)
+pred_uc_income |> select(observed = uc_income, predicted = .pred) |>
+ mutate(source = "Universal Credit") |>
+ bind_rows(
+ pred_lb_income |> select(observed = lba_income, predicted = .pred) |>
+ mutate(source = "Legacy Benefits")
+ ) |>
+ ggplot(aes(observed, predicted, colour = source)) +
+ geom_point() +
+ geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
+ geom_hline(yintercept = 0) +
+ facet_wrap(~source) +
+ scale_colour_sphsu()
+
+train_data_benefit_change <- train_data |> select(-uc_receipt, -uc_income, -lba_income)
+test_data_benefit_change <- test_data |> select(-uc_receipt, -uc_income, -lba_income)
+
+
+rc_income <- recipe(benefit_change ~ .,
+ data = train_data_benefit_change) |>
+ step_interact(
+ ~ starts_with('gender_'):starts_with('children_') +
+ starts_with('gender_'):starts_with('children_'):starts_with('emp_len_') +
+ starts_with('children_'):starts_with('emp_len_') +
+ student:starts_with('children_') + student:starts_with('caring_') +
+ starts_with('marsta_') * starts_with('gender_') * starts_with('children_')
+ )
+
+wf_ln <- workflow() |>
+ add_recipe(rc_income) |>
+ add_model(mod_ln)
+
+mod_ln_lb <- fit(wf_ln, data = train_data_benefit_change)
+
+pred_benefit_change <-
+ test_data_benefit_change |>
+ bind_cols(predict(mod_ln_lb, new_data = test_data_benefit_change))
+
+bc_rsq <- rsq_vec(pred_benefit_change$benefit_change, pred_benefit_change$.pred)
+bc_rmse <- rmse_vec(pred_benefit_change$benefit_change, pred_benefit_change$.pred)
+For this model, \(R^2 = 0.023\) and +root mean squared error \(RMSE = +169.1\)
+pred_benefit_change |>
+ select(observed = benefit_change, predicted = .pred) |>
+ ggplot(aes(observed, predicted)) +
+ geom_point() +
+ geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
+ geom_hline(yintercept = 0) +
+ scale_colour_sphsu()
+
+model_data_tri <- model_data |>
+ mutate(benefit_change = case_when(
+ benefit_change/lba_income >= 0.02 ~ "Increase",
+ abs(benefit_change/lba_income) < 0.02 ~ "No change",
+ TRUE ~ "Decrease"
+ ),
+ benefit_change = factor(benefit_change, levels = c("Decrease", "No change", "Increase"))) |>
+ select(-uc_income, -lba_income, -uc_receipt)
+
+data_split <- initial_split(model_data_tri, prop = 0.8, strata = year)
+
+train_data <- training(data_split) |> select(-year)
+test_data <- testing(data_split) |> select(-year)
+
+
+rc_income <- recipe(benefit_change ~ .,
+ data = train_data) |>
+ step_interact(
+ ~ starts_with('gender_'):starts_with('children_') +
+ starts_with('gender_'):starts_with('children_'):starts_with('emp_len_') +
+ starts_with('children_'):starts_with('emp_len_') +
+ student:starts_with('children_') + student:starts_with('caring_') +
+ starts_with('marsta_') * starts_with('gender_') * starts_with('children_')
+ )
+
+mod_mlogit <- multinom_reg(penalty = double(1), mixture = double(1)) |>
+ set_engine("glmnet")
+
+
+imbal_rec <- rc_income |>
+ step_smote(benefit_change, over_ratio = 0.75)
+
+wf_ln <- workflow() |>
+ add_recipe(imbal_rec) |>
+ add_model(mod_mlogit)
+
+mod_ml_tri <- fit(wf_ln, data = train_data)
+
+pred_benefit_change <-
+ test_data |>
+ bind_cols(predict(mod_ml_tri, new_data = test_data))
+
+pred_benefit_change_prob <-
+ test_data |>
+ bind_cols(predict(mod_ml_tri, new_data = test_data, type = "prob"))
+
+acc_bc <- pred_benefit_change %$% accuracy_vec(benefit_change, .pred_class)
+roc_inc <- pred_benefit_change_prob %$% roc_auc_vec(benefit_change, as.matrix(data.frame(.pred_Decrease, `.pred_No change`, .pred_Increase)))
+
+sens_dec <- pred_benefit_change |>
+ mutate(ob_decrease = fct_other(benefit_change, keep = "Decrease"),
+ pred_decrease = fct_other(.pred_class, keep = "Decrease")) %$%
+ sens_vec(ob_decrease, pred_decrease)
+
+sens_inc <- pred_benefit_change |>
+ mutate(ob_increase = fct_other(benefit_change, keep = "Increase"),
+ pred_increase = fct_other(.pred_class, keep = "Increase")) %$%
+ sens_vec(ob_increase, pred_increase)
+pred_benefit_change |>
+ ggplot(aes(benefit_change, .pred_class)) +
+ geom_count(shape = 15) +
+ ylab("Predicted change") +
+ xlab("Observed change") +
+ scale_size_continuous(range = c(1, 40))
+
+This model has an accuracy of \(69.7\%\), a sensitivity for detecting +decreasing benefit changes of \(72.0\%\) and a sensitivity of detecting +increasing benefits of \(65.7\%\) (area +under ROC-curve = \(0.78\)).
+pred_benefit_change_prob |> roc_curve(benefit_change, .pred_Decrease, `.pred_No change`, .pred_Increase) |> autoplot()
+
+Proportions of predictions within each category of observed benefit +change:
+pred_benefit_change %$% table(benefit_change, .pred_class) |> prop.table(margin = 1)
+## .pred_class
+## benefit_change Decrease No change Increase
+## Decrease 0.7203836 0.1154313 0.1641851
+## No change 0.2739593 0.4811229 0.2449177
+## Increase 0.2074653 0.1351273 0.6574074
+