Cummulative antihypertensive dosing in children with CKD (CKiD research project) .Rmd

---
title: "CKiD research project"
author: "Benjamin Matta, MD"
date: "December 22, 2018"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

# CKiD research project

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see <http://rmarkdown.rstudio.com>.

When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this...

Importing data from CKiD database:

```{r importing data, echo=FALSE}
#loading libraries
library(readr)
library(corrplot)
library(dplyr)
library(reshape2)
library(histogram)
library(gplots)
library(tidyr)
library(stats)
library(tibble)
library(ggplot2)
# importing data
wd<-if_else(Sys.info()["nodename"]=='matta', "~/Documents/Research/CKiDgit", "C:/Users/Orit/Downloads/CKiD/CKiDgit")
setwd(wd)
# importing data
medsum_full <- read.csv("data/medsum_full.csv")
cardio<-read.csv("data/cardio.csv")
echo<-read.csv("data/echo.csv")
f13 <- read.csv("data/f13.csv")
f14 <- read.csv("data/f14.csv")
f15 <- read.csv("data/f15.csv")
socdem <- read.csv("data/socdem.csv")
growth <- read.csv("data/growth.csv")
pe <- read.csv("data/pe.csv")
gfr <- read.csv("data/gfrcalibratedsummary.csv")
advr <- read.csv("data/advr.csv")
kidhist <- read.csv("data/kidhist.csv")
l05 <- read.csv("data/l05.csv")
l02 <- read.csv("data/l02.csv")
l51 <- read.csv("data/l51.csv")
l31 <- read.csv("data/l31.csv")
gh <- read.csv("data/gh.csv")
```

# Rearranging data

The following section combines all the data into a single dataframe in preparation for analysis

```{r rearranging_data}
test<-f13 %>% select(CASEID,VISIT,GHVISDAT,GHHGRADM,GHHGRADF,GHTOTINC,GHBFAMHB,contains("HBP")) %>% tbl_df
test<-full_join(socdem %>% select(-VERSION),test)
test<-full_join(f14 %>% select(CASEID,VISIT,MHCHRLD,MH_BPD, MHFEMALE),test)
test<-full_join(f15 %>% select(CASEID,VISIT,contains("NSSTER"),NSSTPYBP,NSSTSEBP), test)
test<-full_join(growth %>% select(CASEID,VISIT,LBW,PREMATURE,AVHEIGHT,HTPCTAG,AVWEIGHT,WTPCTAG,BMI,BMIPCTAG),test)
test<-full_join(pe %>% select(CASEID,VISIT,PEPRBLPM,PEPRWEIT),test)
test<-full_join(gfr %>% select(CASEID,VISIT,MALE1FE0,SCR,BUN,CYC_DB,BEDGFR, BSA),test)
test<-full_join(advr %>% select(CASEID,VISIT,AENSHEAD,AENSDIZZ,AENSLWBP),test)
test<-full_join(gh %>% select(CASEID,VISIT,GHGENDER,GHWKSBDD,GHPREMIE),test)
test<-full_join(kidhist %>% select(CASEID,DOB,BSDATE, CKDONST,PRIMDX,GNGDIAG),test)
test<-full_join(l05 %>% select(CASEID,VISIT,RLSERCRE,RLURPROT,RLURMALB,RLURCREA),test)
test<-full_join(echo %>% select(CASEID,VISIT,LVHF,LVHE,LVMI,ECHODATEY,CAUTION),test)
test<-full_join(cardio %>% select(CASEID,VISIT,DB_DATE,SBP,SBPINDXAGH,SBPPCTAGH,SBPZAGH,DBP,DBPINDXAGH,DBPPCTAGH,DBPZAGH,ABPM_DATE,ABPMSUCCESS,SHYPAGH,DHYPAGH),test)


# add new columns for age, Upc, Ualbumin%
# due to DOB provided in integer form, the true age may be off by up to 1 year (eg, born 1/1/1994 = DOB 1994, baseline date 12/31/2000 = BSDATE 2000, calculated age = 6yrs, true age=7yrs)
test<-test %>% mutate(age=(BSDATE-DOB)+DB_DATE,Upc=RLURPROT/RLURCREA, Ualb_pct=RLURMALB/RLURPROT)

# gender: since some values of MALE1FE0 are missing, will solve missing values by taking the mean of MALE1FE0 for each case (no change in gender over time)
test<-test %>% group_by(CASEID) %>% mutate(MALE1FE0=mean(MALE1FE0, na.rm=TRUE))

source('gfr.R')
# gfr will be based on ckidfull equation if the data is available, otherwise bedside schartz equation is used
test$gfr<-mapply(GFR,test$AVHEIGHT,test$SCR,test$CYC_DB,test$BUN,test$MALE1FE0, permissive=TRUE)

# add a column for CKD stage using ckidfull
test$CKD_stage <- cut(test$gfr, 
                      breaks = c(Inf, 90, 60, 30, 15, -Inf), 
                      labels=c(1:5),
                      ordered_result=TRUE, 
                      right = T)

# create categories for proteinuria based on cutoffs noted
test$Upc.factor<-cut(test$Upc,
                     breaks=c(-Inf,0.5,1,2,Inf),
                     labels=c("normal","mild","moderate","severe"),
                     ordered_result = T,
                     right=F)

# loads the correct BP medication names into BP_medlist
# and corrects BP medication names for brand names and misspellings
# corrected names are assigned a new variable called med.corrected
# display information about timing of echocardiogram data, grouped by visits
# note: BP_medlist will display the misspelled/brand names of BP meds, frequency at visit 10?, and corrected names
# categorize the BP meds into groups based on BP_medgroups.csv
load("BP_medlist.RData",verbose = TRUE)
BP_medgroups <- read.csv("BP_medgroups.csv")
medsum_full$med.corrected<-BP_medlist$Corrected.Name[match(medsum_full$MSMEDICA,BP_medlist$Var1)]
medsum_full$BPmedgroup<-BP_medgroups$bp_group[match(medsum_full$med.corrected,BP_medgroups$x)]
# remove all unintelligible medications from the data (may be viewed using BP_medlist[which(BP_medlist$Corrected.Name=="*invalid*"),c(2:3)])
medsum_full<-medsum_full %>% filter(med.corrected!="*invalid*")
# combine split dosing medications (eg, some patients taking different doses of same medication at different times of day, eg, labetalol)
medsum_full<-medsum_full %>% group_by(CASEID,VISIT,MSVISDAT, BPmedgroup,med.corrected) %>% summarise_at(vars(DLYDOSE,DLYFREQ,MSMISS7D), sum) %>% ungroup

# determine compliance as percent of doses taken, based on reported missed doses per week and daily frequency of medication
medcompliance<-function(miss7d,dlyfreq) {
  if (miss7d<0) miss7d<-0
  if (dlyfreq<0) dlyfreq<-NA
  return((dlyfreq*7-miss7d)/(dlyfreq*7))
}

medsum_full$compliance<-mapply(medcompliance,medsum_full$MSMISS7D,medsum_full$DLYFREQ)
medsum_full<-medsum_full %>% group_by(CASEID,VISIT) %>%  mutate(mean_compliance=mean(compliance,na.rm=TRUE)) %>% ungroup
medsum_full<-left_join(test %>% select(CASEID,VISIT,AVWEIGHT,age,gfr),medsum_full)
medsum_full<-medsum_full %>% mutate(std_dose=ifelse(DLYDOSE>0,DLYDOSE/AVWEIGHT,NA))
source("DDI.R")
medsum_full$ddi<-mapply(DDI,medsum_full$DLYDOSE,medsum_full$med.corrected,medsum_full$age,medsum_full$AVWEIGHT,medsum_full$gfr)

# identify duplicates (cases with visit codes that have coresponding visit dates that are not the same) and eliminate all the corresponding medication data
# justifications 1) unable to determine whether duplicate visits represent 'addition' of medications or 'switching' between meds
# 2) some cases have same visit code and corresponding visit dates separated in time by up to 4.28 years (obviously an error in recording data)
medsum_full<-medsum_full %>% group_by(CASEID,VISIT) %>% mutate(ndup=n_distinct(MSVISDAT))
medsum_full<-medsum_full %>%filter(ndup==1) # remove all duplicates from the data
medsum_full<-medsum_full %>% group_by(CASEID,VISIT) %>% mutate(n_agents=n_distinct(med.corrected, na.rm=TRUE))
medsum_full<-medsum_full %>% group_by(CASEID,VISIT) %>% mutate(sum_DDI=sum(ddi))
medsum_full.old<-medsum_full
medsum_full<-medsum_full %>% ungroup %>% nest(.,BPmedgroup,DLYFREQ,DLYDOSE,std_dose,ddi,compliance, .key="rx_info")
medsum_full<-dcast(medsum_full,CASEID+VISIT+MSVISDAT+n_agents+mean_compliance+sum_DDI~med.corrected, value.var="rx_info")
test<-full_join(medsum_full,test)

# replace all NULL values of drugs with a tibble containing NA's using change_null_to_list function above
# note: the line below takes approx 5 min to complete, so will save results to medsum_full4.RData file and read it into memory
# the line will be commented to save time
change_null_to_list <- function(x) {
  # If x is a data frame, do nothing and return x
  # Otherwise, return a data frame with 1 row of NAs
  if (!is.null(x)) {return(x)}
  else {return(as_tibble(t(c(BPmedgroup=as.factor(NA), DLYFREQ=as.numeric(NA), DLYDOSE=as.numeric(NA),std_dose=as.numeric(NA),ddi=as.numeric(NA),compliance=as.numeric(NA)))))}
}

for (i in which(names(test)=="ACETAZOLAMIDE"):which(names(test)=="VERAPAMIL")) {test[[names(test)[i]]]<-lapply(test[[names(test)[i]]],change_null_to_list)}

# parse drug info stored in test as separate variables for analysis
source("get_drug_info.R")
drugname<-names(test)[which(names(test)=="ACETAZOLAMIDE"):which(names(test)=="VERAPAMIL")]
# exclude combo drugs (contain '/', eg "AMLODIPINE/HCTZ")
drugname<-drugname[-grep("\\/",drugname)]
temp <- paste(drugname[1:43], "<- get_drug_info(test,'",drugname,"', c(1:6))", sep="")
eval(parse(text=temp))

temp.1<-paste("test$",drugname[1:43],"_ddi<-",drugname[1:43],"$ddi", sep="")
temp.2<-paste("test$",drugname[1:43],"_std_dose<-",drugname[1:43],"$std_dose", sep="")
eval(parse(text=temp.1))
eval(parse(text=temp.2))


# add columns for new 2017 AAP BP percentiles and z-scores (this takes about 2 minutes to calculate)
source('bpp4.R')
source('BPz/bpzv2.R')
source('BPstatus4th.R')
source('BPstatus2017.R')
source('bpclass.R')
source('bpclass2.R')
test$SBPPCTAGH2017<-mapply(bpp,test$age,test$AVHEIGHT,test$MALE1FE0,1,test$SBP, z=F)
test$DBPPCTAGH2017<-mapply(bpp,test$age,test$AVHEIGHT,test$MALE1FE0,2,test$DBP, z=F)
test$SBPZAGH2017<-mapply(bpp,test$age,test$AVHEIGHT,test$MALE1FE0,1,test$SBP, z=T)
test$DBPZAGH2017<-mapply(bpp,test$age,test$AVHEIGHT,test$MALE1FE0,2,test$DBP, z=T)

# classification of BP status based on 4th report and 2017 guidelines
# note: BP status is NA when BP was unavailable
test$BPstatus2017<-mapply(BPstatus2017,test$SBP,test$DBP,test$age,test$MALE1FE0,test$AVHEIGHT)
# fill in missing height percentiles to reduce number of missing BP status
missing_htpct<-which(is.na(test$HTPCTAG) & !is.na(test$AVHEIGHT))
test$HTPCTAG[missing_htpct]<- mapply(htz,test$AVHEIGHT[missing_htpct], test$age[missing_htpct], test$MALE1FE0[missing_htpct],p=T)
test$BPstatus4th<-mapply(bpstatus4th,test$SBP,test$DBP,test$age,test$MALE1FE0,test$HTPCTAG,test$SBPPCTAGH,test$DBPPCTAGH)

# cardio data organization
# analysis of BP status
# based on 2009 AHA consensus guidelines (old)
# BP status is coded as a variable called BPstatus in cardio
# 0 = normal
# 1 = White coat hypertension (WCH)
# 2 = masked hyeprtension (MH)
# 3 = ambulatory hypertension (AH)

# noctHTN variable: BP% dipping <10%
cardio<-cardio %>%rowwise()%>% mutate(noctHTN=sum(SYSPCTDIPPING<10,DIAPCTDIPPING<10))
cardio<-left_join(cardio,test %>% select(CASEID,VISIT,SBPPCTAGH2017,DBPPCTAGH2017, age,MALE1FE0,AVHEIGHT))
cardio<-cardio %>% rowwise() %>% mutate(BPclass= bpclass2(WKSYSINDX,WKDIAINDX,SLSYSINDX,SLDIAINDX,WKSYSLOAD,WKDIALOAD,SLSYSLOAD, SLDIALOAD, SBP, DBP, SBPPCTAGH2017,DBPPCTAGH2017))
test<-full_join(cardio %>% select(CASEID,VISIT,ABPMSUCCESS,BPclass,noctHTN),test)
# combine BP class variants into larger groups using BP classification from 2014
test$BPclass.factor<-cut(test$BPclass,0:4,right=FALSE, labels=c("NL","WCH","MH","AH"),ordered_result = TRUE)
test$BPclass.2.factor<-factor(test$BPclass.factor<="WCH", labels=c("NL+WCH","MH+AH"))
# Create variable LVMIp using LVMIp function to calculate LVMI percentiles
test<-test %>% filter(!is.na(LVMI)) %>% rowwise() %>% mutate(LVMIp=LVMIp(MALE1FE0,age,LVMI))
test$BMIz<-qnorm(test$BMIPCTAG/100)
test$GNGDIAG.factor<-factor(test$GNGDIAG<=2,labels=c("glom","non-glom"))

```

# Load data from file

```{r}
load("test3.RData")
```
# Define the study population
The following are inclusion criteria:

```{r}
# define study population
combo_cases<-medsum_full.old %>% filter(VISIT==20,length(grep("\\/",med.corrected)!=0L)) %>% ungroup() %>% select(CASEID) %>% unique() %>% unlist() %>% as.numeric()
cases<-test %>% filter(!CASEID %in% combo_cases,VISIT==20, n_agents>0,!is.na(sum_DDI),ABPMSUCCESS==1,!is.na(BUN), !is.na(CYC_DB),!is.na(SCR),!is.na(AVHEIGHT), !is.na(AVWEIGHT), !is.na(BPstatus2017)) %>% ungroup() %>%select(CASEID) %>% unique %>% unlist %>% as.numeric()# exclude those taking combo medications
test.cohort<-test %>% filter(VISIT==20, CASEID %in% cases)

test.cohort<-test.cohort %>% mutate(sum_DDI.ln=log(sum_DDI))
library(summarytools)
dfSummary(test.cohort %>% select(age,MALE1FE0,CKD_stage,gfr,CKDONST,n_agents,BPclass.factor,SBPZAGH2017,DBPPCTAGH2017, LVMIp,Upc,Upc.factor,BMIz,GNGDIAG,GHTOTINC,RACE,sum_DDI))
```