-
Notifications
You must be signed in to change notification settings - Fork 4
/
CLIP.profile.r
217 lines (188 loc) · 9.29 KB
/
CLIP.profile.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#' Combine Profile Likelihoods from Imputed-Data Model Fits
#'
#' This function uses CLIP (combination of likelihood profiles)
#' to compute the pooled profile of the posterior after multiple imputation.
#'
#' While CLIP.confint iterates to find those values at which the CDF of the
#' pooled posterior equals the confidence levels, CLIP.profile will evaluate
#' the whole profile, which enables plotting and evaluating the skewness of the combined and the completed-data profiles. The combined and completeddata profiles are available as cumulative distribution function (CDF) or in the scaling of relative
#' profile likelihood (minus twice the likelihood ratio statistic compared to the maximum). Using a
#' plot method, the pooled posterior can also be displayed as a density.
#'
#' @param obj Either a list of logistf fits (on multiple imputed data sets), or the result
#' of analysis of a \code{mice} (multiply imputed) object using \code{with.mids}.
#' @param variable The variable of interest, for which confidence intervals should be computed.
#' If missing, confidence intervals for all variables will be computed.
#' @param data A list of data set corresponding to the model fits. Can be left blank if obj was
#' obtained with the dataout=TRUE option or if obj was obtained by mice.
#' @param which Alternatively to variable, the argument which allows to specify the variable to
#' compute the profile for as righthand formula, e.g. which=~X.
#' @param firth If \code{TRUE}, applies the Firth correction. Should correspond to the entry in obj.
#' @param weightvar An optional weighting variable for each observation
#' @param control control parameters for \code{logistf}, usually obtained by \code{logistf.control()}
#' @param offset An optional offset variable
#' @param from Lowest value for the sequence of values for the regression coefficients for which the profile will be computed. Can be left blank.
#' @param to Highest value for the sequence of values for the regression coefficients for which the profile will be computed. Can be left blank
#' @param steps Number of steps for the sequence of values for the regression coefficients for which the profile will be computed
#' @param legacy If \code{TRUE}, only R code will be used. Should be avoided.
#' @param keep If \code{TRUE}, keeps the profiles for each imputed data sets in the output object.
#'
#' @return An object of class \code{CLIP.profile} with items:
#' \item{beta}{The values of the regression coefficient}
#' \item{cdf}{The cumulative distribution function of the posterior}
#' \item{profile}{The profile of the posterior}
#' \item{cdf.matrix}{An imputations x steps matrix with the values of the completed-data CDFs for each beta}
#' \item{profile.matrix}{An imputations x steps matrix with the values of the completed-data profiles for each beta}
#' \item{call}{The function call}
#' @export
#' @author Georg Heinze und Meinhard Plonar
#' @references Heinze G, Ploner M, Beyea J (2013). Confidence intervals after multiple imputation: combining profile
#' likelihood information from logistic regressions. Statistics in Medicine, to appear.
#' @examples
#'
#' #generate data set with NAs
#' freq=c(5,2,2,7,5,4)
#' y<-c(rep(1,freq[1]+freq[2]), rep(0,freq[3]+freq[4]), rep(1,freq[5]), rep(0,freq[6]))
#' x<-c(rep(1,freq[1]), rep(0,freq[2]), rep(1,freq[3]), rep(0,freq[4]), rep(NA,freq[5]),
#' rep(NA,freq[6]))
#' toy<-data.frame(x=x,y=y)
#'
#' # impute data set 5 times
#' set.seed(169)
#' toymi<-list(0)
#' for(i in 1:5){
#' toymi[[i]]<-toy
#' y1<-toymi[[i]]$y==1 & is.na(toymi[[i]]$x)
#' y0<-toymi[[i]]$y==0 & is.na(toymi[[i]]$x)
#' xnew1<-rbinom(sum(y1),1,freq[1]/(freq[1]+freq[2]))
#' xnew0<-rbinom(sum(y0),1,freq[3]/(freq[3]+freq[4]))
#' toymi[[i]]$x[y1==TRUE]<-xnew1
#' toymi[[i]]$x[y0==TRUE]<-xnew0
#' }
#'
#' # logistf analyses of each imputed data set
#' fit.list<-lapply(1:5, function(X) logistf(data=toymi[[X]], y~x, pl=TRUE))
#'
#' # CLIP profile
#' xprof<-CLIP.profile(obj=fit.list, variable="x",data =toymi, keep=TRUE)
#' plot(xprof)
#'
#' #plot as CDF
#' plot(xprof, "cdf")
#'
#' #plot as density
#' plot(xprof, "density")
#'
#' @rdname CLIP.profile
CLIP.profile <- function(obj=NULL, variable, data, which, firth=TRUE, weightvar,
control=logistf.control(), offset=NULL, from=NULL, to=NULL, steps=101, legacy=FALSE, keep=FALSE) {
old <- legacy
if (!is.null(obj)) {
if (is.mira(obj)) {
# assuming a mira object with fits on each imputed data set
# data.imp<-texteval(paste("",obj$call$data,"",sep=""))
data.imp<-eval(obj$call$data) #############GEHT NOCH NICHT!!
nimp<-data.imp$m
data<-lapply(1:nimp, function(x) complete(data.imp, action=x))
fits<-obj$analyses
formula<-as.formula(fits[[1]]$call$formula)
}
else {
# assuming as input a list of data sets (data) and a list of fits (obj)
fits<-obj
if(missing(data)) stop("Please provide data either as list of imputed data sets or by calling logistf on the imputed data sets with dataout=TRUE.\n")
formula<-as.formula(fits[[1]]$call$formula)
nimp<-length(data)
}
}
else {
nimp<-length(data)
stop(paste("Please provide a list of fit objects or a mira object with fits on the ",nimp," imputed data sets.\n"))
}
imputations<-nimp
if (missing(which) & missing(variable))
stop("You must specify which (a one-sided formula) or variable (the name of the variable).")
if (missing(control))
control <- logistf.control()
res<-matrix(0,length(grid),3)
nperimp<-unlist(lapply(1:imputations,function(x) nrow(data[[x]])))
variable.names<-colnames(model.matrix(formula, data = data[[1]]))
# mat.data<-matrix(lapply(1:imputations,function(x) matrix(unlist(data[[x]]),nperimp[[x]],ncol(data[[x]]))),sum(nperimp),ncol(data[[1]]))
imputation.indicator<-unlist(sapply(1:imputations, function(x) rep(x,nperimp[x]))[TRUE]) #[TRUE]makes vector out of matrix
mat.data<-matrix(0,sum(nperimp),ncol(data[[1]])) ### copy list of data set into a matrix
for(i in 1:imputations) mat.data[imputation.indicator==i,1:ncol(data[[i]])]<-as.matrix(data[[i]])
# if(missing(weightvar)) {
# weightvar<-"weights"
# mat.data[,ncol(mat.data)]<-rep(1,nrow(mat.data))
# }
big.data<-data.frame(mat.data)
colnames(big.data)<-colnames(data[[1]])
k<-length(variable.names)
xyw<-matrix(0,sum(nperimp),k+2)
xyw[,1:k]<- model.matrix(formula, data = big.data)
xyw[,k+1]<- model.response(model.frame(as.formula(formula),data=big.data))
if (!missing(which)) {
cov.name <- variable.names
cov.name2 <- colnames(model.matrix(which, data = big.data))
pos <- match(cov.name2, cov.name)
variable <- cov.name2
}
else {
cov.name <- variable.names
cov.name2 <- variable
pos <- match(cov.name2, cov.name)
}
# fit <- logistf.fit(x, y, weight = weight, offset = offset,
# firth = firth, control = control)
# std.pos <- diag(fit$var)[pos]^0.5
# coefs <- fit$beta
# covs <- fit$var
# n <- nrow(x)
# cov.name <- labels(x)[[2]]
if (missing(weightvar)) {
# data<-lapply(1:imputations, function(z) {
# data[[z]]$weightvar<-rep(1,nrow(data[[z]]))
# data[[z]]
# })
# weightvar<-"weightvar"
xyw[,k+2]<-rep(1,nrow(xyw))
}
else xyw[,k+2]<-big.data[,weightvar]
posweight<-k+2
lf<-function(index) logistf.fit(y=xyw[index,k+1], x=xyw[index,1:k], weight=xyw[index,k+2])
if (is.null(fits)) fits<-lapply(1:imputations, function(z) lf(imputation.indicator==z))
if(is.null(from)){
lower.collect<- (unlist(lapply(1:imputations,function(x) fits[[x]]$ci.lower[pos])))
from<-min(lower.collect) ###von dem den index nehmen und davon das PL CI ausrechnen
}
if(is.null(to)){
upper.collect<- (unlist(lapply(1:imputations,function(x) fits[[x]]$ci.upper[pos])))
to<-max(upper.collect)
}
estimate<-mean(unlist(lapply(1:imputations,function(x) fits[[x]]$coefficients[pos])))
iter<-numeric(0)
loglik<-unlist(lapply(1:imputations, function(x) fits[[x]]$loglik['full']))
beta<-t(matrix(unlist(lapply(1:imputations,function(x) fits[[x]]$coefficients)),k,imputations))
lpdf<-function(zz,z) logistf.pdf(x=xyw[imputation.indicator==zz,1:k], y=xyw[imputation.indicator==zz,k+1],
weight=xyw[imputation.indicator==zz,k+2], beta=beta[zz,],loglik=loglik[zz],
pos=pos, firth=firth, offset=offset, control=control, b=z, old=old)$pdf
z_seq<-seq(from, to, (to-from)/steps)
if(keep==FALSE){
f=function(z) mean(unlist(lapply(1:imputations, function(zz) lpdf(zz,z))))
### lasse z laufen von from nach to
### evaluiere f an allen z's und errechne daraus profile
pdf_mat<-NULL
profile.mat<-NULL
pdf_seq<-unlist(lapply(z_seq, function(Z) f(Z)))
} else {
f=function(z) unlist(lapply(1:imputations, function(zz) lpdf(zz,z)))
# pdf_mat<-matrix(0,steps+1,imputations)
pdf_mat<-matrix(unlist(lapply(z_seq, function(Z) f(Z))), imputations, steps+1)
profile.mat<- -qnorm(pdf_mat)**2
pdf_seq<-apply(pdf_mat,2,mean)
}
chisq_seq<- -qnorm(pdf_seq)**2
res<-list(beta=z_seq, cdf=pdf_seq, profile=chisq_seq, cdf.matrix=pdf_mat, profile.matrix=profile.mat, call=match.call())
attr(res,"class")<-c("logistf.CLIP.profile","logistf.profile")
res
}