-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnpdr.R
620 lines (602 loc) · 32.9 KB
/
npdr.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
# =========================================================================#
#' diffRegression
#'
#' Wrapper for lm and glm-binomial to run regression for a phenotype diff vector, one attribute diff vector with optional covariate adjustment. Organizes regression statistics into a vector and then all attribute statistics combined in npdr.
#'
#' @param design.matrix.df Desgin matrix with variables: pheno.diff.vec (outcome variable as vector of diffs), attr.diff.vec (one predictor varialbe as vector of diffs) and optional covariates (regressors of non-interest) vector diffs.
#' @param regression.type (\code{"lm"}, \code{"binomial"})
#' @param fast.reg logical, whether regression is run with speedlm or speedglm, default as F
#' @param dof manual input for degrees of freedom, dof=0 lets R stats determine
#'
#' @importFrom stats dist p.adjust predict sd cor binomial lm glm pt var quantile rnorm pnorm runif rbinom qbinom
#'
#' @return vector of regression stats to put into list for npdr and combine into matrix
#'
#' @export
# regression of the neighbor diff vector for one attribute
diffRegression <- function(design.matrix.df, regression.type = "binomial", fast.reg = FALSE, dof = 0) {
# if there are no covariates then ~. model is pheno.diff.vec ~ attr.diff.vec
# otherwise ~. model is pheno.diff.vec ~ attr.diff.vec + covariates
# design.matrix.df must have column named 'pheno.diff.vec'
if (fast.reg) {
if (regression.type == "lm") {
mod <- speedglm::speedlm(pheno.diff.vec ~ ., data = design.matrix.df)
} else { # regression.type == "binomial"
mod <- speedglm::speedglm(pheno.diff.vec ~ ., data = design.matrix.df, family = binomial(link = logit))
}
# use R d.o.f use input dof
res_df <- if (dof == 0) mod$df else dof
} else { # non-speedy version -- but why?
if (regression.type == "lm") {
mod <- lm(pheno.diff.vec ~ ., data = design.matrix.df)
} else { # regression.type == "binomial"
mod <- glm(pheno.diff.vec ~ ., family = binomial(link = logit), data = design.matrix.df)
}
# use R d.o.f use input dof
ifelse(dof == 0, res_df <- mod$df.residual, res_df <- dof)
}
fit <- summary(mod)
coeffs <- fit$coefficients
## create output NPDR summary stats (stats.vec)
if (nrow(coeffs) < 2) {
# for example, a monomorphic SNP might result in attribute stats (row 2) not being created
beta_a <- NA
beta_zscore_a <- NA
pval.att <- NA
message("Regression failure. Possible monomorphic SNP.\n")
} else {
beta_a <- coeffs[2, 1]
beta_zscore_a <- coeffs[2, 3] # standardized beta coefficient (col 3)
## use one-side p-value to test H1: beta>0 for case-control and continuous outcome
pval.att <- pt(beta_zscore_a, res_df, lower.tail = FALSE) # one-sided p-val
}
stats.vec <- c(
pval.att, # one-sided p-value for attribute beta
beta_a, # beta_a for attribute a
beta_zscore_a, # standardized beta for attribut a
coeffs[1, 1], # beta_0, intercept, row 1 is inercept, col 1 is raw beta
coeffs[1, 4] # p-value for intercept, row 1 is intercept, col 4 is p-val
)
if (regression.type == "lm") {
stats.vec <- c(stats.vec, fit$r.squared)
} # add R^2 of fit, R.sqr for continuous outcomes
# cat(res_df,"\n")
return(stats.vec)
}
# =========================================================================#
#' npdr
#'
#' Nearest-Neighbor Projected-Distance Regression (npdr)
#' generalized linear model (GLM) extension of STatistical Inference Relief (STIR)
#' Computes attribute statistical signficance with logistic for case/control and linear model for quantitative outcomes.
#' NPDR allows for categorical (SNP) or numeric (expession) predictor data types.
#' NPDR allows for covariate correction.
#' Observations in the model are projected-distance differences between neighbors.
#'
#' @param outcome character name or length-m numeric outcome vector for linear regression, factor for logistic regression
#' @param dataset m x p matrix of m instances and p attributes, May also include outcome vector but then outcome should be name. Include attr names as colnames.
#' @param regression.type (\code{"lm"} or \code{"binomial"})
#' @param attr.diff.type diff type for attributes (\code{"numeric-abs"} or \code{"numeric-sqr"} for numeric, \code{"allele-sharing"} or \code{"match-mismatch"} for SNP). Phenotype diff uses same numeric diff as attr.diff.type when lm regression. For glm-binomial, phenotype diff is \code{"match-mismatch"} For correlation data (e.g., rs-fMRI), use \code{"correlation-data"}; diffs between two variables (e.g., ROIs) are taken across all their pairs of correlations and the attribute importances are given for the overall variable (e.g,. brain ROI), not individual pairs.
#' @param nbd.method neighborhood method \code{"multisurf"} or \code{"surf"} (no k) or \code{"relieff"} (specify k). Used by nearestNeighbors().
#' @param nbd.metric used in npdrDistances for distance matrix between instances, default: \code{"manhattan"} (numeric). Used by nearestNeighbors(). For \code{"precomputed"}, must specify external.dist matrix.
#' @param knn number of constant nearest hits/misses for \code{"relieff"} (fixed-k). Used by nearestNeighbors().
#' The default knn=0 means use the expected SURF theoretical k with msurf.sd.frac (.5 by default)
#' @param msurf.sd.frac multiplier of the standard deviation from the mean distances; subtracted from mean for SURF or multiSURF.
#' The multiSURF default is msurf.sd.frac=0.5: mean - sd/2. Used by nearestNeighbors().
#' @param covars optional vector or matrix of covariate columns for correction. Or separate data matrix of covariates.
#' @param covar.diff.type string (or string vector) specifying diff type(s) for covariate(s) (\code{"numeric-abs"} for numeric or \code{"match-mismatch"} for categorical).
#' @param glmnet.alpha penalty mixture for npdrNET: default alpha=1 (lasso, L1) alpha=0 (ridge, L2)
#' @param glmnet.lower lower limit for coefficients for npdrNET: lower.limits=0 npdrNET default
#' @param use.glmnet logical, whether glmnet is employed
#' @param glmnet.lam lambda for penalized feature selection. Options: \code{"lambda.1se"} (default), \code{"lambda.min"} or numeric.
#' @param rm.attr.from.dist attributes for removal (possible confounders) from the distance matrix calculation. Argument for nearestNeighbors. None by default c()
#' @param neighbor.sampling "none" or \code{"unique"} if you want to use only unique neighbor pairs (used in nearestNeighbors)
#' @param separate.hitmiss.nbds for case/control data, find neighbors for same (hit) and opposite (miss) classes separately (TRUE) or find nearest neighborhoods before assigning hit/miss groups (FALSE). Uses nearestNeighborsSeparateHitMiss function
#' @param corr.attr.names character vector of p variable names that correspond to the variables used to create the p(p-1) correlation-data predictors. If not specified, integer (1...p) labels used.
#' @param padj.method for p.adjust (\code{"fdr"}, \code{"bonferroni"}, ...)
#' @param fast.reg logical, whether regression is run with speedlm or speedglm, default as F
#' @param dopar.nn logical, whether or not neighborhood is computed in parallel, default as F
#' @param dopar.reg logical, whether or not regression is run in parallel, default as F
#' @param unique.dof use unique neighbor pairs for degrees of freedom. FALSE lets R stats determine regression degrees of freedom
#' @param verbose logical, whether to print out intermediate steps
#' @param fast.dist whether or not distance is computed by faster algorithm in wordspace, default as F
#' @param external.dist optional input distance matrix between samples. Used in conjunction with nbd.metric = \code{"precomputed"}.
#'
#' @return npdr.stats.df: npdr fdr-corrected p-value for each attribute ($pval.adj [1]), raw p-value ($pval.attr [2]), and regression coefficient (beta.attr [3])
#'
#' @importFrom utils capture.output combn write.table
#' @importFrom foreach foreach `%dopar%`
#' @importFrom glmnet glmnet cv.glmnet
#' @import dplyr
#'
#' @examples
#' # Data interface options.
#' # Specify name ("qtrait") of outcome and dataset,
#' # which is a data frame including the outcome column.
#' # ReliefF fixed-k neighborhood, uses surf theoretical default (with msurf.sd.frac=.5)
#' # if you do not specify k or let k=0.
#' npdr.results.df <- npdr(
#' "qtrait", qtrait.3sets$train,
#' regression.type = "lm", nbd.method = "relieff", nbd.metric = "manhattan",
#' attr.diff.type = "manhattan", covar.diff.type = "manhattan",
#' msurf.sd.frac = 0.5, padj.method = "bonferroni")
#'
#' # Specify column index (101) of outcome and dataset,
#' # which is a data frame including the outcome column.
#' # ReliefF fixed-k nbd, choose a k (knn = 10). Or choose msurf.sd.frac
#' npdr.results.df <- npdr(
#' 101, case.control.3sets$train,
#' regression.type = "binomial", nbd.method = "relieff", nbd.metric = "manhattan",
#' attr.diff.type = "manhattan", covar.diff.type = "manhattan",
#' knn = 10, padj.method = "bonferroni")
#'
#' # if outcome vector (pheno.vec) is separate from attribute matrix
#' # multisurf
#' pheno.vec <- case.control.3sets$train$class
#' npdr.results.df <- npdr(
#' pheno.vec, predictors.mat,
#' regression.type = "binomial", nbd.method = "multisurf", nbd.metric = "manhattan",
#' attr.diff.type = "manhattan", covar.diff.type = "manhattan",
#' msurf.sd.frac = 0.5, padj.method = "bonferroni"
#' )
#' # attributes with npdr adjusted p-value less than .05
#' npdr.positives <- row.names(npdr.results.df[npdr.results.df$pva.adj < .05, ])
#' @export
#'
npdr <- function(outcome, dataset,
regression.type = "binomial", attr.diff.type = "numeric-abs",
nbd.method = "multisurf", nbd.metric = "manhattan",
knn = 0, msurf.sd.frac = 0.5,
covars = "none", covar.diff.type = "match-mismatch",
padj.method = "bonferroni", verbose = FALSE,
use.glmnet = FALSE, glmnet.alpha = 1, glmnet.lower = 0,
glmnet.lam = "lambda.1se",
rm.attr.from.dist = c(), neighbor.sampling = "none",
separate.hitmiss.nbds = FALSE,
corr.attr.names = NULL,
fast.reg = FALSE, fast.dist = FALSE,
external.dist=NULL,
dopar.nn = FALSE, dopar.reg = FALSE,
unique.dof = FALSE) {
##### parse the commandline
if (length(outcome) == 1) {
# e.g., outcome="qtrait" or outcome=101 (pheno col index) and dataset is data.frame including outcome variable
pheno.vec <- dataset[, outcome] # get phenotype
attr.mat <- dataset %>% dplyr::select(-outcome) # outcome = "qtrait" or 101
} else { # user specifies a separate phenotype vector
pheno.vec <- outcome # assume users provides a separate outcome data vector
attr.mat <- as.matrix(dataset) # assumes dataset only contains attributes/predictors
}
rm(dataset) # cleanup memory
if (attr.diff.type == "correlation-data") { # corrdata
mynum <- dim(attr.mat)[2]
for (i in seq(1, mynum - 1)) {
mydiv <- i
if ((mydiv * (mydiv - 1)) == mynum) {
my.dimension <- mydiv
break
}
}
num.attr <- my.dimension
} else {
num.attr <- ncol(attr.mat)
}
num.samp <- nrow(attr.mat)
# create a list of attribute indices for selecting columns in stretched matrix
##############################################################################
if (attr.diff.type == "correlation-data") { # corrdata
attr.idx.list <- list()
for (i in 1:num.attr) {
lo.idx <- (i - 1) * (num.attr - 1) + 1
hi.idx <- i * (num.attr - 1)
attr.idx.list[[i]] <- c(lo.idx:hi.idx)
}
}
##### get Neighbors (no phenotype used)
# nbd.method (relieff, multisurf...), nbd.metric (manhattan...), k (for relieff nbd, theoerical surf default)
# msurf.sd.frac used by surf/multisurf relieff for theoretical k
if (verbose) {
cat("Finding nearest neighbor pairs.\n")
}
start_time <- Sys.time()
if (nbd.metric == "precomputed"){
#cat(nbd.metric)
#cat("\n")
#cat(dim(external.dist))
if (separate.hitmiss.nbds) { # separate hit and miss neighborhoods
neighbor.pairs.idx <- nearestNeighborsSeparateHitMiss(
external.dist, pheno.vec, # pre-computed distance
nbd.method = nbd.method,
nbd.metric = nbd.metric, sd.frac = msurf.sd.frac,
k = knn,
att_to_remove = rm.attr.from.dist,
fast.dist = fast.dist,
dopar.nn = dopar.nn)
} else {
# allow neighborhoods to be imbalanced,
# often nearest hits are closer than misses,
# which could dilute the effect of misses
neighbor.pairs.idx <- nearestNeighbors(
external.dist, # pre-computed distance
nbd.method = nbd.method,
nbd.metric = nbd.metric,
sd.frac = msurf.sd.frac, k = knn,
att_to_remove = rm.attr.from.dist,
fast.dist = fast.dist,
dopar.nn = dopar.nn)
}
}else{
if (separate.hitmiss.nbds) { # separate hit and miss neighborhoods
neighbor.pairs.idx <- nearestNeighborsSeparateHitMiss(attr.mat,
pheno.vec,
nbd.method = nbd.method,
nbd.metric = nbd.metric,
sd.frac = msurf.sd.frac,
k = knn,
att_to_remove = rm.attr.from.dist,
fast.dist = fast.dist,
dopar.nn = dopar.nn)
} else {
# allow neighborhoods to be imbalanced,
# often nearest hits are closer than misses,
# which could dilute the effect of misses
neighbor.pairs.idx <- nearestNeighbors(attr.mat,
nbd.method = nbd.method,
nbd.metric = nbd.metric,
sd.frac = msurf.sd.frac, k = knn,
att_to_remove = rm.attr.from.dist,
fast.dist = fast.dist,
dopar.nn = dopar.nn
)
}
}
num.neighbor.pairs <- nrow(neighbor.pairs.idx)
k.ave.empirical <- mean(knnVec(neighbor.pairs.idx))
unique.neighbor.pairs.idx <- uniqueNeighbors(neighbor.pairs.idx)
num.unique.neighbors <- nrow(unique.neighbor.pairs.idx)
if (neighbor.sampling == "unique") {
if (verbose) {
cat("Extracting unique neighbors.\n")
}
# if you only want to return unique neighbors
neighbor.pairs.idx <- unique.neighbor.pairs.idx
}
end_time <- Sys.time()
if (verbose) {
cat("Neighborhood calculation:", capture.output(end_time - start_time), "\n")
cat(num.neighbor.pairs, "total neighbor pairs (possible repeats).\n")
cat(num.unique.neighbors, "unique neighbor pairs.\n")
erf <- function(x) 2 * pnorm(x * sqrt(2)) - 1
# theoretical surf k (sd.frac=.5) for regression problems (does not depend on a hit/miss group)
k.msurf.theory <- knnSURF(num.samp, msurf.sd.frac)
cat("Theoretical (predicted) multiSURF average neighbors: ", k.msurf.theory, ".\n", sep = "")
cat("Empirical (computed from neighborhood) average neighbors: ", k.ave.empirical, ".\n", sep = "")
if (neighbor.sampling == "unique") {
# if you only want to return unique neighbors
num.neighbor.pairs <- nrow(neighbor.pairs.idx)
cat(num.neighbor.pairs, "unique neighbor pairs.\n")
cat("\nPerforming projected distance regression.\n")
}
}
### pheno diff vector for glm-binomial or lm to use in each attribute's diff regression in for loop.
# Not needed in loop.
# create pheno diff vector for linear regression (numeric)
Ri.pheno.vals <- pheno.vec[neighbor.pairs.idx[, 1]]
NN.pheno.vals <- pheno.vec[neighbor.pairs.idx[, 2]]
if (regression.type == "lm") {
pheno.diff.vec <- npdrDiff(Ri.pheno.vals, NN.pheno.vals, diff.type = "numeric-abs")
} else { # regression.type == "binomial"
# create pheno diff vector for logistic regression (match-mismatch or hit-miss)
pheno.diff.vec <- npdrDiff(Ri.pheno.vals, NN.pheno.vals, diff.type = "match-mismatch")
# the reference group is the hit group, so the logistic probability is prob of a pair being a miss
pheno.diff.vec <- as.factor(pheno.diff.vec)
}
# ----------------------------------------
# run npdr, each attribute produces a list
npdr.stats.list <- vector("list", num.attr) # initialize
attr.diff.mat <- matrix(0, nrow = nrow(neighbor.pairs.idx), ncol = num.attr)
# for npdrnet later, need matrix because npdrNET operates on all attributes at once
if (length(covars) > 1) { # if covars is a vector or matrix
if (use.glmnet) {
message("penalized npdrNET does not currently support covariates.")
}
# default value is covar="none" (no covariates) which has length 1
covars <- as.matrix(covars) # if covars is just one vector, make sure it's a 1-column matrix
num.covs <- length(covar.diff.type) # or ncol(covars)
# covar.diff.type can be a vector of strings because each column of covars may be a different data type
if (is.null(colnames(covars))) { # if covar vector has no column name, give it one
covar.names <- paste0("cov", seq.int(num.covs)) # cov1, etc.
} else {
covar.names <- colnames(covars) # else get the name from covars
}
covar.diff.df <- data.frame(matrix(, nrow = nrow(neighbor.pairs.idx), ncol = 0))
for (covar.col in (1:num.covs)) {
covar.name <- covar.names[covar.col]
covar.vals <- covars[, covar.col]
Ri.covar.vals <- covar.vals[neighbor.pairs.idx[, 1]]
NN.covar.vals <- covar.vals[neighbor.pairs.idx[, 2]]
covar.diff.vec <- npdrDiff(Ri.covar.vals, NN.covar.vals,
diff.type = covar.diff.type[covar.col]
)
# add covar diff vector to data.frame
# these covars will be included in each attribute's model
covar.diff.df <- covar.diff.df %>%
dplyr::mutate(!!covar.name := covar.diff.vec)
}
}
if (!use.glmnet) { # combine non-glmnet result lists into a matrix
if (dopar.reg) { # perform regressions in parallel
avai.cors <- parallel::detectCores() # - 2
#cl <- parallel::makeCluster(avai.cors)
#doParallel::registerDoParallel(cl)
doParallel::registerDoParallel(cores=avai.cors)
npdr.stats.attr.mat <- foreach(
attr.idx = seq.int(num.attr), .combine = "rbind", .packages = c("dplyr")
) %dopar% {
if (attr.diff.type == "correlation-data") { # corrdata
attr.vals <- attr.mat[, attr.idx.list[[attr.idx]]]
Ri.attr.vals <- attr.vals[neighbor.pairs.idx[, 1], ]
NN.attr.vals <- attr.vals[neighbor.pairs.idx[, 2], ]
attr.diff.vec <- npdrDiff(Ri.attr.vals, NN.attr.vals, diff.type = attr.diff.type)
} else {
attr.vals <- attr.mat[, attr.idx]
Ri.attr.vals <- attr.vals[neighbor.pairs.idx[, 1]]
NN.attr.vals <- attr.vals[neighbor.pairs.idx[, 2]]
attr.diff.vec <- npdrDiff(Ri.attr.vals, NN.attr.vals, diff.type = attr.diff.type)
}
attr.diff.mat[, attr.idx] <- attr.diff.vec
design.matrix.df <- data.frame(
attr.diff.vec = attr.diff.vec,
pheno.diff.vec = pheno.diff.vec
)
if (length(covars) > 1) { # if covars is a vector or matrix
design.matrix.df <- data.frame(design.matrix.df, covar.diff.df)
}
# design.matrix.df = pheno.diff ~ attr.diff + option covar.diff
if (unique.dof) {
dof <- num.unique.neighbors - 2
} else {
dof <- 0 # uses all neighbor pairs for regression degrees of freedom
}
return(diffRegression(design.matrix.df, regression.type = regression.type, fast.reg = fast.reg, dof = dof))
} # end of foreach loop, regression done in parallel
#parallel::stopCluster(cl)
} else { # non parallel version
for (attr.idx in seq(1, num.attr)) {
if (attr.diff.type == "correlation-data") { # corrdata
attr.vals <- attr.mat[, attr.idx.list[[attr.idx]]]
Ri.attr.vals <- attr.vals[neighbor.pairs.idx[, 1], ]
NN.attr.vals <- attr.vals[neighbor.pairs.idx[, 2], ]
attr.diff.vec <- npdrDiff(Ri.attr.vals, NN.attr.vals, diff.type = attr.diff.type)
} else {
attr.vals <- attr.mat[, attr.idx]
Ri.attr.vals <- attr.vals[neighbor.pairs.idx[, 1]]
NN.attr.vals <- attr.vals[neighbor.pairs.idx[, 2]]
attr.diff.vec <- npdrDiff(Ri.attr.vals, NN.attr.vals, diff.type = attr.diff.type)
}
attr.diff.mat[, attr.idx] <- attr.diff.vec
# model data.frame to go into lm or glm-binomial
design.matrix.df <- data.frame(
attr.diff.vec = attr.diff.vec,
pheno.diff.vec = pheno.diff.vec
)
### diff vector for each covariate
# optional covariates to add to design.matrix.df model
if (length(covars) > 1) { # if covars is a vector or matrix
design.matrix.df <- data.frame(design.matrix.df, covar.diff.df)
# design.matrix.df$temp <- covar.diff.vec # add the diff covar to the design matrix data frame
# colnames(design.matrix.df)[2+covar.col] <- covar.name # change variable name
}
# design.matrix.df = pheno.diff ~ attr.diff + option covar.diff
if (verbose) {
# cat("running non-parallel npdr.stats.list for attr ", attr.idx,".\n",sep="")
}
if (unique.dof) {
dof <- num.unique.neighbors - 2
} else {
dof <- 0 # uses all neighbor pairs for regression degrees of freedom
}
npdr.stats.list[[attr.idx]] <- diffRegression(design.matrix.df, regression.type = regression.type, fast.reg = fast.reg, dof = dof)
} # end of for loop, regression done for each attribute
# npdr.stats.attr.mat <- bind_rows(npdr.stats.list)
npdr.stats.attr.mat <- data.frame(do.call(rbind, npdr.stats.list))
colnames(npdr.stats.attr.mat) <- c("pval.att", "beta.raw.att", "beta.Z.att", "beta.0", "pval.0")
}
if (attr.diff.type == "correlation-data") { # corrdata
if (is.null(corr.attr.names)) {
att.names <- as.character(c(1:num.attr))
} else {
att.names <- as.character(corr.attr.names)
}
#### Create Results Data Frame for NPDR
# attribute p-values
# relies on first column [, 1] being the p-value for now
# later, first columns become att and adjusted p-value
attr.pvals <- npdr.stats.attr.mat[, 1]
# order-index for sorted attribute-beta p-values
attr.pvals.order.idx <- order(attr.pvals, decreasing = F)
# adjust p-values using Benjamini-Hochberg (default)
attr.pvals.adj <- p.adjust(attr.pvals[attr.pvals.order.idx], method = padj.method)
# order by attribute p-value
npdr.stats.pval_ordered.mat <- npdr.stats.attr.mat[attr.pvals.order.idx, ]
# prepend adjused attribute p-values to first column
npdr.stats.pval_ordered.mat <- cbind(attr.pvals.adj, npdr.stats.pval_ordered.mat)
# prepend attribute column (att)
# att = colnames(attr.mat)
npdr.stats.pval_ordered.mat <- cbind(
data.frame(att = att.names[attr.pvals.order.idx], stringsAsFactors = FALSE),
data.frame(npdr.stats.pval_ordered.mat, row.names = NULL)
)
colnames(npdr.stats.pval_ordered.mat) <- c(
"att", "pval.adj", "pval.att", "beta.raw.att", "beta.Z.att",
"beta.0", "pval.0"
)
# dataframe final output for regular npdr
npdr.stats.df <- data.frame(npdr.stats.pval_ordered.mat)
# npdr.stats.df <- npdr.stats.attr.mat %>% # corrdata
# mutate(att = att.names, # add an attribute column # corrdata
# pval.adj = p.adjust(pval.att, method = padj.method) # adjust p-values # corrdata
# ) %>% arrange(pval.att) %>% # order by attribute p-value # corrdata
# dplyr::select(att, pval.adj, everything()) %>% # reorder columns # corrdata
# as.data.frame() # convert tibbles to df -- can we remove this step? # corrdata
} else { # non-correlation matrix predictors
# npdr.stats.df <- as.data.frame(npdr.stats.attr.mat) %>%
# mutate(att = colnames(attr.mat), # add an attribute column
# pval.adj = p.adjust(pval.att, method = padj.method) # adjust p-values
# ) %>% arrange(pval.att) %>% # order by attribute p-value
# dplyr::select(att, pval.adj, everything()) %>% # reorder columns
# as.data.frame() # convert tibbles to df -- can we remove this step?
#### Create Results Data Frame for NPDR
# attribute p-values
# relies on first column [, 1] being the p-value for now
# later, first columns become att and adjusted p-value
attr.pvals <- npdr.stats.attr.mat[, 1]
# order-index for sorted attribute-beta p-values
attr.pvals.order.idx <- order(attr.pvals, decreasing = F)
# adjust p-values using Benjamini-Hochberg (default)
attr.pvals.adj <- p.adjust(attr.pvals[attr.pvals.order.idx], method = padj.method)
# order by attribute p-value
npdr.stats.pval_ordered.mat <- npdr.stats.attr.mat[attr.pvals.order.idx, ]
# prepend adjused attribute p-values to first column
npdr.stats.pval_ordered.mat <- cbind(attr.pvals.adj, npdr.stats.pval_ordered.mat)
# prepend attribute column (att)
att <- colnames(attr.mat)
npdr.stats.pval_ordered.mat <- cbind(
data.frame(att = att[attr.pvals.order.idx], stringsAsFactors = FALSE),
data.frame(npdr.stats.pval_ordered.mat, row.names = NULL)
)
if (regression.type == "lm") { # stats colnames for lm
colnames(npdr.stats.pval_ordered.mat) <- c(
"att", "pval.adj", "pval.att", "beta.raw.att", "beta.Z.att",
"beta.0", "pval.0", "R.sqr"
)
} else { # stats columns for glm-binomial
colnames(npdr.stats.pval_ordered.mat) <- c(
"att", "pval.adj", "pval.att", "beta.raw.att", "beta.Z.att",
"beta.0", "pval.0"
)
}
# dataframe final output for regular npdr
npdr.stats.df <- data.frame(npdr.stats.pval_ordered.mat)
} # end-else non-correlation-based predictors
} else { # use.glmnet = TRUE, Option npdrNET
# Run glmnet on the diff attribute columns
# Need to create a data matrix with each column as a vector of diffs for each attribute.
# Need matrix because npdrNET operates on all attributes at once.
attr.diff.mat <- matrix(0, nrow = nrow(neighbor.pairs.idx), ncol = num.attr)
for (attr.idx in seq(1, num.attr)) {
if (attr.diff.type == "correlation-data") { # corrdata
attr.vals <- attr.mat[, attr.idx.list[[attr.idx]]]
Ri.attr.vals <- attr.vals[neighbor.pairs.idx[, 1], ]
NN.attr.vals <- attr.vals[neighbor.pairs.idx[, 2], ]
attr.diff.vec <- npdrDiff(Ri.attr.vals, NN.attr.vals, diff.type = attr.diff.type)
} else {
attr.vals <- attr.mat[, attr.idx]
Ri.attr.vals <- attr.vals[neighbor.pairs.idx[, 1]]
NN.attr.vals <- attr.vals[neighbor.pairs.idx[, 2]]
attr.diff.vec <- npdrDiff(Ri.attr.vals, NN.attr.vals, diff.type = attr.diff.type)
}
attr.diff.mat[, attr.idx] <- attr.diff.vec
} # end for
#
Ri.pheno.vals <- pheno.vec[neighbor.pairs.idx[, 1]]
NN.pheno.vals <- pheno.vec[neighbor.pairs.idx[, 2]]
if (glmnet.alpha != "cluster") { # temporary trick to cluster attributes by diff vector
if (regression.type == "binomial") {
pheno.diff.vec <- npdrDiff(Ri.pheno.vals, NN.pheno.vals, diff.type = "match-mismatch")
pheno.diff.vec <- as.factor(pheno.diff.vec)
if (glmnet.lam=="lambda.min"){
# Run glmnet on the diff attribute columns
npdrNET.model <- glmnet::cv.glmnet(attr.diff.mat, pheno.diff.vec,
alpha = glmnet.alpha, family = "binomial",
lower.limits = glmnet.lower, type.measure = "class"
)
npdrNET.coeffs <- as.matrix(predict(npdrNET.model,
type = "coefficients",
s=npdrNET.model$lambda.min))
}else if(glmnet.lam=="lambda.1se"){
# Run glmnet on the diff attribute columns
npdrNET.model <- glmnet::cv.glmnet(attr.diff.mat, pheno.diff.vec,
alpha = glmnet.alpha, family = "binomial",
lower.limits = glmnet.lower, type.measure = "class"
)
npdrNET.coeffs <- as.matrix(predict(npdrNET.model,
type = "coefficients",
s=npdrNET.model$lambda.1se))
} else{ # numeric lambda value
npdrNET.model <- glmnet::glmnet(attr.diff.mat, pheno.diff.vec,
alpha = glmnet.alpha, family = "binomial",
lambda=glmnet.lam, thresh = 1e-14,
lower.limits = glmnet.lower, type.measure = "class"
)
#npdrNET.coeffs <- as.matrix(coef(npdrNET.model))
npdrNET.coeffs <- as.matrix(predict(npdrNET.model,
type = "coefficients",
s=glmnet.lam, exact=T))
}
} else { # "gaussian"
pheno.diff.vec <- npdrDiff(Ri.pheno.vals, NN.pheno.vals,
diff.type = "numeric-abs")
# Run glmnet on the diff attribute columns
npdrNET.model <- glmnet::cv.glmnet(attr.diff.mat, pheno.diff.vec,
alpha = glmnet.alpha, family = "gaussian",
lower.limits = glmnet.lower, type.measure = "mse"
)
if (glmnet.lam=="lambda.min"){
# Run glmnet on the diff attribute columns
npdrNET.model <- glmnet::cv.glmnet(attr.diff.mat, pheno.diff.vec,
alpha = glmnet.alpha, family = "gaussian",
lower.limits = glmnet.lower, type.measure = "mse"
)
npdrNET.coeffs <- as.matrix(predict(npdrNET.model,
type = "coefficients",
s=npdrNET.model$lambda.min))
}else if(glmnet.lam=="lambda.1se"){
# Run glmnet on the diff attribute columns
npdrNET.model <- glmnet::cv.glmnet(attr.diff.mat, pheno.diff.vec,
alpha = glmnet.alpha, family = "gaussian",
lower.limits = glmnet.lower, type.measure = "mse"
)
npdrNET.coeffs <- as.matrix(predict(npdrNET.model,
type = "coefficients",
s=npdrNET.model$lambda.1se))
} else{ # numeric lambda value
npdrNET.model <- glmnet::glmnet(attr.diff.mat, pheno.diff.vec,
alpha = glmnet.alpha, family = "gaussian",
lambda=glmnet.lam, thresh = 1e-14,
lower.limits = glmnet.lower, type.measure = "mse"
)
#npdrNET.coeffs <- as.matrix(coef(npdrNET.model))
npdrNET.coeffs <- as.matrix(predict(npdrNET.model,
type = "coefficients",
s=glmnet.lam, exact=T))
}
}
if (verbose) {
cat("npdr-glmnet cv lambda values:\n")
cat("lambda.min: ", npdrNET.model$lambda.min,"\n")
cat("lambda.1se: ", npdrNET.model$lambda.1se, "\n")
}
if (attr.diff.type=="correlation-data"){
# attr.mat is ROI pair names, but the correlation metric gives importance of ROIs
# which should be contained in corr.attr.names
row.names(npdrNET.coeffs) <- c("intercept", corr.attr.names) # add variable names
} else{
row.names(npdrNET.coeffs) <- c("intercept", colnames(attr.mat)) # add variable names
}
glmnet.sorted <- as.matrix(npdrNET.coeffs[order(abs(npdrNET.coeffs), decreasing = T), ], ncol = 1) # sort
npdr.stats.df <- data.frame(scores = glmnet.sorted)
} else { # glmnet.alpha == "cluster", so don't do regression and return the attribute diff vectors
# might not need the phenotype diff for clustering, but add anyway.
if (regression.type == "binomial") {
pheno.diff.vec <- npdrDiff(Ri.pheno.vals, NN.pheno.vals, diff.type = "match-mismatch")
} else { # "gaussian"
pheno.diff.vec <- npdrDiff(Ri.pheno.vals, NN.pheno.vals, diff.type = "numeric-abs")
}
colnames(attr.diff.mat) <- colnames(attr.mat)
# not actually stats, contains a pairs x attr diff matrix
npdr.stats.df <- data.frame(attr.diff.mat, pheno.diff = pheno.diff.vec)
} # end cluster option
} # end glmnetNPDR option
return(npdr.stats.df)
}