-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnpdr.Rd
134 lines (107 loc) · 6.56 KB
/
npdr.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/npdr.R
\name{npdr}
\alias{npdr}
\title{npdr}
\usage{
npdr(
outcome,
dataset,
regression.type = "binomial",
attr.diff.type = "numeric-abs",
nbd.method = "multisurf",
nbd.metric = "manhattan",
knn = 0,
msurf.sd.frac = 0.5,
covars = "none",
covar.diff.type = "match-mismatch",
padj.method = "bonferroni",
verbose = FALSE,
use.glmnet = FALSE,
glmnet.alpha = 1,
glmnet.lower = 0,
glmnet.lam = "lambda.1se",
rm.attr.from.dist = c(),
neighbor.sampling = "none",
separate.hitmiss.nbds = FALSE,
corr.attr.names = NULL,
fast.reg = FALSE,
fast.dist = FALSE,
external.dist = NULL,
dopar.nn = FALSE,
dopar.reg = FALSE,
unique.dof = FALSE
)
}
\arguments{
\item{outcome}{character name or length-m numeric outcome vector for linear regression, factor for logistic regression}
\item{dataset}{m x p matrix of m instances and p attributes, May also include outcome vector but then outcome should be name. Include attr names as colnames.}
\item{regression.type}{(\code{"lm"} or \code{"binomial"})}
\item{attr.diff.type}{diff type for attributes (\code{"numeric-abs"} or \code{"numeric-sqr"} for numeric, \code{"allele-sharing"} or \code{"match-mismatch"} for SNP). Phenotype diff uses same numeric diff as attr.diff.type when lm regression. For glm-binomial, phenotype diff is \code{"match-mismatch"} For correlation data (e.g., rs-fMRI), use \code{"correlation-data"}; diffs between two variables (e.g., ROIs) are taken across all their pairs of correlations and the attribute importances are given for the overall variable (e.g,. brain ROI), not individual pairs.}
\item{nbd.method}{neighborhood method \code{"multisurf"} or \code{"surf"} (no k) or \code{"relieff"} (specify k). Used by nearestNeighbors().}
\item{nbd.metric}{used in npdrDistances for distance matrix between instances, default: \code{"manhattan"} (numeric). Used by nearestNeighbors(). For \code{"precomputed"}, must specify external.dist matrix.}
\item{knn}{number of constant nearest hits/misses for \code{"relieff"} (fixed-k). Used by nearestNeighbors().
The default knn=0 means use the expected SURF theoretical k with msurf.sd.frac (.5 by default)}
\item{msurf.sd.frac}{multiplier of the standard deviation from the mean distances; subtracted from mean for SURF or multiSURF.
The multiSURF default is msurf.sd.frac=0.5: mean - sd/2. Used by nearestNeighbors().}
\item{covars}{optional vector or matrix of covariate columns for correction. Or separate data matrix of covariates.}
\item{covar.diff.type}{string (or string vector) specifying diff type(s) for covariate(s) (\code{"numeric-abs"} for numeric or \code{"match-mismatch"} for categorical).}
\item{padj.method}{for p.adjust (\code{"fdr"}, \code{"bonferroni"}, ...)}
\item{verbose}{logical, whether to print out intermediate steps}
\item{use.glmnet}{logical, whether glmnet is employed}
\item{glmnet.alpha}{penalty mixture for npdrNET: default alpha=1 (lasso, L1) alpha=0 (ridge, L2)}
\item{glmnet.lower}{lower limit for coefficients for npdrNET: lower.limits=0 npdrNET default}
\item{glmnet.lam}{lambda for penalized feature selection. Options: \code{"lambda.1se"} (default), \code{"lambda.min"} or numeric.}
\item{rm.attr.from.dist}{attributes for removal (possible confounders) from the distance matrix calculation. Argument for nearestNeighbors. None by default c()}
\item{neighbor.sampling}{"none" or \code{"unique"} if you want to use only unique neighbor pairs (used in nearestNeighbors)}
\item{separate.hitmiss.nbds}{for case/control data, find neighbors for same (hit) and opposite (miss) classes separately (TRUE) or find nearest neighborhoods before assigning hit/miss groups (FALSE). Uses nearestNeighborsSeparateHitMiss function}
\item{corr.attr.names}{character vector of p variable names that correspond to the variables used to create the p(p-1) correlation-data predictors. If not specified, integer (1...p) labels used.}
\item{fast.reg}{logical, whether regression is run with speedlm or speedglm, default as F}
\item{fast.dist}{whether or not distance is computed by faster algorithm in wordspace, default as F}
\item{external.dist}{optional input distance matrix between samples. Used in conjunction with nbd.metric = \code{"precomputed"}.}
\item{dopar.nn}{logical, whether or not neighborhood is computed in parallel, default as F}
\item{dopar.reg}{logical, whether or not regression is run in parallel, default as F}
\item{unique.dof}{use unique neighbor pairs for degrees of freedom. FALSE lets R stats determine regression degrees of freedom}
}
\value{
npdr.stats.df: npdr fdr-corrected p-value for each attribute ($pval.adj \link{1}), raw p-value ($pval.attr \link{2}), and regression coefficient (beta.attr \link{3})
}
\description{
Nearest-Neighbor Projected-Distance Regression (npdr)
generalized linear model (GLM) extension of STatistical Inference Relief (STIR)
Computes attribute statistical signficance with logistic for case/control and linear model for quantitative outcomes.
NPDR allows for categorical (SNP) or numeric (expession) predictor data types.
NPDR allows for covariate correction.
Observations in the model are projected-distance differences between neighbors.
}
\examples{
# Data interface options.
# Specify name ("qtrait") of outcome and dataset,
# which is a data frame including the outcome column.
# ReliefF fixed-k neighborhood, uses surf theoretical default (with msurf.sd.frac=.5)
# if you do not specify k or let k=0.
npdr.results.df <- npdr(
"qtrait", qtrait.3sets$train,
regression.type = "lm", nbd.method = "relieff", nbd.metric = "manhattan",
attr.diff.type = "manhattan", covar.diff.type = "manhattan",
msurf.sd.frac = 0.5, padj.method = "bonferroni")
# Specify column index (101) of outcome and dataset,
# which is a data frame including the outcome column.
# ReliefF fixed-k nbd, choose a k (knn = 10). Or choose msurf.sd.frac
npdr.results.df <- npdr(
101, case.control.3sets$train,
regression.type = "binomial", nbd.method = "relieff", nbd.metric = "manhattan",
attr.diff.type = "manhattan", covar.diff.type = "manhattan",
knn = 10, padj.method = "bonferroni")
# if outcome vector (pheno.vec) is separate from attribute matrix
# multisurf
pheno.vec <- case.control.3sets$train$class
npdr.results.df <- npdr(
pheno.vec, predictors.mat,
regression.type = "binomial", nbd.method = "multisurf", nbd.metric = "manhattan",
attr.diff.type = "manhattan", covar.diff.type = "manhattan",
msurf.sd.frac = 0.5, padj.method = "bonferroni"
)
# attributes with npdr adjusted p-value less than .05
npdr.positives <- row.names(npdr.results.df[npdr.results.df$pva.adj < .05, ])
}