PADOG: Pathway Analysis with Down-weighting of Overlapping Genes

This package implements a general purpose gene set analysis method called PADOG that downplays the importance of genes that apear often accross the sets of genes to be analyzed. The package provides also a benchmark for gene set analysis methods in terms of sensitivity, ranking, specificity (null p values), and FDR using 24 public datasets from KEGGdzPathwaysGEO package.

Tarca, Adi L., et al. "Down-weighting overlapping genes improves gene set analysis." BMC bioinformatics 13.1 (2012): 136.

Installation

the latest released version from Bioconductor:

source("http://bioconductor.org/biocLite.R")
biocLite("PADOG")

the latest development version from github:

ups = installed.packages()[,"Package"]
biocp = c("KEGGdzPathwaysGEO", "KEGG.db", "limma", "AnnotationDbi", 
          "Biobase", "hgu133plus2.db", "hgu133a.db")
todos = setdiff(biocp, ups)
if (length(todos) > 0) {
    source("http://bioconductor.org/biocLite.R")
    biocLite(todos)
}

if (! "devtools" %in% ups) install.packages("devtools")
devtools::install_github("pacifly/PADOG")

Usage

To run PADOG gene set analysis:

### run padog on a colorectal cancer dataset of the 24 benchmark datasets
set = "GSE9348"
data(list=set, package="KEGGdzPathwaysGEO")

# extract from the dataset the required info
dlist = PADOG:::getdataaslist(set)
dataset = dlist$dataset
dat.m = dlist$dat.m
ano = dlist$ano
design = dlist$design
annotation = dlist$annotation
targetGeneSets = dlist$targetGeneSets

# run in serial
myr = padog(esetm = dat.m, group = ano$Group, paired = design=="Paired",
    block = ano$Block, targetgs = targetGeneSets, annotation = annotation,
    gslist = "KEGG.db", organism = "hsa", verbose = TRUE, Nmin = 3, NI = 200,
    plots = FALSE, dseed = 1)

head(myr$res, 20)

# run in parallel using 2 CPU cores
myr2 = padog(esetm = dat.m, group = ano$Group, paired = design=="Paired",
    block = ano$Block, targetgs = targetGeneSets, annotation = annotation,
    gslist = "KEGG.db", organism = "hsa", verbose = TRUE, Nmin = 3, NI = 200,
    plots = FALSE, dseed = 1, paral = TRUE, ncr = 2)

# verify that the result is the same which is a built-in feature
all.equal(myr, myr2)

To compare your gene set analysis method with those implemented in PADOG on benchmark datasets:

Write a "constructor" function to return your gene set analysis method; here we construct some artificial methods that assign uniform (or skewed Beta distributed) p values to gene sets.

setF = function(type=c("pos","uni","neg"), NI=1000, dseed=1, FDRmeth=c("BH","Permutation","holm")) {
    ### type -- uni for uniform p values, pos / neg for excessive false positive / negative 
    type = match.arg(type)
    FDRmeth = match.arg(FDRmeth)
    return(function(set, mygslist, minsize) {
    # define your new gene set analysis method that takes as input:
    # set -- the name of dataset file from the PADOGsetspackage
    # mygslist -- a list of the genesets
    # minsize -- minimum number of genes in a geneset to be considered for analysis 
        set.seed(dseed)
        data(list=set, package="KEGGdzPathwaysGEO", envir=environment())

        # extract from the dataset the required info (you can also use function PADOG:::getdataaslist)
        x = get(set)
        exp = experimentData(x)
        dat.m = exprs(x)
        ano = pData(x)
        dataset = exp@name
        design = notes(exp)$design
        annotation = paste(x@annotation,".db",sep="")
        targetGeneSets = notes(exp)$targetGeneSets

        # get rid of duplicates probesets per ENTREZ ID
        aT1 = filteranot(esetm=dat.m, group=ano$Group, paired=(design == "Paired"),
            block=ano$Block, annotation=annotation)
            
        # remove from gene sets the genes absent in the current dataset
        mygslist = lapply(mygslist, function(z) z[z %in% (aT1$ENTREZID)])
        # drop gene sets with less than minsize genes in the current dataset
        mygslist = mygslist[sapply(mygslist, length) >= minsize]
        
        # this toy method simulate some artificial p values for gene sets
        pres = switch(type,
                    uni = runif(length(mygslist)),
                    pos = rbeta(length(mygslist), 1, 5),
                    neg = rbeta(length(mygslist), 5, 1)
        )
        res = data.frame(ID=names(mygslist), P=pres, stringsAsFactors=FALSE) 
                        
        # obtain null p value matrix (i.e. applying your gene set analysis 
        # method on permuted group labels for NI times) that will be used
        # in estimating fdr if FDRmeth = "Permutation"
        nullp = switch(type,
                    uni = matrix(runif(nrow(res) * NI), ncol = NI),
                    pos = matrix(rbeta(nrow(res) * NI, 1, 5), ncol = NI),
                    neg = matrix(rbeta(nrow(res) * NI, 5, 1), ncol = NI)
        )
        rownames(nullp) = res$ID
        
        # multiple test adjustment        
        res$FDR = switch(FDRmeth,
                        p.adjust(res$P, FDRmeth),
                        Permutation = PADOG:::getFDR(nullp, res$P)
        )
        
        # compute normalized ranks
        res$Rank = sapply(res$P, function(p) ifelse(is.na(p), NA, mean(res$P <= p, na.rm=TRUE) * 100))
        # record method name and dataset name
        res$Method = type
        res$Dataset = dataset

        # return relevant gene sets result and null p values
        return(list(targ = res[res$ID %in% targetGeneSets,], pval = nullp))
    }
    )
}

mymnam = c("pos","uni","neg")
mym = lapply(mymnam, setF, NI=200, dseed=1, FDRmeth="P")
names(mym) = mymnam

# compare your methods with existing ones
out = compFDR(datasets=NULL, existingMethods=c("AbsmT", "GSA", "PADOG"),
    mymethods=mym, gslist="KEGG.db", Nmin=3, NI=200, Npsudo=20, FDRmeth="P", 
    plots=TRUE, verbose=TRUE, use.parallel=TRUE, dseed=1, pkgs=NULL)

Here is an output figure from above running:

Note that the estimated FDR is conservative (i.e. true FDR can be smaller). Contrast with the classic BH (Benjamini & Hochberg) method (set FDRmeth to "BH" in the above running):

As expected, the BH method and permutation based method produce similar FDR estimates when the null p value distribution is uniform. However, note how BH method is fooled when null distribution deviates from uniform.

Documentation

To view documentation for the version of this package installed in your system, start R and enter: browseVignettes("PADOG")

Name		Name	Last commit message	Last commit date
Latest commit History 58 Commits
R		R
build		build
inst		inst
man		man
vignettes		vignettes
.Rbuildignore		.Rbuildignore
.gitignore		.gitignore
COPYING		COPYING
DESCRIPTION		DESCRIPTION
NAMESPACE		NAMESPACE
README.md		README.md

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

PADOG: Pathway Analysis with Down-weighting of Overlapping Genes

Installation

Usage

Documentation

About

Releases

Packages

Languages

License

pacifly/PADOG

Folders and files

Latest commit

History

Repository files navigation

PADOG: Pathway Analysis with Down-weighting of Overlapping Genes

Installation

Usage

Documentation

About

Resources

License

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages