Merge branch 'develop' into feat/FastSNN

sky-xian · Mar 20, 2018 · a67997c · a67997c
2 parents 71f409a + 91ebd4c
commit a67997c
Show file tree

Hide file tree

Showing 12 changed files with 211 additions and 46 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -59,7 +59,9 @@ Imports:
     lmtest,
     cluster,
     fitdistrplus,
-    png
+    png,
+    doSNOW,
+    foreach
 LinkingTo: Rcpp, RcppEigen, RcppProgress
 License: GPL-3 | file LICENSE
 LazyData: true

diff --git a/NAMESPACE b/NAMESPACE
@@ -157,6 +157,7 @@ export(Shuffle)
 export(SplitDotPlotGG)
 export(SplitObject)
 export(StashIdent)
+export(SubsetByPredicate)
 export(SubsetColumn)
 export(SubsetData)
 export(SubsetRow)
@@ -264,6 +265,7 @@ import(Matrix)
 import(RColorBrewer)
 import(SDMTools)
 import(diffusionMap)
+import(doSNOW)
 import(fpc)
 import(ggplot2)
 import(gridExtra)
@@ -312,6 +314,8 @@ importFrom(dplyr,top_n)
 importFrom(dplyr,ungroup)
 importFrom(dtw,dtw)
 importFrom(fitdistrplus,fitdist)
+importFrom(foreach,"%dopar%")
+importFrom(foreach,foreach)
 importFrom(gdata,drop.levels)
 importFrom(gdata,interleave)
 importFrom(ggplot2,annotation_raster)

diff --git a/R/dimensional_reduction.R b/R/dimensional_reduction.R
@@ -219,10 +219,14 @@ RunICA <- function(
 #' @param genes.use If set, run the tSNE on this subset of genes
 #' (instead of running on a set of reduced dimensions). Not set (NULL) by default
 #' @param seed.use Random seed for the t-SNE
-#' @param do.fast If TRUE, uses the Barnes-hut implementation, which runs
-#' faster, but is less flexible. TRUE by default.
-#' @param do.approx Run FIt-SNE implementation, based on Kluger Lab code on
-#' https://github.com/ChristophH/FIt-SNE
+#' @param tsne.method Select the method to use to compute the tSNE. Available
+#' methods are:
+#' \itemize{
+#' \item{Rtsne: }{Use the Rtsne package Barnes-Hut implementation of tSNE (default)}
+#' \item{tsne: }{standard tsne - not recommended for large datasets}
+#' \item{FIt-SNE: }{Use the FFT-accelerated Interpolation-based t-SNE. Based on
+#' Kluger Lab code found here: https://github.com/ChristophH/FIt-SNE}
+#' }
 #' @param add.iter If an existing tSNE has already been computed, uses the
 #' current tSNE to seed the algorithm and then adds additional iterations on top
 #' of this
@@ -261,8 +265,7 @@ RunTSNE <- function(
   dims.use = 1:5,
   genes.use = NULL,
   seed.use = 1,
-  do.fast = TRUE,
-  do.approx = FALSE,
+  tsne.method = "Rtsne",
   add.iter = 0,
   dim.embed = 2,
   distance.matrix = NULL,
@@ -286,24 +289,29 @@ RunTSNE <- function(
       genes.use = genes.use))
   }
   set.seed(seed = seed.use)
-  if (do.fast) {
-    if (do.approx & is.null(x = distance.matrix)) {
-      data.tsne <- fftRtsne(X = as.matrix(x = data.use), dims = dim.embed, rand_seed = seed.use, ...)
+
+  if(tsne.method == "Rtsne"){
+    if (is.null(x = distance.matrix)) {
+      data.tsne <- Rtsne(X = as.matrix(x = data.use),
+                         dims = dim.embed,
+                         pca = FALSE, ...)
     } else {
-      if (is.null(x = distance.matrix)) {
-        data.tsne <- Rtsne(X = as.matrix(x = data.use), dims = dim.embed, ...)
-      } else {
-        data.tsne <- Rtsne(
-          X = as.matrix(x = distance.matrix),
-          dims = dim.embed,
-          is_distance=TRUE
-        )
-      }
-      data.tsne <- data.tsne$Y
+      data.tsne <- Rtsne(
+        X = as.matrix(x = distance.matrix),
+        dims = dim.embed,
+        is_distance = TRUE,
+        ...
+      )
     }
-  } else {
+    data.tsne <- data.tsne$Y
+  } else if (tsne.method == "FIt-SNE" & is.null(x = distance.matrix)) {
+    data.tsne <- fftRtsne(X = as.matrix(x = data.use), dims = dim.embed, rand_seed = seed.use, ...)
+  } else if (tsne.method == "tsne") {
     data.tsne <- tsne(X = data.use, k = dim.embed, ...)
+  } else {
+    stop ("Invalid tsne.method: Please select from Rtsne, tsne, or FIt-SNE")
   }
+
   if (add.iter > 0) {
     data.tsne <- tsne(
       X = data.use,

diff --git a/R/dimensional_reduction_internal.R b/R/dimensional_reduction_internal.R
@@ -429,7 +429,7 @@ fftRtsne <- function(
     stop('tsne call failed');
   }
   f <- file(description = result_path, open = "rb")
-  # initialError <- readBin(f, integer(), n = 1, size = 8) # Not used
+  initialError <- readBin(f, integer(), n = 1, size = 8)
   n <- readBin(con = f, what = integer(), n = 1, size = 4)
   d <- readBin(con = f, what = integer(), n = 1, size = 4)
   Y <- readBin(con = f, what = numeric(), n = n * d)

diff --git a/R/interaction.R b/R/interaction.R
@@ -306,6 +306,44 @@ AddSamples <- function(
   return(new.object)
 }
 
+
+
+#' Return a subset of the Seurat object.
+#'
+#' Creates a Seurat object containing only a subset of the cells in the
+#' original object. Forms a dataframe by fetching the variables in \code{vars.use}, then
+#' subsets it using \code{base::subset} with \code{predicate} as the filter.
+#' Returns the corresponding subset of the Seurat object. 
+#'
+#' @param object Seurat object
+#' @param vars.use Variables to fetch for use in base::subset. Character vector.
+#' @param predicate String to be parsed into an R expression and evaluated as an input to base::subset.
+#' 
+#' @export
+#'
+#' @examples
+#' pbmc1 <- SubsetByPredicate(object = pbmc_small, 
+#'                       vars.use = c("nUMI", "res.1"), 
+#'                       predicate = "nUMI < 200 & res.1=='3'")
+#' pbmc1
+#'
+SubsetByPredicate = function( 
+  object,
+  vars.use,
+  predicate
+){
+  if( typeof(vars.use) != "character"){
+    stop("predicate should be a character vector. It will be parsed in `subset` as an R expression.")
+  }
+  if( typeof(predicate) != "character"){
+    stop("vars.use should be a character vector. These variables will be passed to FetchData.")
+  }
+  df <- FetchData(object, vars.use) %>% as.data.frame
+  cu <- df %>% subset(eval(parse(text=predicate))) %>% rownames
+  object <- SubsetData(object, cells.use = cu)
+  return( object )
+}
+
 #' Return a subset of the Seurat object
 #'
 #' Creates a Seurat object containing only a subset of the cells in the
@@ -725,7 +763,9 @@ WhichCells <- function(
   if(accept.low >= accept.high) {
     stop("accept.low greater than or equal to accept.high")
   }
-  set.seed(seed = random.seed)
+  if (!is.na(x = random.seed)) {
+    set.seed(seed = random.seed)
+  }
   cells.use <- SetIfNull(x = cells.use, default = object@cell.names)
   ident <- SetIfNull(x = ident, default = unique(x = object@ident))
   bad.remove.idents <- ident.remove[! (ident.remove %in% unique(x = object@ident))]
@@ -1085,7 +1125,9 @@ AddMetaData <- function(object, metadata, col.name = NULL) {
     colnames(x = metadata) <- col.name
   }
   cols.add <- colnames(x = metadata)
-  meta.add <- metadata[rownames(x = object@meta.data), cols.add]
+  #meta.add <- metadata[rownames(x = [email protected]), cols.add]
+  meta.order <- match(rownames(object@meta.data), rownames(metadata))
+  meta.add <- metadata[meta.order, ]
   if (all(is.null(x = meta.add))) {
     stop("Metadata provided doesn't match the cells in this object")
   }

diff --git a/R/preprocessing.R b/R/preprocessing.R
@@ -431,6 +431,9 @@ ScaleDataR <- function(
 #' differences in numerical precision which could affect downstream calculations.
 #' @param check.for.norm Check to see if data has been normalized, if not,
 #' output a warning (TRUE by default)
+#' @param do.par use parallel processing for regressing out variables faster.
+#' If set to TRUE, will use half of the machines available cores (FALSE by default)
+#' @param num.cores If do.par = TRUE, specify the number of cores to use.
 #'
 #' @return Returns a seurat object with object@@scale.data updated with scaled
 #' and/or centered data.
@@ -461,7 +464,9 @@ ScaleData <- function(
   display.progress = TRUE,
   assay.type = "RNA",
   do.cpp = TRUE,
-  check.for.norm = TRUE
+  check.for.norm = TRUE,
+  do.par = FALSE,
+  num.cores = 1
 ) {
   data.use <- SetIfNull(
     x = data.use,
@@ -495,7 +500,9 @@ ScaleData <- function(
       genes.regress = genes.use,
       use.umi = use.umi,
       model.use = model.use,
-      display.progress = display.progress
+      display.progress = display.progress,
+      do.par = do.par,
+      num.cores = num.cores
     )
     if (model.use != "linear") {
       use.umi <- TRUE

diff --git a/R/preprocessing_internal.R b/R/preprocessing_internal.R
@@ -9,20 +9,27 @@
 # @param model.use Use a linear model or generalized linear model (poisson, negative binomial) for the regression. Options are 'linear' (default), 'poisson', and 'negbinom'
 # @param use.umi Regress on UMI count data. Default is FALSE for linear modeling, but automatically set to TRUE if model.use is 'negbinom' or 'poisson'
 # @param display.progress display progress bar for regression procedure.
+# @param do.par use parallel processing for regressing out variables faster.
+# If set to TRUE, will use half of the machines available cores (FALSE by default)
+# @param num.cores If do.par = TRUE, specify the number of cores to use.
 #
 # @return Returns the residuals from the regression model
 #
 #' @import Matrix
+#' @import doSNOW
 #' @importFrom stats as.formula lm residuals glm
 #' @importFrom utils txtProgressBar setTxtProgressBar
+#' @importFrom foreach foreach %dopar%
 #
 RegressOutResid <- function(
   object,
   vars.to.regress,
   genes.regress = NULL,
   model.use = 'linear',
   use.umi = FALSE,
-  display.progress = TRUE
+  display.progress = TRUE,
+  do.par = FALSE,
+  num.cores = 1
 ) {
   possible.models <- c("linear", "poisson", "negbinom")
   if (! model.use %in% possible.models){
@@ -56,7 +63,38 @@ RegressOutResid <- function(
   if (use.umi) {
     data.use <- object@raw.data[genes.regress, object@cell.names, drop = FALSE]
   }
-  for (i in 1:max.bin) {
+
+  # input checking for parallel options
+  if(do.par){
+    if(num.cores == 1){
+      num.cores <- detectCores() / 2
+    } else {
+      if(num.cores > detectCores()){
+        num.cores <- detectCores() - 1
+        warning(paste0("num.cores set greater than number of available cores(", detectCores(), "). Setting num.cores to ", num.cores, "."))
+      }
+    }
+  } else {
+    if(num.cores != 1){
+      num.cores <- 1
+      warning("For parallel processing, please set do.par to TRUE.")
+    }
+  }
+  cl<- parallel::makeCluster(num.cores)
+
+  # using doSNOW library because it supports progress bar update
+  registerDoSNOW(cl)
+
+  opts <- list()
+  if(display.progress)
+  {
+    # define progress bar function
+    progress <- function(n) setTxtProgressBar(pb, n)
+    opts <- list(progress = progress)
+    time_elapsed <- Sys.time()
+  }
+
+  data.resid <- foreach(i = 1:max.bin, .combine = "rbind", .options.snow = opts) %dopar% {
     genes.bin.regress <- rownames(x = data.use)[bin.ind == i]
     gene.expr <- as.matrix(x = data.use[genes.bin.regress, , drop = FALSE])
     new.data <- do.call(
@@ -96,19 +134,17 @@ RegressOutResid <- function(
         }
       )
     )
-    if (i == 1) {
-      data.resid=new.data
-    }
-    if (i > 1) {
-      data.resid=rbind(data.resid,new.data)
-    }
-    if(display.progress) {
-      setTxtProgressBar(pb, i)
-    }
+    new.data
   }
+
   if (display.progress) {
+    time_elapsed <- Sys.time() - time_elapsed
+    cat(paste("\nTime Elapsed: ",time_elapsed, units(time_elapsed)))
     close(pb)
   }
+
+  stopCluster(cl)
+
   rownames(x = data.resid) <- genes.regress
   if (use.umi) {
     data.resid <- log1p(

diff --git a/man/RunTSNE.Rd b/man/RunTSNE.Rd
diff --git a/man/ScaleData.Rd b/man/ScaleData.Rd
diff --git a/man/SubsetByPredicate.Rd b/man/SubsetByPredicate.Rd