bplapply backend for more functions

mjz1 · Jun 21, 2024 · 7da0cdb · 7da0cdb
1 parent 655165d
commit 7da0cdb
Show file tree

Hide file tree

Showing 13 changed files with 49 additions and 39 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -9,9 +9,9 @@
 ^docs$
 ^pkgdown$
 ^\.github$
+^\.vscode$
 ^example$
-^vignettes/example$
-^vignettes/results$
+^results$
 Dockerfile
 ^docker*
 ^dev$

diff --git a/NAMESPACE b/NAMESPACE
@@ -60,6 +60,7 @@ export(segment_cnv)
 export(smooth_counts)
 export(summarise_chr_arm)
 import(SingleCellExperiment)
+importFrom(BiocParallel,bplapply)
 importFrom(Matrix,t)
 importFrom(S4Vectors,mcols)
 importFrom(SummarizedExperiment,"assay<-")

diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,7 @@
 # scatools (development version)
 
-* Replace `perform_gc_cor` parallel backend to use `BiocParallel`
+* Added `segment=FALSE` as default
+* Replace `add_gc_cor`, `segment_cnv`, and `merge_segments` parallel backend to use `BiocParallel`
 
 # scatools 0.1.1
 

diff --git a/R/cnv_tools.R b/R/cnv_tools.R
@@ -39,23 +39,23 @@ smooth_counts <- function(sce, assay_name, ncores = 1, smooth_name = paste(assay
 #' @param assay_name Name of the assay to segment
 #' @param new_assay Name of the new assay
 #' @param verbose Verbosity
-#' @param ncores Number of cores to use
 #' @param ... Additional parameters to pass to [DNAcopy::segment]
 #'
 #' @inheritParams DNAcopy::segment
+#' @inheritParams run_scatools
 #'
 #' @return A `SingleCellExperiment` object
 #' @export
 #'
-segment_cnv <- function(sce, assay_name, new_assay = paste(assay_name, "segment", sep = "_"), alpha = 0.2, nperm = 10, min.width = 2, undo.splits = "none", verbose = 0, ncores = 1, ...) {
+segment_cnv <- function(sce, assay_name, new_assay = paste(assay_name, "segment", sep = "_"), alpha = 0.2, nperm = 10, min.width = 2, undo.splits = "none", verbose = 0, bpparam = BiocParallel::SerialParam(), ...) {
   chrs <- as.vector(GenomeInfoDb::seqnames(SummarizedExperiment::rowRanges(sce)))
   starts <- GenomicRanges::start(SummarizedExperiment::rowRanges(sce))
   sample_ids <- colnames(sce)
   # Perform segmentation
   # TODO: Make this function chromosome arm aware (ie segment within arms rather than chrs)
   logger::log_info("Segmenting CNVs")
-  segmented_counts <- pbmcapply::pbmclapply(1:ncol(sce), mc.cores = ncores, FUN = function(i) {
-    x <- as.vector(assay(sce, assay_name)[, i])
+  segmented_counts <- BiocParallel::bplapply(X = 1:ncol(sce), BPPARAM = bpparam, FUN = function(i) {
+    x <- as.vector(SummarizedExperiment::assay(sce, assay_name)[, i])
     obj <- DNAcopy::CNA(genomdat = x, chrom = chrs, maploc = starts, data.type = "logratio", sampleid = sample_ids[i], presorted = T)
     res <- withr::with_seed(3, smoothed_CNA_counts <- DNAcopy::segment(obj,
       alpha = alpha,
@@ -80,30 +80,31 @@ segment_cnv <- function(sce, assay_name, new_assay = paste(assay_name, "segment"
   segmented_counts <- as.matrix(dplyr::bind_rows(segmented_counts))
   rownames(segmented_counts) <- rownames(sce)
 
-  assay(sce, new_assay) <- segmented_counts
+  SummarizedExperiment::assay(sce, new_assay) <- segmented_counts
   return(sce)
 }
 
 
 #' Merge segment levels
 #'
 #' @inheritParams segment_cnv
+#' @inheritParams run_scatools
 #' @param smooth_assay name of assay with smoothed counts
 #' @param segment_assay name of assay with segmented counts
 #'
 #' @return A `SingleCellExperiment` object
 #' @export
 #'
-merge_segments <- function(sce, smooth_assay, segment_assay, new_assay = "segment_merged", ncores = 1) {
+merge_segments <- function(sce, smooth_assay, segment_assay, new_assay = "segment_merged", bpparam = bpparam) {
   smooth_counts <- log2(assay(sce, smooth_assay))
 
-  segment_df <- as.data.frame(assay(sce, segment_assay))
+  segment_df <- as.data.frame(SummarizedExperiment::assay(sce, segment_assay))
   segment_df[segment_df == 0] <- 1e-4
   segment_df <- log2(segment_df)
 
-  logger::log_info("Merging segments using {ncores} cores for {ncol(segment_df)} cells")
+  logger::log_info("Merging segments using {bpparam$workers} cores for {ncol(segment_df)} cells")
 
-  seg_ml_list <- pbmcapply::pbmclapply(seq_along(segment_df), mc.cores = ncores, function(i) {
+  seg_ml_list <- BiocParallel::bplapply(X = seq_along(segment_df), BPPARAM = bpparam, function(i) {
     cell_name <- names(segment_df)[i]
     seg_means_ml <- mergeLevels(
       vecObs = smooth_counts[, i],
@@ -169,7 +170,7 @@ identify_normal <- function(sce, assay_name, group_by = "clusters", method = c("
     n_normal_clusts <- 1
   }
 
-  sce$seg_sd <- colSds(assay(sce, assay_name), na.rm = TRUE)
+  sce$seg_sd <- colSds(SummarizedExperiment::assay(sce, assay_name), na.rm = TRUE)
 
 
   # Take the per cluster medians

diff --git a/R/gc_correction.R b/R/gc_correction.R
@@ -16,7 +16,7 @@
 #' @export
 #'
 add_gc_cor <- function(sce,
-                       gc = rowData(sce)$gc,
+                       gc = SummarizedExperiment::rowData(sce)$gc,
                        assay_name = "counts",
                        method = c("modal", "copykit", "loess"),
                        verbose = FALSE,
@@ -27,19 +27,19 @@ add_gc_cor <- function(sce,
   gc_slot <- paste0("counts_gc_", method)
 
   # Check if valid bins exists and pass correctly
-  if ("valid_bins" %in% names(assays(sce))) {
+  if ("valid_bins" %in% names(SummarizedExperiment::assays(sce))) {
     if (verbose) {
       logger::log_info("Found valid bins in sce object")
     }
-    valid_mat <- assay(sce, "valid_bins")
+    valid_mat <- SummarizedExperiment::assay(sce, "valid_bins")
   } else {
     warning("No valid bins matrix in sce object. Defaulting to all valid.")
     valid_mat <- NULL
   }
 
   if (method == "modal") {
     res <- perform_gc_cor(
-      mat = assay(sce, assay_name),
+      mat = SummarizedExperiment::assay(sce, assay_name),
       gc = gc,
       valid_mat = valid_mat,
       method = method,
@@ -49,16 +49,16 @@ add_gc_cor <- function(sce,
       ...
     )
 
-    assay(sce, gc_slot) <- res$counts
+    SummarizedExperiment::assay(sce, gc_slot) <- res$counts
 
     sce$modal_quantile <- as.integer(gsub("q", "", res$modal_quantiles))
   } else {
-    assay(sce, gc_slot) <- perform_gc_cor(
-      mat = assay(sce, assay_name),
+    SummarizedExperiment::assay(sce, gc_slot) <- perform_gc_cor(
+      mat = SummarizedExperiment::assay(sce, assay_name),
       gc = gc,
       valid_mat = valid_mat,
       method = method,
-      ncores = ncores,
+      bpparam = bpparam,
       verbose = verbose,
       ...
     )
@@ -80,7 +80,6 @@ add_gc_cor <- function(sce,
 #' @param gc GC corresponding to bins (rows) in the matrix
 #' @param valid_mat Matrix of TRUE/FALSE for valid bins. If none provided defaults to all TRUE
 #' @param method Specifies the type of GC correction to perform. One of `'modal', 'copykit', or 'loess'`
-#' @param ncores Number of cores to use if parallel backend is available
 #' @param bpparam BiocParallel params
 #' @param verbose Message verbosity (TRUE/FALSE)
 #' @param ... Additional arguments to be passed to GC correction methods
@@ -95,7 +94,7 @@ perform_gc_cor <- function(mat,
   bpparam, 
   verbose = FALSE, ...) {
   if (verbose) {
-    logger::log_info("Performing GC correction on {ncol(mat)} cells")
+    logger::log_info("Performing GC correction on {ncol(mat)} cells using {bpparam$workers} cores")
     logger::log_info("GC correction method: {method}")
   }
 

diff --git a/R/load_atac_bins.R b/R/load_atac_bins.R
@@ -39,7 +39,7 @@ load_atac_bins <- function(bin_dir,
   sce <- scuttle::addPerFeatureQCMetrics(sce, subsets = get_f_idx(sce$Sample))
 
   if (!is.null(save_to)) {
-    .save_to(object = sce, save_to = save_to, verbose = verbose)
+    save_to(object = sce, save_to = save_to, verbose = verbose)
   }
 
   if (verbose) {

diff --git a/R/run_scatools.R b/R/run_scatools.R
@@ -11,6 +11,7 @@
 #' @param outdir Output directory
 #' @param rmsb_size Remove small bins below a given size. Useful in cases where small bins are leftover at the ends of chromosomes. Defaults to 10% of the binwidth.
 #' @param gc_range GC range for bins to keep. Removes large GC outliers
+#' @param segment Logical. Determines whether to run segmentation or not
 #' @param min_bin_counts A bin requires at least `min_bin_counts` across `min_bin_prop` proportion of cells to be kept
 #' @param min_bin_prop Minimum proportion of cells with at least `min_bin_counts` per bin in order to keep a bin
 #' @param min_cell_counts  A cell requires at least `min_cell_counts` across `min_cell_prop` proportion of bins to be kept
@@ -31,6 +32,7 @@ run_scatools <- function(sample_id,
                          bin_name = prettyMb(getmode(width(bins))),
                          blacklist = NULL,
                          outdir = sample_id,
+                         segment = TRUE,
                          rmsb_size = 0.1*getmode(width(bins)),
                          gc_range = c(0.3, 0.8),
                          overwrite = FALSE,
@@ -110,11 +112,15 @@ run_scatools <- function(sample_id,
     smooth_counts(assay_name = "counts_gc_modal", ncores = ncores) %>%
     calc_ratios(assay_name = "counts_gc_modal_smoothed") %>%
     logNorm(assay_name = "counts_gc_modal_smoothed_ratios", name = "logr_modal") %>%
-    cluster_seurat(assay_name = "counts_gc_modal_smoothed_ratios", resolution = 0.5, verbose = FALSE) %>%
-    segment_cnv(assay_name = "counts_gc_modal_smoothed", ncores = ncores) %>%
-    merge_segments(smooth_assay = "counts_gc_modal_smoothed", segment_assay = "counts_gc_modal_smoothed_segment", ncores = ncores) %>%
-    identify_normal(assay_name = "segment_merged_logratios", group_by = "clusters", method = "gmm")
-  save_to(object = sce_processed, save_to = final_out)
+    cluster_seurat(assay_name = "counts_gc_modal_smoothed_ratios", resolution = 0.5, verbose = FALSE) 
+
+    if (segment) {
+    sce_processed <- segment_cnv(assay_name = "counts_gc_modal_smoothed", bpparam = bpparam) %>%
+      merge_segments(smooth_assay = "counts_gc_modal_smoothed", segment_assay = "counts_gc_modal_smoothed_segment", bpparam = bpparam) %>%
+      identify_normal(assay_name = "segment_merged_logratios", group_by = "clusters", method = "gmm")
+    save_to(object = sce_processed, save_to = final_out)
+    }
+
 
   # Save anndata
   if (save_h5ad == TRUE) {

diff --git a/R/scatools-package.R b/R/scatools-package.R
@@ -7,12 +7,10 @@ utils::globalVariables("where")
 
 
 ## usethis namespace: start
+#' @importFrom BiocParallel bplapply
 #' @importFrom methods as
 #' @importFrom methods is
 #' @importFrom S4Vectors mcols
-#' @importFrom SummarizedExperiment assay
-#' @importFrom SummarizedExperiment assay<-
-#' @importFrom SummarizedExperiment assays
 #' @importFrom stats acf
 #' @importFrom stats approx
 #' @importFrom stats as.dist
@@ -34,6 +32,9 @@ utils::globalVariables("where")
 #' @importFrom stats start
 #' @importFrom stats var
 #' @importFrom stats weighted.mean
+#' @importFrom SummarizedExperiment assay
+#' @importFrom SummarizedExperiment assay<-
+#' @importFrom SummarizedExperiment assays
 #' @importFrom utils read.table
 #' @importFrom utils write.table
 ## usethis namespace: end

diff --git a/man/add_gc_cor.Rd b/man/add_gc_cor.Rd
diff --git a/man/merge_segments.Rd b/man/merge_segments.Rd
diff --git a/man/perform_gc_cor.Rd b/man/perform_gc_cor.Rd
diff --git a/man/run_scatools.Rd b/man/run_scatools.Rd
diff --git a/man/segment_cnv.Rd b/man/segment_cnv.Rd