Skip to content

Commit

Permalink
progress
Browse files Browse the repository at this point in the history
  • Loading branch information
mjz1 committed Jun 17, 2024
1 parent 8a14f9b commit 49dfe0c
Show file tree
Hide file tree
Showing 23 changed files with 4,632 additions and 712 deletions.
1 change: 1 addition & 0 deletions .Rprofile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
source("renv/activate.R")
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ Imports:
reshape2,
S4Vectors,
scuttle,
Seurat,
stringr,
tidyr,
tidyselect,
Expand All @@ -54,8 +55,8 @@ Suggests:
gtools,
knitr,
rmarkdown,
Seurat,
testthat (>= 3.0.0),
vegan,
zellkonverter
Config/testthat/edition: 3
Depends:
Expand Down
3 changes: 1 addition & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ export(add_gc_cor)
export(add_gc_freq)
export(add_hmmcopy)
export(add_ideal_mat)
export(add_umap_clusters)
export(bin_atac_frags)
export(bin_frags)
export(bin_frags_chr)
Expand Down Expand Up @@ -35,6 +34,7 @@ export(gc_modal_qc_filter)
export(get_assay_dat)
export(get_bin_ids)
export(get_bin_info)
export(get_blacklist)
export(get_chr_arm_bins)
export(get_counts)
export(get_counts1)
Expand Down Expand Up @@ -64,7 +64,6 @@ export(numbat_to_sce)
export(overlap_genes)
export(params_sc_hmm)
export(perform_gc_cor)
export(perform_umap_clustering)
export(phase_snps)
export(plot_cell_cna)
export(plot_cell_multi)
Expand Down
122 changes: 50 additions & 72 deletions R/bin_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#'
#' @inherit bin_frags
#'
#' @param ArrowFiles Named list or vector of ArrowFile paths
#' @param fragment_files Named list or vector of fragment file paths
#' @param bin_name Name of the bins (e.g. `'10Mb'`, `'500Kb'`, `'chr_arm'`). If not provided is automatically detected based on binwidth.
#' @param overwrite Logical. Overwrite previously existing results (default = FALSE)
#' @param return_mat Logical. Return the binned depth matrix (default = FALSE)
Expand All @@ -13,11 +13,12 @@
#' @return If `return_mat=TRUE`, returns a sparse binned depth matrix. Otherwise returns `NULL`
#'
#' @export
bin_atac_frags <- function(ArrowFiles,
bin_atac_frags <- function(sample_id,
fragment_file,
bins,
blacklist = NULL,
bin_name,
blacklist = blacklist,
outdir,
bin_name = prettyMb(getmode(width(bins))),
ncores = 1,
bpparams = BiocParallel::MulticoreParam(
workers = ncores,
Expand All @@ -31,22 +32,18 @@ bin_atac_frags <- function(ArrowFiles,
stopifnot(class(bins) %in% "GRanges")

# Compute fragments per bins and combine
matlist <- lapply(X = seq_along(ArrowFiles), FUN = function(i) {
sample_name <- names(ArrowFiles[i])
ArrowFile <- ArrowFiles[i]
sample_outdir <- file.path(outdir, bin_name)

dir.create(sample_outdir, showWarnings = FALSE, recursive = TRUE)

# Only bin frags if not done already
if (!file.exists(file.path(sample_outdir, "matrix.mtx.gz")) | overwrite) {
logger::log_info("{sample_name} -- Computing fragments in {bin_name} bins using {ncores} cores")
tmp <- bin_frags(ArrowFile = ArrowFile, bins = bins, blacklist = blacklist, outdir = sample_outdir, ncores = ncores, ...)
return(tmp)
} else {
logger::log_info("Binned counts file already found for ", sample_name)
}
})
bin_dir <- file.path(outdir, bin_name)

dir.create(bin_dir, showWarnings = FALSE, recursive = TRUE)

# Only bin frags if not done already
if (!file.exists(file.path(bin_dir, "matrix.mtx.gz")) | overwrite) {
logger::log_info("{sample_id} -- Computing fragments in {bin_name} bins using {ncores} cores")
tmp <- bin_frags(fragment_file = fragment_file, bins = bins, blacklist = blacklist, outdir = bin_dir, ncores = ncores, ...)
return(tmp)
} else {
logger::log_info("Binned counts file already found for {sample_id}")
}

# Invisible return
if (return_mat) {
Expand All @@ -61,9 +58,9 @@ bin_atac_frags <- function(ArrowFiles,

#' Bin fragments from ArrowFile
#'
#' Parallel enabled depth counting of read fragments in given genomic bins from `ArchR` processed ArrowFiles
#' Parallel enabled depth counting of read fragments in given genomic bins from fragment files
#'
#' @param ArrowFile ArrowFile
#' @param fragment_file Fragment file
#' @param bins Bins GRanges object
#' @param outdir Optional: Directory to write the `.mtx`, `barcodes`, and `bins` files
#' @param ncores Number of cores to use
Expand All @@ -73,7 +70,7 @@ bin_atac_frags <- function(ArrowFiles,
#'
#' @return Binned depth sparse matrix
#' @export
bin_frags <- function(ArrowFile, bins, blacklist = NULL, outdir = NULL, ncores = 1, bpparams = BiocParallel::MulticoreParam(workers = ncores, progressbar = TRUE), verbose = FALSE, ...) {
bin_frags <- function(fragment_file, bins, blacklist = NULL, outdir = NULL, ncores = 1, bpparams = BiocParallel::MulticoreParam(workers = ncores, progressbar = TRUE), verbose = FALSE, ...) {
requireNamespace("BiocParallel")

stopifnot(class(bins) %in% "GRanges")
Expand All @@ -87,7 +84,7 @@ bin_frags <- function(ArrowFile, bins, blacklist = NULL, outdir = NULL, ncores =
FUN = bin_frags_chr,
bins = bins,
blacklist = blacklist,
ArrowFile = ArrowFile,
fragment_file = fragment_file,
BPPARAM = bpparams
))

Expand All @@ -107,56 +104,30 @@ bin_frags <- function(ArrowFile, bins, blacklist = NULL, outdir = NULL, ncores =
#'
#' \code{bin_frags_chr} computes the fragments across bins in a single chromosome from an ArchR ArrowFile
#'
#' @param chr A single chromsome to compute depth information
#' @param chrom A single chromosome to compute depth information
#' @param bins A list of bins (can include all chromosomes)
#' @param ArrowFile Path to an ArrowFile generated by `ArchR`
#' @param fragment_file Path to an fragment file to bin
#'
#' @return Sparse matrix of binned fragment counts
#' @export
#'
#' @examples
#' \dontrun{
#' dp_mat <- bin_frags_chr(chr = "chr1", bins = get_chr_arm_bins("hg38"), ArrowFile = ArrowFile)
#' dp_mat <- bin_frags_chr(chrom = "chr1", bins = get_chr_arm_bins("hg38"), ArrowFile = ArrowFile)
#' }
bin_frags_chr <- function(chr, bins, blacklist = NULL, ArrowFile) {
if (!requireNamespace("ArchR", quietly = TRUE)) {
stop("Package \"ArchR\" must be installed to use this function.")
}

stopifnot(length(chr) == 1)
bin_frags_chr <- function(chrom, bins, blacklist = NULL, fragment_file) {
stopifnot(length(chrom) == 1)
stopifnot(class(bins) %in% "GRanges")
stopifnot(file.exists(ArrowFile))

# Fix for parallel HDF5 access (if this function is passed to parallel loop)
# We scope only to this function
withr::local_envvar(c("HDF5_USE_FILE_LOCKING" = FALSE, "RHDF5_USE_FILE_LOCKING" = FALSE))

# Export needed functions
# .getFragsFromArrow <- utils::getFromNamespace(".getFragsFromArrow", "ArchR")
# .availableCells <- utils::getFromNamespace(".availableCells", "ArchR")
# h5closeAll <- utils::getFromNamespace("h5closeAll", "rhdf5")


# Load ArchR package temporarily (if not loaded) to execute the following commands
# This is required as ArchR has depends from rhdf5 and other packages that it does not properly reference using '::'
# Side effect is that the ArchR namespace remains attached afterwards (ie all the packages it depends on)
if (!"ArchR" %in% .packages()) {
withr::local_package(package = "ArchR")
ArchR::addArchRThreads(1)
}

# Export needed functions
.getFragsFromArrow <- utils::getFromNamespace(".getFragsFromArrow", "ArchR")
.availableCells <- utils::getFromNamespace(".availableCells", "ArchR")

# Get cells
cellNames <- .availableCells(ArrowFile)
stopifnot(file.exists(fragment_file))

# Read in Fragments
fragments <- .getFragsFromArrow(ArrowFile, chr = chr, out = "GRanges", cellNames = cellNames)
fragments <- data.table::fread(fragment_file)[, 1:5]
colnames(fragments) <- c("chr", "start", "end", "barcode", "umi")
cells <- unique(fragments$barcode)
fragments <- fragments[fragments$chr == chrom, ]
fragments <- GenomicRanges::makeGRangesFromDataFrame(fragments, keep.extra.columns = T)


# Remove fragments in blacklist regions
# Remove fragments overlapping with blacklist regions
if (!is.null(blacklist)) {
if (!class(blacklist) %in% "GRanges") {
logger::log_warn("Blacklist must be of class 'GRanges'. Provided: {class(blacklist)}")
Expand All @@ -172,15 +143,15 @@ bin_frags_chr <- function(chr, bins, blacklist = NULL, ArrowFile) {


# Chromosome bins
bins_chr <- bins[BSgenome::seqnames(bins) == chr]
bins_chr <- bins[BSgenome::seqnames(bins) == chrom]

# Get overlapping indices
# Note: Each fragment represents two transposition events
# Therefore we count both the start and end site of each independently
start_hits <- GenomicRanges::findOverlaps(
subject = bins_chr,
query = GRanges(
seqnames = chr,
seqnames = chrom,
IRanges::IRanges(
start = GenomicRanges::start(fragments),
end = GenomicRanges::start(fragments)
Expand All @@ -190,7 +161,7 @@ bin_frags_chr <- function(chr, bins, blacklist = NULL, ArrowFile) {
end_hits <- GenomicRanges::findOverlaps(
subject = bins_chr,
query = GenomicRanges::GRanges(
seqnames = chr,
seqnames = chrom,
IRanges::IRanges(
start = GenomicRanges::end(fragments),
end = GenomicRanges::end(fragments)
Expand All @@ -199,18 +170,18 @@ bin_frags_chr <- function(chr, bins, blacklist = NULL, ArrowFile) {
)

# Match Cells
matchID <- S4Vectors::match(mcols(fragments)$RG, cellNames)
matchID <- S4Vectors::match(mcols(fragments)$barcode, cells)

# Create Sparse Matrix
mat <- Matrix::sparseMatrix(
i = c(S4Vectors::to(start_hits), S4Vectors::to(end_hits)),
j = as.vector(c(matchID, matchID)),
x = rep(1, 2 * length(fragments)),
dims = c(length(bins_chr), length(cellNames))
dims = c(length(bins_chr), length(cells))
)

# Name matrix
colnames(mat) <- cellNames
colnames(mat) <- cells
rownames(mat) <- paste(GenomicRanges::seqnames(bins_chr),
GenomicRanges::start(bins_chr),
GenomicRanges::end(bins_chr),
Expand Down Expand Up @@ -275,7 +246,10 @@ get_chr_arm_bins <- function(genome = "hg38", calc_gc = FALSE, bs_genome = NULL)
#' \dontrun{
#' bins <- get_tiled_bins(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38, tilewidth = 500000)
#' }
get_tiled_bins <- function(bs_genome = NULL, tilewidth = 500000, select_chrs = NULL, respect_chr_arms = TRUE) {
get_tiled_bins <- function(bs_genome = NULL,
tilewidth = 500000,
select_chrs = NULL,
respect_chr_arms = TRUE) {
if (is.null(bs_genome)) {
logger::log_error("Must provide 'bs_genome'")
stop()
Expand All @@ -298,16 +272,16 @@ get_tiled_bins <- function(bs_genome = NULL, tilewidth = 500000, select_chrs = N
stop()
}
# Get arm bins
arm_bins <- get_chr_arm_bins(genome = unique(genome(bs_genome))[1])
arm_bins <- get_chr_arm_bins(genome = unique(GenomeInfoDb::genome(bs_genome))[1])
bins <- lapply(X = seq_along(arm_bins), FUN = function(i) {
r <- arm_bins[i]

starts <- seq(start(r), end(r), by = tilewidth)
ends <- c(starts[2:(length(starts))] - 1, end(r))

r_new <- GRanges(seqnames = seqnames(r), ranges = IRanges(start = starts, end = ends), arm = r$arm)
r_new <- GenomicRanges::GRanges(seqnames = seqnames(r), ranges = IRanges(start = starts, end = ends), arm = r$arm)
}) %>%
GRangesList() %>%
GenomicRanges::GRangesList() %>%
unlist()
} else {
bins <- GenomicRanges::tileGenome(BSgenome::seqinfo(bs_genome)[select_chrs],
Expand All @@ -319,6 +293,10 @@ get_tiled_bins <- function(bs_genome = NULL, tilewidth = 500000, select_chrs = N
bins$binwidth <- IRanges::width(bins)

bins$bin_id <- paste(seqnames(bins), start(bins), end(bins), sep = "_")

if (!is.null(select_chrs)) {
bins <- keepSeqlevels(x = bins, value = select_chrs, pruning.mode = "coarse")
}

bins <- add_gc_freq(bs_genome, bins)
bins <- sort(bins)
Expand Down
Loading

0 comments on commit 49dfe0c

Please sign in to comment.