diff --git a/scripts/count_coverage_for_single_sample_by_CODEX.R b/scripts/count_coverage_for_single_sample_by_CODEX.R deleted file mode 100644 index fb1f101..0000000 --- a/scripts/count_coverage_for_single_sample_by_CODEX.R +++ /dev/null @@ -1,52 +0,0 @@ -# docker run --rm -it -v /data/work/home/iwkusmirek/cnv_cov:/data/work/home/iwkusmirek/cnv_cov -w /data/work/home/iwkusmirek/cnv_cov biodatageeks/cnv-opt-codexcov Rscript count_coverage_for_single_sample_by_CODEX.R NA06984.chrom11.ILLUMINA.bwa.CEU.exome.20120522.bam codex.bed 20 11 NA06984_codex.txt - -count_coverage_for_single_sample_by_CODEX <- function(bambedObj, mapqthres) { - ref <- bambedObj$ref - chr <- bambedObj$chr - bamdir <- bambedObj$bamdir - st <- start(ref)[1] - ed <- end(ref)[length(ref)] - Y <- matrix(NA, nrow = length(ref), ncol = 1) - readlength <- rep(NA, 1) - i <- 1 - bamurl <- bamdir[i] - which <- RangesList(quack = IRanges(st - 10000, ed + 10000)) - names(which) <- as.character(chr) - what <- c("pos", "mapq", "qwidth") - flag <- scanBamFlag(isDuplicate = FALSE, isUnmappedQuery = FALSE, - isNotPassingQualityControls = FALSE, - isFirstMateRead = TRUE) - param <- ScanBamParam(which = which, what = what, flag = flag) - bam <- scanBam(bamurl, param = param)[[1]] - mapqfilter <- (bam[["mapq"]] >= mapqthres) - readlength[i] <- round(mean(bam[["qwidth"]])) - if(is.nan(readlength[i])){ - flag <- scanBamFlag(isDuplicate = FALSE, isUnmappedQuery = FALSE, - isNotPassingQualityControls = FALSE) - param <- ScanBamParam(which = which, what = what, flag = flag) - bam <- scanBam(bamurl, param = param)[[1]] - mapqfilter <- (bam[["mapq"]] >= mapqthres) - readlength[i] <- round(mean(bam[["qwidth"]])) - } - message("Getting coverage for sample ", bamurl, ": ", - "read length ", readlength[i], ".", sep = "") - irang <- IRanges(bam[["pos"]][mapqfilter], width = - bam[["qwidth"]][mapqfilter]) - Y[, i] <- countOverlaps(ref, irang) - list(Y = Y, readlength = readlength) -} - -library(CODEX) -args = commandArgs(trailingOnly=TRUE) -if (length(args) != 5) { - stop("Invalid number of arguments!!!", call.=FALSE) -} -bamFile <- args[1] -bedFile <- args[2] -mapqthres <- strtoi(args[3]) -chr <- args[4] -outputFile <- args[5] - -bambedObj <- getbambed(bamdir = bamFile, bedFile = bedFile, sampname = NULL, projectname = NULL, chr) -coverageObj <- count_coverage_for_single_sample_by_CODEX(bambedObj, mapqthres = mapqthres) -write.table(data.frame(coverageObj$Y), sep=",", col.names=F, row.names=F, quote=F, file = outputFile) diff --git a/scripts/run_CODEXCOV_from_Yhat.R b/scripts/run_CODEXCOV_from_Yhat.R deleted file mode 100644 index 788efe1..0000000 --- a/scripts/run_CODEXCOV_from_Yhat.R +++ /dev/null @@ -1,133 +0,0 @@ -segment_without_K <- function(Y_qc, Yhat, sampname_qc, ref_qc, chr, lmax, mode) { - finalcall <- matrix(ncol = 9) - lmax <- lmax - 1 - for (sampno in 1:ncol(Y_qc)) { - message("Segmenting sample ", sampno, ": ", sampname_qc[sampno], - ".") - y <- Y_qc[, sampno] -# yhat <- Yhat[[which(K == optK)]][, sampno] - yhat <- Yhat[,sampno] - num <- length(y) - y <- c(y, rep(0, lmax)) - yhat <- c(yhat, rep(0, lmax)) - i <- rep(1:num, rep(lmax, num)) - j <- rep(1:lmax, num) + i - yact <- rep(0, length(i)) - lambda <- rep(0, length(i)) - for (k in 1:num) { - yact[(lmax * k - (lmax - 1)):(lmax * k)] <- cumsum(y[k:(k + - lmax)])[-1] - lambda[(lmax * k - (lmax - 1)):(lmax * k)] <- cumsum(yhat[k:(k + - lmax)])[-1] - } - i <- i[j <= num] - yact <- yact[j <= num] - lambda <- lambda[j <= num] - j <- j[j <= num] - yact[lambda < 20] <- 20 - lambda[lambda < 20] <- 20 - if (mode == "integer") { - chat <- round(2 * (yact/lambda)) - } - else if (mode == "fraction") { - chat <- 2 * (yact/lambda) - } - lratio <- (1 - chat/2) * lambda + log((chat + 1e-04)/2.0001) * - yact - chat[chat > 5] <- 5 - if (sum(lratio > 0) > 0) { - if (sum(lratio > 0) >= 2) { - finalmat <- (cbind(i, j, yact, lambda, chat, - lratio))[lratio > 0, ] - finalmat <- finalmat[order(-finalmat[, 6]), ] - s <- 1 - while (s <= (nrow(finalmat))) { - rowstart <- finalmat[s, 1] - rowend <- finalmat[s, 2] - rowsel <- (finalmat[, 1] <= rowend & finalmat[, - 2] >= rowstart) - rowsel[s] <- FALSE - finalmat <- finalmat[!rowsel, ] - if (is.vector(finalmat)) { - finalmat <- t(as.matrix(finalmat)) - } - s <- s + 1 - } - } - if (sum(lratio > 0) == 1) { - finalmat <- (cbind(i, j, yact, lambda, chat, - lratio))[lratio > 0, ] - finalmat <- t(as.matrix(finalmat)) - } - finalmat <- round(finalmat, digits = 3) - loglikeij <- cumsum(finalmat[, 6]) - mBIC <- rep(NA, length(loglikeij)) - for (s in 1:nrow(finalmat)) { - tau <- sort(unique(c(as.vector(finalmat[1:s, - 1:2]), 1, num))) - P <- length(tau) - 2 - mbic <- loglikeij[s] - mbic <- mbic - 0.5 * sum(log(tau[2:length(tau)] - - tau[1:(length(tau) - 1)])) - mbic <- mbic + (0.5 - P) * log(num) - mBIC[s] <- mbic - } - mBIC <- round(mBIC, digits = 3) - if (mBIC[1] > 0) { - finalmat <- cbind(rep(sampname_qc[sampno], nrow(finalmat)), - rep(chr, nrow(finalmat)), finalmat) - finalmat <- (cbind(finalmat, mBIC)[1:which.max(mBIC), - ]) - finalcall <- rbind(finalcall, finalmat) - } - } - } - finalcall <- finalcall[-1, ] - if (is.vector(finalcall)) { - finalcall <- t(as.matrix(finalcall)) - } - st <- start(ref_qc)[as.numeric(finalcall[, 3])] - ed <- end(ref_qc)[as.numeric(finalcall[, 4])] - cnvtype <- rep(NA, length(st)) - cnvtype[finalcall[, 7] < 2] <- "del" - cnvtype[finalcall[, 7] > 2] <- "dup" - if (nrow(finalcall) == 1) { - finalcall <- t(as.matrix(c(finalcall[, 1:2], cnvtype, - st, ed, (ed - st + 1)/1000, finalcall[, 3:9]))) - } - else { - finalcall <- cbind(finalcall[, 1:2], cnvtype, st, ed, - (ed - st + 1)/1000, finalcall[, 3:9]) - } - colnames(finalcall) <- c("sample_name", "chr", "cnv", "st_bp", - "ed_bp", "length_kb", "st_exon", "ed_exon", "raw_cov", - "norm_cov", "copy_no", "lratio", "mBIC") - rownames(finalcall) <- rep("", nrow(finalcall)) - lratio = as.numeric(finalcall[, "lratio"]) - finalcall -} - - -run_CODEXCOV <- function(K_from, - K_to, - lmax, - input_cov_table, - input_cov_table_norm, - input_bed, - reference_sample_set_file, - output_calls_file){ - - Y <- read.csv(input_cov_table) - Yhat <- read.csv(input_cov_table_norm) - targets <- read.delim(input_bed) - rownames(Y) <- 1:nrow(Y) - rownames(targets) <- 1:nrow(targets) - samples <- colnames(Y) - - ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"]) - - finalcall <- segment_without_K(Y, Yhat, samples, ref, targets[1,'chr'], lmax, mode = "integer") - - finalcall <- unify_calls_format(finalcall)$finalcall - write.csv(finalcall, output_calls_file, row.names=F) -} diff --git a/scripts/run_CODEXCOV_knn_by_regions.R b/scripts/run_CODEXCOV_knn_by_regions.R deleted file mode 100644 index ef392c4..0000000 --- a/scripts/run_CODEXCOV_knn_by_regions.R +++ /dev/null @@ -1,46 +0,0 @@ -run_CODEXCOV <- function(K_from, - K_to, - lmax, - number_of_neighbours, - target_id, - input_cov_table, - input_bed, - output_Yhat_file){ - - Y <- read.csv(input_cov_table) - targets <- read.delim(input_bed) - rownames(Y) <- 1:nrow(Y) - rownames(targets) <- 1:nrow(targets) - - correlation <- cor(t(Y[target_id,]),t(Y[,])) - thr <- sort(correlation,decreasing=TRUE)[number_of_neighbours] - indices <- which(correlation >= thr) - indices <- as.numeric(gsub("\\D+", "", indices)) - - print(indices) - - Y <- Y[indices,] - targets <- targets[indices,] - rownames(Y) <- 1:nrow(Y) - rownames(targets) <- 1:nrow(targets) - - ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"]) - gcmapp1_result <- gcmapp1(targets[1,'chr'], ref) - gc <- gcmapp1_result$gc - - normObj_result <- normObj1(as.matrix(Y), gc, K=K_from:K_to) - Yhat <- normObj_result$Yhat - AIC <- normObj_result$AIC - BIC <- normObj_result$BIC - RSS <- normObj_result$RSS - K <- normObj_result$K - optK <- K[which.max(BIC)] - Yhat_opt <- Yhat[[which(K == optK)]][,] - Yhat_1 <- Yhat[[which(K == 1)]][,] - Yhat_2 <- Yhat[[which(K == 2)]][,] - Yhat_3 <- Yhat[[which(K == 3)]][,] - write.table(t(Yhat_opt[which(indices==target_id),]), output_Yhat_file, sep=',', row.names=FALSE, col.names=FALSE) - write.table(t(Yhat_1[which(indices==target_id),]), paste(output_Yhat_file,'_1',sep=''), sep=',', row.names=FALSE, col.names=FALSE) - write.table(t(Yhat_2[which(indices==target_id),]), paste(output_Yhat_file,'_2',sep=''), sep=',', row.names=FALSE, col.names=FALSE) - write.table(t(Yhat_3[which(indices==target_id),]), paste(output_Yhat_file,'_3',sep=''), sep=',', row.names=FALSE, col.names=FALSE) -} diff --git a/scripts/run_CODEXCOV_random_by_regions.R b/scripts/run_CODEXCOV_random_by_regions.R deleted file mode 100644 index ce71d2f..0000000 --- a/scripts/run_CODEXCOV_random_by_regions.R +++ /dev/null @@ -1,44 +0,0 @@ -run_CODEXCOV <- function(K_from, - K_to, - lmax, - number_of_neighbours, - target_id, - input_cov_table, - input_bed, - output_Yhat_file){ - - Y <- read.csv(input_cov_table) - targets <- read.delim(input_bed) - rownames(Y) <- 1:nrow(Y) - rownames(targets) <- 1:nrow(targets) - - indices <- sample(1:nrow(Y), number_of_neighbours, replace=F) - indices <- sort(unique(c(target_id,indices))) - - print(indices) - - Y <- Y[indices,] - targets <- targets[indices,] - rownames(Y) <- 1:nrow(Y) - rownames(targets) <- 1:nrow(targets) - - ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"]) - gcmapp1_result <- gcmapp1(targets[1,'chr'], ref) - gc <- gcmapp1_result$gc - - normObj_result <- normObj1(as.matrix(Y), gc, K=K_from:K_to) - Yhat <- normObj_result$Yhat - AIC <- normObj_result$AIC - BIC <- normObj_result$BIC - RSS <- normObj_result$RSS - K <- normObj_result$K - optK <- K[which.max(BIC)] - Yhat_opt <- Yhat[[which(K == optK)]][,] - Yhat_1 <- Yhat[[which(K == 1)]][,] - Yhat_2 <- Yhat[[which(K == 2)]][,] - Yhat_3 <- Yhat[[which(K == 3)]][,] - write.table(t(Yhat_opt[which(indices==target_id),]), output_Yhat_file, sep=',', row.names=FALSE, col.names=FALSE) - write.table(t(Yhat_1[which(indices==target_id),]), paste(output_Yhat_file,'_1',sep=''), sep=',', row.names=FALSE, col.names=FALSE) - write.table(t(Yhat_2[which(indices==target_id),]), paste(output_Yhat_file,'_2',sep=''), sep=',', row.names=FALSE, col.names=FALSE) - write.table(t(Yhat_3[which(indices==target_id),]), paste(output_Yhat_file,'_3',sep=''), sep=',', row.names=FALSE, col.names=FALSE) -}