Skip to content

Commit

Permalink
added input ongen scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Tiffany Amariuta committed Jul 11, 2023
1 parent 2cca863 commit ef2a246
Show file tree
Hide file tree
Showing 5 changed files with 393 additions and 92 deletions.
56 changes: 1 addition & 55 deletions simulation_analysis/CoCoNet/CoCoNet_analysis.R
Original file line number Diff line number Diff line change
@@ -1,15 +1,4 @@
sim <- as.numeric(commandArgs(trailingOnly=TRUE))
#####
#simulations:
#1. 1000 genes, 10 tissues; done
#2. for each tissue, do kmeans on genes (gene x person matrix?), get 10-15 clusters (hard clustering)
#3. tissue-specific gene adjacency matrix: block diagonal matrix for each tissue separately, 1 if belong to the same cluster. 0 else.
#4. pick tissue 1 as causal tissue
#5. make outcome variable using this one tissue-specific gene adjacency matrix: MVN mean 0 cov matrix Asigma_1^2 + Isigma_0^2 where A is TSGAM
#6. p = 0.02, sigma1 = 0.13, sigma2 = 0.87
#7. apply model, pick tissue with highest log likelihood, can we use CoCoNet() function for this?

#2.
library(data.table)
library(MASS)
library(CoCoNet)
Expand All @@ -29,15 +18,13 @@ causalgenes <- as.matrix(fread("TCSC/simulation_analysis/Nov_Y1_alpha_nCausalGen
for (ti in tissues){
print(ti)
ge <- as.matrix(fread(paste0("TCSC/simulation_analysis/SEG/totalge/",samplesizes[samp],"/TotExp_Nov_Tissue",ti,"_sim",sim,".txt.gz"), header = F))
#ge <- fread(paste0("totalexpression/Nov_0.75_ggcoreg/",samplesizes[samp],"/TotExp_Nov_Group1_Tissue",ti,"_sim",sim,".txt.gz"), header = F)
kobj <- kmeans(ge, centers = 10) #clusters on the rows (genes), cols are people
clusters <- kobj$cluster
adjmat <- matrix(0,nrow(ge),nrow(ge))
for (i in 1:10){
w <- which(clusters == i)
adjmat[w,w] <- 1
}
#if(ti == 0){outcomevar <- mvrnorm(1, mu = rep(0,nrow(adjmat)), Sigma = adjmat*0.13^2 + diag(nrow(adjmat))*0.87^2 )} #pergene, per disease (so just 1)
result = CoCoNet(outcomevar_binary[1:ngenes], max_path = 1, adjmat)
print(result$loglikelihood)
m[samp,ti+1] <- result$loglikelihood
Expand All @@ -47,47 +34,6 @@ causalgenes <- as.matrix(fread("TCSC/simulation_analysis/Nov_Y1_alpha_nCausalGen
write.table(m, file = filename, row.names = F, col.names = F, sep = "\t", quote = F)
system(paste0("gzip TCSC/simulation_analysis/CoCoNet/results/res_",sim,".txt"))

#afterward
sims <- 1000
res_mat <- data.frame()
for (sim in 1:sims){
tryCatch({
res <- as.matrix(fread(paste0("TCSC/simulation_analysis/CoCoNet/results/res_",sim,".txt.gz")))
res_mat <- rbind(res_mat, cbind(1:6,res))
},error=function(cond){message(paste("Didn't finish ", sim))})
}
power <- c()
fpr <- c()
for (i in 1:6){
w <- which(res_mat[,1] == i)
res_mat2 <- res_mat[w,-1]
performance <- sapply(1:nrow(res_mat2), function(x) which.max(as.numeric(res_mat2[x,]))) #1 1 1 2
power[i] <- length(which(performance == 1))/length(performance)*100
fpr[i] <- 100-power[i]
}
####

#https://lulushang.org/docs/Projects/CoCoNet/Reproduce

#load("CoCoNet/outcome_tissue_scale.RData") #double, one column per trait. rows = genes. already for one tissue only?
#load("CoCoNet/tissue_net.RData") #very large.

#gene by gene correlation, squared? how do they compute it? follow tutorial.

#scaled gene level effect sizes, twas res? how do they compute it? follow tutorial.

#A = tissue_net[[1]] #for one tissue
#result = CoCoNet(outcome_tissue_scale[,3], max_path = 1, A) #genes on rows 1 column for disease.


#from https://lulushang.org/docs/Projects/CoCoNet/Reproduce
#library(data.table)
#load("CoCoNet/outcome_cell_scale.RData")
#gene_table = fread("CoCoNet/genetable.txt.gz", header=T)
#motif = fread("CoCoNet/motif.txt.gz") # same as in GTEx tissue PANDA input, downloaded from same website
#ppi = fread("CoCoNet/ppi.txt.gz") # same as in GTEx tissue PANDA input, downloaded from same website
#expr = read.table("CoCoNet/GTEx_droncseq_hip_pcf/GTEx_droncseq_hip_pcf.umi_counts.txt.gz",header=T) # GTEx single cell expression genes x cells
#cell_anno = read.csv("CoCoNet/GTEx_droncseq_hip_pcf/cell_annotation.csv",header=T) # cell type annotation for GTEx single cell expression cells x 6 columns
#library(devtools)
#devtools::install_github('xzhoulab/CoCoNet')


19 changes: 19 additions & 0 deletions simulation_analysis/CoCoNet/CoCoNet_results.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
library(data.table)
sims <- 1000
res_mat <- data.frame()
for (sim in 1:sims){
tryCatch({
res <- as.matrix(fread(paste0("TCSC/simulation_analysis/CoCoNet/results/res_",sim,".txt.gz")))
res_mat <- rbind(res_mat, cbind(1:6,res))
},error=function(cond){message(paste("Didn't finish ", sim))})
}
power <- c()
fpr <- c()
for (i in 1:6){
w <- which(res_mat[,1] == i)
res_mat2 <- res_mat[w,-1]
performance <- sapply(1:nrow(res_mat2), function(x) which.max(as.numeric(res_mat2[x,]))) #1 1 1 2
power[i] <- length(which(performance == 1))/length(performance)*100
fpr[i] <- 100-power[i]
}

44 changes: 44 additions & 0 deletions simulation_analysis/Nov_reformat_sumstats.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
sim_vegene_vesnp <- commandArgs(trailingOnly=TRUE)
s <- strsplit(sim_vegene_vesnp, split = "_")[[1]]
sim <- as.numeric(s[1])
ve_gene <- s[2]
ve_snp <- s[3]
ngwas <- s[4]

library(data.table)

#Make sumstats with this format:
#SNP A1 A2 N CHISQ Z #for ldsc
#rs3766192 T C 25604.000 2.604996 -1.614
#for rolypoly
#chrom pos rsid beta se maf

snps <- fread("TCSC/simulation_analysis/1KG_HM3_chr1.bim", header = F)
frq <- fread("TCSC/simulation_analysis/1000G.EUR.QC.hm3_noMHC.1.frq", header = T)
frq <- frq[match(snps$V2,frq$SNP),]

sumstat_names <- c()
if(as.numeric(ve_gene) != 0){sumstat_names <- c(sumstat_names, paste0("vgene",ve_gene,"_sim",sim,".txt"))}
if(as.numeric(ve_snp) != 0){sumstat_names <- c(sumstat_names, paste0("vgene",ve_gene,"_vsnp",ve_snp,"_sim",sim,".txt"))}

for (i in 1:length(sumstat_names)){
sumstats <- cbind(snps$V2, snps$V5, snps$V6, ngwas, 0, 0) #LDSC SEG
sumstats_rolypoly <- cbind(snps$V1,snps$V4,snps$V2,0,0,frq$MAF) #RolyPoly
mysumstats <- as.matrix(fread(paste0("TCSC/simulation_analysis/GWASsumstats/sumstats_",ngwas,"_",sumstat_names[i],".gz")), header = F)
z <- as.numeric(mysumstats[,2])/as.numeric(mysumstats[,3])
beta <- as.numeric(mysumstats[,2])
se <- as.numeric(mysumstats[,3])
chisq <- z^2
sumstats_rolypoly[,5] <- se
sumstats_rolypoly[,4] <- beta
sumstats[,6] <- z
sumstats[,5] <- chisq

colnames(sumstats) <- c("SNP","A1","A2","N","CHISQ","Z")
write.table(sumstats, file = paste0("TCSC/simulation_analysis/SEG/sumstats/sumstats_",ngwas,"_",sumstat_names[i]), row.names = F, col.names = T, sep = "\t", quote = F)
system(paste0("gzip -f TCSC/simulation_analysis/SEG/sumstats/sumstats_",ngwas,"_",sumstat_names[i]))

colnames(sumstats_rolypoly) <- c("chrom","pos","rsid","beta","se","maf")
write.table(sumstats_rolypoly, file = paste0("TCSC/simulation_analysis/RolyPoly/sumstats/sumstats_",ngwas,"_",sumstat_names[i]), row.names = F, col.names = T, sep = "\t", quote = F)
system(paste0("gzip -f TCSC/simulation_analysis/RolyPoly/sumstats/sumstats_",ngwas,"_",sumstat_names[i]))
}
Loading

0 comments on commit ef2a246

Please sign in to comment.