add figs

Huizhong1993 · Mar 13, 2023 · 869e7c9 · 869e7c9
1 parent 4dbc76d
commit 869e7c9
Show file tree

Hide file tree

Showing 148 changed files with 21,778 additions and 3 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/README.md b/README.md
@@ -1,2 +1,8 @@
-# TCM
-Codes used in pan-T cell analysis
+# T Cell Map
+Codes used in pan-cancer T cell study
+
+- "TCellMap.R" is a tool we developed to automatically align and annotate T cells in a scRNA-seq dataset. It uniformly aligns T cells from the query dataset with the T cell maps that we built in our pan-cancer T cell study.
+
+- The python script "Res_largerT.py" functions as a pipeline for the analysis and visualization of SRT data.
+
+- The directories titled "fig1/2/3/4/5/6" contain additional scripts used for generating figures in the manuscript. Furthermore, the "data_preprocess" folder hosts scripts designed to preprocess raw data. Those scripts are used for tasks such as batch-correction and dimensional reduction, etc.
diff --git a/TCellMap.R b/TCellMap.R
@@ -39,18 +39,42 @@ querySeuratObj <- readRDS(opt$queryData)
 DefaultAssay(refSeuratObj) <- "RNA"
 DefaultAssay(querySeuratObj) <- "RNA"
 
+cellCycleGeneT1 <- read_tsv("/rsrch3/scratch/genomic_med/ychu2/projects/p1review/R3Q7/knowledge/public/database/general/cell-cy\
+cle-gene-list.txt")
+cellCycleGeneT2 <- read_tsv("/rsrch3/scratch/genomic_med/ychu2/projects/p1review/R3Q7/knowledge/public/database/general/regev_l\
+ab_cell_cycle_genes.txt")
+
+HSMarkers <- c("HSPA6", "HSPA1A", "HSPA1B", "DNAJB1", "HSPH1", "HSP90AA1", "HSPE1", "HSPB1", "BAG3", "HSPD1", "DNAJA1", "HSP90A\
+B1", "HSPA8", "DNAJB4", "DNAJA4")
+
+DEG_path <- file.path(dirname(opt$referenceData), "snn-single-markers.tsv")
+## DEG_path <- file.path(dirname(CD4_ReferenceDataPath), "snn-single-markers.tsv")
+DEGs <- read_tsv(DEG_path) %>%
+    arrange(cluster, desc(avg_logFC)) %>%
+    filter(avg_logFC > 0) %>%
+    group_by(cluster) %>%
+    top_n(100) %>%
+    pull(gene)
+
 refSeuratObj <- refSeuratObj %>%
     NormalizeData(verbose = T) %>%
     FindVariableFeatures(selection.method = "vst")
 hvgR = VariableFeatures(object = refSeuratObj)
+hvgR <- union(hvgR, HSMarkers)
+hvgR <- intersect(hvgR, DEGs)
+hvgR <- setdiff(hvgR, cellCycleGeneT1$marker)
+hvgR <- setdiff(hvgR, cellCycleGeneT2$marker)
 refSeuratObj <- refSeuratObj %>%
     ScaleData(verbose = T) %>%
     RunPCA(verbose = T, features = hvgR)
 
 querySeuratObj <- querySeuratObj %>%
 NormalizeData(verbose = T) %>%
 FindVariableFeatures(selection.method = "vst")
-hvgQ = VariableFeatures(object = querySeuratObj)
+hvgR <- union(hvgR, HSMarkers)
+hvgR <- intersect(hvgR, DEGs)
+hvgR <- setdiff(hvgR, cellCycleGeneT1$marker)
+hvgR <- setdiff(hvgR, cellCycleGeneT2$marker)
 querySeuratObj <- querySeuratObj %>%
     ScaleData(verbose = T) %>%
     RunPCA(verbose = T, features = hvgQ)

diff --git a/TCellMap2.R b/TCellMap2.R
@@ -0,0 +1,184 @@
+#'--------------------------------------------------------------
+#' filename : SeuratMapping.R
+#' Date : 2021-07-21
+#' contributor : Yanshuo Chu
+#' function: SeuratMapping
+#' R: 4.0.3
+#' seurat: 4
+#'--------------------------------------------------------------
+
+print('<==== SeuratMapping.R ====>')
+
+suppressMessages({
+    library(optparse)
+    library(tidyverse)
+    library(Seurat)
+    library(cowplot)
+})
+
+option_list = list(
+    make_option(c("-d","--data"),
+                type = 'character',
+                help = 'data.rds',
+                metavar = 'character'),
+    make_option(c("-o","--out"),
+                type = 'character',
+                help = 'out',
+                metavar = 'character')
+);
+
+opt_parser = OptionParser(option_list = option_list);
+opt = parse_args(opt_parser);
+
+seuratObj <- readRDS(opt$data)
+
+## NKT_PATH="/rsrch3/scratch/genomic_med/ychu2/data/tmp/Tcellproject/analysis/validate/NKTMAIT_V6/nPC_5/UMAP_dist_0.1_nneighbor_35/p1_NKTMAIT_v6_UMAP_dist_0.1_nneighbor_35_CLUSTER_res_0.3/cluster.rds"
+## seuratObj <- readRDS(NKT_PATH)
+
+DEGsT <- read_tsv(file.path(dirname(opt$data), "snn-single-markers.tsv"))
+DEGs <- unique(DEGsT %>% pull(gene))
+
+Idents(seuratObj) <- seuratObj$batch
+sortedBatch <- sort(table(seuratObj$batch), decreasing = T)
+totalT <- c()
+
+###############################################################################
+#               Here Map to Each Batch, find out best reference               #
+###############################################################################
+# Or based on a score method to map query to reference ########################
+# Compare the score method and together method, check the difference###########
+
+
+for(i in 1:length(sortedBatch)){
+    tempRefObj <- subset(seuratObj, idents = names(sortedBatch)[i])
+    for(j in 1:length(sortedBatch)){
+
+        if(i == j){next}
+        ## if(sortedBatch[i] < 500){next}
+        tryCatch({
+            tempQueryObj <- subset(seuratObj,
+                                   idents = names(sortedBatch)[j])
+
+            DefaultAssay(tempRefObj) <- "RNA"
+            DefaultAssay(tempQueryObj) <- "RNA"
+
+            tempRefObj$reference.cell.type <- tempRefObj$seurat_clusters
+
+            temp.anchors <- FindTransferAnchors(reference = tempRefObj,
+                                                query = tempQueryObj,
+                                                features = DEGs,
+                                                k.filter = NA,
+                                                reduction = "pcaproject",
+                                                reference.reduction = "pca",
+                                                reference.assay = "RNA",
+                                                query.assay = "RNA")
+
+            ## Error: No features to use in finding transfer anchors. To troubleshoot, try explicitly providing features to the features \
+            ## 2 parameter and ensure that they are present in both reference and query assays.
+            ## 1 Execution halted
+
+            #' create a new umap model, exactly the same as the existing one #############
+
+            tempRefObj[["umap.new"]] <- CreateDimReducObject(
+                embeddings = tempRefObj[["umap"]]@cell.embeddings, key = "UMAPnew_")
+            umap.new.model <- list()
+            umap.new.model$n_epochs <- 500
+            umap.new.model$alpha <-1
+            umap.new.model$method <- "umap"
+            umap.new.model$negative_sample_rate <- 5
+            umap.new.model$gamma <- 1
+            umap.new.model$approx_pow <- 0
+            umap.new.model$metric$cosine <- list()
+            umap.new.model$embedding <- tempRefObj[["umap.new"]]@cell.embeddings
+            ab_param <- uwot:::find_ab_params(spread = 1, min_dist = 0.3)
+            umap.new.model$a <- ab_param["a"]
+            umap.new.model$b <- ab_param["b"]
+            tempRefObj[["umap.new"]]@misc$model <- umap.new.model
+
+
+            tempQueryObj <- MapQuery(anchorset = temp.anchors,
+                                     reference = tempRefObj,
+                                     query = tempQueryObj,
+                                     refdata = list(celltype = "reference.cell.type"),
+                                     reference.reduction = "pca",
+                                     reduction.model = "umap.new")
+
+            correctPosition <- as.numeric(
+                tempQueryObj$predicted.celltype == tempQueryObj$seurat_clusters)
+            wrongPosition <- as.numeric(
+                tempQueryObj$predicted.celltype != tempQueryObj$seurat_clusters)
+
+            tempTibble <- tibble(
+                QueryDataSet = names(sortedBatch)[j],
+                RefDataSet = names(sortedBatch)[i],
+                MatchedCellNum = sum(correctPosition),
+                QueryDataCellNum = length(correctPosition),
+                RefDataCellNum = length(Cells(tempRefObj)),
+                ACC = sum(correctPosition)/length(correctPosition))
+
+            totalT <- bind_rows(totalT, tempTibble)
+
+            p1 <- DimPlot(tempRefObj,
+                          reduction = "umap",
+                          group.by = "reference.cell.type",
+                          label = TRUE,
+                          label.size = 3,
+                          repel = TRUE) +
+                NoLegend() +
+                ggtitle("Reference annotations")
+
+            p2 <- DimPlot(tempQueryObj,
+                          reduction = "ref.umap",
+                          group.by = "predicted.celltype", label = TRUE,
+                          label.size = 3, repel = TRUE) +
+                NoLegend() +
+                ggtitle("Query transferred labels")
+
+            p3 <- DimPlot(subset(tempQueryObj, cells = Cells(tempQueryObj)[as.logical(correctPosition)]),
+                          reduction = "umap",
+                          group.by = "predicted.celltype", label = TRUE,
+                          label.size = 3, repel = TRUE) +
+                NoLegend() +
+                ggtitle(paste0("Query correct, ACC=", round(sum(correctPosition)/length(correctPosition), 2)))
+
+            p4 <- DimPlot(subset(tempQueryObj, cells = Cells(tempQueryObj)[as.logical(wrongPosition)]),
+                          reduction = "umap",
+                          group.by = "seurat_clusters", label = TRUE,
+                          label.size = 3, repel = TRUE) +
+                NoLegend() +
+                ggtitle("Query wrong")
+
+            if(str_detect(opt$data, "CD8")){
+                p1 <- p1 + scale_x_reverse()
+                p2 <- p2 + scale_x_reverse()
+                p3 <- p3 + scale_x_reverse()
+                p4 <- p4 + scale_x_reverse()
+            }
+
+            g <- plot_grid(p1,p2,p3,p4, nrow = 1, axis = "bt", align = 'h')
+            pdf(file.path(opt$out, paste0("Query-", names(sortedBatch)[j],
+                                          "_Ref-", names(sortedBatch)[i], "_umap.pdf")),
+                width = 12, height=3)
+            print(g)
+            dev.off()
+        },
+        error = function(e){print(e)}
+        )
+    }
+}
+
+g <- totalT %>% ggplot() +
+    geom_point(aes(x = RefDataSet,
+                   y = QueryDataSet,
+                   size = ACC,
+                   color = RefDataCellNum,
+                   fill = RefDataCellNum), shape = 22) +
+    xlab("RefDataSet") + ylab("QueryDataSet") +
+    theme_classic() +
+    theme(text = element_text(size = 10),
+          axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
+pdf(file.path(opt$out, "allQuerysACC_point.pdf"))
+print(g)
+dev.off()
+
+write_tsv(totalT, file.path(opt$out, paste0('totalT', "_", Sys.Date(), '.tsv')))
diff --git a/data_preprocess/0_run_seurat_pipeline/FindClusterJobs.sh b/data_preprocess/0_run_seurat_pipeline/FindClusterJobs.sh
@@ -0,0 +1,86 @@
+##!/usr/bin/env bash
+
+module load python/3.7.3-anaconda
+module load R/4.0.3
+
+mainscriptFolder=${1}
+inData=${2}
+currentFolder=${3}
+res=${4}
+reduction=${5}
+npc=${6}
+parentJobName=${7}
+toRunClustering=${8}
+toRunCommonAnalysis=${9}
+toRunCallBack=${10}
+callBackPath=${11}
+
+echo "FindClusterJobs parameters:
+mainscriptFolder=${mainscriptFolder}
+inData=${inData}
+currentFolder=${currentFolder}
+res=${res}
+reduction=${reduction}
+npc=${npc}
+parentJobName=${parentJobName}
+toRunClustering=${toRunClustering}
+toRunCommonAnalysis=${toRunCommonAnalysis}
+toRunCallBack=${toRunCallBack}
+callBackPath=${callBackPath}
+"
+
+runR="Rscript --no-save "
+
+###############################################################################
+#'                              Run Find Cluster                             '#
+###############################################################################
+if [ $toRunClustering = "YES" ]; then
+    ${runR} ${HOME}/configs/public/pipeline/UMAP_CLUSTER_JOBS_EMBEDED/FindCluster.R -d ${inData} -o ${currentFolder}/cluster.rds -r ${reduction} -n ${npc} -e ${res}
+fi
+
+
+
+###############################################################################
+#'                             Run Common Analysis                           '#
+###############################################################################
+srcD=${HOME}/configs/public/src
+ResD=${currentFolder}
+paramD=${HOME}/configs/public/params
+databaseD=${HOME}/configs/public/knowledge/database
+
+if [ $toRunCommonAnalysis = "YES" ]; then
+
+    if [ ! -d ${ResD}/bubbleplot ]; then
+        mkdir -p ${ResD}/bubbleplot
+    fi
+
+    ${runR} ${srcD}/bubble-plot.R -d ${ResD}/cluster.rds -o ${ResD}/bubbleplot -m ${databaseD}/TMarkers.txt
+    ${runR} ${srcD}/bubble-plot.R -d ${ResD}/cluster.rds -o ${ResD}/bubbleplot -m ${databaseD}/topLevel.txt
+    ${runR} ${srcD}/bubble-plot.R -d ${ResD}/cluster.rds -o ${ResD}/bubbleplot -m ${databaseD}/general/generalAll.txt
+
+    if [ ! -d ${ResD}/featureplot ]; then
+        mkdir -p ${ResD}/featureplot
+    fi
+
+    # ${runR} ${srcD}/feature-plot.R -d ${ResD}/cluster.rds -o ${ResD}/featureplot/topLevel -c ${paramD}/feature-plot-origin.json -m ${databaseD}/topLevel.txt
+    # ${runR} ${srcD}/feature-plot.R -d ${ResD}/cluster.rds -o ${ResD}/featureplot/tmarkers -c ${paramD}/feature-plot-origin.json -m ${databaseD}/TMarkers.txt
+
+    ${runR} ${srcD}/qc-by-cluster.R -d ${ResD}/cluster.rds -o ${ResD}/qc-by-cluster.pdf
+    ${runR} ${srcD}/visualize.R -d ${ResD}/cluster.rds
+    # ${runR} ${srcD}/visualize_batch.R -d ${ResD}/cluster.rds
+    # ${runR} ${srcD}/findmarker.r -d ${ResD}/cluster.rds -o ${ResD}/snn-single-markers.tsv
+
+    # ${runR} ${srcD}/snn-marker.R -d ${ResD}/cluster.rds -o ${ResD}/snn-markers.tsv -c ${paramD}/snn-marker.json
+    # ${runR} ${srcD}/snn-heatmap.R -d ${ResD}/cluster.rds -o ${ResD}/markersHeatmap.pdf -c ${paramD}/snn-heatmap.json -m ${ResD}/snn-markers.tsv -p heatmap
+    # python ${srcD}/statMarkers.py --markersTop ${ResD}/snn-markers.tsv --markersDatabase ${databaseD}/immuneCellMarkerAllinBox_Yanshuo.txt --out ${ResD}/markers.ys_celltype.tsv
+    # python ${srcD}/statMarkers.py --markersTop ${ResD}/markers.top.tsv --markersDatabase ${databaseD}/immuneCellMarkerAllinBox_Yanshuo.txt --out ${ResD}/markers.top.ys_celltype.tsv
+fi
+
+###############################################################################
+#'                             Run Callback Script                           '#
+###############################################################################
+if [ $toRunCallBack = "YES" ]; then
+    if [ -f $callBackPath ]; then
+        $callBackPath ${ResD}/cluster.rds
+    fi
+fi