first commit

hermanzhaozzzz · Feb 23, 2022 · 3572d20 · 3572d20
commit 3572d20
Show file tree

Hide file tree

Showing 31 changed files with 164,642 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,27 @@
+/public/
+/.ipynb_checkpoints/
+/.deploy_git/
+/.texpadtmp/
+/.Trash/
+/.snakemake/
+/tmpdir_*/
+sna*.err
+sna*.out
+job.srp*
+.DS_Store
+.Rhistory
+.localized
+slurm-*.out∑/public/
+/.ipynb_checkpoints/
+/.deploy_git/
+/.texpadtmp/
+/.Trash/
+/.snakemake/
+/tmpdir_*/
+sna*.err
+sna*.out
+job.srp*
+.DS_Store
+.Rhistory
+.localized
+slurm-*.out
diff --git a/000_Dispersion_plot.pdf b/000_Dispersion_plot.pdf
diff --git a/001_expression_value_compare.pdf b/001_expression_value_compare.pdf
diff --git a/002_expression_distribution_info_hist.pdf b/002_expression_distribution_info_hist.pdf
diff --git a/003_correlation_cross_samples_heatmap.pdf b/003_correlation_cross_samples_heatmap.pdf
diff --git a/004_DESeq2_MA_plot.pdf b/004_DESeq2_MA_plot.pdf
diff --git a/005_nbinomTest_p_value.pdf b/005_nbinomTest_p_value.pdf
diff --git a/006_PCAplot.pdf b/006_PCAplot.pdf
diff --git a/007_jitter_plot_for_min_pvalue_transcript_counts.pdf b/007_jitter_plot_for_min_pvalue_transcript_counts.pdf
diff --git a/008_select_first_ordered_1000_transcriptsOrGenes_plot_heatmap.pdf b/008_select_first_ordered_1000_transcriptsOrGenes_plot_heatmap.pdf
diff --git a/009_select_first_ordered_50_transcriptsOrGenes_plot_heatmap.pdf b/009_select_first_ordered_50_transcriptsOrGenes_plot_heatmap.pdf
diff --git a/010_select_differential_expression_genes_heatmap.pdf b/010_select_differential_expression_genes_heatmap.pdf
diff --git a/011_select_differential_expression_genes_valcano_plot.pdf b/011_select_differential_expression_genes_valcano_plot.pdf
diff --git a/012_DEG_output_annotate_20210202180437.txt b/012_DEG_output_annotate_20210202180437.txt
diff --git a/013_enrichment_KEGG-and-GO_output_identify_20210202180721.txt b/013_enrichment_KEGG-and-GO_output_identify_20210202180721.txt
diff --git a/014.enrich_circular_network.png b/014.enrich_circular_network.png
diff --git a/015.enrich_barplot.png b/015.enrich_barplot.png
diff --git a/016.enrich_bubble_plot.png b/016.enrich_bubble_plot.png
diff --git a/README.md b/README.md
@@ -0,0 +1,9 @@
+1. https://github.com/hermanzhaozzzz/snakepipes_fastqc-multiqc
+先跑质控流程，主要是看看需不需要trim 5‘ 端的不稳定序列和3'端质量不够高的reads
+2. 跑mapping流程，如果是star mapping，就用这个流程即可
+3. 执行Snakefile进行 cutadapt、Star mapping、bam sort等处理
+4. 后续1，可以选择GATK SNP calling和差异表达分析
+5. 后续2，差异表达基因
+    - 可以使用Snakefile.fc.deseq2.py计算差异表达基因的featureCounts
+    - 并且使用format_all.featureCounts.form.ipynb整理表格数据
+    - 最后使用DESeq2_plots.ipynb绘图
diff --git a/Snakefile.cuffdiff.py b/Snakefile.cuffdiff.py
@@ -0,0 +1,62 @@
+CUFFDIFF = "/home/zhaohuanan/anaconda3/envs/snakepipes_cutadapt-STARmapping-FPKM-sortBAM/bin/cuffdiff"
+HG38_FA = "/home/zhaohuanan/zhaohn_HD/2.database/bwa_hg38/hg38_only_chromosome.fa"
+HG38_GTF = "/home/zhaohuanan/zhaohn_HD/2.database/annotate_hg38/20200714_ComprehensiveGeneAnnotation_Chr_gencode.v29.annotation.gtf"
+
+# --------------------------------------------------------------->>>>>>>
+# vars
+# --------------------------------------------------------------->>>>>>>
+SAMPLES = [
+    'BE4-0706-rep1'
+]
+CTRL = [
+    '2Mock-1_combined'
+]
+
+# SAMPLES = [
+#     '2Mock-1_combined',# 前面试bg后面是exp
+#     '2BE4-All-1_combined',
+#     '2Vector-1_combined',
+#     'BE3-1_combined',
+#     'BE3-2_combined',
+#     'BE4-1_combined',
+#     'BE4-2_combined',
+#     'EM-1_combined',
+#     'EM-2_combined',
+#     'BE4-0706-rep1',
+#     'M2-0706-rep1'
+# ]
+
+# ------------------------------------------------------------------------------------------>>>>>>>>>>
+# rule all
+# ------------------------------------------------------------------------------------------>>>>>>>>>>
+rule all:
+    input:
+        expand("../bam/293T-RNASeq-{sample}_Aligned_sort.bam",sample=SAMPLES),
+        expand("../bam/293T-RNASeq-{ctrl}_Aligned_sort.bam",ctrl=CTRL),
+        expand("../cuffdiff/{sample}_vs_{ctrl}",sample=SAMPLES,ctrl=CTRL)
+
+
+rule cuffdiff:
+    # 摘要：如何解决cuffdiff运行结果中表达量为0的情况？ cuffdiff -o cdiffout -b ref.fasta -u -p 15 --library-type fr-firststrand \ -L H_3,O_3,F_3 cuffmergeout/merged.gtf \ tophato 阅读全文
+    input:
+        "../bam/293T-RNASeq-{sample}_Aligned_sort.bam",
+        "../bam/293T-RNASeq-{ctrl}_Aligned_sort.bam"
+    output:
+        directory("../cuffdiff/{sample}_vs_{ctrl}")
+#     params:
+#         l1 = '{sample}',
+#         l2 = '{ctrl}'
+    log:
+        "../cuffdiff/{sample}_vs_{ctrl}.log"
+    shell:
+        """
+        {CUFFDIFF} \
+        -o {output} \
+        -b {HG38_FA} \
+        -p 24 \
+        --FDR 0.01 \
+        -u {HG38_GTF} \
+        {input[0]} \
+        {input[1]} \
+        --no-update-check 2>{log}
+        """
diff --git a/Snakefile_main.py b/Snakefile_main.py
@@ -0,0 +1,214 @@
+# _*_ coding: UTF-8 _*_
+
+########################################################################
+# ZHAO Huanan
+# 2021-02-01
+# pcif1 RNA-Seq analysis pipeline
+######################################################################## 
+# before this, make sure you have done fastqc + multiqc 
+# https://github.com/hermanzhaozzzz/snakepipes_fastqc-multiqc
+# and add trim rule to make a better trim
+# --------------------------------------------------------------->>>>>>>
+# pipeline
+# --------------------------------------------------------------->>>>>>>
+# 1. cutadapt
+# 2. STAR alingment 
+# 3. samtools addreplacerg - > RG
+# 4. samtools sort by position
+# 5. picard mark duplicates
+# 6. samtools build bam index
+# --------------------------------------------------------------->>>>>>>
+# software
+# --------------------------------------------------------------->>>>>>>
+# make sure the fastqqc and the multiqc are in you PATH
+# get the application path
+with os.popen("which cutadapt") as path:
+    CUTADAPT = path.read().strip()
+    print('PATH cutadapt:', CUTADAPT)
+with os.popen("which samtools") as path:
+    SAMTOOLS = path.read().strip()
+    print('PATH samtools:', SAMTOOLS)    
+with os.popen("which STAR") as path:
+    STAR = path.read().strip()
+    print('PATH star:', STAR)   
+with os.popen("which java") as path:
+    JAVA = path.read().strip()
+    print('PATH java:', JAVA)
+
+# picard version 2.23
+PICARD = "/home/zhaohuanan/zhaohn_HD/1.apps/picard/picard.jar"
+
+
+# --------------------------------------------------------------->>>>>>>
+# index and files
+# --------------------------------------------------------------->>>>>>>
+HG38_FA = "/home/zhaohuanan/zhaohn_HD/2.database/bwa_hg38/hg38_only_chromosome.fa"
+HG38_FA_DICT = "/home/zhaohuanan/zhaohn_HD/2.database/bwa_hg38/hg38_only_chromosome.fa"
+STAR_HG38_INDEX = "/home/zhaohuanan/zhaohn_HD/2.database/star_hg38"
+HG38_GTF = "/home/zhaohuanan/zhaohn_HD/2.database/annotate_hg38/20200714_ComprehensiveGeneAnnotation_Chr_gencode.v29.annotation.gtf"
+
+# 这里是小鼠的, 没改变量名
+# HG38_FA = "/home/zhaohuanan/zhaohn_HD/2.database/db_genomes/genome_fa/genome_gencode_GRCm38.p6/GRCm38.p6.genome.fa"
+# HG38_FA_DICT = "/home/zhaohuanan/zhaohn_HD/2.database/db_genomes/genome_fa/genome_gencode_GRCm38.p6/GRCm38.p6.genome.fa"
+# STAR_HG38_INDEX = "/home/zhaohuanan/zhaohn_HD/2.database/db_genomes/genome_fa/genome_gencode_GRCm38.p6/star_index_150bp"
+# HG38_GTF = "/home/zhaohuanan/zhaohn_HD/2.database/db_genomes/genome_annotation/genome_gencode_GRCm38.p6/gencode.vM25.annotation.gtf"
+
+
+# --------------------------------------------------------------->>>>>>>
+# vars
+# --------------------------------------------------------------->>>>>>>
+SAMPLES = [
+    "pcif1-WT-1",
+    "pcif1-WT-2",
+    "pcif1-WT-3",
+    "pcif1-WT-4",
+    "pcif1-KO-1",
+    "pcif1-KO-2",
+    "pcif1-KO-3",
+    "pcif1-KO-4"
+#    "test"
+]
+
+
+# ------------------------------------------------------------------------------------------>>>>>>>>>>
+# rule all
+# ------------------------------------------------------------------------------------------>>>>>>>>>>
+rule all:
+    input:
+        expand("../fastq/{sample}_R1.fq.gz",sample=SAMPLES),
+        expand("../fastq/{sample}_R2.fq.gz",sample=SAMPLES),
+        expand("../fix.fastq/293T-RNASeq-{sample}_R1_cutadapt.fq.gz",sample=SAMPLES),
+        expand("../fix.fastq/293T-RNASeq-{sample}_R2_cutadapt.fq.gz",sample=SAMPLES),
+        expand("../bam/293T-RNASeq-{sample}_Aligned.out.bam",sample=SAMPLES),
+        expand("../bam/293T-RNASeq-{sample}_Aligned.out.fix_RG.bam",sample=SAMPLES),
+        expand("../bam/293T-RNASeq-{sample}_Aligned_sort.bam",sample=SAMPLES),
+        expand("../bam/293T-RNASeq-{sample}_Aligned_sort_MarkDup.bam",sample=SAMPLES),
+        expand("../bam/293T-RNASeq-{sample}_Aligned_sort.bam.bai",sample=SAMPLES),
+#         expand("../fpkm/{sample}",sample=SAMPLES)
+# ------------------------------------------------------------------------------------------>>>>>>>>>>
+# cutadapter
+# ------------------------------------------------------------------------------------------>>>>>>>>>>
+rule TruSeq_cutadapt:
+    input:
+        "../fix.fastq/293T-RNASeq-{sample}_R1.fastq.gz",
+        "../fix.fastq/293T-RNASeq-{sample}_R2.fastq.gz"
+    output:
+        "../fix.fastq/293T-RNASeq-{sample}_R1_cutadapt.fq.gz",
+        "../fix.fastq/293T-RNASeq-{sample}_R2_cutadapt.fq.gz"
+    log:
+        "../fix.fastq/293T-RNASeq-{sample}_cutadapt.log"
+    shell:# using illumina universal adaptor
+        """
+        srun -T 24 -c 24 \
+        {CUTADAPT} -j 24 --times 1  -e 0.1  -O 3  --quality-cutoff 25 \
+        -m 55 -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \
+        -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT \
+        -o {output[0]} -p {output[1]} {input[0]} {input[1]} > {log} 2>&1
+        """
+# rule NexteraSeq_cutadapt:
+#     input:
+#         "../fastq/{sample}_R1.fq.gz",
+#         "../fastq/{sample}_R2.fq.gz"
+#     output:
+#         "../fix.fastq/293T-RNASeq-{sample}_R1_cutadapt.fq.gz",
+#         "../fix.fastq/293T-RNASeq-{sample}_R2_cutadapt.fq.gz"
+#     log:
+#         "../fix.fastq/293T-RNASeq-{sample}_cutadapt.log"
+#     shell:# using illumina NexteraSeq adaptor
+#         """
+#         {CUTADAPT} -j 24 --times 1  -e 0.1  -O 3  --quality-cutoff 25 \
+#         -m 55 -a TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG \
+#         -A GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG \
+#         -o {output[0]} -p {output[1]} {input[0]} {input[1]} > {log} 2>&1
+#         """
+# ------------------------------------------------------------------------------------------>>>>>>>>>>
+# STAR mapping
+# ------------------------------------------------------------------------------------------>>>>>>>>>>
+rule STAR_mapping:
+    input:
+        fq1 = "../fix.fastq/293T-RNASeq-{sample}_R1_cutadapt.fq.gz",
+        fq2 = "../fix.fastq/293T-RNASeq-{sample}_R2_cutadapt.fq.gz"
+    output:
+        "../bam/293T-RNASeq-{sample}_Aligned.out.bam"
+    log:
+        "../bam/293T-RNASeq-{sample}_Aligned.out.log"
+    params:
+        "../bam/293T-RNASeq-{sample}_"
+    shell:
+        """
+        {STAR} \
+        --genomeDir {STAR_HG38_INDEX} \
+        --runThreadN 24 \
+        --readFilesIn {input.fq1} {input.fq2} \
+        --readFilesCommand zcat \
+        --outFileNamePrefix {params} \
+        --outSAMtype BAM Unsorted \
+        --outSAMstrandField intronMotif \
+        --outFilterIntronMotifs RemoveNoncanonical > {log} 2>&1
+        """
+# ------------------------------------------------------------------------------------------>>>>>>>>>>
+# add @RG tag (mostly for GATK SNP/SNV calling)
+# ------------------------------------------------------------------------------------------>>>>>>>>>>
+rule add_RG_tag:
+    input:
+        "../bam/293T-RNASeq-{sample}_Aligned.out.bam"
+    output:
+        "../bam/293T-RNASeq-{sample}_Aligned.out.fix_RG.bam"
+    params:
+        tag = "'@RG\\tID:{sample}\\tSM:{sample}\\tPL:ILLUMINA'"
+    shell:
+        """
+        {SAMTOOLS} addreplacerg -r {params.tag} -@ 24 -O BAM -o {output} --reference {HG38_FA_DICT} {input}
+        """
+# ------------------------------------------------------------------------------------------>>>>>>>>>>
+# samtools sort by position(not sort by name)
+# ------------------------------------------------------------------------------------------>>>>>>>>>>
+rule BAM_sort_by_position:
+    input:
+        "../bam/293T-RNASeq-{sample}_Aligned.out.fix_RG.bam"
+    output:
+        "../bam/293T-RNASeq-{sample}_Aligned_sort.bam"
+    shell:
+        """
+        {SAMTOOLS} sort \
+        -O BAM \
+        -o {output} \
+        -T {output}.temp \
+        -@ 24  \
+        {input}
+        """
+#         -m 4G
+# ------------------------------------------------------------------------------------------>>>>>>>>>>
+# picard mark duplicate
+# ------------------------------------------------------------------------------------------>>>>>>>>>>
+rule BAM_mark_duplicate:
+    input:
+        "../bam/293T-RNASeq-{sample}_Aligned_sort.bam"
+    output:
+        "../bam/293T-RNASeq-{sample}_Aligned_sort_MarkDup.bam",
+        "../bam/293T-RNASeq-{sample}_Aligned_sort_MarkDup.matrix"
+    log:
+        "../bam/293T-RNASeq-{sample}_Aligned_sort_MarkDup.log"
+    shell:
+        """
+        {JAVA} -Xms100g -Xmx100g -XX:ParallelGCThreads=24 \
+        -jar {PICARD} MarkDuplicates \
+        I={input} \
+        O={output[0]} \
+        M={output[1]} \
+        ASO=coordinate 2>{log}
+        """
+# ------------------------------------------------------------------------------------------>>>>>>>>>>
+# samtools build bam index
+# ------------------------------------------------------------------------------------------>>>>>>>>>>
+rule BAM_index:
+    input:
+        "../bam/293T-RNASeq-{sample}_Aligned_sort.bam"
+    output:
+        "../bam/293T-RNASeq-{sample}_Aligned_sort.bam.bai"
+    shell:
+        """
+        {SAMTOOLS} index -@ 24 \
+        {input} \
+        {output}
+        """
diff --git a/Snakefile_raw-counts-for-deseq2.py b/Snakefile_raw-counts-for-deseq2.py
@@ -0,0 +1,63 @@
+with os.popen("which featureCounts") as path:
+    FEATURECOUNTS = path.read().strip()
+    print('PATH cutadapt:', FEATURECOUNTS)
+
+
+
+# human 
+# HG38_FA = "/home/zhaohuanan/zhaohn_HD/2.database/bwa_hg38/hg38_only_chromosome.fa"
+# HG38_GTF = "/home/zhaohuanan/zhaohn_HD/2.database/annotate_hg38/20200714_ComprehensiveGeneAnnotation_Chr_gencode.v29.annotation.gtf"
+# mouse
+HG38_FA = "/home/zhaohuanan/zhaohn_HD/2.database/db_genomes/genome_fa/genome_gencode_GRCm38.p6/GRCm38.p6.genome.fa"
+HG38_GTF = "/home/zhaohuanan/zhaohn_HD/2.database/db_genomes/genome_annotation/genome_gencode_GRCm38.p6/gencode.vM25.annotation.gtf"
+# --------------------------------------------------------------->>>>>>>
+# vars
+# --------------------------------------------------------------->>>>>>>
+
+
+# 前面WT后面是TREAT
+SAMPLES = [
+    "pcif1-WT-1",
+    "pcif1-WT-2",
+    "pcif1-WT-3",
+    "pcif1-WT-4",
+    "pcif1-KO-1",
+    "pcif1-KO-2",
+    "pcif1-KO-3",
+    "pcif1-KO-4"
+]
+
+
+# ------------------------------------------------------------------------------------------>>>>>>>>>>
+# rule all
+# ------------------------------------------------------------------------------------------>>>>>>>>>>
+rule all:
+    input:
+        '../featureCounts/all_feature.txt'
+
+rule featureCounts:
+    params:
+        BAM = ' '.join(expand("../bam/293T-RNASeq-{sample}_Aligned_sort.bam",sample=SAMPLES))
+    output:
+        '../featureCounts/all_feature.txt'
+    log:
+        '../featureCounts/run_FC.log'
+    shell:
+        """
+        {FEATURECOUNTS} \
+        -T 24 \
+        -p \
+        -t exon \
+        -g gene_id \
+        -a {HG38_GTF} \
+        -o {output} \
+        {params.BAM} 2>{log}
+        """
+
+# -T 使用的线程数
+# -p 如果是paird end 就用
+# -t 将exon作为一个feature
+# -g 将gene_id作为一个feature
+# -a 参考的gtf/gff
+# -o 输出文件
+# 最后加上bam文件，有几个就加几个