-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstep1.RNAseq data analysis.r
182 lines (152 loc) · 7.22 KB
/
step1.RNAseq data analysis.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
########################################################
#-------------------------------------------------------
# Topic: Li Yining Data Analysis -- Transcriptome + Metabolome
# Author: Wang Haiquan
# Date: Fri Jul 5 14:36:14 2024
# Mail: [email protected]
#-------------------------------------------------------
########################################################
library(TCseq)
library(ClusterGVis)
library(Biobase)
library(ggsci)
library(openxlsx)
#devtools::install_github("junjunlab/ClusterGVis")
setwd("/data/whq/myproject/others_work/lyn_20240705/")
#-------------------------------------------------------
# Chapter: Data Input
#-------------------------------------------------------
# Read data
lyn_count <- read.xlsx("RNAseq_countsandFPKM.xlsx") %>% filter(!duplicated(GeneSymbol)) %>%
column_to_rownames("GeneSymbol") %>%
dplyr::select(colnames(.) %>% .[str_detect(.,"^(v|([0-9]{1,}(h|d)))-[1-3]$")])
lyn_rpkm <- read.xlsx("RNAseq_countsandFPKM.xlsx") %>% filter(!duplicated(GeneSymbol)) %>%
column_to_rownames("GeneSymbol") %>%
dplyr::select(colnames(.) %>% .[str_detect(.,"^(v|([0-9]{1,}(h|d)))-[1-3]_FPKM")]) %>%
setNames(colnames(.) %>% str_remove("_FPKM"))
# Define groups
lyn_rna_group <- data.frame(row.names = colnames(lyn_count),
group=colnames(lyn_count) %>% str_remove("-[0-3]") %>% factor(.,levels=c("v","12h","1d","2d"),label=c("Veh","12hrs","24hrs","48hrs"))) %>%
.[order(.$group),,drop=F]
# Rearrange data, log transform RPKM
lyn_count <- lyn_count[,rownames(lyn_rna_group)] %>% .[rowSums(lyn_count)>5,]
lyn_rpkm <- lyn_rpkm[,rownames(lyn_rna_group)] %>% .[rownames(lyn_count),] %>% {log2(.+1)}
identical(colnames(lyn_rpkm),colnames(lyn_count))
identical(rownames(lyn_rpkm),rownames(lyn_count))
#-------------------------------------------------------
# Chapter: Quality Control
#-------------------------------------------------------
# Create DGElist object
lyn_DGElist <- lyn_count %>% DGEList(.,group = lyn_rna_group$group) %>% calcNormFactors()
# Quality control
lyn_rpkm %>% t() %>% prcomp(.,scale.=T) -> lyn_rpkm_pca
lyn_rpkm_pca %>% .$x %>% as.data.frame() %>%
ggplot(aes(x=PC1,y=PC2,color=lyn_rna_group$group))+
geom_point()+
geom_text_repel(aes(label=rownames(lyn_rna_group)))+
theme_prism()+
labs(x=paste("PC1(",summary(lyn_rpkm_pca) %>% .$importance %>% .[2,1]*100,"%)",sep = ""),
y=paste("PC2(",summary(lyn_rpkm_pca) %>% .$importance %>% .[2,2]*100,"%)",sep = ""))
pheatmap(cor(lyn_rpkm),
annotation_col = lyn_rna_group)
#-------------------------------------------------------
# Chapter: Trend Analysis
#-------------------------------------------------------
# Further filter genes
lyn_rpkm_mean <- apply(lyn_rpkm,1,function(x){
tapply(x,lyn_rna_group$group,mean)
}) %>% t() %>% .[rowSums(.)>1,]
#getClusters(exp = lyn_rpkm_mean)
lyn_rpkm_mean_cm <- clusterData(exp = as.matrix(lyn_rpkm_mean),
cluster.method = "mfuzz",
cluster.num = 8,
seed = 2024,
scaleData = T)
# Remove genes with membership lower than 0.4
lyn_rpkm_mean_cm$wide.res <- filter(lyn_rpkm_mean_cm$wide.res,membership>0.4)
lyn_rpkm_mean_cm$long.res <- filter(lyn_rpkm_mean_cm$long.res,membership>0.4) %>%
mutate(cluster_name=str_remove(cluster_name," \\(.{1,}\\)"))
visCluster(object = lyn_rpkm_mean_cm,
plot.type = "line",ncol = 4)
# Batch KEGG enrichment
lyn_rpkm_mean_cm_enrichkegg <- lyn_rpkm_mean_cm$wide.res %>%
mutate(cluster=paste0("C",cluster)) %>%
mutate(cluster=factor(cluster,levels=paste0("C",1:8))) %>%
split(.,.$cluster) %>% lapply(.,function(x){
mapIds(org.Mm.eg.db,x$gene,"ENTREZID","SYMBOL") %>%
enrichKEGG(.,"mmu","ncbi-geneid",pvalueCutoff=1) %>%
setReadable(org.Mm.eg.db,"ENTREZID")
})
# Select top 20 KEGG pathways
lyn_rpkm_mean_cm_enrichkegg_df <- lyn_rpkm_mean_cm_enrichkegg %>%
lapply(.,function(x){
x@result %>% .[1:20,]
}) %>% Reduce(rbind,.) %>%
mutate(cluster=rep(names(lyn_rpkm_mean_cm_enrichkegg),each=20)) %>%
dplyr::select("cluster",colnames(.)[colnames(.)!="cluster"])
# Save, download and highlight key information, select 5 terms to display for each cluster
#write.xlsx(lyn_rpkm_mean_cm_enrichkegg_df,"lyn_rpkm_mean_cm_enrichkegg_df_top10.xlsx")
# Construct candidate terms
lyn_rpkm_mean_cm_enrichkegg_selected <- read.xlsx("lyn_rpkm_mean_cm_enrichkegg_df_top10.xlsx",sheet = 3) %>%
dplyr::select(cluster,Description) %>%
mutate(Description=str_remove(Description," - Mus musculus \\(house mouse\\)")) %>%
setNames(c("id","term"))
# Create heatmap
pdf('lyn_rpkm_mean_cm_heatmap.pdf',height = 8,width = 10)
visCluster(object = lyn_rpkm_mean_cm,
plot.type = "both",
show_row_dend = F,line.side = "left",
column_names_rot = 0,
annoTerm.data =lyn_rpkm_mean_cm_enrichkegg_selected,
ctAnno.col = ggsci::pal_aaas()(8),
go.col = rep(ggsci::pal_aaas()(8),each=5),
sample.col = c("Veh"="#66c2a5", "12hrs"="#fc8d62", "24hrs"="#8da0cb", "48hrs"="#e78ac3")
)
dev.off()
#-------------------------------------------------------
# Chapter: Expression of Some Genes
#-------------------------------------------------------
lyn_rpkm[c("Dvl3","Fzd9","Wnt9a","Yap1",
"Gsk3b","Smad2","Smad3","Smad4"),] %>%
mutate(gene=rownames(.)) %>%
melt(id.vars="gene") %>%
merge(.,lyn_rna_group %>% mutate(sample=rownames(.)),by.x="variable",by.y="sample",all.x=T) %>%
setNames(c("sample","gene","rpkm","group")) %>%
ggplot(aes(x=group,y=rpkm,fill=group))+
geom_boxplot()+
stat_compare_means(comparisons = list(c("Veh","12hrs"),c("12hrs","24hrs"),c("12hrs","48hrs")),method = "t.test")+
scale_fill_manual(values = c("Veh"="#66c2a5", "12hrs"="#fc8d62", "24hrs"="#8da0cb", "48hrs"="#e78ac3"))+
facet_wrap(~gene,scales = "free",ncol = 4)+
theme_bw()+NoLegend()+
labs(x="",y="log2(FPKM+1)")
lyn_rpkm[c("Acaca","Acly","Acss2","Fasn"),] %>%
mutate(gene=rownames(.)) %>%
melt(id.vars="gene") %>%
merge(.,lyn_rna_group %>% mutate(sample=rownames(.)),by.x="variable",by.y="sample",all.x=T) %>%
setNames(c("sample","gene","rpkm","group")) %>%
ggplot(aes(x=group,y=rpkm,fill=group))+
geom_boxplot()+
stat_compare_means(comparisons = list(c("Veh","12hrs"),c("Veh","24hrs"),c("Veh","48hrs")),method = "t.test")+
scale_fill_manual(values = c("Veh"="#66c2a5", "12hrs"="#fc8d62", "24hrs"="#8da0cb", "48hrs"="#e78ac3"))+
facet_wrap(~gene,scales = "free",ncol = 4)+
theme_bw()+NoLegend()+
labs(x="",y="log2(FPKM+1)")
#-------------------------------------------------------
# Chapter: Final Processing and Saving of Results
#-------------------------------------------------------
# Check variable sizes
require(dplyr)
require(stringr)
my_obj_in_env <- sapply(ls(),function(x){
a=eval(expr = parse(text = x)) %>% object.size()
b=a/1024^3
b %>% round(.,digits = 3) %>% as.numeric()}) %>%
data.frame(object=names(.),size_GB=.,row.names = NULL) %>% arrange(-size_GB)
my_obj_in_env[1:10,]
# Delete unnecessary variables
#rm()
# Save results
res_dir <- "/data/whq/myproject/others_work/lyn_20240705/RDS/"
image.name <- ""
save.image(paste(res_dir,image.name,sep = ""))
lapply(ls(),function(x){filename=paste(res_dir,x,".rds",sep = "");saveRDS(eval(parse(text = x)),filename)})