forked from annebozack/EBC_summer_2022
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmeth04_analyze_data.R
executable file
·208 lines (162 loc) · 7.48 KB
/
meth04_analyze_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
.libPaths( c('.Rpackages',.libPaths() ) )
#'# Analyze methylation data
#' Using data preprocessed in our script:
#' meth01_process_data.R
#' we have a processed dataset with 30 samples (otherwise we run script 01)
if(!file.exists("data/processed.rda")){
source("code/meth01_process_data.R")
}
# load the data
load("data/processed.rda")
#' load packages
options(warn=-1)
suppressPackageStartupMessages({
library(DMRcate) # for regional analysis
library(magrittr)
library(CpGassoc) # for running association analysis between methylation levels values and phenotype of interest
library(data.table) # for fast aggregation of large data
library(qqman) # for visualization of data
library(stringi) # string manipulation
library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
})
options(warn=0)
#' Code categorical variable as factors
pheno$smoker %<>% factor
pheno$sex %<>% factor
## Cleaning up the methylation data
#' Filters a matrix of beta values by distance to single nucleotide polymorphism (SNP) and SNPs with the minor allele frequency (MAF) of 5% (rare variant).
#' Also removes crosshybridising probes and sex-chromosome probes.
dim(beta)
betas.clean = beta[manifest[probe_type=="cg" & !chr %in% c("X","Y")]$index,]
nCpG = dim(betas.clean)[1]
nCpG
#'# Running an Epigenome Wide Association
#' Here we run an EWAS on Smoking status (as a predictor of methylation)
#' First we can run a linear regression on a single CpG that we have already picked
CpG.name = "cg05575921"
pheno$CpG.level <- betas.clean[CpG.name,]
#' Difference in methylation between smokers and non-smokers for this CpG
#' some descriptive statistics
pheno[,.(
Min = min (CpG.level) %>% round(3)
,Mean = mean (CpG.level) %>% round(3)
,Median = median(CpG.level) %>% round(3)
,Max = max (CpG.level) %>% round(3)
,SD = sd (CpG.level) %>% round(3)
,.N),by=smoker] %>% knitr::kable(.)
#' Difference in beta methylation values between Smokers and non smokers
boxplot(CpG.level ~ smoker,pheno,main=paste0("Beta-values\n", CpG.name), col=c("blue","red"),ylim=c(.3,1))
#' Linear regression on betas
lm(CpG.level ~ smoker,data=pheno) %>% summary %>% coef
#' Comparison with m-values
pheno[,CpG.mlevel:=log2(CpG.level/(1-CpG.level))]
pheno[,.(
Min = min (CpG.mlevel) %>% round(3)
,Mean = mean (CpG.mlevel) %>% round(3)
,Median = median(CpG.mlevel) %>% round(3)
,Max = max (CpG.mlevel) %>% round(3)
,SD = sd (CpG.mlevel) %>% round(3)
,.N),by=smoker] %>% knitr::kable(.)
par(mfrow=c(1,2))
boxplot(CpG.level ~ smoker,data=pheno,main=paste0("Beta-values\n",CpG.name), col=c("blue","red"))
boxplot(CpG.mlevel ~ smoker,data=pheno,main=paste0("M-values\n" ,CpG.name), col=c("blue","red"))
#' linear regression on m-values
lm(CpG.mlevel ~ smoker,data=pheno) %>% summary %>% coef
#' We can always extract measures of the relative quality of statistical models - e.g. adjusted R2 - to look at model performance
#' model on betas
lm(CpG.level ~ smoker,data=pheno) %>% summary %$% adj.r.squared
#' model on mvalues
lm(CpG.mlevel ~ smoker,data=pheno) %>% summary %$% adj.r.squared
#'## EWAS and results using CpGassoc
#'see [Barfield et al. Bioinformatics 2012](http://www.ncbi.nlm.nih.gov/pubmed/22451269)
#' Smoking as predictor
#' note that CpGassoc is quite fast for running almost half million regressions!
pheno[,smoke_dummy:=ifelse(smoker=="smoker",1,0)]
system.time(results1 <- cpg.assoc(betas.clean, pheno$smoke_dummy,fdr.cutoff=0.1))
#' there are several components of the results
class(results1)
names(results1)
#' look at a few results
#' here effect size is ~ mean difference in methylation proportion
head(cbind(results1$coefficients[,4:5], P.value=results1$results[,3]))
#' and the top hits
head(cbind(results1$coefficients[,4:5], P.value=results1$results[,3])[order(results1$results[,3]),])
#' check with previous result on our selected CpG (running lm without CpGassoc)
cbind(results1$coefficients[,4:5],results1$results[,c(1,3)])[CpG.name,]
summary(lm(CpG.level~smoker,pheno))
#' Bonferroni significant hits
table(results1$results[,3] < 0.05/(nCpG))
#' FDR significant hits
table(results1$results[,5] < 0.05)
#' EWAS with adjustment for cell types
#' now we can run the linear regression on betas adjusting for cell proportions
#' Need sex as indicator in covariate matrix
#'
#'
#'
results2 = cpg.assoc(
betas.clean
,pheno$smoke_dummy
,covariates=as.data.frame(pheno[,.(sex,CD8,CD4,NK,B,MO,GR)])
)
print(results2)
#'using mvalues
#' Use of M-values reduces heteroscedasticity to meet linear model assumptions, see [Du P, et al. BMC Bioinformatics. 2010](https://pubmed.ncbi.nlm.nih.gov/21118553/).
results3 <- cpg.assoc(
betas.clean
,pheno$smoke_dummy
,covariates=as.data.frame(pheno[,.(sex,CD8,CD4,NK,B,MO,GR)])
,logit.transform=TRUE
)
print(results3)
#' ## Genomic inflation in EWAS
#' qqplot and lambda interpretation
#+ fig.width=13, fig.height=7, dpi=300
par(mfrow=c(1,1))
plot(results1, main="QQ plot for association between methylation and Smoking\nadjusted for cell proportions")
plot(results2, main="QQ plot for association between (mvals) methylation and Smoking\nadjusted for cell proportions")
#' Lambda - this is a summary measure of genomic inflation
#' ratio of observed vs expected median p-value - is there early departure of the qqline
#' estimated at -log10(median=0.5) ~ 0.3 on the x-axis of a qqplot
lambda <- function(p) median(qchisq(p, df=1, lower.tail=FALSE), na.rm=TRUE) / qchisq(0.5, df=1)
#' Lambda before cell type adjustment
lambda(results1$results[,3])
#' Lambda after cell type adjustment
lambda(results2$results[,3])
datamanhat = cbind(results2$results,results2$coefficients)
setDT(datamanhat)
datamanhat = datamanhat[,.(probe_id=CPG.Labels,effect.size,std.error,P.value)]
datamanhat = merge(datamanhat,manifest[,.(probe_id,chr,mapinfo)],by="probe_id")
#' See where the top hits are
datamanhat[order(P.value)][1:7]
#' Volcano Plot-results2
#' Bonferroni threshold
#'
plot(-log10(P.value) ~ effect.size,data=datamanhat,xlab="Estimate",ylab="-log10(p-value)",main="Volcano Plot\nadjusted for cell proportions",ylim=c(0,8))
abline(h = -log10(0.05/(nCpG)), lty=1, col="#FDE725FF", lwd=2)
#'## Manhattan plot for cell-type adjusted EWAS
#' Cast the variable chr (so we can simplify and use a numeric x-axis)
datamanhat[,chr:=as.integer(chr)]
qqman::manhattan(datamanhat,chr="chr",bp="mapinfo",p="P.value",snp="probe_id"
,suggestiveline=FALSE, genomewideline = -log10(0.05/(nCpG)),ylim=c(0,8)
,main = "Manhattan Plot \n adjusted for cell proportions")
#'# Introduction to limma
#' see [Smyth GK. Stat Appl Genet Mol Biol 2004](https://www.ncbi.nlm.nih.gov/pubmed/16646809).
suppressMessages(library(limma,minfi))
#' First we need to define a model
model <- model.matrix( ~smoker+sex+CD4+CD8+NK+B+MO+GR,data=pheno)
EWAS.limma <- eBayes(lmFit(betas.clean, design=model))
Top<-topTable(EWAS.limma, coef=2, number=Inf, sort.by="p")[1:10,]
Top
#' Bind results with annotation
Annot<-as.data.frame(getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19))
Annot.Tops<- Annot[match(rownames(Top),Annot$Name),]
Annot.Tops<-Annot.Tops[,c("UCSC_RefGene_Name","UCSC_RefGene_Group","Relation_to_Island","chr","pos")]
Top<-cbind(Top[,1:5], Annot.Tops)
#' Order by chr and chromosomal position
Top$chr = as.numeric(gsub("chr", "", Top$chr))
Top[order(Top$chr,Top$pos),c(1,6,7,8,9,10)]
#' cleanup
rm(nCpG,CpG.name,datamanhat,lambda,results1,results2,results3,Annot.Tops,EWAS.limma,Top)
gc()
#' End of script 04