-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcell-cycle.Rmd
163 lines (122 loc) · 4.7 KB
/
cell-cycle.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
---
title: "Cell cycle gene"
author: "Po-Yuan Tung"
date: 2018-02-28
output: html_document
---
<!-- The file analysis/chunks.R contains chunks that define default settings
shared across the workflowr files. -->
```{r read-chunk, include=FALSE, cache=FALSE}
knitr::read_chunk("chunks.R")
```
<!-- Update knitr chunk options -->
```{r knitr-opts-chunk, include=FALSE}
```
<!-- Insert the date the file was last updated -->
```{r last-updated, echo=FALSE, results='asis'}
```
<!-- Insert the code version (Git commit SHA1) if Git repository exists and R
package git2r is installed -->
```{r code-version, echo=FALSE, results='asis'}
```
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Setup
```{r packages, message=FALSE}
library("dplyr")
library("edgeR")
library("knitr")
library("scde")
source("../code/functions.R")
library("Biobase") # has to be loaded last to use `combine`
```
Import data.
```{r import}
eset <- readRDS("../data/eset.rds")
dim(eset)
```
Keep human genes and ERCC
```{r human-genes}
eset <- eset[fData(eset)$source %in% c("H. sapiens", "ERCC") , ]
dim(eset)
```
Only keep high-quality single cells.
```{r quality-cell}
quality <- read.table("../data/quality-single-cells.txt", stringsAsFactors = FALSE)
colnames(quality) <- c("sample", "quality")
eset <- eset[, quality$quality]
dim(eset)
```
Remove zeros.
```{r remove-zeros}
eset <- eset[rowSums(exprs(eset)) != 0, ]
dim(eset)
```
Convert to log2 counts per million.
```{r hist}
log2cpm <- cpm(exprs(eset), log = TRUE)
dim(log2cpm)
```
Filter for cell cycle gene
```{r filter}
## input cell cycle gene Gene sets reflecting 5 cell cycle phases were taken from Table S2 of Macosko2015 Gene ID conversion was done by using the DAVID http://david.abcc.ncifcrf.gov
cell_cycle_genes <- read.table("../data/cellcyclegenes.txt", header = TRUE, sep="\t")
## keep only cell cycle gene
log2cpm_cycle <- log2cpm[rownames(log2cpm) %in% unlist(cell_cycle_genes),]
dim(log2cpm_cycle)
heatmap(log2cpm_cycle)
```
## Pagoda
```{r varinfo}
# We will set the associated weights to 1 for all observations in all cells and calculated variances for each gene
# Set all weights to 1
N <- nrow(log2cpm_cycle)
M <- ncol(log2cpm_cycle)
log2cpm_cycle_w <- matrix(1, N, M)
rownames(log2cpm_cycle_w) <- rownames(log2cpm_cycle)
colnames(log2cpm_cycle_w) <- colnames(log2cpm_cycle)
# Regular variance since equal weights anyway
log2cpm_cycle_var <- apply(log2cpm_cycle, 1, var)
# Create varinfo object to pipe into PAGODA
varinfo <- list('mat' = log2cpm_cycle, 'matw' = log2cpm_cycle_w, 'arv' = log2cpm_cycle_var)
```
```{r create-go}
library(biomaRt)
library(GO.db)
# Initialize the connection to the Ensembl BioMart Service
# Available datasets can be listed with
# listDatasets(useMart("ENSEMBL_MART_ENSEMBL", host = "feb2014.archive.ensembl.org"))
# Use mmusculus_gene_ensembl for mouse
ensembl <- useMart("ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", host = "feb2014.archive.ensembl.org")
# Constructs a dataframe with two columns: hgnc_symbol and go_id
# If rownames are Ensembl IDs, use ensembl_gene_id as filter value
go <- getBM(attributes = c("ensembl_gene_id", "go_id"), filters = "ensembl_gene_id", values = rownames(log2cpm_cycle), mart = ensembl)
# Use the GO.db library to add a column with the GO-term to the dataframe
go$term <- Term(go$go_id)
# Create a named list of character vectors out of the df
s = split(go$ensembl_gene_id, paste(go$go_id,go$term))
# Saves the list as a R environment
go.env <- list2env(s)
# Test
class(go.env)
```
```{r pagoda}
# Run PAGODA with generated data
pwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components = 1, batch.center = FALSE)
df <- pagoda.top.aspects(pwpca, return.table = TRUE, plot = TRUE, z.score = 1.96)
df$name
# get full info on the top aspects
tam <- pagoda.top.aspects(pwpca, n.cells = NULL, z.score = qnorm(0.01/2, lower.tail = FALSE))
# determine overall cell clustering
hc <- pagoda.cluster.cells(tam, varinfo)
# combine pathways that are driven by the same sets of genes
tamr <- pagoda.reduce.loading.redundancy(tam, pwpca)
# combine aspects that show similar patterns
tamr2 <- pagoda.reduce.redundancy(tamr, distance.threshold = 0.9, plot = TRUE, cell.clustering = hc, labRow = NA, labCol = NA, box = TRUE, margins = c(0.5, 0.5), trim = 0)
# view the top aspects, clustering them by pattern similarity
col.cols <- rbind(groups = cutree(hc, 5))
pagoda.view.aspects(tamr2, cell.clustering = hc, box = TRUE, labCol = NA, margins = c(0.5, 20), col.cols = col.cols)
# get cell cycle signature and view the top genes
pagoda.show.pathways(c("GO:0007049 cell cycle","GO:0051301 cell division"), varinfo, go.env, cell.clustering = hc, margins = c(1,5), show.cell.dendrogram = TRUE, showRowLabels = TRUE, showPC = TRUE)
```