Skip to content

Commit 97ef1b2

Browse files
authoredDec 18, 2024··
Revert "Add exon-only gri filter script"
1 parent ad07cc9 commit 97ef1b2

File tree

3 files changed

+15
-332
lines changed

3 files changed

+15
-332
lines changed
 

‎utils/GRI_bed_filter/README.md

+7-17
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,13 @@
11
# Genomic Region of Interest filter
2-
32
\
43
\
54

6-
##### Data files for generation:
7-
8-
1. Gencode v46 gene annotation\
9-
2. HGMD 2022 database\
10-
3. Clinvar 8/6/2024 database\
11-
4. SpliceAI v1.3 prediction data\
12-
13-
##### BED file description:
14-
15-
1. Version 1 --- gene only: BED \<--- protein_coding gene + HGMD mutations (50 flanking bp) + Cinvar mutations (50 flanking bp)\
16-
(Note: Promoter upstream 1kb is dropped as requested by Dr.Liu)\
17-
2. Version 2 --- exon only: BED \<--- protein_coding gene + HGMD mutations (50 flanking bp) + Cinvar mutations (50 flanking bp) + SpliceAI predicted positions (50 flanking bp)\
18-
3. Additionally, initial bed files which also contain a source column can be generated, that indicates where each position entry comes from (gencode, hgmd, clinvar, or spliceAI)
5+
##### Data files for generation: \
6+
1. Gencode v46 gene annotation \
7+
2. HGMD 2022 database \
8+
3. Clinvar 8/6/2024 database \
199

20-
##### BED file format:
10+
##### BED file description: \
11+
1. Version 1: BED \<--- protein_coding gene + HGMD mutations (50 flanking bp) + Cinvar mutations (50 flanking bp) \
12+
(Note: Promoter upstream 1kb is dropped as requested by Dr.Liu)
2113

22-
1. chr \| start \| end\
23-
2. The 1st column contains prefix of "chr" for chromosomes, for example, "chr1, chr2, .... chr Y".

‎utils/GRI_bed_filter/version1_gene-only/gri_v1.R

+8-15
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ library(readr)
2222
library(data.table)
2323

2424
# Function to generate GRI region based on genome version
25-
generate_gri_v1 <- function(genome_version = "hg38") {
25+
generate_gri_geneonly <- function(genome_version = "hg38") {
2626
# data preparation --------------------------------------------------------
2727

2828
## hgmd local file name
@@ -34,7 +34,7 @@ generate_gri_v1 <- function(genome_version = "hg38") {
3434
hg38 <- BSgenome.Hsapiens.UCSC.hg38
3535
size <- seqlengths(hg38)
3636
chr_size <-
37-
data.frame(seqnames = names(size), chr_end = as.integer(size))[c(1:24), ]
37+
data.frame(seqnames = names(size), chr_end = as.integer(size))[c(1:24),]
3838

3939
##load gencode data
4040
url <-
@@ -58,7 +58,7 @@ generate_gri_v1 <- function(genome_version = "hg38") {
5858
hg19 <- BSgenome.Hsapiens.UCSC.hg19
5959
size <- seqlengths(hg19)
6060
chr_size <-
61-
data.frame(seqnames = names(size), chr_end = as.integer(size))[c(1:24), ]
61+
data.frame(seqnames = names(size), chr_end = as.integer(size))[c(1:24),]
6262

6363
##load gencode data
6464
url <-
@@ -141,15 +141,15 @@ generate_gri_v1 <- function(genome_version = "hg38") {
141141
mutate(source = "HGMD_2022")
142142

143143
## exclude "rejected variants" in HGMD database
144-
hgmd_noR <- hgmd[-which(hgmd$CLASS == "R"), ]
144+
hgmd_noR <- hgmd[-which(hgmd$CLASS == "R"),]
145145
hgmd_noR <- hgmd_noR |>
146146
dplyr::mutate(seqnames = paste0("chr", seqnames))
147147

148148
## add 50bp flanking region
149149
hgmd_noR <- merge(hgmd_noR, chr_size, by = "seqnames")
150150
hgmd_noR_flank <- hgmd_noR |>
151151
dplyr::mutate(
152-
final_start = ifelse(start > 50, start - 50, 1),
152+
final_start = ifelse(start > 50, start - 50, 0),
153153
final_end = ifelse((end + 50) > chr_end, chr_end, end + 50),
154154
.after = seqnames
155155
)
@@ -201,7 +201,7 @@ generate_gri_v1 <- function(genome_version = "hg38") {
201201
## add flanking region
202202
clinvar_patho_flank <- clinvar_patho |>
203203
mutate(
204-
final_start = ifelse(start > 50, start - 50, 1),
204+
final_start = ifelse(start > 50, start - 50, 0),
205205
final_end = ifelse((end + 50) > chr_end, chr_end, end + 50),
206206
.after = seqnames
207207
)
@@ -233,13 +233,6 @@ generate_gri_v1 <- function(genome_version = "hg38") {
233233
## check bed coverage
234234
coverage_bed(bed_gencode_hgmd_clinvar.dt)
235235

236-
# convert start position from 1-based into 0-based format of bed file
237-
bed_gencode_hgmd_clinvar.dt <- bed_gencode_hgmd_clinvar.dt |>
238-
mutate(start = pmax(0, start - 1))
239-
240-
bed_gencode_hgmd_clinvar <- bed_gencode_hgmd_clinvar |>
241-
mutate(start = pmax(0, start - 1))
242-
243236
# write the bed file
244237
file_name <- paste0("gene_only.", genome_version, ".bed")
245238
write_tsv(bed_gencode_hgmd_clinvar.dt[, c(1:3)],
@@ -255,5 +248,5 @@ generate_gri_v1 <- function(genome_version = "hg38") {
255248
}
256249

257250
# Apply generate_gri_geneonly function to generate bed files --------------
258-
generate_gri_v1("hg19")
259-
generate_gri_v1("hg38")
251+
generate_gri_geneonly("hg19")
252+
generate_gri_geneonly("hg38")

‎utils/GRI_bed_filter/version2_exon-only/gri_v2.R

-300
This file was deleted.

0 commit comments

Comments
 (0)
Please sign in to comment.