Skip to content

Commit

Permalink
New db version
Browse files Browse the repository at this point in the history
  • Loading branch information
Patrice Godard committed Nov 9, 2024
1 parent 42b136f commit ff104b5
Show file tree
Hide file tree
Showing 3 changed files with 6,935 additions and 56 deletions.
310 changes: 254 additions & 56 deletions supp/Build/Genodesy-instance/Rebuild-BED.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -1949,75 +1949,273 @@ BED:::loadProbes(

**End**: `r Sys.time()`

## Probes from biomaRt
<!-- ## Probes from biomaRt -->


<!-- **Start**: `r Sys.time()` -->

<!-- ```{r} -->
<!-- library(biomaRt) -->
<!-- bm <- "ENSEMBL_MART_ENSEMBL" -->
<!-- marts <-listMarts() -->
<!-- version <- ensembl_release -->
<!-- if(grep( -->
<!-- sprintf(" %s$", version), -->
<!-- marts[which(marts$biomart==bm), "version"] -->
<!-- ) == 1) { -->
<!-- version <- NULL -->
<!-- } -->
<!-- orgOfInt <- c("hsapiens", "mmusculus", "rnorvegicus", "sscrofa", "drerio") -->
<!-- for(org in orgOfInt){ -->
<!-- message(org) -->
<!-- mart <- useEnsembl( -->
<!-- biomart = bm, -->
<!-- dataset = paste0(org, "_gene_ensembl"), -->
<!-- version = version -->
<!-- ) -->
<!-- at <- listAttributes(mart) %>% -->
<!-- dplyr::filter( -->
<!-- stringr::str_detect( -->
<!-- description, stringr::regex("probe", ignore_case=TRUE) -->
<!-- ) -->
<!-- ) %>% -->
<!-- dplyr::filter( -->
<!-- !name %in% -->
<!-- c( -->
<!-- "affy_huex_1_0_st_v2", -->
<!-- "affy_moex_1_0_st_v1", -->
<!-- "affy_raex_1_0_st_v1" -->
<!-- ) -->
<!-- ) -->
<!-- for(i in 1:nrow(at)){ -->
<!-- message(" ", i, "/", nrow(at), " platforms") -->
<!-- message(" ", Sys.time()) -->
<!-- platName <- at$name[i] -->
<!-- platDesc <- paste(at$description[i], "(Ensembl BioMart mapping)") -->
<!-- be <- "Transcript" -->
<!-- ## -->
<!-- BED:::loadPlf(name=platName, description=platDesc, be=be) -->
<!-- ## Import mapping with Ens_transcript -->
<!-- toImport <- getBM( -->
<!-- mart=mart, -->
<!-- attributes=c( -->
<!-- "ensembl_transcript_id", -->
<!-- platName -->
<!-- ) -->
<!-- ) %>% -->
<!-- dplyr::as_tibble() %>% -->
<!-- magrittr::set_colnames(c("id", "probeID")) %>% -->
<!-- dplyr::filter(!is.na(probeID) & probeID!="" & !is.na(id) & id!="") %>% -->
<!-- dplyr::select(probeID, id) %>% -->
<!-- dplyr::distinct() -->
<!-- dbname <- "Ens_transcript" -->
<!-- ## -->
<!-- BED:::loadProbes( -->
<!-- d=toImport, -->
<!-- be=be, -->
<!-- platform=platName, -->
<!-- dbname=dbname -->
<!-- ) -->
<!-- } -->
<!-- } -->
<!-- ``` -->

<!-- **End**: `r Sys.time()` -->

## Probes from Ensembl

**Start**: `r Sys.time()`

```{r}
library(biomaRt)
bm <- "ENSEMBL_MART_ENSEMBL"
marts <-listMarts()
version <- ensembl_release
if(grep(
sprintf(" %s$", version),
marts[which(marts$biomart==bm), "version"]
) == 1) {
version <- NULL
to_exclude <- c(
"affy_huex_1_0_st_v2",
"affy_moex_1_0_st_v1",
"affy_raex_1_0_st_v1"
)
dump_ensembl_func <- function(ens_def){
base_url <- sprintf(
"https://ftp.ensembl.org/pub/current_mysql/%s_funcgen_%s_%s",
stringr::str_replace(tolower(ens_def$organism), " ", "_"),
ens_def$release, ens_def$gv
)
data_dir <- file.path(
wd,
sprintf(
"%s_funcgen_%s_%s",
stringr::str_replace(tolower(ens_def$organism), " ", "_"),
ens_def$release, ens_def$gv
)
)
dir.create(data_dir, showWarnings = FALSE)
to_download <- c(
"array.txt.gz",
"probe.txt.gz",
"probe_set.txt.gz",
"probe_transcript.txt.gz",
"probe_set_transcript.txt.gz"
)
for(f in to_download){
fp <- file.path(data_dir, f)
if(!file.exists(fp)){
download.file(file.path(base_url, f), fp)
}
}
}
orgOfInt <- c("hsapiens", "mmusculus", "rnorvegicus", "sscrofa", "drerio")
for(org in orgOfInt){
message(org)
mart <- useEnsembl(
biomart = bm,
dataset = paste0(org, "_gene_ensembl"),
version = version
load_ensembl_probes <- function(ens_def){
data_dir <- file.path(
wd,
sprintf(
"%s_funcgen_%s_%s",
stringr::str_replace(tolower(ens_def$organism), " ", "_"),
ens_def$release, ens_def$gv
)
)
at <- listAttributes(mart) %>%
dplyr::filter(
stringr::str_detect(
description, stringr::regex("probe", ignore_case=TRUE)
)
) %>%
array <- readr::read_tsv(
file.path(data_dir, "array.txt.gz"),
col_names = c(
"array_id",
"name",
"format",
"vendor",
"description",
"type",
"class",
"is_probeset_array",
"is_linked_array",
"has_sense_interrogation"
),
col_types = "ccccccclll",
na = "\\N"
)
array <- array |>
dplyr::filter(
!name %in%
c(
"affy_huex_1_0_st_v2",
"affy_moex_1_0_st_v1",
"affy_raex_1_0_st_v1"
)
format %in% c("EXPRESSION"),
!.data$name %in% to_exclude
)
for(i in 1:nrow(at)){
message(" ", i, "/", nrow(at), " platforms")
probe_arrays <- array |> filter(!is_probeset_array) |> pull(array_id)
probe <- readr::read_tsv(
file.path(data_dir, "probe.txt.gz"),
col_names = c(
"probe_id",
"probe_set_id",
"name",
"length",
"array_chip_id",
"class",
"description",
"probe_seq_id"
),
col_types = "cccnccccc",
na = "\\N"
)
probe <- probe |> filter(array_chip_id %in% probe_arrays)
probe_set <- readr::read_tsv(
file.path(data_dir, "probe_set.txt.gz"),
col_names = c(
"probe_set_id",
"name",
"size",
"family",
"array_chip_id"
),
col_types = "cccncc",
na = "\\N"
)
probe_transcript <- readr::read_tsv(
file.path(data_dir, "probe_transcript.txt.gz"),
col_names = c(
"probe_transcript_id",
"probe_id",
"stable_id",
"description"
),
col_types = "cccc",
na = "\\N"
)
probe_transcript <- probe_transcript |>
filter(probe_id %in% probe$probe_id)
probe_set_transcript <- readr::read_tsv(
file.path(data_dir, "probe_set_transcript.txt.gz"),
col_names = c(
"probe_set_transcript_id",
"probe_set_id",
"stable_id",
"description"
),
col_types = "cccc",
na = "\\N"
)
be <- "Transcript"
dbname <- "Ens_transcript"
for(i in 1:nrow(array)){
message(" ", i, "/", nrow(array), " platforms")
message(" ", Sys.time())
platName <- at$name[i]
platDesc <- paste(at$description[i], "(Ensembl BioMart mapping)")
be <- "Transcript"
##
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping with Ens_transcript
toImport <- getBM(
mart=mart,
attributes=c(
"ensembl_transcript_id",
platName
)
) %>%
dplyr::as_tibble() %>%
magrittr::set_colnames(c("id", "probeID")) %>%
dplyr::filter(!is.na(probeID) & probeID!="" & !is.na(id) & id!="") %>%
dplyr::select(probeID, id) %>%
dplyr::distinct()
dbname <- "Ens_transcript"
##
BED:::loadProbes(
d=toImport,
be=be,
platform=platName,
dbname=dbname
platName <- tolower(paste(
str_replace(ens_def$organism, " ", "_"),
array$vendor[i], array$name[i], sep = "_"
))
is_probe_set <- array$is_probeset_array[i]
platDesc <- paste(
array$vendor[i],
array$name[i],
array$type[i],
ifelse(is_probe_set, "probe set", "probe"),
ens_def$organism,
"(Ensembl mapping)"
)
array_id <- array$array_id[i]
if(is_probe_set){
toImport <- probe_set |>
dplyr::filter(.data$array_chip_id==!!array_id) |>
dplyr::select("probe_set_id", "name") |>
dplyr::inner_join(
probe_set_transcript |>
dplyr::select("probe_set_id", "stable_id"),
by = "probe_set_id"
) |>
dplyr::select("probeID" = "name", "id" = "stable_id") |>
dplyr::distinct()
}else{
toImport <- probe |>
dplyr::filter(.data$array_chip_id==!!array_id) |>
dplyr::select("probe_id", "name") |>
dplyr::inner_join(
probe_transcript |> dplyr::select("probe_id", "stable_id"),
by = "probe_id"
) |>
dplyr::select("probeID" = "name", "id" = "stable_id") |>
dplyr::distinct()
}
if(nrow(toImport) > 0){
## Platform description
BED:::loadPlf(name=platName, description=platDesc, be=be)
## Import mapping with Ens_transcript
BED:::loadProbes(
d=toImport,
be=be,
platform=platName,
dbname=dbname
)
}
}
}
message("Ensembl probes for Drerio")
dump_ensembl_func(ensembl_Drerio)
load_ensembl_probes(ensembl_Drerio)
message("Ensembl probes for Hsapiens")
dump_ensembl_func(ensembl_Hsapiens)
load_ensembl_probes(ensembl_Hsapiens)
message("Ensembl probes for Mmusculus")
dump_ensembl_func(ensembl_Mmusculus)
load_ensembl_probes(ensembl_Mmusculus)
message("Ensembl probes for Rnorvegicus")
dump_ensembl_func(ensembl_Rnorvegicus)
load_ensembl_probes(ensembl_Rnorvegicus)
message("Ensembl probes for Sscrofa")
dump_ensembl_func(ensembl_Sscrofa)
load_ensembl_probes(ensembl_Sscrofa)
```

**End**: `r Sys.time()`
Expand Down
3,887 changes: 3,887 additions & 0 deletions supp/Build/Genodesy-instance/Rebuild-BED.html

Large diffs are not rendered by default.

Loading

0 comments on commit ff104b5

Please sign in to comment.