New db version

patzaw · Nov 9, 2024 · ff104b5 · ff104b5
1 parent 42b136f
commit ff104b5
Show file tree

Hide file tree

Showing 3 changed files with 6,935 additions and 56 deletions.
diff --git a/supp/Build/Genodesy-instance/Rebuild-BED.Rmd b/supp/Build/Genodesy-instance/Rebuild-BED.Rmd
@@ -1949,75 +1949,273 @@ BED:::loadProbes(
 
 **End**: `r Sys.time()`
 
-## Probes from biomaRt
+<!-- ## Probes from biomaRt -->
 
 
+<!-- **Start**: `r Sys.time()` -->
+
+<!-- ```{r} -->
+<!-- library(biomaRt) -->
+<!-- bm <- "ENSEMBL_MART_ENSEMBL" -->
+<!-- marts <-listMarts() -->
+<!-- version <- ensembl_release -->
+<!-- if(grep( -->
+<!--    sprintf(" %s$", version), -->
+<!--    marts[which(marts$biomart==bm), "version"] -->
+<!-- ) == 1) { -->
+<!--    version <- NULL -->
+<!-- } -->
+<!-- orgOfInt <- c("hsapiens", "mmusculus", "rnorvegicus", "sscrofa", "drerio") -->
+<!-- for(org in orgOfInt){ -->
+<!--    message(org) -->
+<!--    mart <- useEnsembl( -->
+<!--       biomart = bm, -->
+<!--       dataset = paste0(org, "_gene_ensembl"), -->
+<!--       version = version -->
+<!--    ) -->
+<!--    at <- listAttributes(mart) %>% -->
+<!--       dplyr::filter( -->
+<!--          stringr::str_detect( -->
+<!--             description, stringr::regex("probe", ignore_case=TRUE) -->
+<!--          ) -->
+<!--       ) %>% -->
+<!--       dplyr::filter( -->
+<!--          !name %in% -->
+<!--             c( -->
+<!--                "affy_huex_1_0_st_v2", -->
+<!--                "affy_moex_1_0_st_v1", -->
+<!--                "affy_raex_1_0_st_v1" -->
+<!--             ) -->
+<!--       ) -->
+<!--    for(i in 1:nrow(at)){ -->
+<!--       message("   ", i, "/", nrow(at), " platforms") -->
+<!--       message("      ", Sys.time()) -->
+<!--       platName <- at$name[i] -->
+<!--       platDesc <- paste(at$description[i], "(Ensembl BioMart mapping)") -->
+<!--       be <- "Transcript" -->
+<!--       ## -->
+<!--       BED:::loadPlf(name=platName, description=platDesc, be=be) -->
+<!--       ## Import mapping with Ens_transcript -->
+<!--       toImport <- getBM( -->
+<!--          mart=mart, -->
+<!--          attributes=c( -->
+<!--             "ensembl_transcript_id", -->
+<!--             platName -->
+<!--          ) -->
+<!--       ) %>% -->
+<!--          dplyr::as_tibble() %>% -->
+<!--          magrittr::set_colnames(c("id", "probeID")) %>% -->
+<!--          dplyr::filter(!is.na(probeID) & probeID!="" & !is.na(id) & id!="") %>% -->
+<!--          dplyr::select(probeID, id) %>% -->
+<!--          dplyr::distinct() -->
+<!--       dbname <- "Ens_transcript" -->
+<!--       ## -->
+<!--       BED:::loadProbes( -->
+<!--          d=toImport, -->
+<!--          be=be, -->
+<!--          platform=platName, -->
+<!--          dbname=dbname -->
+<!--       ) -->
+<!--    } -->
+<!-- } -->
+<!-- ``` -->
+
+<!-- **End**: `r Sys.time()` -->
+
+## Probes from Ensembl
+
 **Start**: `r Sys.time()`
 
 ```{r}
-library(biomaRt)
-bm <- "ENSEMBL_MART_ENSEMBL"
-marts <-listMarts()
-version <- ensembl_release
-if(grep(
-   sprintf(" %s$", version),
-   marts[which(marts$biomart==bm), "version"]
-) == 1) {
-   version <- NULL
+to_exclude <- c(
+   "affy_huex_1_0_st_v2",
+   "affy_moex_1_0_st_v1",
+   "affy_raex_1_0_st_v1"
+)
+dump_ensembl_func <- function(ens_def){
+   base_url <- sprintf(
+      "https://ftp.ensembl.org/pub/current_mysql/%s_funcgen_%s_%s",
+      stringr::str_replace(tolower(ens_def$organism), " ", "_"),
+      ens_def$release, ens_def$gv
+   )
+   data_dir <- file.path(
+      wd,
+      sprintf(
+         "%s_funcgen_%s_%s",
+         stringr::str_replace(tolower(ens_def$organism), " ", "_"),
+         ens_def$release, ens_def$gv
+      )
+   )
+   dir.create(data_dir, showWarnings = FALSE)
+   to_download <- c(
+      "array.txt.gz",
+      "probe.txt.gz",
+      "probe_set.txt.gz",
+      "probe_transcript.txt.gz",
+      "probe_set_transcript.txt.gz"
+   )
+   for(f in to_download){
+      fp <- file.path(data_dir, f)
+      if(!file.exists(fp)){
+         download.file(file.path(base_url, f), fp)
+      }
+   }
 }
-orgOfInt <- c("hsapiens", "mmusculus", "rnorvegicus", "sscrofa", "drerio")
-for(org in orgOfInt){
-   message(org)
-   mart <- useEnsembl(
-      biomart = bm,
-      dataset = paste0(org, "_gene_ensembl"),
-      version = version
+load_ensembl_probes <- function(ens_def){
+   data_dir <- file.path(
+      wd,
+      sprintf(
+         "%s_funcgen_%s_%s",
+         stringr::str_replace(tolower(ens_def$organism), " ", "_"),
+         ens_def$release, ens_def$gv
+      )
    )
-   at <- listAttributes(mart) %>%
-      dplyr::filter(
-         stringr::str_detect(
-            description, stringr::regex("probe", ignore_case=TRUE)
-         )
-      ) %>%
+   array <- readr::read_tsv(
+      file.path(data_dir, "array.txt.gz"),
+      col_names = c(
+         "array_id",
+         "name",
+         "format",
+         "vendor",
+         "description",
+         "type",
+         "class",
+         "is_probeset_array",
+         "is_linked_array",
+         "has_sense_interrogation"
+      ),
+      col_types = "ccccccclll",
+      na = "\\N"
+   )
+   array <- array |>
       dplyr::filter(
-         !name %in%
-            c(
-               "affy_huex_1_0_st_v2",
-               "affy_moex_1_0_st_v1",
-               "affy_raex_1_0_st_v1"
-            )
+         format %in% c("EXPRESSION"),
+         !.data$name %in% to_exclude
       )
-   for(i in 1:nrow(at)){
-      message("   ", i, "/", nrow(at), " platforms")
+   probe_arrays <- array |> filter(!is_probeset_array) |> pull(array_id)
+   probe <- readr::read_tsv(
+      file.path(data_dir, "probe.txt.gz"),
+      col_names = c(
+         "probe_id",
+         "probe_set_id",
+         "name",
+         "length",
+         "array_chip_id",
+         "class",
+         "description",
+         "probe_seq_id"
+      ),
+      col_types = "cccnccccc",
+      na = "\\N"
+   )
+   probe <- probe |> filter(array_chip_id %in% probe_arrays)
+   probe_set <- readr::read_tsv(
+      file.path(data_dir, "probe_set.txt.gz"),
+      col_names = c(
+         "probe_set_id",
+         "name",
+         "size",
+         "family",
+         "array_chip_id"
+      ),
+      col_types = "cccncc",
+      na = "\\N"
+   )
+   probe_transcript <- readr::read_tsv(
+      file.path(data_dir, "probe_transcript.txt.gz"),
+      col_names = c(
+         "probe_transcript_id",
+         "probe_id",
+         "stable_id",
+         "description"
+      ),
+      col_types = "cccc",
+      na = "\\N"
+   )
+   probe_transcript <- probe_transcript |> 
+      filter(probe_id %in% probe$probe_id)
+   probe_set_transcript <- readr::read_tsv(
+      file.path(data_dir, "probe_set_transcript.txt.gz"),
+      col_names = c(
+         "probe_set_transcript_id",
+         "probe_set_id",
+         "stable_id",
+         "description"
+      ),
+      col_types = "cccc",
+      na = "\\N"
+   )
+   be <- "Transcript"
+   dbname <- "Ens_transcript"
+   for(i in 1:nrow(array)){
+      message("   ", i, "/", nrow(array), " platforms")
       message("      ", Sys.time())
-      platName <- at$name[i]
-      platDesc <- paste(at$description[i], "(Ensembl BioMart mapping)")
-      be <- "Transcript"
-      ##
-      BED:::loadPlf(name=platName, description=platDesc, be=be)
-      ## Import mapping with Ens_transcript
-      toImport <- getBM(
-         mart=mart,
-         attributes=c(
-            "ensembl_transcript_id",
-            platName
-         )
-      ) %>%
-         dplyr::as_tibble() %>%
-         magrittr::set_colnames(c("id", "probeID")) %>%
-         dplyr::filter(!is.na(probeID) & probeID!="" & !is.na(id) & id!="") %>%
-         dplyr::select(probeID, id) %>%
-         dplyr::distinct()
-      dbname <- "Ens_transcript"
-      ##
-      BED:::loadProbes(
-         d=toImport,
-         be=be,
-         platform=platName,
-         dbname=dbname
+      platName <- tolower(paste(
+         str_replace(ens_def$organism, " ", "_"),
+         array$vendor[i], array$name[i], sep = "_"
+      ))
+      is_probe_set <- array$is_probeset_array[i]
+      platDesc <- paste(
+         array$vendor[i],
+         array$name[i],
+         array$type[i],
+         ifelse(is_probe_set, "probe set", "probe"),
+         ens_def$organism,
+         "(Ensembl mapping)"
       )
+      array_id <- array$array_id[i]
+      if(is_probe_set){
+         toImport <- probe_set |>
+            dplyr::filter(.data$array_chip_id==!!array_id) |> 
+            dplyr::select("probe_set_id", "name") |> 
+            dplyr::inner_join(
+               probe_set_transcript |>
+                  dplyr::select("probe_set_id", "stable_id"),
+               by = "probe_set_id"
+            ) |> 
+            dplyr::select("probeID" = "name", "id" = "stable_id") |> 
+            dplyr::distinct()
+      }else{
+         toImport <- probe |>
+            dplyr::filter(.data$array_chip_id==!!array_id) |> 
+            dplyr::select("probe_id", "name") |> 
+            dplyr::inner_join(
+               probe_transcript |> dplyr::select("probe_id", "stable_id"),
+               by = "probe_id"
+            ) |> 
+            dplyr::select("probeID" = "name", "id" = "stable_id") |> 
+            dplyr::distinct()
+      }
+      if(nrow(toImport) > 0){
+         ## Platform description
+         BED:::loadPlf(name=platName, description=platDesc, be=be)
+         ## Import mapping with Ens_transcript
+         BED:::loadProbes(
+            d=toImport,
+            be=be,
+            platform=platName,
+            dbname=dbname
+         )
+      }
    }
 }
+
+message("Ensembl probes for Drerio")
+dump_ensembl_func(ensembl_Drerio)
+load_ensembl_probes(ensembl_Drerio)
+message("Ensembl probes for Hsapiens")
+dump_ensembl_func(ensembl_Hsapiens)
+load_ensembl_probes(ensembl_Hsapiens)
+message("Ensembl probes for Mmusculus")
+dump_ensembl_func(ensembl_Mmusculus)
+load_ensembl_probes(ensembl_Mmusculus)
+message("Ensembl probes for Rnorvegicus")
+dump_ensembl_func(ensembl_Rnorvegicus)
+load_ensembl_probes(ensembl_Rnorvegicus)
+message("Ensembl probes for Sscrofa")
+dump_ensembl_func(ensembl_Sscrofa)
+load_ensembl_probes(ensembl_Sscrofa)
+
 ```
 
 **End**: `r Sys.time()`

diff --git a/supp/Build/Genodesy-instance/Rebuild-BED.html b/supp/Build/Genodesy-instance/Rebuild-BED.html