improved add_clusters, better doc, example data and code

tetukas · Jan 30, 2021 · 33f46bd · 33f46bd
1 parent 5101faa
commit 33f46bd
Show file tree

Hide file tree

Showing 10 changed files with 167 additions and 1,044 deletions.
diff --git a/R/clusters.R b/R/clusters.R
@@ -1,5 +1,23 @@
-#' Add gene clusters
+#' Add gene (or feature) clusters
+#'
+#' Converts a table with the two required columns `cluster_id` and `feat_id`
+#' into a link track connecting features belonging to the same cluster over
+#' their entire length. Additionally, the table is joined to the parent feature
+#' table, adding `cluster_id` and all additional columns with cluster
+#' information to the feat table.
+#'
+#' @inheritParams add_subfeats
 #' @export
+#' @examples
+#' gggenomes(emale_seqs, emale_genes) %>%
+#'   add_clusters(emale_cogs) %>%
+#'   flip_nicely() +      # works because clusters become links
+#'   geom_link() +        # works because clusters become links
+#'   geom_seq() +
+#'   # works because cluster info is joined to gene track
+#'   geom_gene(aes(fill=ifelse(is.na(cluster_id), NA,
+#'       str_glue("{cluster_id} [{cluster_size}]")))) +
+#'   scale_fill_discrete("COGs")
 add_clusters <- function(x, ..., .track_id = "genes"){
   UseMethod("add_clusters")
 }
@@ -12,25 +30,57 @@ add_clusters.gggenomes <- function(x, ..., .track_id = "genes"){
 
 #' @export
 add_clusters.gggenomes_layout <- function(x, ..., .track_id = "genes"){
-  if(!has_dots())
-    rlang::abort("no clusters data provided - did you forget parent_track_id as first argument")
+  if(!has_dots()){
+    warn("No clusters data provided - check your arguments")
+    return(x)
+  }
 
   pid <- tidyselect::vars_pull(track_ids(x), {{.track_id}})
   dot_exprs <- enexprs(...) # defuse before list(...)
   tracks <- as_tracks(list(...), dot_exprs, track_ids(x))
-  sublinks <- map(tracks, cluster2sublinks, x$feats[[pid]])
-  x <- add_sublink_tracks(x, {{.track_id}}, sublinks, "none")
+
+  tracks <- map(tracks, function(track){
+    require_vars(track, c("feat_id", "cluster_id"))
+
+    track <- filter(track, feat_id %in% x$feats[[pid]]$feat_id)
+    if(nrow(track) < 1){
+      warn(str_glue("No matches between clusters and parent track based on ",
+          "`track_id`. Check your IDs and arguments"))
+      return(x)
+    }
+
+    if(any(duplicated(track$feat_id))){
+      dup_ids <- track$feat_id[duplicated(track$feat_id)][1:5]
+      abort(c("Duplicated `feat_id`s not allowed:", str_glue("{dup_ids}")))
+    }
+
+    track
+  })
+
+  sublinks <- map(tracks, cluster2sublinks, x$feats[[pid]]) %>%
+    compact # can be empty tibble of all clusters were singletons
+  if(length(sublinks) < length(tracks)){
+    warn("At least one cluster table had only singletons, so no links were produced")
+  }
+  if(length(sublinks)){
+    x <- add_sublink_tracks(x, {{.track_id}}, sublinks, transform="none")
+  }
 
   # this is just q&d - only adds the ids of the first cluster track. Not sure,
   # how to handle adding multiple ones
+  if(length(tracks) > 1){
+    warn(str_glue("If adding multiple cluster tables, all are added as ",
+        "individual link tracks, but only the first table is joined with the ",
+        "parent feat table"))
+  }
+
   x$feats[[pid]] <- left_join(x$feats[[pid]], tracks[[1]])
   x
 }
 
 cluster2sublinks <- function(x, parent_track){
-  x <- filter(x, feat_id %in% parent_track$feat_id)
   x %>% split_by(cluster_id) %>%
-    keep(~nrow(.) > 1) %>%
+    keep(~nrow(.) > 1) %>% # links need >2 members, ignore singletons
     map_df(.id = "cluster_id", function(g){
       mat <- combn(g$feat_id, 2, simplify=TRUE)
       tibble(feat_id = mat[1,], feat_id2 = mat[2,])

diff --git a/R/data.R b/R/data.R
@@ -171,3 +171,17 @@
 #' @source
 #' * Derived & bundled data: `ex("emales/emales-prot-uniref50.tsv")`
 "emale_prot_uniref50"
+
+#' Clusters of orthologs of 6 EMALE proteomes
+#'
+#' One row per feature. Clusters are based on manual curation.
+#'
+#' @format A data frame with 48 rows and 3 columns
+#' \describe{
+#'   \item{cluster_id}{identifier of the cluster}
+#'   \item{feat_id}{identifer of the gene}
+#'   \item{cluster_size}{number of features in the cluster}
+#' }
+#' @source
+#' * Derived & bundled data: `ex("emales/emales-cogs.tsv")`
+"emale_cogs"
diff --git a/data/emale_cogs.rda b/data/emale_cogs.rda
diff --git a/data/emale_genes.rda b/data/emale_genes.rda
diff --git a/inst/extdata/emales/emales-cogs.tsv b/inst/extdata/emales/emales-cogs.tsv
@@ -0,0 +1,49 @@
+cluster_id	feat_id	cluster_size
+cog-MV19	BVI_008A_0002	9
+cog-MV13_2	BVI_008A_0006	7
+cog-ATPase	BVI_008A_0010	8
+cog-MCP	BVI_008A_0011	8
+cog-MV12	BVI_008A_0016	7
+cog-pri-hel	BVI_008A_0020	9
+cog-pri-hel	BVI_008A_0021	9
+cog-MV19	BVI_069_0002	9
+cog-MV13_2	BVI_069_0006	7
+cog-ATPase	BVI_069_0012	8
+cog-MCP	BVI_069_0014	8
+cog-MV12	BVI_069_0017	7
+cog-MV12	BVI_069_0018	7
+cog-pri-hel	BVI_069_0025	9
+cog-MV19	Cflag_017B_0003	9
+cog-MV13_2	Cflag_017B_0007	7
+cog-ATPase	Cflag_017B_0010	8
+cog-MCP	Cflag_017B_0011	8
+cog-MV12	Cflag_017B_0013	7
+cog-pri-hel	Cflag_017B_0018	9
+cog-pri-hel	E4-10_086_0004	9
+cog-MV12	E4-10_086_0009	7
+cog-MCP	E4-10_086_0011	8
+cog-ATPase	E4-10_086_0012	8
+cog-MV13_2	E4-10_086_0015	7
+cog-MV19	E4-10_086_0019	9
+cog-MV19	E4-10_086_0020	9
+cog-pri-hel	E4-10_112_0003	9
+cog-pri-hel	E4-10_112_0007	9
+cog-MV12	E4-10_112_0012	7
+cog-MCP	E4-10_112_0014	8
+cog-ATPase	E4-10_112_0015	8
+cog-ATPase	E4-10_112_0016	8
+cog-MV13_2	E4-10_112_0020	7
+cog-MV13_2	E4-10_112_0021	7
+cog-MV19	E4-10_112_0025	9
+cog-pri-hel	RCC970_016B_0005	9
+cog-pri-hel	RCC970_016B_0006	9
+cog-MV12	RCC970_016B_0011	7
+cog-MCP	RCC970_016B_0013	8
+cog-MCP	RCC970_016B_0014	8
+cog-MCP	RCC970_016B_0015	8
+cog-ATPase	RCC970_016B_0016	8
+cog-ATPase	RCC970_016B_0017	8
+cog-MV13_2	RCC970_016B_0021	7
+cog-MV19	RCC970_016B_0025	9
+cog-MV19	RCC970_016B_0026	9
+cog-MV19	RCC970_016B_0027	9