Skip to content

Commit

Permalink
improved add_clusters, better doc, example data and code
Browse files Browse the repository at this point in the history
  • Loading branch information
thackl committed Jan 30, 2021
1 parent 5101faa commit 33f46bd
Show file tree
Hide file tree
Showing 10 changed files with 167 additions and 1,044 deletions.
64 changes: 57 additions & 7 deletions R/clusters.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,23 @@
#' Add gene clusters
#' Add gene (or feature) clusters
#'
#' Converts a table with the two required columns `cluster_id` and `feat_id`
#' into a link track connecting features belonging to the same cluster over
#' their entire length. Additionally, the table is joined to the parent feature
#' table, adding `cluster_id` and all additional columns with cluster
#' information to the feat table.
#'
#' @inheritParams add_subfeats
#' @export
#' @examples
#' gggenomes(emale_seqs, emale_genes) %>%
#' add_clusters(emale_cogs) %>%
#' flip_nicely() + # works because clusters become links
#' geom_link() + # works because clusters become links
#' geom_seq() +
#' # works because cluster info is joined to gene track
#' geom_gene(aes(fill=ifelse(is.na(cluster_id), NA,
#' str_glue("{cluster_id} [{cluster_size}]")))) +
#' scale_fill_discrete("COGs")
add_clusters <- function(x, ..., .track_id = "genes"){
UseMethod("add_clusters")
}
Expand All @@ -12,25 +30,57 @@ add_clusters.gggenomes <- function(x, ..., .track_id = "genes"){

#' @export
add_clusters.gggenomes_layout <- function(x, ..., .track_id = "genes"){
if(!has_dots())
rlang::abort("no clusters data provided - did you forget parent_track_id as first argument")
if(!has_dots()){
warn("No clusters data provided - check your arguments")
return(x)
}

pid <- tidyselect::vars_pull(track_ids(x), {{.track_id}})
dot_exprs <- enexprs(...) # defuse before list(...)
tracks <- as_tracks(list(...), dot_exprs, track_ids(x))
sublinks <- map(tracks, cluster2sublinks, x$feats[[pid]])
x <- add_sublink_tracks(x, {{.track_id}}, sublinks, "none")

tracks <- map(tracks, function(track){
require_vars(track, c("feat_id", "cluster_id"))

track <- filter(track, feat_id %in% x$feats[[pid]]$feat_id)
if(nrow(track) < 1){
warn(str_glue("No matches between clusters and parent track based on ",
"`track_id`. Check your IDs and arguments"))
return(x)
}

if(any(duplicated(track$feat_id))){
dup_ids <- track$feat_id[duplicated(track$feat_id)][1:5]
abort(c("Duplicated `feat_id`s not allowed:", str_glue("{dup_ids}")))
}

track
})

sublinks <- map(tracks, cluster2sublinks, x$feats[[pid]]) %>%
compact # can be empty tibble of all clusters were singletons
if(length(sublinks) < length(tracks)){
warn("At least one cluster table had only singletons, so no links were produced")
}
if(length(sublinks)){
x <- add_sublink_tracks(x, {{.track_id}}, sublinks, transform="none")
}

# this is just q&d - only adds the ids of the first cluster track. Not sure,
# how to handle adding multiple ones
if(length(tracks) > 1){
warn(str_glue("If adding multiple cluster tables, all are added as ",
"individual link tracks, but only the first table is joined with the ",
"parent feat table"))
}

x$feats[[pid]] <- left_join(x$feats[[pid]], tracks[[1]])
x
}

cluster2sublinks <- function(x, parent_track){
x <- filter(x, feat_id %in% parent_track$feat_id)
x %>% split_by(cluster_id) %>%
keep(~nrow(.) > 1) %>%
keep(~nrow(.) > 1) %>% # links need >2 members, ignore singletons
map_df(.id = "cluster_id", function(g){
mat <- combn(g$feat_id, 2, simplify=TRUE)
tibble(feat_id = mat[1,], feat_id2 = mat[2,])
Expand Down
14 changes: 14 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,17 @@
#' @source
#' * Derived & bundled data: `ex("emales/emales-prot-uniref50.tsv")`
"emale_prot_uniref50"

#' Clusters of orthologs of 6 EMALE proteomes
#'
#' One row per feature. Clusters are based on manual curation.
#'
#' @format A data frame with 48 rows and 3 columns
#' \describe{
#' \item{cluster_id}{identifier of the cluster}
#' \item{feat_id}{identifer of the gene}
#' \item{cluster_size}{number of features in the cluster}
#' }
#' @source
#' * Derived & bundled data: `ex("emales/emales-cogs.tsv")`
"emale_cogs"
Binary file added data/emale_cogs.rda
Binary file not shown.
Binary file modified data/emale_genes.rda
Binary file not shown.
49 changes: 49 additions & 0 deletions inst/extdata/emales/emales-cogs.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
cluster_id feat_id cluster_size
cog-MV19 BVI_008A_0002 9
cog-MV13_2 BVI_008A_0006 7
cog-ATPase BVI_008A_0010 8
cog-MCP BVI_008A_0011 8
cog-MV12 BVI_008A_0016 7
cog-pri-hel BVI_008A_0020 9
cog-pri-hel BVI_008A_0021 9
cog-MV19 BVI_069_0002 9
cog-MV13_2 BVI_069_0006 7
cog-ATPase BVI_069_0012 8
cog-MCP BVI_069_0014 8
cog-MV12 BVI_069_0017 7
cog-MV12 BVI_069_0018 7
cog-pri-hel BVI_069_0025 9
cog-MV19 Cflag_017B_0003 9
cog-MV13_2 Cflag_017B_0007 7
cog-ATPase Cflag_017B_0010 8
cog-MCP Cflag_017B_0011 8
cog-MV12 Cflag_017B_0013 7
cog-pri-hel Cflag_017B_0018 9
cog-pri-hel E4-10_086_0004 9
cog-MV12 E4-10_086_0009 7
cog-MCP E4-10_086_0011 8
cog-ATPase E4-10_086_0012 8
cog-MV13_2 E4-10_086_0015 7
cog-MV19 E4-10_086_0019 9
cog-MV19 E4-10_086_0020 9
cog-pri-hel E4-10_112_0003 9
cog-pri-hel E4-10_112_0007 9
cog-MV12 E4-10_112_0012 7
cog-MCP E4-10_112_0014 8
cog-ATPase E4-10_112_0015 8
cog-ATPase E4-10_112_0016 8
cog-MV13_2 E4-10_112_0020 7
cog-MV13_2 E4-10_112_0021 7
cog-MV19 E4-10_112_0025 9
cog-pri-hel RCC970_016B_0005 9
cog-pri-hel RCC970_016B_0006 9
cog-MV12 RCC970_016B_0011 7
cog-MCP RCC970_016B_0013 8
cog-MCP RCC970_016B_0014 8
cog-MCP RCC970_016B_0015 8
cog-ATPase RCC970_016B_0016 8
cog-ATPase RCC970_016B_0017 8
cog-MV13_2 RCC970_016B_0021 7
cog-MV19 RCC970_016B_0025 9
cog-MV19 RCC970_016B_0026 9
cog-MV19 RCC970_016B_0027 9
Loading

0 comments on commit 33f46bd

Please sign in to comment.