Skip to content

Commit

Permalink
clean up EMALE example data set, and add use case as vignette
Browse files Browse the repository at this point in the history
  • Loading branch information
thackl committed May 12, 2020
1 parent 37e5431 commit 74533ba
Show file tree
Hide file tree
Showing 117 changed files with 1,026 additions and 57,863 deletions.
65 changes: 0 additions & 65 deletions data-raw/emales.R

This file was deleted.

239 changes: 239 additions & 0 deletions data-raw/emales.org
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@

asd

#+BEGIN_SRC R
pp <- function(x=NA, fmt="emales-%s.png"){
out <- as_label(enexpr(x))
out <- sprintf(fmt, out)
print(paste0("generating ", out))
ggsave(out, x, type="cairo", width=18, height=8)
}
#+END_SRC


*** Read in the genomes
We start with a fasta file of 33 viral genomes. We read sequence length and some
metadata from the header lines using `read_fai`...

#+BEGIN_SRC R
library(tidyverse)
library(gggenomes)

# parse sequence length and some metadata from fasta file
emale_seqs <- read_fai("emales.fna") %>%
extract(seq_desc, into = c("emale_type", "is_typespecies"), "=(\\S+) \\S+=(\\S+)",
remove=F, convert=T) %>%
arrange(emale_type, length)

# plot the genomes - first six only to keep it simple for this example
emale_seqs_6 <- emale_seqs[1:6,]
p1 <- gggenomes(emale_seqs_6) +
geom_seq() + geom_bin_label()
p1

pp(p1)
#+END_SRC

[[file:emales-p1.png]]

*** Annotate genes
#+BEGIN_SRC sh
# https://github.com/thackl/seq-scripts
seq-join -n emales-concat < emales.fna > emales-concat.fna
# Annotate genes | https://github.com/hyattpd/Prodigal
prodigal -n -t emales-prodigal.train -i emales-concat.fna
prodigal -t emales-prodigal.train -i emales.fna -o emales-prodigal.gff -f gff
# A little help to clean up the prodigal gff | https://github.com/thackl/seq-scripts
gff-clean emales-prodigal.gff > emales.gff
#+END_SRC

#+BEGIN_SRC R
emale_genes <- read_gff("emales.gff") %>%
rename(feature_id=ID) %>% # we'll need this later
mutate(gc_cont=as.numeric(gc_cont)) # per gene GC-content

p2 <- gggenomes(emale_seqs_6, emale_genes) +
geom_seq() + geom_bin_label() +
geom_gene(aes(fill=gc_cont)) +
scale_fill_distiller(palette="Spectral")
p2

pp(p2)
#+END_SRC

[[file:emales-p2.png]]

*** Find terminal inverted repeats
It is known that these type of viruses often have linear genomes with terminal
inverted repeats (TIRs). So let's look for those next.

#+BEGIN_SRC sh
# split into one genome per file | https://bioinf.shenwei.me/seqkit/
seqkit split -i emales.fna
# self-align opposite strands
for fna in `ls emales.fna.split/*.fna`; do
minimap2 -c -B5 -O6 -E3 --rev-only $fna $fna > $fna.paf;
done;
cat emales.fna.split/*.paf > emales-tirs.paf
#+END_SRC

#+BEGIN_SRC R
emale_tirs_paf <- read_paf("emales-tirs.paf") %>%
filter(seq_id1 == seq_id2 & start1 < start2 & map_length > 99 & de < 0.1)
emale_tirs <- bind_rows(
select(emale_tirs_paf, seq_id=seq_id1, start=start1, end=end1, de),
select(emale_tirs_paf, seq_id=seq_id2, start=start2, end=end2, de))

p3 <- gggenomes(emale_seqs_6, emale_genes, emale_tirs) +
geom_seq() + geom_bin_label() +
geom_feature(size=5) +
geom_gene(aes(fill=gc_cont)) +
scale_fill_distiller(palette="Spectral")
p3

pp(p3)
#+END_SRC

[[file:emales-p3.png]]

*** Compare genome synteny
#+BEGIN_SRC sh
# All-vs-all alignment | https://github.com/lh3/minimap2
minimap2 -X -N 50 -p 0.1 -c emales.fna emales.fna > emales.paf
#+END_SRC

#+BEGIN_SRC R
emale_links <- read_paf("emales.paf")

p4 <- gggenomes(emale_seqs_6, emale_genes, emale_tirs, emale_links) +
geom_seq() + geom_bin_label() +
geom_feature(size=5, data=use_features(features)) +
geom_gene(aes(fill=gc_cont)) +
geom_link() +
scale_fill_distiller(palette="Spectral")

p4 <- p4 %>% flip_bins(3:5)
p4

pp(p4)
#+END_SRC

[[file:emales-p4.png]]

*** GC-content
#+BEGIN_SRC sh
# https://github.com/thackl/seq-scripts (bedtools & samtools)
seq-gc -Nbw 50 emales.fna > emales-gc.tsv
#+END_SRC

#+BEGIN_SRC R
emale_gc <- thacklr::read_bed("emales-gc.tsv") %>%
rename(seq_id=contig_id)

p5 <- p4 %>% add_features(emale_gc)
p5 <- p5 + geom_ribbon(aes(x=(x+xend)/2, ymax=y+.24, ymin=y+.38-(.4*score),
group=seq_id, linetype="GC-content"), use_features(emale_gc),
fill="blue", alpha=.5)
p5

pp(p5)
#+END_SRC

*** cluster protein sequences into orthogroups
#+BEGIN_SRC sh
gff2cds --aa --type CDS --source Prodigal_v2.6.3 --fna emales.fna emales.gff > emales.faa
mmseqs easy-cluster emales.faa emales-mmseqs /tmp -e 1e-5 -c 0.7;
cluster-ids -t "cog%03d" < emales-mmseqs_cluster.tsv > emales-cogs.tsv
#+END_SRC

#+BEGIN_SRC R
emale_cogs <- read_tsv("emales-cogs.tsv", col_names = c("feature_id", "cluster_id", "cluster_n"))
emale_cogs %<>% mutate(
cluster_label = paste0(cluster_id, " (", cluster_n, ")"),
cluster_label = fct_lump_min(cluster_label, 5, other_level = "rare"),
cluster_label = fct_lump_min(cluster_label, 15, other_level = "medium"),
cluster_label = fct_relevel(cluster_label, "rare", after=Inf))
emale_cogs


p6 <- gggenomes(emale_seqs_6, emale_genes, emale_tirs, emale_links) %>%
add_features(emale_gc) %>%
add_clusters(genes, emale_cogs) %>%
flip_bins(3:5) +
geom_seq() + geom_bin_label() +
geom_feature(size=5, data=use_features(features)) +
geom_gene(aes(fill=cluster_label)) +
geom_link() +
geom_ribbon(aes(x=(x+xend)/2, ymax=y+.24, ymin=y+.38-(.4*score),
group=seq_id, linetype="GC-content"), use_features(emale_gc),
fill="blue", alpha=.5) +
scale_fill_brewer("Conserved genes", palette="Set3")

p6

pp(p6)
#+END_SRC

[[file:emales-p6.png]]

*** Blast hits and integrated transposons

#+BEGIN_SRC sh
# mavirus.faa - published
blastp -query emales.faa -subject mavirus.faa -outfmt 7 > emales_mavirus-blastp.tsv
perl -ne 'if(/>(\S+) gene=(\S+) product=(.+)/){print join("\t", $1, $2, $3), "\n"}' mavirus.faa > mavirus.tsv
#+END_SRC

#+BEGIN_SRC R
emale_blast <- read_blast("emales_mavirus-blastp.tsv")
emale_blast %<>%
filter(evalue < 1e-3) %>%
select(feature_id=qaccver, start=qstart, end=qend, saccver) %>%
left_join(read_tsv("mavirus.tsv", col_names = c("saccver", "blast_hit", "blast_desc")))

# manual annotations by MFG
emale_transposons <- read_gff("emales-manual.gff", types = c("mobile_element"))


p7 <- gggenomes(emale_seqs_6, emale_genes, emale_tirs, emale_links) %>%
add_features(emale_gc) %>%
add_clusters(genes, emale_cogs) %>%
add_features(emale_transposons) %>%
add_subfeatures(genes, emale_blast, transform="aa2nuc") %>%
flip_bins(3:5) +
geom_feature(aes(color="integrated transposon"),
use_features(emale_transposons), size=7) +
geom_seq() + geom_bin_label() +
geom_link(offset = c(0.3, 0.2), color="white", alpha=.3) +
geom_feature(aes(color="terminal inverted repeat"), use_features(features),
size=4) +
geom_gene(aes(fill=cluster_label)) +
geom_feature(aes(color=blast_desc), use_features(emale_blast), size=2,
position="pile") +
geom_ribbon(aes(x=(x+xend)/2, ymax=y+.24, ymin=y+.38-(.4*score),
group=seq_id, linetype="GC-content"), use_features(emale_gc),
fill="blue", alpha=.5) +
scale_fill_brewer("Conserved genes", palette="Set3") +
scale_color_viridis_d("Blast hits & Features", direction = -1) +
scale_linetype("Graphs") +
ggtitle(expression(paste("Endogenous mavirus-like elements of ",
italic("C. burkhardae"))))

p7

pp(p7)
#+END_SRC

[[file:emales-p7.png]]

*** save example data
#+BEGIN_SRC R
usethis::use_data(emale_seqs, overwrite=TRUE)
usethis::use_data(emale_genes, overwrite=TRUE)
usethis::use_data(emale_links, overwrite=TRUE)
usethis::use_data(emale_tirs, overwrite=TRUE)
usethis::use_data(emale_transposons, overwrite=TRUE)
usethis::use_data(emale_gc, overwrite=TRUE)
usethis::use_data(emale_blast, overwrite=TRUE)
usethis::use_data(emale_cogs, overwrite=TRUE)
#+END_SRC
Binary file added data-raw/emales.tgz
Binary file not shown.
Loading

0 comments on commit 74533ba

Please sign in to comment.