forked from thackl/gggenomes
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
clean up EMALE example data set, and add use case as vignette
- Loading branch information
Showing
117 changed files
with
1,026 additions
and
57,863 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,239 @@ | ||
|
||
asd | ||
|
||
#+BEGIN_SRC R | ||
pp <- function(x=NA, fmt="emales-%s.png"){ | ||
out <- as_label(enexpr(x)) | ||
out <- sprintf(fmt, out) | ||
print(paste0("generating ", out)) | ||
ggsave(out, x, type="cairo", width=18, height=8) | ||
} | ||
#+END_SRC | ||
|
||
|
||
*** Read in the genomes | ||
We start with a fasta file of 33 viral genomes. We read sequence length and some | ||
metadata from the header lines using `read_fai`... | ||
|
||
#+BEGIN_SRC R | ||
library(tidyverse) | ||
library(gggenomes) | ||
|
||
# parse sequence length and some metadata from fasta file | ||
emale_seqs <- read_fai("emales.fna") %>% | ||
extract(seq_desc, into = c("emale_type", "is_typespecies"), "=(\\S+) \\S+=(\\S+)", | ||
remove=F, convert=T) %>% | ||
arrange(emale_type, length) | ||
|
||
# plot the genomes - first six only to keep it simple for this example | ||
emale_seqs_6 <- emale_seqs[1:6,] | ||
p1 <- gggenomes(emale_seqs_6) + | ||
geom_seq() + geom_bin_label() | ||
p1 | ||
|
||
pp(p1) | ||
#+END_SRC | ||
|
||
[[file:emales-p1.png]] | ||
|
||
*** Annotate genes | ||
#+BEGIN_SRC sh | ||
# https://github.com/thackl/seq-scripts | ||
seq-join -n emales-concat < emales.fna > emales-concat.fna | ||
# Annotate genes | https://github.com/hyattpd/Prodigal | ||
prodigal -n -t emales-prodigal.train -i emales-concat.fna | ||
prodigal -t emales-prodigal.train -i emales.fna -o emales-prodigal.gff -f gff | ||
# A little help to clean up the prodigal gff | https://github.com/thackl/seq-scripts | ||
gff-clean emales-prodigal.gff > emales.gff | ||
#+END_SRC | ||
|
||
#+BEGIN_SRC R | ||
emale_genes <- read_gff("emales.gff") %>% | ||
rename(feature_id=ID) %>% # we'll need this later | ||
mutate(gc_cont=as.numeric(gc_cont)) # per gene GC-content | ||
|
||
p2 <- gggenomes(emale_seqs_6, emale_genes) + | ||
geom_seq() + geom_bin_label() + | ||
geom_gene(aes(fill=gc_cont)) + | ||
scale_fill_distiller(palette="Spectral") | ||
p2 | ||
|
||
pp(p2) | ||
#+END_SRC | ||
|
||
[[file:emales-p2.png]] | ||
|
||
*** Find terminal inverted repeats | ||
It is known that these type of viruses often have linear genomes with terminal | ||
inverted repeats (TIRs). So let's look for those next. | ||
|
||
#+BEGIN_SRC sh | ||
# split into one genome per file | https://bioinf.shenwei.me/seqkit/ | ||
seqkit split -i emales.fna | ||
# self-align opposite strands | ||
for fna in `ls emales.fna.split/*.fna`; do | ||
minimap2 -c -B5 -O6 -E3 --rev-only $fna $fna > $fna.paf; | ||
done; | ||
cat emales.fna.split/*.paf > emales-tirs.paf | ||
#+END_SRC | ||
|
||
#+BEGIN_SRC R | ||
emale_tirs_paf <- read_paf("emales-tirs.paf") %>% | ||
filter(seq_id1 == seq_id2 & start1 < start2 & map_length > 99 & de < 0.1) | ||
emale_tirs <- bind_rows( | ||
select(emale_tirs_paf, seq_id=seq_id1, start=start1, end=end1, de), | ||
select(emale_tirs_paf, seq_id=seq_id2, start=start2, end=end2, de)) | ||
|
||
p3 <- gggenomes(emale_seqs_6, emale_genes, emale_tirs) + | ||
geom_seq() + geom_bin_label() + | ||
geom_feature(size=5) + | ||
geom_gene(aes(fill=gc_cont)) + | ||
scale_fill_distiller(palette="Spectral") | ||
p3 | ||
|
||
pp(p3) | ||
#+END_SRC | ||
|
||
[[file:emales-p3.png]] | ||
|
||
*** Compare genome synteny | ||
#+BEGIN_SRC sh | ||
# All-vs-all alignment | https://github.com/lh3/minimap2 | ||
minimap2 -X -N 50 -p 0.1 -c emales.fna emales.fna > emales.paf | ||
#+END_SRC | ||
|
||
#+BEGIN_SRC R | ||
emale_links <- read_paf("emales.paf") | ||
|
||
p4 <- gggenomes(emale_seqs_6, emale_genes, emale_tirs, emale_links) + | ||
geom_seq() + geom_bin_label() + | ||
geom_feature(size=5, data=use_features(features)) + | ||
geom_gene(aes(fill=gc_cont)) + | ||
geom_link() + | ||
scale_fill_distiller(palette="Spectral") | ||
|
||
p4 <- p4 %>% flip_bins(3:5) | ||
p4 | ||
|
||
pp(p4) | ||
#+END_SRC | ||
|
||
[[file:emales-p4.png]] | ||
|
||
*** GC-content | ||
#+BEGIN_SRC sh | ||
# https://github.com/thackl/seq-scripts (bedtools & samtools) | ||
seq-gc -Nbw 50 emales.fna > emales-gc.tsv | ||
#+END_SRC | ||
|
||
#+BEGIN_SRC R | ||
emale_gc <- thacklr::read_bed("emales-gc.tsv") %>% | ||
rename(seq_id=contig_id) | ||
|
||
p5 <- p4 %>% add_features(emale_gc) | ||
p5 <- p5 + geom_ribbon(aes(x=(x+xend)/2, ymax=y+.24, ymin=y+.38-(.4*score), | ||
group=seq_id, linetype="GC-content"), use_features(emale_gc), | ||
fill="blue", alpha=.5) | ||
p5 | ||
|
||
pp(p5) | ||
#+END_SRC | ||
|
||
*** cluster protein sequences into orthogroups | ||
#+BEGIN_SRC sh | ||
gff2cds --aa --type CDS --source Prodigal_v2.6.3 --fna emales.fna emales.gff > emales.faa | ||
mmseqs easy-cluster emales.faa emales-mmseqs /tmp -e 1e-5 -c 0.7; | ||
cluster-ids -t "cog%03d" < emales-mmseqs_cluster.tsv > emales-cogs.tsv | ||
#+END_SRC | ||
|
||
#+BEGIN_SRC R | ||
emale_cogs <- read_tsv("emales-cogs.tsv", col_names = c("feature_id", "cluster_id", "cluster_n")) | ||
emale_cogs %<>% mutate( | ||
cluster_label = paste0(cluster_id, " (", cluster_n, ")"), | ||
cluster_label = fct_lump_min(cluster_label, 5, other_level = "rare"), | ||
cluster_label = fct_lump_min(cluster_label, 15, other_level = "medium"), | ||
cluster_label = fct_relevel(cluster_label, "rare", after=Inf)) | ||
emale_cogs | ||
|
||
|
||
p6 <- gggenomes(emale_seqs_6, emale_genes, emale_tirs, emale_links) %>% | ||
add_features(emale_gc) %>% | ||
add_clusters(genes, emale_cogs) %>% | ||
flip_bins(3:5) + | ||
geom_seq() + geom_bin_label() + | ||
geom_feature(size=5, data=use_features(features)) + | ||
geom_gene(aes(fill=cluster_label)) + | ||
geom_link() + | ||
geom_ribbon(aes(x=(x+xend)/2, ymax=y+.24, ymin=y+.38-(.4*score), | ||
group=seq_id, linetype="GC-content"), use_features(emale_gc), | ||
fill="blue", alpha=.5) + | ||
scale_fill_brewer("Conserved genes", palette="Set3") | ||
|
||
p6 | ||
|
||
pp(p6) | ||
#+END_SRC | ||
|
||
[[file:emales-p6.png]] | ||
|
||
*** Blast hits and integrated transposons | ||
|
||
#+BEGIN_SRC sh | ||
# mavirus.faa - published | ||
blastp -query emales.faa -subject mavirus.faa -outfmt 7 > emales_mavirus-blastp.tsv | ||
perl -ne 'if(/>(\S+) gene=(\S+) product=(.+)/){print join("\t", $1, $2, $3), "\n"}' mavirus.faa > mavirus.tsv | ||
#+END_SRC | ||
|
||
#+BEGIN_SRC R | ||
emale_blast <- read_blast("emales_mavirus-blastp.tsv") | ||
emale_blast %<>% | ||
filter(evalue < 1e-3) %>% | ||
select(feature_id=qaccver, start=qstart, end=qend, saccver) %>% | ||
left_join(read_tsv("mavirus.tsv", col_names = c("saccver", "blast_hit", "blast_desc"))) | ||
|
||
# manual annotations by MFG | ||
emale_transposons <- read_gff("emales-manual.gff", types = c("mobile_element")) | ||
|
||
|
||
p7 <- gggenomes(emale_seqs_6, emale_genes, emale_tirs, emale_links) %>% | ||
add_features(emale_gc) %>% | ||
add_clusters(genes, emale_cogs) %>% | ||
add_features(emale_transposons) %>% | ||
add_subfeatures(genes, emale_blast, transform="aa2nuc") %>% | ||
flip_bins(3:5) + | ||
geom_feature(aes(color="integrated transposon"), | ||
use_features(emale_transposons), size=7) + | ||
geom_seq() + geom_bin_label() + | ||
geom_link(offset = c(0.3, 0.2), color="white", alpha=.3) + | ||
geom_feature(aes(color="terminal inverted repeat"), use_features(features), | ||
size=4) + | ||
geom_gene(aes(fill=cluster_label)) + | ||
geom_feature(aes(color=blast_desc), use_features(emale_blast), size=2, | ||
position="pile") + | ||
geom_ribbon(aes(x=(x+xend)/2, ymax=y+.24, ymin=y+.38-(.4*score), | ||
group=seq_id, linetype="GC-content"), use_features(emale_gc), | ||
fill="blue", alpha=.5) + | ||
scale_fill_brewer("Conserved genes", palette="Set3") + | ||
scale_color_viridis_d("Blast hits & Features", direction = -1) + | ||
scale_linetype("Graphs") + | ||
ggtitle(expression(paste("Endogenous mavirus-like elements of ", | ||
italic("C. burkhardae")))) | ||
|
||
p7 | ||
|
||
pp(p7) | ||
#+END_SRC | ||
|
||
[[file:emales-p7.png]] | ||
|
||
*** save example data | ||
#+BEGIN_SRC R | ||
usethis::use_data(emale_seqs, overwrite=TRUE) | ||
usethis::use_data(emale_genes, overwrite=TRUE) | ||
usethis::use_data(emale_links, overwrite=TRUE) | ||
usethis::use_data(emale_tirs, overwrite=TRUE) | ||
usethis::use_data(emale_transposons, overwrite=TRUE) | ||
usethis::use_data(emale_gc, overwrite=TRUE) | ||
usethis::use_data(emale_blast, overwrite=TRUE) | ||
usethis::use_data(emale_cogs, overwrite=TRUE) | ||
#+END_SRC |
Binary file not shown.
Oops, something went wrong.