Skip to content

Commit

Permalink
Merge pull request #244 from rhysnewell/add-taxvamb2
Browse files Browse the repository at this point in the history
Add TaxVAMB
  • Loading branch information
AroneyS authored Jan 17, 2025
2 parents a7cf255 + e80b764 commit d824335
Show file tree
Hide file tree
Showing 15 changed files with 464 additions and 9 deletions.
1 change: 1 addition & 0 deletions .github/workflows/test-aviary.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,4 @@ jobs:
python test/test_assemble.py
python test/test_recover.py
python test/test_run_checkm.py -b
python test/test_convert_metabuli.py
24 changes: 19 additions & 5 deletions aviary/aviary.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,11 +278,11 @@ def main():

base_group.add_argument(
'--download', '--download',
help='Downloads the requested GTDB, EggNOG, SingleM, & CheckM2 databases',
help='Downloads the requested GTDB, EggNOG, SingleM, CheckM2, & Metabuli databases',
dest='download',
default=[],
nargs="*",
choices=["gtdb", "eggnog", "singlem", "checkm2"]
choices=["gtdb", "eggnog", "singlem", "checkm2", "metabuli"]
)

base_group.add_argument(
Expand Down Expand Up @@ -590,12 +590,13 @@ def main():
binning_group.add_argument(
'--extra-binners', '--extra_binners', '--extra-binner', '--extra_binner',
help='Optional list of extra binning algorithms to run. Can be any combination of: \n'
'maxbin, maxbin2, concoct, comebin \n'
'maxbin, maxbin2, concoct, comebin, taxvamb \n'
'These binners are skipped by default as they can have long runtimes \n'
'N.B. specifying "maxbin" and "maxbin2" are equivalent \n',
'N.B. specifying "maxbin" and "maxbin2" are equivalent \n'
'N.B. specifying "taxvamb" will also run metabuli for contig taxonomic assignment \n',
dest='extra_binners',
nargs='*',
choices=["maxbin", "maxbin2", "concoct", "comebin"]
choices=["maxbin", "maxbin2", "concoct", "comebin", "taxvamb"]
)

binning_group.add_argument(
Expand Down Expand Up @@ -1187,6 +1188,13 @@ def main():
required=False,
)

configure_options.add_argument(
'--metabuli-db-path', '--metabuli_db_path',
help='Path to the local metabuli database',
dest='metabuli_db_path',
required=False,
)

add_workflow_arg(configure_options, ['download_databases'], help=argparse.SUPPRESS)

###########################################################################
Expand Down Expand Up @@ -1234,13 +1242,17 @@ def main():
if args.singlem_metapackage_path is not None:
Config.set_db_path(args.singlem_metapackage_path, db_name='SINGLEM_METAPACKAGE_PATH')

if args.metabuli_db_path is not None:
Config.set_db_path(args.metabuli_db_path, db_name='METABULI_DB_PATH')

logging.info("The current aviary environment variables are:")
logging.info(f"CONDA_ENV_PATH: {Config.get_software_db_path('CONDA_ENV_PATH', '--conda-prefix')}")
logging.info(f"TMPDIR: {Config.get_software_db_path('TMPDIR', '--tmpdir')}")
logging.info(f"GTDBTK_DATA_PATH: {Config.get_software_db_path('GTDBTK_DATA_PATH', '--gtdb-path')}")
logging.info(f"EGGNOG_DATA_DIR: {Config.get_software_db_path('EGGNOG_DATA_DIR', '--eggnog-db-path')}")
logging.info(f"CHECKM2DB: {Config.get_software_db_path('CHECKM2DB', '--checkm2-db-path')}")
logging.info(f"SINGLEM_METAPACKAGE_PATH: {Config.get_software_db_path('SINGLEM_METAPACKAGE_PATH', '--singlem-metapackage-path')}")
logging.info(f"METABULI_DB_PATH: {Config.get_software_db_path('METABULI_DB_PATH', '--metabuli-db-path')}")
if not args.download:
logging.info("All paths set. Exiting without downloading databases. If you wish to download databases use --download")
sys.exit(0)
Expand Down Expand Up @@ -1295,6 +1307,8 @@ def manage_env_vars(args):
args.checkm2_db_path = Config.get_software_db_path('CHECKM2DB', '--checkm2-db-path')
if args.singlem_metapackage_path is None:
args.singlem_db_path = Config.get_software_db_path('SINGLEM_METAPACKAGE_PATH', '--singlem-metapackage-path')
if args.metabuli_db_path is None:
args.metabuli_db_path = Config.get_software_db_path('METABULI_DB_PATH', '--metabuli-db-path')
except AttributeError:
pass

Expand Down
1 change: 1 addition & 0 deletions aviary/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def get_software_db_path(db_name='CONDA_ENV_PATH', software_flag='--conda-prefix
print(f'Please set this variable to your default server/home directory containing {db_name}.'.center(100))
print(f'Alternatively, use {software_flag} flag.'.center(100))
print(f'Note: This variable must point to the DIRECTORY containing the files, not the files themselves'.center(100))
print('Note: This can be set to an arbitrary string if you do not need this database'.center(100))
print('=' * 100)
signal.alarm(120)
os.environ[db_name] = input(f'Input path to directory for {db_name} now:').strip()
Expand Down
6 changes: 6 additions & 0 deletions aviary/envs/metabuli.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- metabuli=1.0.*
12 changes: 12 additions & 0 deletions aviary/modules/annotation/annotation.smk
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ rule download_databases:
'logs/download_eggnog.log' if "eggnog" in config["download"] else [],
'logs/download_singlem.log' if "singlem" in config["download"] else [],
'logs/download_checkm2.log' if "checkm2" in config["download"] else [],
'logs/download_metabuli.log' if "metabuli" in config["download"] else [],
threads: 1
log:
temp("logs/download.log")
Expand Down Expand Up @@ -136,6 +137,17 @@ rule download_checkm2:
'checkm2 database --download --path {params.checkm2_folder} 2> {log}; '
'mv {params.checkm2_folder}/CheckM2_database/*.dmnd {params.checkm2_folder}/; '

rule download_metabuli:
params:
metabuli_folder = os.path.expanduser(config['metabuli_folder'])
conda:
'../../envs/metabuli.yaml'
threads: 1
log:
'logs/download_metabuli.log'
shell:
'metabuli databases GTDB {params.metabuli_folder} tmp 2> {log} 2&>1 '

rule checkm2:
input:
mag_folder = config['mag_directory'],
Expand Down
103 changes: 102 additions & 1 deletion aviary/modules/binning/binning.smk
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
localrules: get_bam_indices, vamb_jgi_filter, vamb_skip, amber_checkm_output, finalise_stats, recover_mags, recover_mags_no_singlem
localrules: get_bam_indices, vamb_jgi_filter, vamb_skip, convert_metabuli, amber_checkm_output, finalise_stats, recover_mags, recover_mags_no_singlem

ruleorder: dereplicate_and_get_abundances_paired > dereplicate_and_get_abundances_interleaved
ruleorder: checkm_rosella > amber_checkm_output
Expand Down Expand Up @@ -203,6 +203,106 @@ rule vamb_skip:
"touch data/vamb_bins/skipped"


rule taxvamb_abundance_tsv:
"""
VAMB post v4 has to have a coverage file with the following format:
A TSV file with the header being "contigname" followed by one samplename per sample,
and the values in the TSV file being precomputed abundances.
"""
input:
fasta = ancient(config["fasta"]),
done = ancient("data/coverm.filt.cov")
output:
vamb_bams_done = "data/coverm.vamb.cov"
threads:
config["max_threads"]
run:
# Short-only
# contigName contigLen totalAvgDepth assembly.fasta/wgsim.1.fq.gz.bam assembly.fasta/wgsim.1.fq.gz.bam-var
# Short + long
# contigName contigLen totalAvgDepth assembly.fasta/wgsim.1.fq.gz.bam assembly.fasta/wgsim.1.fq.gz.bam-var assembly.fasta/wgsim.css.fastq.gz Trimmed Mean assembly.fasta/wgsim.css.fastq.gz Variance
import pandas as pd
coverm_out = pd.read_csv("data/coverm.filt.cov", sep='\t')
coverm_out.drop(columns=["contigLen", "totalAvgDepth"], inplace=True)
coverm_out.rename(columns={"contigName": "contigname"}, inplace=True)
columns_to_drop = [col for col in coverm_out.columns if col.endswith('-var') or col.endswith(' Variance')]
coverm_out.drop(columns=columns_to_drop, inplace=True)
coverm_out.to_csv("data/coverm.vamb.cov", sep='\t', index=False)

rule metabuli_taxonomy:
input:
fasta = ancient(config["fasta"]),
output:
"data/metabuli_taxonomy/done"
threads:
config["max_threads"]
resources:
mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 128*1024*attempt),
mem_gb = lambda wildcards, attempt: min(int(config["max_memory"]), 128*attempt),
runtime = lambda wildcards, attempt: 48*60*attempt,
params:
metabuli_db = config['metabuli_folder'],
conda:
"../../envs/metabuli.yaml"
log:
"logs/metabuli.log"
benchmark:
"benchmarks/metabuli.benchmark.txt"
shell:
"rm -rf data/metabuli_taxonomy/; "
"mkdir -p data/metabuli_taxonomy && "
"metabuli classify "
"{input.fasta} {params.metabuli_db}/gtdb data/metabuli_taxonomy tax > {log} 2>&1 "
"--seq-mode 1 --threads {threads} --max-ram {resources.mem_gb} "
"&& touch {output[0]}"

rule convert_metabuli:
input:
"data/metabuli_taxonomy/done",
filt_cov = ancient("data/coverm.filt.cov")
output:
"data/metabuli_taxonomy/taxonomy.tsv"
threads:
config["max_threads"]
params:
report = "data/metabuli_taxonomy/tax_report.tsv",
classifications = "data/metabuli_taxonomy/tax_classifications.tsv",
log:
"logs/metabuli_convert.log"
script:
"scripts/convert_metabuli.py"

rule taxvamb:
input:
coverage = ancient("data/coverm.vamb.cov"),
fasta = ancient(config["fasta"]),
taxonomy = "data/metabuli_taxonomy/taxonomy.tsv"
params:
min_bin_size = config["min_bin_size"],
min_contig_size = config["min_contig_size"],
vamb_threads = min(int(config["max_threads"]), 16) // 2 # vamb use double the threads you give it
threads:
config["max_threads"]
resources:
mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 128*1024*attempt),
runtime = lambda wildcards, attempt: 48*60*attempt,
gpus = 1 if config["request_gpu"] else 0
output:
"data/taxvamb_bins/done"
conda:
"envs/taxvamb.yaml"
log:
"logs/taxvamb.log"
benchmark:
"benchmarks/taxvamb.benchmark.txt"
shell:
"rm -rf data/taxvamb_bins/; "
"bash -c 'vamb bin taxvamb --outdir data/taxvamb_bins/ -p {params.vamb_threads} --fasta {input.fasta} "
"--abundance_tsv {input.coverage} --taxonomy {input.taxonomy} "
"--minfasta {params.min_bin_size} -m {params.min_contig_size} > {log} 2>&1 && touch {output[0]}' || "
"touch {output[0]} && mkdir -p data/taxvamb_bins/bins"


rule metabat2:
input:
coverage = ancient("data/coverm.cov"),
Expand Down Expand Up @@ -649,6 +749,7 @@ rule das_tool:
semibin_done = [] if "semibin" in config["skip_binners"] else "data/semibin_refined/done",
comebin_done = [] if "comebin" in config["skip_binners"] else "data/comebin_bins/done",
vamb_done = [] if "vamb" in config["skip_binners"] else "data/vamb_bins/done",
taxvamb_done = [] if "taxvamb" in config["skip_binners"] else "data/taxvamb_bins/done",
threads:
config["max_threads"]
resources:
Expand Down
14 changes: 14 additions & 0 deletions aviary/modules/binning/envs/taxvamb.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- samtools
- python>=3.9.0,<3.12
- numpy==1.24.2
- pytorch==1.13.1
- pycoverm==0.6.2
- pip
- git
- pip:
- git+https://github.com/RasmussenLab/vamb.git@8fa3280
2 changes: 1 addition & 1 deletion aviary/modules/binning/envs/vamb.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ channels:
- bioconda
- defaults
dependencies:
- vamb
- vamb==3.0.2
- samtools
70 changes: 70 additions & 0 deletions aviary/modules/binning/scripts/convert_metabuli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/env python3

import pandas as pd
import re

def create_conversion_dict(ids, labels):
map_ids_metabuli = {}
taxstring = []
tax_regex = re.compile("^[pcofgs]__")
for k, v in zip(ids, labels):
tax = v.strip()
if tax in ["unclassified", "root"]:
continue
if tax.startswith("d__"):
taxstring = [tax]
else:
tax_tag = re.match(tax_regex, tax)
if tax_tag:
replace_pos = [t.startswith(tax_tag[0]) for t in taxstring]
if sum(replace_pos) > 0:
taxstring = taxstring[:replace_pos.index(True)]
taxstring.append(tax)
else:
# For assignments to specific genome ids
pass

map_ids_metabuli[k] = ",".join(taxstring)

return map_ids_metabuli

def process_classifications(df_clas, map_ids_metabuli, coverm_out):
df_clas['contigs'] = df_clas[1]
df_clas['predictions'] = df_clas[2].map(map_ids_metabuli)

out = pd.merge(df_clas, coverm_out, on="contigs", how="inner")

return out[['contigs', 'predictions']]


if __name__ == '__main__':
report = snakemake.params.report
classifications = snakemake.params.classifications
filt_cov = snakemake.input.filt_cov
output_file = snakemake.output[0]
threads = snakemake.threads
log = snakemake.log[0]

with open(log, "w") as logf:
logf.write("Starting conversion of metabuli predictions\n")

df_report = pd.read_csv(report, delimiter='\t', header=None)

map_ids_metabuli = create_conversion_dict(df_report[4], df_report[5])
with open(log, 'a') as logf:
logf.write(f"Conversion dictionary created\n")

# df_clas: classified_flag, read_id, taxonomy_identifier, ...
df_clas = pd.read_csv(classifications, delimiter='\t', header=None)
with open(log, 'a') as logf:
logf.write(f"{len(df_clas)} classifications read\n")

coverm_out = pd.read_csv(filt_cov, sep='\t')[['contigName']]
coverm_out.rename(columns={"contigName": "contigs"}, inplace=True)

df_clas = process_classifications(df_clas, map_ids_metabuli, coverm_out)
with open(log, 'a') as logf:
logf.write(f"Conversion complete\n")
logf.write(f"{len(df_clas)} contigs passed filtering\n")

df_clas.to_csv(output_file, sep='\t', index=False)
3 changes: 2 additions & 1 deletion aviary/modules/binning/scripts/das_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
('maxbin2', 'fasta'),
('vamb', 'fna'),
('comebin', 'fa'),
('taxvamb', 'fna'),
]
refined_binners_to_use = [
('rosella', 'fna'),
Expand All @@ -23,7 +24,7 @@
for (binner, extension) in unrefined_binners_to_use:
if binner not in snakemake.config['skip_binners']:
extra = ''
if binner == 'vamb':
if binner == 'vamb' or binner == 'taxvamb':
extra = 'bins/'
elif binner == 'comebin':
extra = 'comebin_res/comebin_res_bins/'
Expand Down
10 changes: 9 additions & 1 deletion aviary/modules/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def __init__(self,
self.skip_singlem = True
self.binning_only = args.binning_only

self.skip_binners = ["maxbin2", "concoct", "comebin"]
self.skip_binners = ["maxbin2", "concoct", "comebin", "taxvamb"]
if args.extra_binners:
for binner in args.extra_binners:
binner = binner.lower()
Expand All @@ -138,6 +138,8 @@ def __init__(self,
self.skip_binners.remove("concoct")
elif binner == "comebin":
self.skip_binners.remove("comebin")
elif binner == "taxvamb":
self.skip_binners.remove("taxvamb")
else:
logging.warning(f"Unknown extra binner {binner} specified. Skipping...")

Expand Down Expand Up @@ -278,10 +280,15 @@ def __init__(self,
self.singlem = args.singlem_metapackage_path
else:
self.singlem = Config.get_software_db_path('SINGLEM_METAPACKAGE_PATH', '--singlem-metapackage-path')
if args.metabuli_db_path is not None:
self.metabuli = args.metabuli_db_path
else:
self.metabuli = Config.get_software_db_path('METABULI_DB_PATH', '--metabuli-db-path')
except AttributeError:
self.gtdbtk = Config.get_software_db_path('GTDBTK_DATA_PATH', '--gtdb-path')
self.eggnog = Config.get_software_db_path('EGGNOG_DATA_DIR', '--eggnog-db-path')
self.singlem = Config.get_software_db_path('SINGLEM_METAPACKAGE_PATH', '--singlem-metapackage-path')
self.metabuli = Config.get_software_db_path('METABULI_DB_PATH', '--metabuli-db-path')
# self.enrichm = Config.get_software_db_path('ENRICHM_DB', '--enrichm-db-path')

try:
Expand Down Expand Up @@ -421,6 +428,7 @@ def make_config(self):
conf["gtdbtk_folder"] = self.gtdbtk
conf["eggnog_folder"] = self.eggnog
conf["singlem_metapackage"] = self.singlem
conf["metabuli_folder"] = self.metabuli
conf["strain_analysis"] = self.strain_analysis
conf["checkm2_db_folder"] = self.checkm2_db
conf["use_checkm2_scores"] = self.use_checkm2_scores
Expand Down
1 change: 1 addition & 0 deletions test/test_assemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def test_assemble_simple_inputs(self):
f"GTDBTK_DATA_PATH=. "
f"CHECKM2DB=. "
f"EGGNOG_DATA_DIR=. "
f"METABULI_DB_PATH=. "
f"SINGLEM_METAPACKAGE_PATH=. "
f"aviary assemble "
f"-1 {FORWARD_READS} "
Expand Down
Loading

0 comments on commit d824335

Please sign in to comment.