diff --git a/analysis/arabidopsis/Snakefile b/analysis/arabidopsis/Snakefile index 8e78ccb..182a6d7 100644 --- a/analysis/arabidopsis/Snakefile +++ b/analysis/arabidopsis/Snakefile @@ -62,8 +62,8 @@ ruleorder: get_kmer_spectrum > get_embeddings rule all: input: + expand(f"output/merged_dataset/{config['window_size']}/{config['step_size']}/{config['add_rc']}/balanced/data/{{split}}", split=splits), #"output/embedding/embeddings/kmers.parquet", - #expand("output/merged_dataset/{config[window_size]}/{config[step_size]}/{config[add_rc]}/balanced/data/{split}", split=splits), #"output/assemblies.tex", #expand("output/whole_genome/modisco/{model}/report", model=models), #expand("output/whole_genome/subset/promoter/modisco/{model}/report", model=models), @@ -71,7 +71,7 @@ rule all: #"output/embedding/subset_no_repeats/windows.parquet", #expand("output/embedding/subset_no_repeats/umap/{model}.parquet", model=models), #expand("output/embedding/subset_no_repeats/leiden_{resolution}_/{model}.parquet", model=models, resolution=[0.3]), - expand("output/embedding/umap/{model}.parquet", model=models + ["kmers_5", "kmers_3", "kmers_6", "kmers_4"]), + #expand("output/embedding/umap/{model}.parquet", model=models + ["kmers_5", "kmers_3", "kmers_6", "kmers_4"]), #expand("output/funtfbs/logits/{model}.parquet", model=models), #expand("output/whole_genome/logits/{model}.parquet", model=models), #expand("output/simulated_variants/vep/{model}.parquet", model=models), @@ -105,13 +105,13 @@ rule filter_assemblies: """ -#module make_dataset_from_ncbi: -# snakefile: -# "https://github.com/songlab-cal/gpn/workflow/make_dataset_from_ncbi/Snakefile" -# config: config -# -# -#use rule * from make_dataset_from_ncbi as make_dataset_from_ncbi_* +module make_dataset_from_ncbi: + snakefile: + "https://raw.githubusercontent.com/songlab-cal/gpn/main/workflow/make_dataset_from_ncbi/Snakefile" + config: config + + +use rule * from make_dataset_from_ncbi as make_dataset_from_ncbi_* rule make_assemblies_latex_table: diff --git a/analysis/arabidopsis/test.sh b/analysis/arabidopsis/test.sh index 74c19a3..f4b0b52 100644 --- a/analysis/arabidopsis/test.sh +++ b/analysis/arabidopsis/test.sh @@ -1,12 +1,10 @@ WANDB_PROJECT=GPN_Arabidopsis_multispecies python -m gpn.run_mlm --do_test \ --fp16 --prediction_loss_only True --remove_unused_columns False \ - --dataset_name output/merged_dataset/balanced --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \ + --dataset_name output/merged_dataset/balanced/512/256/True --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \ --soft_masked_loss_weight_test 0.0 \ --dataloader_num_workers 16 --preprocessing_num_workers 32 --seed 42 \ --output_dir /tmp/dir \ --per_device_eval_batch_size 2048 \ - --model_name_or_path /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0_secondpart/checkpoint-30000 \ - -# --model_name_or_path gonzalobenegas/gpn-brassicales \ + --model_name_or_path /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0.1_secondpart/checkpoint-30000 \ # note: there's a bug in huggingface trainer with iterable dataset, the per_device_*_batch_size is interpreted as total batch size diff --git a/analysis/arabidopsis/train.sh b/analysis/arabidopsis/train.sh index 52af82c..401117d 100644 --- a/analysis/arabidopsis/train.sh +++ b/analysis/arabidopsis/train.sh @@ -1,37 +1,12 @@ WANDB_PROJECT=GPN_Arabidopsis_multispecies python -m gpn.run_mlm --do_train --do_eval \ --fp16 --report_to wandb --prediction_loss_only True --remove_unused_columns False \ - --dataset_name output/merged_dataset/balanced --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \ - --soft_masked_loss_weight_train 0.0 --soft_masked_loss_weight_evaluation 0.0 \ + --dataset_name output/merged_dataset/512/256/True/balanced --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \ + --soft_masked_loss_weight_train 0.1 --soft_masked_loss_weight_evaluation 0.0 \ --weight_decay 0.01 --optim adamw_torch \ --dataloader_num_workers 16 --preprocessing_num_workers 32 --seed 42 \ --save_strategy steps --save_steps 10000 --evaluation_strategy steps --eval_steps 10000 --logging_steps 10000 --max_steps 120000 --warmup_steps 1000 \ --learning_rate 1e-3 --lr_scheduler_type constant_with_warmup \ - --run_name ConvNet_batch2048_weight0 --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0 --model_type ConvNet \ + --run_name ConvNet_batch2048_weight0.1 --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0.1 --model_type ConvNet \ --per_device_train_batch_size 2048 --per_device_eval_batch_size 2048 --gradient_accumulation_steps 1 \ - --resume_from_checkpoint /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0/checkpoint-30000 \ - --ignore_data_skip \ # note: there's a bug in huggingface trainer with iterable dataset, the per_device_*_batch_size is interpreted as total batch size - -# --run_name RoFormer_12_weight0.5_v2 --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_7/RoFormer_12_weight0.5_v2 --model_type roformer --config_overrides vocab_size=7 \ - -# --torch_compile \ # not working, will wait until stable version -# --save_strategy steps --save_steps 10000 --evaluation_strategy steps --eval_steps 5000 \ -# --logging_steps 5000 --max_steps 50000 --warmup_steps 500 \ - #--save_strategy epoch --evaluation_strategy epoch --num_train_epochs 1 --warmup_ratio 0.01 \ - #--run_name RoFormer_8 --output_dir output/checkpoints/RoFormer_8 --model_type roformer --config_overrides vocab_size=7,num_hidden_layers=8,num_attention_heads=8,hidden_size=512,intermediate_size=2048 \ - -# --learning_rate 1e-4 --lr_scheduler_type cosine \ -# --run_name RoFormer_12 --output_dir output/checkpoints/RoFormer_12 --model_type roformer --config_overrides vocab_size=7 \ -# --per_device_train_batch_size 128 --per_device_eval_batch_size 128 --gradient_accumulation_steps 2 \ - -#WANDB_PROJECT=GPN_Arabidopsis_6 python -m gpn.run_mlm --do_train --do_eval \ -# --fp16 --report_to wandb --prediction_loss_only True --remove_unused_columns False \ -# --dataset_name output/dataset/mlm/gpn --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \ -# --soft_masked_loss_weight_train 1.0 --soft_masked_loss_weight_evaluation 1.0 \ -# --weight_decay 0.01 --optim adamw_torch \ -# --dataloader_num_workers 16 --preprocessing_num_workers 32 --seed 42 \ -# --save_strategy steps --save_steps 10000 --evaluation_strategy steps --eval_steps 10000 --logging_steps 10000 --max_steps 400000 --warmup_steps 1000 \ -# --learning_rate 1e-3 --lr_scheduler_type constant_with_warmup \ -# --run_name ConvNet_25_weight1.0_batch256 --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_6/ConvNet_25_weight1.0_batch256 --model_type ConvNet \ -# --per_device_train_batch_size 64 --per_device_eval_batch_size 64 --gradient_accumulation_steps 1 \ \ No newline at end of file diff --git a/analysis/arabidopsis/train_secondpart.sh b/analysis/arabidopsis/train_secondpart.sh index e607d53..36a4f03 100644 --- a/analysis/arabidopsis/train_secondpart.sh +++ b/analysis/arabidopsis/train_secondpart.sh @@ -1,11 +1,13 @@ WANDB_PROJECT=GPN_Arabidopsis_multispecies python -m gpn.run_mlm --do_train --do_eval \ --fp16 --report_to wandb --prediction_loss_only True --remove_unused_columns False \ - --dataset_name output/merged_dataset/balanced --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \ - --soft_masked_loss_weight_train 0.0 --soft_masked_loss_weight_evaluation 0.0 \ + --dataset_name output/merged_dataset/512/256/True/balanced --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \ + --soft_masked_loss_weight_train 0.1 --soft_masked_loss_weight_evaluation 0.0 \ --weight_decay 0.01 --optim adamw_torch \ --dataloader_num_workers 16 --preprocessing_num_workers 32 --seed 42 \ --save_strategy steps --save_steps 10000 --evaluation_strategy steps --eval_steps 10000 --logging_steps 10000 --max_steps 30000 --warmup_steps 0 \ --learning_rate 1e-3 --lr_scheduler_type cosine \ - --run_name ConvNet_batch2048_weight0_secondpart --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0_secondpart \ + --run_name ConvNet_batch2048_weight0.1_secondpart --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0.1_secondpart \ --per_device_train_batch_size 2048 --per_device_eval_batch_size 2048 --gradient_accumulation_steps 1 \ - --model_name_or_path /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0/checkpoint-120000 \ \ No newline at end of file + --model_name_or_path /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0.1/checkpoint-120000 \ + +# note: there's a bug in huggingface trainer with iterable dataset, the per_device_*_batch_size is interpreted as total batch size \ No newline at end of file diff --git a/workflow/make_dataset_from_ncbi/README.md b/workflow/make_dataset_from_ncbi/README.md new file mode 100644 index 0000000..bef0296 --- /dev/null +++ b/workflow/make_dataset_from_ncbi/README.md @@ -0,0 +1,18 @@ +# Workflow to create a training dataset for any set of taxa +For example usage, check out `analysis/arabidopsis/Snakefile` and `analysis/arabidopsis/config.yaml`. + +As a quick preview, here's how you could integrate this as a Snakemake sub-workflow: +```python +rule all: + input: + expand(f"output/merged_dataset/{config['window_size']}/{config['step_size']}/{config['add_rc']}/balanced/data/{{split}}", split=splits), + + +module make_dataset_from_ncbi: + snakefile: + "https://raw.githubusercontent.com/songlab-cal/gpn/main/workflow/make_dataset_from_ncbi/Snakefile" + config: config + + +use rule * from make_dataset_from_ncbi as make_dataset_from_ncbi_* +``` diff --git a/workflow/make_dataset_from_ncbi/Snakefile b/workflow/make_dataset_from_ncbi/Snakefile index 5a4ed15..3ad1a3e 100644 --- a/workflow/make_dataset_from_ncbi/Snakefile +++ b/workflow/make_dataset_from_ncbi/Snakefile @@ -9,6 +9,12 @@ from tqdm import tqdm tqdm.pandas() +# Assembly metadata downloaded from NCBI Genome +# (https://www.ncbi.nlm.nih.gov/data-hub/genome). +# You can choose a set of taxa and apply filters such as annotation level, +# assembly level. +# Checkout the script gpn/filter_assemblies.py for more details, such as +# how to subsample, or how to keep only one assembly per genus. assemblies = pd.read_csv(config["assemblies_path"], sep="\t", index_col=0) assemblies["Assembly Name"] = assemblies["Assembly Name"].str.replace(" ", "_") assemblies["genome_path"] = (