forked from songlab-cal/gpn
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b167669
commit 2cb20e9
Showing
6 changed files
with
44 additions
and
45 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,10 @@ | ||
WANDB_PROJECT=GPN_Arabidopsis_multispecies python -m gpn.run_mlm --do_test \ | ||
--fp16 --prediction_loss_only True --remove_unused_columns False \ | ||
--dataset_name output/merged_dataset/balanced --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \ | ||
--dataset_name output/merged_dataset/balanced/512/256/True --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \ | ||
--soft_masked_loss_weight_test 0.0 \ | ||
--dataloader_num_workers 16 --preprocessing_num_workers 32 --seed 42 \ | ||
--output_dir /tmp/dir \ | ||
--per_device_eval_batch_size 2048 \ | ||
--model_name_or_path /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0_secondpart/checkpoint-30000 \ | ||
|
||
# --model_name_or_path gonzalobenegas/gpn-brassicales \ | ||
--model_name_or_path /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0.1_secondpart/checkpoint-30000 \ | ||
|
||
# note: there's a bug in huggingface trainer with iterable dataset, the per_device_*_batch_size is interpreted as total batch size |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,37 +1,12 @@ | ||
WANDB_PROJECT=GPN_Arabidopsis_multispecies python -m gpn.run_mlm --do_train --do_eval \ | ||
--fp16 --report_to wandb --prediction_loss_only True --remove_unused_columns False \ | ||
--dataset_name output/merged_dataset/balanced --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \ | ||
--soft_masked_loss_weight_train 0.0 --soft_masked_loss_weight_evaluation 0.0 \ | ||
--dataset_name output/merged_dataset/512/256/True/balanced --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \ | ||
--soft_masked_loss_weight_train 0.1 --soft_masked_loss_weight_evaluation 0.0 \ | ||
--weight_decay 0.01 --optim adamw_torch \ | ||
--dataloader_num_workers 16 --preprocessing_num_workers 32 --seed 42 \ | ||
--save_strategy steps --save_steps 10000 --evaluation_strategy steps --eval_steps 10000 --logging_steps 10000 --max_steps 120000 --warmup_steps 1000 \ | ||
--learning_rate 1e-3 --lr_scheduler_type constant_with_warmup \ | ||
--run_name ConvNet_batch2048_weight0 --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0 --model_type ConvNet \ | ||
--run_name ConvNet_batch2048_weight0.1 --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0.1 --model_type ConvNet \ | ||
--per_device_train_batch_size 2048 --per_device_eval_batch_size 2048 --gradient_accumulation_steps 1 \ | ||
--resume_from_checkpoint /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0/checkpoint-30000 \ | ||
--ignore_data_skip \ | ||
|
||
# note: there's a bug in huggingface trainer with iterable dataset, the per_device_*_batch_size is interpreted as total batch size | ||
|
||
# --run_name RoFormer_12_weight0.5_v2 --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_7/RoFormer_12_weight0.5_v2 --model_type roformer --config_overrides vocab_size=7 \ | ||
|
||
# --torch_compile \ # not working, will wait until stable version | ||
# --save_strategy steps --save_steps 10000 --evaluation_strategy steps --eval_steps 5000 \ | ||
# --logging_steps 5000 --max_steps 50000 --warmup_steps 500 \ | ||
#--save_strategy epoch --evaluation_strategy epoch --num_train_epochs 1 --warmup_ratio 0.01 \ | ||
#--run_name RoFormer_8 --output_dir output/checkpoints/RoFormer_8 --model_type roformer --config_overrides vocab_size=7,num_hidden_layers=8,num_attention_heads=8,hidden_size=512,intermediate_size=2048 \ | ||
|
||
# --learning_rate 1e-4 --lr_scheduler_type cosine \ | ||
# --run_name RoFormer_12 --output_dir output/checkpoints/RoFormer_12 --model_type roformer --config_overrides vocab_size=7 \ | ||
# --per_device_train_batch_size 128 --per_device_eval_batch_size 128 --gradient_accumulation_steps 2 \ | ||
|
||
#WANDB_PROJECT=GPN_Arabidopsis_6 python -m gpn.run_mlm --do_train --do_eval \ | ||
# --fp16 --report_to wandb --prediction_loss_only True --remove_unused_columns False \ | ||
# --dataset_name output/dataset/mlm/gpn --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \ | ||
# --soft_masked_loss_weight_train 1.0 --soft_masked_loss_weight_evaluation 1.0 \ | ||
# --weight_decay 0.01 --optim adamw_torch \ | ||
# --dataloader_num_workers 16 --preprocessing_num_workers 32 --seed 42 \ | ||
# --save_strategy steps --save_steps 10000 --evaluation_strategy steps --eval_steps 10000 --logging_steps 10000 --max_steps 400000 --warmup_steps 1000 \ | ||
# --learning_rate 1e-3 --lr_scheduler_type constant_with_warmup \ | ||
# --run_name ConvNet_25_weight1.0_batch256 --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_6/ConvNet_25_weight1.0_batch256 --model_type ConvNet \ | ||
# --per_device_train_batch_size 64 --per_device_eval_batch_size 64 --gradient_accumulation_steps 1 \ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,13 @@ | ||
WANDB_PROJECT=GPN_Arabidopsis_multispecies python -m gpn.run_mlm --do_train --do_eval \ | ||
--fp16 --report_to wandb --prediction_loss_only True --remove_unused_columns False \ | ||
--dataset_name output/merged_dataset/balanced --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \ | ||
--soft_masked_loss_weight_train 0.0 --soft_masked_loss_weight_evaluation 0.0 \ | ||
--dataset_name output/merged_dataset/512/256/True/balanced --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \ | ||
--soft_masked_loss_weight_train 0.1 --soft_masked_loss_weight_evaluation 0.0 \ | ||
--weight_decay 0.01 --optim adamw_torch \ | ||
--dataloader_num_workers 16 --preprocessing_num_workers 32 --seed 42 \ | ||
--save_strategy steps --save_steps 10000 --evaluation_strategy steps --eval_steps 10000 --logging_steps 10000 --max_steps 30000 --warmup_steps 0 \ | ||
--learning_rate 1e-3 --lr_scheduler_type cosine \ | ||
--run_name ConvNet_batch2048_weight0_secondpart --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0_secondpart \ | ||
--run_name ConvNet_batch2048_weight0.1_secondpart --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0.1_secondpart \ | ||
--per_device_train_batch_size 2048 --per_device_eval_batch_size 2048 --gradient_accumulation_steps 1 \ | ||
--model_name_or_path /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0/checkpoint-120000 \ | ||
--model_name_or_path /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0.1/checkpoint-120000 \ | ||
|
||
# note: there's a bug in huggingface trainer with iterable dataset, the per_device_*_batch_size is interpreted as total batch size |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Workflow to create a training dataset for any set of taxa | ||
For example usage, check out `analysis/arabidopsis/Snakefile` and `analysis/arabidopsis/config.yaml`. | ||
|
||
As a quick preview, here's how you could integrate this as a Snakemake sub-workflow: | ||
```python | ||
rule all: | ||
input: | ||
expand(f"output/merged_dataset/{config['window_size']}/{config['step_size']}/{config['add_rc']}/balanced/data/{{split}}", split=splits), | ||
|
||
|
||
module make_dataset_from_ncbi: | ||
snakefile: | ||
"https://raw.githubusercontent.com/songlab-cal/gpn/main/workflow/make_dataset_from_ncbi/Snakefile" | ||
config: config | ||
|
||
|
||
use rule * from make_dataset_from_ncbi as make_dataset_from_ncbi_* | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters