Skip to content

Commit

Permalink
Polish ncbi workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
gonzalobenegas committed Apr 12, 2023
1 parent b167669 commit 2cb20e9
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 45 deletions.
18 changes: 9 additions & 9 deletions analysis/arabidopsis/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -62,16 +62,16 @@ ruleorder: get_kmer_spectrum > get_embeddings

rule all:
input:
expand(f"output/merged_dataset/{config['window_size']}/{config['step_size']}/{config['add_rc']}/balanced/data/{{split}}", split=splits),
#"output/embedding/embeddings/kmers.parquet",
#expand("output/merged_dataset/{config[window_size]}/{config[step_size]}/{config[add_rc]}/balanced/data/{split}", split=splits),
#"output/assemblies.tex",
#expand("output/whole_genome/modisco/{model}/report", model=models),
#expand("output/whole_genome/subset/promoter/modisco/{model}/report", model=models),
#expand("output/whole_genome/bed_probs/{model}/{nuc}.bw", model=models, nuc=NUCLEOTIDES),
#"output/embedding/subset_no_repeats/windows.parquet",
#expand("output/embedding/subset_no_repeats/umap/{model}.parquet", model=models),
#expand("output/embedding/subset_no_repeats/leiden_{resolution}_/{model}.parquet", model=models, resolution=[0.3]),
expand("output/embedding/umap/{model}.parquet", model=models + ["kmers_5", "kmers_3", "kmers_6", "kmers_4"]),
#expand("output/embedding/umap/{model}.parquet", model=models + ["kmers_5", "kmers_3", "kmers_6", "kmers_4"]),
#expand("output/funtfbs/logits/{model}.parquet", model=models),
#expand("output/whole_genome/logits/{model}.parquet", model=models),
#expand("output/simulated_variants/vep/{model}.parquet", model=models),
Expand Down Expand Up @@ -105,13 +105,13 @@ rule filter_assemblies:
"""


#module make_dataset_from_ncbi:
# snakefile:
# "https://github.com/songlab-cal/gpn/workflow/make_dataset_from_ncbi/Snakefile"
# config: config
#
#
#use rule * from make_dataset_from_ncbi as make_dataset_from_ncbi_*
module make_dataset_from_ncbi:
snakefile:
"https://raw.githubusercontent.com/songlab-cal/gpn/main/workflow/make_dataset_from_ncbi/Snakefile"
config: config


use rule * from make_dataset_from_ncbi as make_dataset_from_ncbi_*


rule make_assemblies_latex_table:
Expand Down
6 changes: 2 additions & 4 deletions analysis/arabidopsis/test.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
WANDB_PROJECT=GPN_Arabidopsis_multispecies python -m gpn.run_mlm --do_test \
--fp16 --prediction_loss_only True --remove_unused_columns False \
--dataset_name output/merged_dataset/balanced --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \
--dataset_name output/merged_dataset/balanced/512/256/True --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \
--soft_masked_loss_weight_test 0.0 \
--dataloader_num_workers 16 --preprocessing_num_workers 32 --seed 42 \
--output_dir /tmp/dir \
--per_device_eval_batch_size 2048 \
--model_name_or_path /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0_secondpart/checkpoint-30000 \

# --model_name_or_path gonzalobenegas/gpn-brassicales \
--model_name_or_path /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0.1_secondpart/checkpoint-30000 \

# note: there's a bug in huggingface trainer with iterable dataset, the per_device_*_batch_size is interpreted as total batch size
31 changes: 3 additions & 28 deletions analysis/arabidopsis/train.sh
Original file line number Diff line number Diff line change
@@ -1,37 +1,12 @@
WANDB_PROJECT=GPN_Arabidopsis_multispecies python -m gpn.run_mlm --do_train --do_eval \
--fp16 --report_to wandb --prediction_loss_only True --remove_unused_columns False \
--dataset_name output/merged_dataset/balanced --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \
--soft_masked_loss_weight_train 0.0 --soft_masked_loss_weight_evaluation 0.0 \
--dataset_name output/merged_dataset/512/256/True/balanced --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \
--soft_masked_loss_weight_train 0.1 --soft_masked_loss_weight_evaluation 0.0 \
--weight_decay 0.01 --optim adamw_torch \
--dataloader_num_workers 16 --preprocessing_num_workers 32 --seed 42 \
--save_strategy steps --save_steps 10000 --evaluation_strategy steps --eval_steps 10000 --logging_steps 10000 --max_steps 120000 --warmup_steps 1000 \
--learning_rate 1e-3 --lr_scheduler_type constant_with_warmup \
--run_name ConvNet_batch2048_weight0 --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0 --model_type ConvNet \
--run_name ConvNet_batch2048_weight0.1 --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0.1 --model_type ConvNet \
--per_device_train_batch_size 2048 --per_device_eval_batch_size 2048 --gradient_accumulation_steps 1 \
--resume_from_checkpoint /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0/checkpoint-30000 \
--ignore_data_skip \

# note: there's a bug in huggingface trainer with iterable dataset, the per_device_*_batch_size is interpreted as total batch size

# --run_name RoFormer_12_weight0.5_v2 --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_7/RoFormer_12_weight0.5_v2 --model_type roformer --config_overrides vocab_size=7 \

# --torch_compile \ # not working, will wait until stable version
# --save_strategy steps --save_steps 10000 --evaluation_strategy steps --eval_steps 5000 \
# --logging_steps 5000 --max_steps 50000 --warmup_steps 500 \
#--save_strategy epoch --evaluation_strategy epoch --num_train_epochs 1 --warmup_ratio 0.01 \
#--run_name RoFormer_8 --output_dir output/checkpoints/RoFormer_8 --model_type roformer --config_overrides vocab_size=7,num_hidden_layers=8,num_attention_heads=8,hidden_size=512,intermediate_size=2048 \

# --learning_rate 1e-4 --lr_scheduler_type cosine \
# --run_name RoFormer_12 --output_dir output/checkpoints/RoFormer_12 --model_type roformer --config_overrides vocab_size=7 \
# --per_device_train_batch_size 128 --per_device_eval_batch_size 128 --gradient_accumulation_steps 2 \

#WANDB_PROJECT=GPN_Arabidopsis_6 python -m gpn.run_mlm --do_train --do_eval \
# --fp16 --report_to wandb --prediction_loss_only True --remove_unused_columns False \
# --dataset_name output/dataset/mlm/gpn --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \
# --soft_masked_loss_weight_train 1.0 --soft_masked_loss_weight_evaluation 1.0 \
# --weight_decay 0.01 --optim adamw_torch \
# --dataloader_num_workers 16 --preprocessing_num_workers 32 --seed 42 \
# --save_strategy steps --save_steps 10000 --evaluation_strategy steps --eval_steps 10000 --logging_steps 10000 --max_steps 400000 --warmup_steps 1000 \
# --learning_rate 1e-3 --lr_scheduler_type constant_with_warmup \
# --run_name ConvNet_25_weight1.0_batch256 --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_6/ConvNet_25_weight1.0_batch256 --model_type ConvNet \
# --per_device_train_batch_size 64 --per_device_eval_batch_size 64 --gradient_accumulation_steps 1 \
10 changes: 6 additions & 4 deletions analysis/arabidopsis/train_secondpart.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
WANDB_PROJECT=GPN_Arabidopsis_multispecies python -m gpn.run_mlm --do_train --do_eval \
--fp16 --report_to wandb --prediction_loss_only True --remove_unused_columns False \
--dataset_name output/merged_dataset/balanced --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \
--soft_masked_loss_weight_train 0.0 --soft_masked_loss_weight_evaluation 0.0 \
--dataset_name output/merged_dataset/512/256/True/balanced --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \
--soft_masked_loss_weight_train 0.1 --soft_masked_loss_weight_evaluation 0.0 \
--weight_decay 0.01 --optim adamw_torch \
--dataloader_num_workers 16 --preprocessing_num_workers 32 --seed 42 \
--save_strategy steps --save_steps 10000 --evaluation_strategy steps --eval_steps 10000 --logging_steps 10000 --max_steps 30000 --warmup_steps 0 \
--learning_rate 1e-3 --lr_scheduler_type cosine \
--run_name ConvNet_batch2048_weight0_secondpart --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0_secondpart \
--run_name ConvNet_batch2048_weight0.1_secondpart --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0.1_secondpart \
--per_device_train_batch_size 2048 --per_device_eval_batch_size 2048 --gradient_accumulation_steps 1 \
--model_name_or_path /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0/checkpoint-120000 \
--model_name_or_path /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0.1/checkpoint-120000 \

# note: there's a bug in huggingface trainer with iterable dataset, the per_device_*_batch_size is interpreted as total batch size
18 changes: 18 additions & 0 deletions workflow/make_dataset_from_ncbi/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Workflow to create a training dataset for any set of taxa
For example usage, check out `analysis/arabidopsis/Snakefile` and `analysis/arabidopsis/config.yaml`.

As a quick preview, here's how you could integrate this as a Snakemake sub-workflow:
```python
rule all:
input:
expand(f"output/merged_dataset/{config['window_size']}/{config['step_size']}/{config['add_rc']}/balanced/data/{{split}}", split=splits),


module make_dataset_from_ncbi:
snakefile:
"https://raw.githubusercontent.com/songlab-cal/gpn/main/workflow/make_dataset_from_ncbi/Snakefile"
config: config


use rule * from make_dataset_from_ncbi as make_dataset_from_ncbi_*
```
6 changes: 6 additions & 0 deletions workflow/make_dataset_from_ncbi/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ from tqdm import tqdm
tqdm.pandas()


# Assembly metadata downloaded from NCBI Genome
# (https://www.ncbi.nlm.nih.gov/data-hub/genome).
# You can choose a set of taxa and apply filters such as annotation level,
# assembly level.
# Checkout the script gpn/filter_assemblies.py for more details, such as
# how to subsample, or how to keep only one assembly per genus.
assemblies = pd.read_csv(config["assemblies_path"], sep="\t", index_col=0)
assemblies["Assembly Name"] = assemblies["Assembly Name"].str.replace(" ", "_")
assemblies["genome_path"] = (
Expand Down

0 comments on commit 2cb20e9

Please sign in to comment.