Polish ncbi workflow

animesh · Apr 12, 2023 · 2cb20e9 · 2cb20e9
1 parent b167669
commit 2cb20e9
Show file tree

Hide file tree

Showing 6 changed files with 44 additions and 45 deletions.
diff --git a/analysis/arabidopsis/Snakefile b/analysis/arabidopsis/Snakefile
@@ -62,16 +62,16 @@ ruleorder: get_kmer_spectrum > get_embeddings
 
 rule all:
     input:
+        expand(f"output/merged_dataset/{config['window_size']}/{config['step_size']}/{config['add_rc']}/balanced/data/{{split}}", split=splits),
         #"output/embedding/embeddings/kmers.parquet",
-        #expand("output/merged_dataset/{config[window_size]}/{config[step_size]}/{config[add_rc]}/balanced/data/{split}", split=splits),
         #"output/assemblies.tex",
         #expand("output/whole_genome/modisco/{model}/report", model=models),
         #expand("output/whole_genome/subset/promoter/modisco/{model}/report", model=models),
         #expand("output/whole_genome/bed_probs/{model}/{nuc}.bw", model=models, nuc=NUCLEOTIDES),
         #"output/embedding/subset_no_repeats/windows.parquet",
         #expand("output/embedding/subset_no_repeats/umap/{model}.parquet", model=models),
         #expand("output/embedding/subset_no_repeats/leiden_{resolution}_/{model}.parquet", model=models, resolution=[0.3]),
-        expand("output/embedding/umap/{model}.parquet", model=models + ["kmers_5", "kmers_3", "kmers_6", "kmers_4"]),
+        #expand("output/embedding/umap/{model}.parquet", model=models + ["kmers_5", "kmers_3", "kmers_6", "kmers_4"]),
         #expand("output/funtfbs/logits/{model}.parquet", model=models),
         #expand("output/whole_genome/logits/{model}.parquet", model=models),
         #expand("output/simulated_variants/vep/{model}.parquet", model=models),
@@ -105,13 +105,13 @@ rule filter_assemblies:
         """
 
 
-#module make_dataset_from_ncbi:
-#    snakefile:
-#        "https://github.com/songlab-cal/gpn/workflow/make_dataset_from_ncbi/Snakefile"
-#    config: config
-#
-#
-#use rule * from make_dataset_from_ncbi as make_dataset_from_ncbi_*
+module make_dataset_from_ncbi:
+    snakefile:
+        "https://raw.githubusercontent.com/songlab-cal/gpn/main/workflow/make_dataset_from_ncbi/Snakefile"
+    config: config
+
+
+use rule * from make_dataset_from_ncbi as make_dataset_from_ncbi_*
 
 
 rule make_assemblies_latex_table:

diff --git a/analysis/arabidopsis/test.sh b/analysis/arabidopsis/test.sh
@@ -1,12 +1,10 @@
 WANDB_PROJECT=GPN_Arabidopsis_multispecies python -m gpn.run_mlm --do_test \
     --fp16 --prediction_loss_only True --remove_unused_columns False \
-    --dataset_name output/merged_dataset/balanced --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \
+    --dataset_name output/merged_dataset/balanced/512/256/True --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \
     --soft_masked_loss_weight_test 0.0 \
     --dataloader_num_workers 16 --preprocessing_num_workers 32 --seed 42 \
     --output_dir /tmp/dir \
     --per_device_eval_batch_size 2048 \
-    --model_name_or_path /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0_secondpart/checkpoint-30000 \
-
-#    --model_name_or_path gonzalobenegas/gpn-brassicales \
+    --model_name_or_path /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0.1_secondpart/checkpoint-30000 \
 
 # note: there's a bug in huggingface trainer with iterable dataset, the per_device_*_batch_size is interpreted as total batch size
diff --git a/analysis/arabidopsis/train.sh b/analysis/arabidopsis/train.sh
@@ -1,37 +1,12 @@
 WANDB_PROJECT=GPN_Arabidopsis_multispecies python -m gpn.run_mlm --do_train --do_eval \
     --fp16 --report_to wandb --prediction_loss_only True --remove_unused_columns False \
-    --dataset_name output/merged_dataset/balanced --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \
-    --soft_masked_loss_weight_train 0.0 --soft_masked_loss_weight_evaluation 0.0 \
+    --dataset_name output/merged_dataset/512/256/True/balanced --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \
+    --soft_masked_loss_weight_train 0.1 --soft_masked_loss_weight_evaluation 0.0 \
     --weight_decay 0.01 --optim adamw_torch \
     --dataloader_num_workers 16 --preprocessing_num_workers 32 --seed 42 \
     --save_strategy steps --save_steps 10000 --evaluation_strategy steps --eval_steps 10000 --logging_steps 10000 --max_steps 120000 --warmup_steps 1000 \
     --learning_rate 1e-3 --lr_scheduler_type constant_with_warmup \
-    --run_name ConvNet_batch2048_weight0 --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0 --model_type ConvNet \
+    --run_name ConvNet_batch2048_weight0.1 --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0.1 --model_type ConvNet \
     --per_device_train_batch_size 2048 --per_device_eval_batch_size 2048 --gradient_accumulation_steps 1 \
-    --resume_from_checkpoint /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0/checkpoint-30000 \
-    --ignore_data_skip \
 
 # note: there's a bug in huggingface trainer with iterable dataset, the per_device_*_batch_size is interpreted as total batch size
-
-#    --run_name RoFormer_12_weight0.5_v2 --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_7/RoFormer_12_weight0.5_v2 --model_type roformer --config_overrides vocab_size=7 \
-
-#     --torch_compile \  # not working, will wait until stable version
-# --save_strategy steps --save_steps 10000 --evaluation_strategy steps --eval_steps 5000 \
-#    --logging_steps 5000 --max_steps 50000 --warmup_steps 500 \
-    #--save_strategy epoch --evaluation_strategy epoch --num_train_epochs 1 --warmup_ratio 0.01 \
-    #--run_name RoFormer_8 --output_dir output/checkpoints/RoFormer_8 --model_type roformer --config_overrides vocab_size=7,num_hidden_layers=8,num_attention_heads=8,hidden_size=512,intermediate_size=2048 \
-
-#    --learning_rate 1e-4 --lr_scheduler_type cosine \
-#    --run_name RoFormer_12 --output_dir output/checkpoints/RoFormer_12 --model_type roformer --config_overrides vocab_size=7 \
-#    --per_device_train_batch_size 128 --per_device_eval_batch_size 128 --gradient_accumulation_steps 2 \
-
-#WANDB_PROJECT=GPN_Arabidopsis_6 python -m gpn.run_mlm --do_train --do_eval \
-#    --fp16 --report_to wandb --prediction_loss_only True --remove_unused_columns False \
-#    --dataset_name output/dataset/mlm/gpn --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \
-#    --soft_masked_loss_weight_train 1.0 --soft_masked_loss_weight_evaluation 1.0 \
-#    --weight_decay 0.01 --optim adamw_torch \
-#    --dataloader_num_workers 16 --preprocessing_num_workers 32 --seed 42 \
-#    --save_strategy steps --save_steps 10000 --evaluation_strategy steps --eval_steps 10000 --logging_steps 10000 --max_steps 400000 --warmup_steps 1000 \
-#    --learning_rate 1e-3 --lr_scheduler_type constant_with_warmup \
-#    --run_name ConvNet_25_weight1.0_batch256 --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_6/ConvNet_25_weight1.0_batch256 --model_type ConvNet \
-#    --per_device_train_batch_size 64 --per_device_eval_batch_size 64 --gradient_accumulation_steps 1 \
diff --git a/analysis/arabidopsis/train_secondpart.sh b/analysis/arabidopsis/train_secondpart.sh
@@ -1,11 +1,13 @@
 WANDB_PROJECT=GPN_Arabidopsis_multispecies python -m gpn.run_mlm --do_train --do_eval \
     --fp16 --report_to wandb --prediction_loss_only True --remove_unused_columns False \
-    --dataset_name output/merged_dataset/balanced --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \
-    --soft_masked_loss_weight_train 0.0 --soft_masked_loss_weight_evaluation 0.0 \
+    --dataset_name output/merged_dataset/512/256/True/balanced --tokenizer_name gonzalobenegas/tokenizer-dna-mlm \
+    --soft_masked_loss_weight_train 0.1 --soft_masked_loss_weight_evaluation 0.0 \
     --weight_decay 0.01 --optim adamw_torch \
     --dataloader_num_workers 16 --preprocessing_num_workers 32 --seed 42 \
     --save_strategy steps --save_steps 10000 --evaluation_strategy steps --eval_steps 10000 --logging_steps 10000 --max_steps 30000 --warmup_steps 0 \
     --learning_rate 1e-3 --lr_scheduler_type cosine \
-    --run_name ConvNet_batch2048_weight0_secondpart --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0_secondpart \
+    --run_name ConvNet_batch2048_weight0.1_secondpart --output_dir /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0.1_secondpart \
     --per_device_train_batch_size 2048 --per_device_eval_batch_size 2048 --gradient_accumulation_steps 1 \
-    --model_name_or_path /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0/checkpoint-120000 \
+    --model_name_or_path /scratch/users/gbenegas/checkpoints/GPN_Arabidopsis_multispecies/ConvNet_batch2048_weight0.1/checkpoint-120000 \
+
+# note: there's a bug in huggingface trainer with iterable dataset, the per_device_*_batch_size is interpreted as total batch size
diff --git a/workflow/make_dataset_from_ncbi/README.md b/workflow/make_dataset_from_ncbi/README.md
@@ -0,0 +1,18 @@
+# Workflow to create a training dataset for any set of taxa
+For example usage, check out `analysis/arabidopsis/Snakefile` and `analysis/arabidopsis/config.yaml`.
+
+As a quick preview, here's how you could integrate this as a Snakemake sub-workflow:
+```python
+rule all:
+    input:
+        expand(f"output/merged_dataset/{config['window_size']}/{config['step_size']}/{config['add_rc']}/balanced/data/{{split}}", split=splits),
+
+
+module make_dataset_from_ncbi:
+    snakefile:
+        "https://raw.githubusercontent.com/songlab-cal/gpn/main/workflow/make_dataset_from_ncbi/Snakefile"
+    config: config
+
+
+use rule * from make_dataset_from_ncbi as make_dataset_from_ncbi_*
+```
diff --git a/workflow/make_dataset_from_ncbi/Snakefile b/workflow/make_dataset_from_ncbi/Snakefile
@@ -9,6 +9,12 @@ from tqdm import tqdm
 tqdm.pandas()
 
 
+# Assembly metadata downloaded from NCBI Genome
+# (https://www.ncbi.nlm.nih.gov/data-hub/genome).
+# You can choose a set of taxa and apply filters such as annotation level,
+# assembly level.
+# Checkout the script gpn/filter_assemblies.py for more details, such as 
+# how to subsample, or how to keep only one assembly per genus.
 assemblies = pd.read_csv(config["assemblies_path"], sep="\t", index_col=0)
 assemblies["Assembly Name"] = assemblies["Assembly Name"].str.replace(" ", "_")
 assemblies["genome_path"] = (