Skip to content

Commit

Permalink
eval and checkpoint period bug fix for utils
Browse files Browse the repository at this point in the history
  • Loading branch information
theyorubayesian committed Jun 18, 2023
1 parent e004b27 commit 51fe7bd
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 34 deletions.
28 changes: 14 additions & 14 deletions scripts/lafand_mt.sh
Original file line number Diff line number Diff line change
@@ -1,33 +1,33 @@
{
set -x;
set -e;
export CUDA_VISIBLE_DEVICES="0,1"
export CUDA_VISIBLE_DEVICES="2,3"

# Pass `true` if you you set env var `DATA_GCP_DIR` to a local path on your machine
USING_LOCAL_DATASET=true
# Pass full bucket dir for dataset if dataset is not on local.
DATASET_DIR=data/lafand
TRAIN_BATCH_SIZE=16
EVAL_BATCH_SIZE=64
INFER_BATCH_SIZE=64
CHECKPOINT="gs://awarawa/T5_1_1_base/checkpoint_524288"
EVAL_BATCH_SIZE=32
INFER_BATCH_SIZE=32
CHECKPOINT="gs://awarawa/T5_1_1_large/checkpoint_300000"
CHECKPOINT_PERIOD=auto
MODEL_SIZE="base"
MODEL_SIZE="large"
EVAL_PERIOD=auto
# Please pass FEATURE_LENGTHS as string dictionary.
FEATURE_LENGTHS="{'inputs': 512, 'targets': 200}"
# We pretrained for 524288 steps if you use the final checkpoints.
# If you use any other checkpoint, take note of its pre-trained steps.
PRETRAINED_STEPS=524288
PRETRAINED_STEPS=300000
FT_NUM_EPOCHS=5
OUTPUT_DIR="arawat5_base_lafand_hau_pcm_swa"
OUTPUT_DIR="arawat5_large_lafand_ibo_yor_zul"
mkdir -p logs/$OUTPUT_DIR
REMOVE_CHECKPOINTS=true
# ---------------------------------------------

# LANGUAGES=("hau" "pcm" "swa" "ibo" "yor" "zul")
LANGUAGES=("hau" "pcm" "swa")
# LANGUAGES=("ibo" "yor" "zul")
# LANGUAGES=("hau" "pcm" "swa")
LANGUAGES=("ibo" "yor" "zul")
for language in ${LANGUAGES[@]}
do
# TODO: You can check the task name format in src/teva/tasks.py
Expand All @@ -44,8 +44,8 @@
# TRAIN_STEPS MUST ALWAYS BE pre-trained steps + no. of fine-tuning steps.
train_steps=$((PRETRAINED_STEPS + ft_steps))

[[ $EVAL_PERIOD == "auto" ]] && EVAL_PERIOD=$num_steps_per_epoch
[[ $CHECKPOINT_PERIOD == "auto" ]] && CHECKPOINT_PERIOD=$num_steps_per_epoch
[[ $EVAL_PERIOD == "auto" ]] && _EVAL_PERIOD=$num_steps_per_epoch || _EVAL_PERIOD=$EVAL_PERIOD
[[ $CHECKPOINT_PERIOD == "auto" ]] && _CHECKPOINT_PERIOD=$num_steps_per_epoch || _CHECKPOINT_PERIOD=$CHECKPOINT_PERIOD

for seed in 1 2 3
do
Expand All @@ -59,8 +59,8 @@
--feature_lengths "$FEATURE_LENGTHS" \
--batch_size $TRAIN_BATCH_SIZE \
--checkpoint $CHECKPOINT \
--checkpoint_period $CHECKPOINT_PERIOD \
--eval_period $EVAL_PERIOD \
--checkpoint_period $_CHECKPOINT_PERIOD \
--eval_period $_EVAL_PERIOD \
--train_steps $train_steps \
--model_size $MODEL_SIZE \
--output_dir $seed_output_dir \
Expand Down Expand Up @@ -90,7 +90,7 @@
if [[ $REMOVE_CHECKPOINTS == "true" ]]; then
for ckpt in ${checkpoints[@]};
do
rm -rf $ckpt
rm -rf $seed_output_dir/$ckpt
done
fi
done
Expand Down
8 changes: 4 additions & 4 deletions scripts/masakhanews_ft.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ do
# TRAIN_STEPS MUST ALWAYS BE pre-trained steps + no. of fine-tuning steps.
train_steps=$((PRETRAINED_STEPS + ft_steps))

[[ $EVAL_PERIOD == "auto" ]] && EVAL_PERIOD=$num_steps_per_epoch
[[ $CHECKPOINT_PERIOD == "auto" ]] && CHECKPOINT_PERIOD=$num_steps_per_epoch
[[ $EVAL_PERIOD == "auto" ]] && _EVAL_PERIOD=$num_steps_per_epoch || _EVAL_PERIOD=$EVAL_PERIOD
[[ $CHECKPOINT_PERIOD == "auto" ]] && _CHECKPOINT_PERIOD=$num_steps_per_epoch || _CHECKPOINT_PERIOD=$CHECKPOINT_PERIOD
# ------------------------------------------------------------------------

for seed in 1 2 3
Expand All @@ -65,8 +65,8 @@ do
--feature_lengths "$FEATURE_LENGTHS" \
--batch_size $TRAIN_BATCH_SIZE \
--checkpoint $CHECKPOINT \
--checkpoint_period $CHECKPOINT_PERIOD \
--eval_period $EVAL_PERIOD \
--checkpoint_period $_CHECKPOINT_PERIOD \
--eval_period $_EVAL_PERIOD \
--train_steps $train_steps \
--model_size $MODEL_SIZE \
--output_dir $seed_output_dir \
Expand Down
13 changes: 6 additions & 7 deletions scripts/task_ft_and_eval.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@
# TRAIN_STEPS MUST ALWAYS BE pre-trained steps + no. of fine-tuning steps.
train_steps=$((PRETRAINED_STEPS + ft_steps))

[[ $EVAL_PERIOD == "auto" ]] && EVAL_PERIOD=$num_steps_per_epoch
[[ $CHECKPOINT_PERIOD == "auto" ]] && CHECKPOINT_PERIOD=$num_steps_per_epoch
[[ $EVAL_PERIOD == "auto" ]] && _EVAL_PERIOD=$num_steps_per_epoch || _EVAL_PERIOD=$EVAL_PERIOD
[[ $CHECKPOINT_PERIOD == "auto" ]] && _CHECKPOINT_PERIOD=$num_steps_per_epoch || _CHECKPOINT_PERIOD=$CHECKPOINT_PERIOD
# ------------------------------------------------------------------------

for seed in 1 2 3
Expand All @@ -59,24 +59,23 @@
# Replace `--gininfer_eval.utils.DatasetConfig.batch_size=$INFER_BATCH_SIZE` with `--no_infer_eval``
# to disable inference evaluation during training.
# You will need to run evaluation on the checkpoints after training is done.
# TODO: Remove the `--cuda_12` command if you're not on CUDA 12
# TODO: Pass `--cuda_12` command if you're on CUDA 12
bash scripts/t5_utils.sh \
--action finetune \
--task $task \
--feature_lengths "$FEATURE_LENGTHS" \
--batch_size $TRAIN_BATCH_SIZE \
--checkpoint $CHECKPOINT \
--checkpoint_period $CHECKPOINT_PERIOD \
--eval_period $EVAL_PERIOD \
--checkpoint_period $_CHECKPOINT_PERIOD \
--eval_period $_EVAL_PERIOD \
--train_steps $train_steps \
--model_size $MODEL_SIZE \
--output_dir $seed_output_dir \
--cuda_12 \
--gin.infer_eval/utils.DatasetConfig.batch_size=$INFER_BATCH_SIZE \
>& logs/$OUTPUT_DIR/${task}_${seed}_ft.log \
&& finetuned=true

checkpoints=($(ls $seed_output_dir | grep checkpoint | grep -v "524288"))
checkpoints=($(ls $seed_output_dir | grep checkpoint | grep -v $PRETRAINED_STEPS))

# Uncomment if you are using `no_infer_eval` when finetuning.
# This will run inference evaluation on checkpoints produced during training
Expand Down
18 changes: 9 additions & 9 deletions scripts/xlsum.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
set -x;
# export CUDA_VISIBLE_DEVICES="0,1,2,3"
set -e;
export CUDA_VISIBLE_DEVICES="0,1,2,3"

# Pass `true` if you you set env var `DATA_GCP_DIR` to a local path on your machine
USING_LOCAL_DATASET=true
Expand All @@ -9,7 +10,7 @@
TRAIN_BATCH_SIZE=32
EVAL_BATCH_SIZE=16
INFER_BATCH_SIZE=64 # TODO: Reduce by half if OOM error during inference_evaluation
CHECKPOINT="gs://awarawa/T5_1_1_base/checkpoint_524288" # TODO: Change to the checkpoint you want to value on
CHECKPOINT="gs://awarawa/T5_1_1_large/checkpoint_250000" # TODO: Change to the checkpoint you want to value on
CHECKPOINT_PERIOD=auto # If auto, we save checkpoint after every epoch. Otherwise set to value.
MODEL_SIZE="base"
EVAL_PERIOD=auto # If auto, we run evaluations after every epoch. Otherwise set to value.
Expand All @@ -19,7 +20,7 @@
# If you use any other checkpoint, take note of its pre-trained steps.
PRETRAINED_STEPS=524288
FT_NUM_EPOCHS=5
OUTPUT_DIR="arawat5_base_xlsum" # TODO: Change to unique output dir
OUTPUT_DIR="arawat5_large_ckpt_250k_xlsum" # TODO: Change to unique output dir
mkdir -p logs/$OUTPUT_DIR

REMOVE_CHECKPOINTS=true
Expand All @@ -46,8 +47,8 @@
# TRAIN_STEPS MUST ALWAYS BE pre-trained steps + no. of fine-tuning steps.
train_steps=$((PRETRAINED_STEPS + ft_steps))

[[ $EVAL_PERIOD == "auto" ]] && EVAL_PERIOD=$num_steps_per_epoch
[[ $CHECKPOINT_PERIOD == "auto" ]] && CHECKPOINT_PERIOD=$num_steps_per_epoch
[[ $EVAL_PERIOD == "auto" ]] && _EVAL_PERIOD=$num_steps_per_epoch || _EVAL_PERIOD=$EVAL_PERIOD
[[ $CHECKPOINT_PERIOD == "auto" ]] && _CHECKPOINT_PERIOD=$num_steps_per_epoch || _CHECKPOINT_PERIOD=$CHECKPOINT_PERIOD
# ------------------------------------------------------------------------

for seed in 1 2 3
Expand All @@ -56,7 +57,7 @@

# For some tasks, running inference_evaluation during training causes 00M
# no matter how small the `INFER_BATCH_SIZE`
# Replace `--gininfer_eval.utils.DatasetConfig.batch_size=$INFER_BATCH_SIZE` with `--no_infer_eval``
# Replace `--gin.infer_eval.utils.DatasetConfig.batch_size=$INFER_BATCH_SIZE` with `--no_infer_eval``
# to disable inference evaluation during training.
# You will need to run evaluation on the checkpoints after training is done.
# TODO: Remove the `--cuda_12` command if you're not on CUDA 12
Expand All @@ -66,12 +67,11 @@
--feature_lengths "$FEATURE_LENGTHS" \
--batch_size $TRAIN_BATCH_SIZE \
--checkpoint $CHECKPOINT \
--checkpoint_period $CHECKPOINT_PERIOD \
--eval_period $EVAL_PERIOD \
--checkpoint_period $_CHECKPOINT_PERIOD \
--eval_period $_EVAL_PERIOD \
--train_steps $train_steps \
--model_size $MODEL_SIZE \
--output_dir $seed_output_dir \
--cuda_12 \
--gin.infer_eval/utils.DatasetConfig.batch_size=$INFER_BATCH_SIZE \
>& logs/$OUTPUT_DIR/${task}_${seed}_ft.log \
&& finetuned=true
Expand Down

0 comments on commit 51fe7bd

Please sign in to comment.