eval and checkpoint period bug fix for utils

castorini · Jun 18, 2023 · 51fe7bd · 51fe7bd
1 parent e004b27
commit 51fe7bd
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 34 deletions.
diff --git a/scripts/lafand_mt.sh b/scripts/lafand_mt.sh
@@ -1,33 +1,33 @@
 {
     set -x;
     set -e;
-    export CUDA_VISIBLE_DEVICES="0,1"
+    export CUDA_VISIBLE_DEVICES="2,3"
 
     # Pass `true` if you you set env var `DATA_GCP_DIR` to a local path on your machine
     USING_LOCAL_DATASET=true
     # Pass full bucket dir for dataset if dataset is not on local.
     DATASET_DIR=data/lafand
     TRAIN_BATCH_SIZE=16
-    EVAL_BATCH_SIZE=64
-    INFER_BATCH_SIZE=64
-    CHECKPOINT="gs://awarawa/T5_1_1_base/checkpoint_524288"
+    EVAL_BATCH_SIZE=32
+    INFER_BATCH_SIZE=32
+    CHECKPOINT="gs://awarawa/T5_1_1_large/checkpoint_300000"
     CHECKPOINT_PERIOD=auto
-    MODEL_SIZE="base"
+    MODEL_SIZE="large"
     EVAL_PERIOD=auto
     # Please pass FEATURE_LENGTHS as string dictionary.
     FEATURE_LENGTHS="{'inputs': 512, 'targets': 200}"
     # We pretrained for 524288 steps if you use the final checkpoints.
     # If you use any other checkpoint, take note of its pre-trained steps.
-    PRETRAINED_STEPS=524288
+    PRETRAINED_STEPS=300000
     FT_NUM_EPOCHS=5
-    OUTPUT_DIR="arawat5_base_lafand_hau_pcm_swa"
+    OUTPUT_DIR="arawat5_large_lafand_ibo_yor_zul"
     mkdir -p logs/$OUTPUT_DIR
     REMOVE_CHECKPOINTS=true
     # ---------------------------------------------
 
     # LANGUAGES=("hau" "pcm" "swa" "ibo" "yor" "zul")
-    LANGUAGES=("hau" "pcm" "swa")
-    # LANGUAGES=("ibo" "yor" "zul")
+    # LANGUAGES=("hau" "pcm" "swa")
+    LANGUAGES=("ibo" "yor" "zul")
     for language in ${LANGUAGES[@]}
     do
         # TODO: You can check the task name format in src/teva/tasks.py
@@ -44,8 +44,8 @@
             # TRAIN_STEPS MUST ALWAYS BE pre-trained steps + no. of fine-tuning steps.
             train_steps=$((PRETRAINED_STEPS + ft_steps))
 
-            [[ $EVAL_PERIOD == "auto" ]] && EVAL_PERIOD=$num_steps_per_epoch
-            [[ $CHECKPOINT_PERIOD == "auto" ]] && CHECKPOINT_PERIOD=$num_steps_per_epoch
+            [[ $EVAL_PERIOD == "auto" ]] && _EVAL_PERIOD=$num_steps_per_epoch || _EVAL_PERIOD=$EVAL_PERIOD
+            [[ $CHECKPOINT_PERIOD == "auto" ]] && _CHECKPOINT_PERIOD=$num_steps_per_epoch || _CHECKPOINT_PERIOD=$CHECKPOINT_PERIOD
 
             for seed in 1 2 3
             do
@@ -59,8 +59,8 @@
                 --feature_lengths "$FEATURE_LENGTHS" \
                 --batch_size $TRAIN_BATCH_SIZE \
                 --checkpoint $CHECKPOINT \
-                --checkpoint_period $CHECKPOINT_PERIOD \
-                --eval_period $EVAL_PERIOD \
+                --checkpoint_period $_CHECKPOINT_PERIOD \
+                --eval_period $_EVAL_PERIOD \
                 --train_steps $train_steps \
                 --model_size $MODEL_SIZE \
                 --output_dir $seed_output_dir \
@@ -90,7 +90,7 @@
                 if [[ $REMOVE_CHECKPOINTS == "true" ]]; then
                     for ckpt in ${checkpoints[@]};
                     do
-                        rm -rf $ckpt
+                        rm -rf $seed_output_dir/$ckpt
                     done
                 fi
             done

diff --git a/scripts/masakhanews_ft.sh b/scripts/masakhanews_ft.sh
@@ -45,8 +45,8 @@ do
     # TRAIN_STEPS MUST ALWAYS BE pre-trained steps + no. of fine-tuning steps.
     train_steps=$((PRETRAINED_STEPS + ft_steps))
 
-    [[ $EVAL_PERIOD == "auto" ]] && EVAL_PERIOD=$num_steps_per_epoch
-    [[ $CHECKPOINT_PERIOD == "auto" ]] && CHECKPOINT_PERIOD=$num_steps_per_epoch
+    [[ $EVAL_PERIOD == "auto" ]] && _EVAL_PERIOD=$num_steps_per_epoch || _EVAL_PERIOD=$EVAL_PERIOD
+    [[ $CHECKPOINT_PERIOD == "auto" ]] && _CHECKPOINT_PERIOD=$num_steps_per_epoch || _CHECKPOINT_PERIOD=$CHECKPOINT_PERIOD
     # ------------------------------------------------------------------------
 
     for seed in 1 2 3
@@ -65,8 +65,8 @@ do
         --feature_lengths "$FEATURE_LENGTHS" \
         --batch_size $TRAIN_BATCH_SIZE \
         --checkpoint $CHECKPOINT \
-        --checkpoint_period $CHECKPOINT_PERIOD \
-        --eval_period $EVAL_PERIOD \
+        --checkpoint_period $_CHECKPOINT_PERIOD \
+        --eval_period $_EVAL_PERIOD \
         --train_steps $train_steps \
         --model_size $MODEL_SIZE \
         --output_dir $seed_output_dir \

diff --git a/scripts/task_ft_and_eval.sh b/scripts/task_ft_and_eval.sh
@@ -46,8 +46,8 @@
         # TRAIN_STEPS MUST ALWAYS BE pre-trained steps + no. of fine-tuning steps.
         train_steps=$((PRETRAINED_STEPS + ft_steps))
 
-        [[ $EVAL_PERIOD == "auto" ]] && EVAL_PERIOD=$num_steps_per_epoch
-        [[ $CHECKPOINT_PERIOD == "auto" ]] && CHECKPOINT_PERIOD=$num_steps_per_epoch
+        [[ $EVAL_PERIOD == "auto" ]] && _EVAL_PERIOD=$num_steps_per_epoch || _EVAL_PERIOD=$EVAL_PERIOD
+        [[ $CHECKPOINT_PERIOD == "auto" ]] && _CHECKPOINT_PERIOD=$num_steps_per_epoch || _CHECKPOINT_PERIOD=$CHECKPOINT_PERIOD
         # ------------------------------------------------------------------------
 
         for seed in 1 2 3
@@ -59,24 +59,23 @@
             # Replace `--gininfer_eval.utils.DatasetConfig.batch_size=$INFER_BATCH_SIZE` with `--no_infer_eval`` 
             # to disable inference evaluation during training.
             # You will need to run evaluation on the checkpoints after training is done.
-            # TODO: Remove the `--cuda_12` command if you're not on CUDA 12
+            # TODO: Pass `--cuda_12` command if you're on CUDA 12
             bash scripts/t5_utils.sh \
             --action finetune \
             --task $task \
             --feature_lengths "$FEATURE_LENGTHS" \
             --batch_size $TRAIN_BATCH_SIZE \
             --checkpoint $CHECKPOINT \
-            --checkpoint_period $CHECKPOINT_PERIOD \
-            --eval_period $EVAL_PERIOD \
+            --checkpoint_period $_CHECKPOINT_PERIOD \
+            --eval_period $_EVAL_PERIOD \
             --train_steps $train_steps \
             --model_size $MODEL_SIZE \
             --output_dir $seed_output_dir \
-            --cuda_12 \
             --gin.infer_eval/utils.DatasetConfig.batch_size=$INFER_BATCH_SIZE \
             >& logs/$OUTPUT_DIR/${task}_${seed}_ft.log \
             && finetuned=true
 
-            checkpoints=($(ls $seed_output_dir | grep checkpoint | grep -v "524288"))
+            checkpoints=($(ls $seed_output_dir | grep checkpoint | grep -v $PRETRAINED_STEPS))
 
             # Uncomment if you are using `no_infer_eval` when finetuning.
             # This will run inference evaluation on checkpoints produced during training

diff --git a/scripts/xlsum.sh b/scripts/xlsum.sh
@@ -1,6 +1,7 @@
 {
     set -x;
-    # export CUDA_VISIBLE_DEVICES="0,1,2,3"
+    set -e;
+    export CUDA_VISIBLE_DEVICES="0,1,2,3"
 
     # Pass `true` if you you set env var `DATA_GCP_DIR` to a local path on your machine
     USING_LOCAL_DATASET=true
@@ -9,7 +10,7 @@
     TRAIN_BATCH_SIZE=32
     EVAL_BATCH_SIZE=16
     INFER_BATCH_SIZE=64                                                     # TODO: Reduce by half if OOM error during inference_evaluation
-    CHECKPOINT="gs://awarawa/T5_1_1_base/checkpoint_524288"                 # TODO: Change to the checkpoint you want to value on
+    CHECKPOINT="gs://awarawa/T5_1_1_large/checkpoint_250000"                 # TODO: Change to the checkpoint you want to value on
     CHECKPOINT_PERIOD=auto                                                  # If auto, we save checkpoint after every epoch. Otherwise set to value.
     MODEL_SIZE="base"
     EVAL_PERIOD=auto                                                        # If auto, we run evaluations after every epoch. Otherwise set to value.
@@ -19,7 +20,7 @@
     # If you use any other checkpoint, take note of its pre-trained steps.
     PRETRAINED_STEPS=524288
     FT_NUM_EPOCHS=5
-    OUTPUT_DIR="arawat5_base_xlsum"                                        # TODO: Change to unique output dir
+    OUTPUT_DIR="arawat5_large_ckpt_250k_xlsum"                                        # TODO: Change to unique output dir
     mkdir -p logs/$OUTPUT_DIR
 
     REMOVE_CHECKPOINTS=true
@@ -46,8 +47,8 @@
         # TRAIN_STEPS MUST ALWAYS BE pre-trained steps + no. of fine-tuning steps.
         train_steps=$((PRETRAINED_STEPS + ft_steps))
 
-        [[ $EVAL_PERIOD == "auto" ]] && EVAL_PERIOD=$num_steps_per_epoch
-        [[ $CHECKPOINT_PERIOD == "auto" ]] && CHECKPOINT_PERIOD=$num_steps_per_epoch
+        [[ $EVAL_PERIOD == "auto" ]] && _EVAL_PERIOD=$num_steps_per_epoch || _EVAL_PERIOD=$EVAL_PERIOD
+        [[ $CHECKPOINT_PERIOD == "auto" ]] && _CHECKPOINT_PERIOD=$num_steps_per_epoch || _CHECKPOINT_PERIOD=$CHECKPOINT_PERIOD
         # ------------------------------------------------------------------------
 
         for seed in 1 2 3
@@ -56,7 +57,7 @@
 
             # For some tasks, running inference_evaluation during training causes 00M
             # no matter how small the `INFER_BATCH_SIZE`
-            # Replace `--gininfer_eval.utils.DatasetConfig.batch_size=$INFER_BATCH_SIZE` with `--no_infer_eval`` 
+            # Replace `--gin.infer_eval.utils.DatasetConfig.batch_size=$INFER_BATCH_SIZE` with `--no_infer_eval`` 
             # to disable inference evaluation during training.
             # You will need to run evaluation on the checkpoints after training is done.
             # TODO: Remove the `--cuda_12` command if you're not on CUDA 12
@@ -66,12 +67,11 @@
             --feature_lengths "$FEATURE_LENGTHS" \
             --batch_size $TRAIN_BATCH_SIZE \
             --checkpoint $CHECKPOINT \
-            --checkpoint_period $CHECKPOINT_PERIOD \
-            --eval_period $EVAL_PERIOD \
+            --checkpoint_period $_CHECKPOINT_PERIOD \
+            --eval_period $_EVAL_PERIOD \
             --train_steps $train_steps \
             --model_size $MODEL_SIZE \
             --output_dir $seed_output_dir \
-            --cuda_12 \
             --gin.infer_eval/utils.DatasetConfig.batch_size=$INFER_BATCH_SIZE \
             >& logs/$OUTPUT_DIR/${task}_${seed}_ft.log \
             && finetuned=true