Use fast tokenizer by default.

yifan1130 · Sep 22, 2023 · 28d5366 · 28d5366
1 parent 7040019
commit 28d5366
Show file tree

Hide file tree

Showing 9 changed files with 64 additions and 13 deletions.
diff --git a/eval/bbh/run_eval.py b/eval/bbh/run_eval.py
@@ -168,7 +168,8 @@ def main(args):
             tokenizer_name_or_path=args.tokenizer_name_or_path, 
             load_in_8bit=args.load_in_8bit, 
             device_map="balanced_low_0" if torch.cuda.device_count() > 1 else "auto",
-            gptq_model=args.gptq
+            gptq_model=args.gptq,
+            use_fast_tokenizer=not args.use_slow_tokenizer,
         )
 
     performance = {}
@@ -224,6 +225,11 @@ def main(args):
         default=None, 
         help="if specified, we will load the tokenizer from here."
     )
+    parser.add_argument(
+        "--use_slow_tokenizer",
+        action="store_true",
+        help="If given, we will use the slow tokenizer."
+    )
     parser.add_argument(
         "--openai_engine", 
         type=str, 

diff --git a/eval/codex_humaneval/run_eval.py b/eval/codex_humaneval/run_eval.py
@@ -46,7 +46,8 @@ def main(args):
             load_in_8bit=args.load_in_8bit, 
             # device map is determined by the number of gpus available.
             device_map="balanced_low_0" if torch.cuda.device_count() > 1 else "auto",
-            gptq_model=args.gptq
+            gptq_model=args.gptq,
+            use_fast_tokenizer=not args.use_slow_tokenizer,
         )
 
         # these stop sequences are those mentioned in the codex paper.
@@ -141,6 +142,11 @@ def main(args):
         default=None, 
         help="If specified, we will load the tokenizer from here."
     )
+    parser.add_argument(
+        "--use_slow_tokenizer",
+        action="store_true",
+        help="If given, we will use the slow tokenizer."
+    )
     parser.add_argument(
         "--openai_engine", 
         type=str, 

diff --git a/eval/gsm/run_eval.py b/eval/gsm/run_eval.py
@@ -82,7 +82,8 @@ def main(args):
             tokenizer_name_or_path=args.tokenizer_name_or_path, 
             load_in_8bit=args.load_in_8bit, 
             device_map="balanced_low_0" if torch.cuda.device_count() > 1 else "auto",
-            gptq_model=args.gptq
+            gptq_model=args.gptq,
+            use_fast_tokenizer=not args.use_slow_tokenizer,
         )
         new_line_token = tokenizer.encode("\n", add_special_tokens=False)[-1] # get the last token because the tokenizer may add space tokens at the start.
         outputs = generate_completions(
@@ -166,6 +167,11 @@ def main(args):
         default=None, 
         help="if specified, we will load the tokenizer from here."
     )
+    parser.add_argument(
+        "--use_slow_tokenizer",
+        action="store_true",
+        help="If given, we will use the slow tokenizer."
+    )
     parser.add_argument(
         "--openai_engine", 
         type=str, 

diff --git a/eval/mmlu/run_eval.py b/eval/mmlu/run_eval.py
@@ -152,7 +152,8 @@ def main(args):
             tokenizer_name_or_path=args.tokenizer_name_or_path,
             load_in_8bit=args.load_in_8bit, 
             device_map="balanced_low_0" if torch.cuda.device_count() > 1 else "auto",
-            gptq_model=args.gptq
+            gptq_model=args.gptq,
+            use_fast_tokenizer=not args.use_slow_tokenizer,
         )
 
     subjects = sorted(
@@ -268,6 +269,11 @@ def main(args):
         default=None,
         help="if specified, we will load the tokenizer from here."
     )
+    parser.add_argument(
+        "--use_slow_tokenizer",
+        action="store_true",
+        help="If given, we will use the slow tokenizer."
+    )
     parser.add_argument(
         "--openai_engine",
         type=str,

diff --git a/eval/predict.py b/eval/predict.py
@@ -39,10 +39,15 @@ def parse_args():
         type=str,
         help="Huggingface tokenizer name or path."
     )
+    parser.add_argument(
+        "--use_slow_tokenizer",
+        action="store_true",
+        help="If given, we will use the slow tokenizer."
+    )
     parser.add_argument(
         "--openai_engine", 
         type=str,
-        help="OpenAI engine name.")
+        help="OpenAI engine name. This should be exclusive with `model_name_or_path`.")
     parser.add_argument(
         "--input_files", 
         type=str, 
@@ -146,7 +151,11 @@ def parse_args():
                 raise ValueError("Either `messages` or `prompt` should be in the instance.")
             prompts.append(prompt)
         if args.use_vllm:
-            model = vllm.LLM(model=args.model_name_or_path)
+            model = vllm.LLM(
+                model=args.model_name_or_path,
+                tokenizer=args.tokenizer_name_or_path if args.tokenizer_name_or_path else args.model_name_or_path,
+                tokenizer_mode="slow" if args.use_slow_tokenizer else "auto",
+            )
             sampling_params = vllm.SamplingParams(
                 temperature=args.temperature if args.do_sample else 0, 
                 top_p=args.top_p,
@@ -160,7 +169,8 @@ def parse_args():
                 tokenizer_name_or_path=args.tokenizer_name_or_path,
                 load_in_8bit=args.load_in_8bit, 
                 device_map="balanced_low_0" if torch.cuda.device_count() > 1 else "auto",
-                gptq_model=args.gptq
+                gptq_model=args.gptq,
+                use_fast_tokenizer=not args.use_slow_tokenizer,
             )
             outputs = generate_completions(
                 model=model,

diff --git a/eval/toxigen/run_eval.py b/eval/toxigen/run_eval.py
@@ -50,6 +50,7 @@ def eval_vllm_model(
     model = vllm.LLM(
         model=args.model_name_or_path,
         tokenizer=tokenizer if tokenizer else args.model_name_or_path,
+        tokenizer_mode="slow" if args.use_slow_tokenizer else "auto",
     )
     prompts = []
     chat_formatting_function = dynamic_import_function(args.chat_formatting_function) if args.use_chat_format else None
@@ -273,10 +274,9 @@ def main(args):
                 model_name_or_path=args.model_name_or_path,
                 tokenizer_name_or_path=args.tokenizer_name_or_path,
                 load_in_8bit=args.load_in_8bit,
-                device_map="balanced_low_0"
-                if torch.cuda.device_count() > 1
-                else "auto",
+                device_map="balanced_low_0" if torch.cuda.device_count() > 1 else "auto",
                 gptq_model=args.gptq,
+                use_fast_tokenizer=not args.use_slow_tokenizer,
             )
             results = eval_hf_model(
                 args,
@@ -322,6 +322,11 @@ def main(args):
         default=None,
         help="if specified, we will load the tokenizer from here.",
     )
+    parser.add_argument(
+        "--use_slow_tokenizer",
+        action="store_true",
+        help="If given, we will use the slow tokenizer."
+    )
     parser.add_argument(
         "--openai_engine",
         type=str,

diff --git a/eval/truthfulqa/run_eval.py b/eval/truthfulqa/run_eval.py
@@ -339,7 +339,8 @@ def main(args):
             tokenizer_name_or_path=args.tokenizer_name_or_path,
             load_in_8bit=args.load_in_8bit, 
             device_map="balanced_low_0" if torch.cuda.device_count() > 1 else "auto",
-            gptq_model=args.gptq
+            gptq_model=args.gptq,
+            use_fast_tokenizer=not args.use_slow_tokenizer,
         )
         print("Running generations!")
         run_answers(
@@ -426,6 +427,11 @@ def main(args):
         default=None, 
         help="If specified, we will load the tokenizer from here."
     )
+    parser.add_argument(
+        "--use_slow_tokenizer",
+        action="store_true",
+        help="If given, we will use the slow tokenizer."
+    )
     parser.add_argument(
         "--openai_engine", 
         type=str, 

diff --git a/eval/tydiqa/run_eval.py b/eval/tydiqa/run_eval.py
@@ -103,7 +103,8 @@ def main(args):
             tokenizer_name_or_path=args.tokenizer_name_or_path, 
             load_in_8bit=args.load_in_8bit, 
             device_map="balanced_low_0" if torch.cuda.device_count() > 1 else "auto",
-            gptq_model=args.gptq
+            gptq_model=args.gptq,
+            use_fast_tokenizer=not args.use_slow_tokenizer,
         )
     else:
         import tiktoken
@@ -260,6 +261,11 @@ def main(args):
         default=None,
         help="if specified, we will load the tokenizer from here."
     )
+    parser.add_argument(
+        "--use_slow_tokenizer",
+        action="store_true",
+        help="If given, we will use the slow tokenizer."
+    )
     parser.add_argument(
         "--openai_engine",
         type=str,

diff --git a/eval/utils.py b/eval/utils.py
@@ -194,7 +194,7 @@ def load_hf_lm_and_tokenizer(
         load_in_8bit=False, 
         convert_to_half=False,
         gptq_model=False,
-        use_fast_tokenizer=False,
+        use_fast_tokenizer=True,
         padding_side="left",
     ):