Refactor test inference

blue03 · Nov 29, 2022 · 3281f0b · 3281f0b
1 parent 38f1623
commit 3281f0b
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 1 deletion.
diff --git a/codegeex/torch/inference.py b/codegeex/torch/inference.py
@@ -169,6 +169,7 @@ def get_token_stream(
         topp: float = 1.0,
         topk: int = 0.0,
         greedy: bool = False,
+        recompute: bool = False,
 ):
     context_tokens, context_lengths = pad_batch(context_tokens, tokenizer.eos_token_id, seq_length)
 
@@ -197,6 +198,7 @@ def get_token_stream(
         topp=topp,
         topk=topk,
         greedy=greedy,
+        recompute=recompute,
     )
 
     for tokens, lengths in batch_token_iterator:

diff --git a/tests/test_inference.py b/tests/test_inference.py
@@ -9,6 +9,7 @@
 from codegeex.megatron.initialize import initialize_megatron
 from codegeex.megatron.model import CodeGeeXModel
 from codegeex.megatron.code_generation_utils import get_token_stream
+from codegeex.quantization import quantize
 
 torch.set_printoptions(precision=8)
 
@@ -80,7 +81,7 @@ def add_code_generation_args(parser):
     group.add_argument(
         "--ws-encoding-length",
         type=int,
-        default=80,
+        default=10,
         help="Length of whitespace encoding",
     )
     group.add_argument(
@@ -123,6 +124,10 @@ def add_code_generation_args(parser):
         default=None,
         help='Identify the type of programming language to generate',
     )
+    group.add_argument(
+        "--quantize",
+        action="store_true",
+    )
 
     return parser
 
@@ -151,6 +156,8 @@ def main():
     model.eval()
     if args.fp16 and args.ln_fp16:
         model.half()
+    if args.quantize:
+        model = quantize(model, weight_bit_width=8, backend="megatron")
     model.cuda()
 
     with open(args.prompt_file, "r") as f: