Skip to content

Commit

Permalink
Tokenizer and formatter test case
Browse files Browse the repository at this point in the history
  • Loading branch information
jelmervdl committed Apr 3, 2024
1 parent ba2dacb commit 99ebb4a
Showing 1 changed file with 84 additions and 0 deletions.
84 changes: 84 additions & 0 deletions test_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import os
from unittest import TestCase
from llama.tokenizer import ChatFormat, Tokenizer


class TokenizerTests(TestCase):
def setUp(self):
self.tokenizer = Tokenizer(os.environ["TOKENIZER_PATH"])
self.format = ChatFormat(self.tokenizer)

def test_special_tokens(self):
self.assertEqual(
self.tokenizer.special_tokens["<|begin_of_text|>"],
128000,
)

def test_encode(self):
self.assertEqual(
self.tokenizer.encode(
"This is a test sentence.",
bos=True,
eos=True
),
[128000, 2028, 374, 264, 1296, 11914, 13, 128001],
)

def test_decode(self):
self.assertEqual(
self.tokenizer.decode(
[128000, 2028, 374, 264, 1296, 11914, 13, 128001],
),
"<|begin_of_text|>This is a test sentence.<|end_of_text|>",
)

def test_encode_message(self):
message = {
"role": "user",
"content": "This is a test sentence.",
}
self.assertEqual(
self.format.encode_message(message),
[
128006, # <|start_header_id|>
882, # "user"
128007, # <|end_of_header|>
271, # "\n\n"
2028, 374, 264, 1296, 11914, 13, # This is a test sentence.
128009, # <|eot_id|>
]
)

def test_encode_dialog(self):
dialog = [
{
"role": "system",
"content": "This is a test sentence.",
},
{
"role": "user",
"content": "This is a response.",
}
]
self.assertEqual(
self.format.encode_dialog_prompt(dialog),
[
128000, # <|begin_of_text|>
128006, # <|start_header_id|>
9125, # "system"
128007, # <|end_of_header|>
271, # "\n\n"
2028, 374, 264, 1296, 11914, 13, # "This is a test sentence."
128009, # <|eot_id|>
128006, # <|start_header_id|>
882, # "user"
128007, # <|end_of_header|>
271, # "\n\n"
2028, 374, 264, 2077, 13, # "This is a response.",
128009, # <|eot_id|>
128006, # <|start_header_id|>
78191, # "assistant"
128007, # <|end_of_header|>
271, # "\n\n"
]
)

0 comments on commit 99ebb4a

Please sign in to comment.