Skip to content

Commit

Permalink
fix <eos> tokenization
Browse files Browse the repository at this point in the history
  • Loading branch information
tloen committed Mar 16, 2023
1 parent 6f21821 commit 6b69ea8
Showing 1 changed file with 3 additions and 3 deletions.
6 changes: 3 additions & 3 deletions finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
EPOCHS = 3 # we don't need 3 tbh
LEARNING_RATE = 3e-4 # the Karpathy constant
CUTOFF_LEN = 256 # 256 accounts for about 96% of the data
LORA_R = 8
LORA_R = 16
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

Expand All @@ -27,7 +27,7 @@
device_map="auto",
)
tokenizer = LLaMATokenizer.from_pretrained(
"decapoda-research/llama-7b-hf", add_eos_token=True
"decapoda-research/llama-7b-hf", add_eos_token=False
)

model = prepare_model_for_int8_training(model)
Expand Down Expand Up @@ -70,7 +70,7 @@ def generate_prompt(data_point):

data = data.shuffle().map(
lambda data_point: tokenizer(
generate_prompt(data_point),
generate_prompt(data_point) + tokenizer.eos_token,
truncation=True,
max_length=CUTOFF_LEN,
padding="max_length",
Expand Down

0 comments on commit 6b69ea8

Please sign in to comment.