Skip to content

Commit

Permalink
Update codebase
Browse files Browse the repository at this point in the history
  • Loading branch information
hauntsaninja committed Jan 3, 2023
1 parent 0f8ec70 commit 40d9b1f
Show file tree
Hide file tree
Showing 12 changed files with 57 additions and 13 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build_wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
# cibuildwheel builds linux wheels inside a manylinux container
# it also takes care of procuring the correct python version for us
os: [ubuntu-latest, windows-latest, macos-latest]
python-version: [39, 310, 311]
python-version: [38, 39, 310, 311]

steps:
- uses: actions/checkout@v3
Expand Down
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Changelog

This is the changelog for the open source version of tiktoken.

## [v0.1.2]
- Avoid use of `blobfile` for public files
- Add support for Python 3.8
- Add py.typed
- Improve the public tests

## [v0.1.1]
- Initial release
2 changes: 2 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
include *.svg
include *.toml
include *.md
include Makefile
global-include py.typed
recursive-include scripts *.py
recursive-include tests *.py
recursive-include src *.rs
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[project]
name = "tiktoken"
dependencies = ["blobfile>=2", "regex>=2022.1.18"]
dependencies = ["blobfile>=2", "regex>=2022.1.18", "requests>=2.26.0"]
dynamic = ["version"]
requires-python = ">=3.9"
requires-python = ">=3.8"

[build-system]
build-backend = "setuptools.build_meta"
Expand Down
2 changes: 2 additions & 0 deletions scripts/redact.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ def redact_file(path: Path, dry_run: bool) -> None:
return

text = path.read_text()
if not text:
return

first_line = text.splitlines()[0]
if "redact" in first_line:
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
public = True

if public:
version = "0.1.1"
version = "0.1.2"

setup(
name="tiktoken",
Expand All @@ -18,6 +18,7 @@
debug=False,
)
],
package_data={"tiktoken": ["py.typed"]},
packages=["tiktoken", "tiktoken_ext"],
zip_safe=False,
)
8 changes: 8 additions & 0 deletions tests/test_simple_public.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,18 @@


def test_simple():
# Note that there are more actual tests, they're just not currently public :-)
enc = tiktoken.get_encoding("gpt2")
assert enc.encode("hello world") == [31373, 995]
assert enc.decode([31373, 995]) == "hello world"
assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256]

enc = tiktoken.get_encoding("cl100k_base")
assert enc.encode("hello world") == [15339, 1917]
assert enc.decode([15339, 1917]) == "hello world"
assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257]

for enc_name in tiktoken.list_encoding_names():
enc = tiktoken.get_encoding(enc_name)
for token in range(10_000):
assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token
2 changes: 2 additions & 0 deletions tiktoken/core.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import functools
from concurrent.futures import ThreadPoolExecutor
from typing import AbstractSet, Collection, Literal, NoReturn, Optional, Union
Expand Down
17 changes: 13 additions & 4 deletions tiktoken/load.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@
from __future__ import annotations

import base64
import hashlib
import json
import os
import uuid

import blobfile
import requests


def read_file(blobpath: str) -> bytes:
if not blobpath.startswith("http://") and not blobpath.startswith("https://"):
with blobfile.BlobFile(blobpath, "rb") as f:
return f.read()
# avoiding blobfile for public files helps avoid auth issues, like MFA prompts
return requests.get(blobpath).content


def read_file_cached(blobpath: str) -> bytes:
Expand All @@ -17,8 +28,7 @@ def read_file_cached(blobpath: str) -> bytes:

if cache_dir == "":
# disable caching
with blobfile.BlobFile(blobpath, "rb") as f:
return f.read()
return read_file(blobpath)

cache_key = hashlib.sha1(blobpath.encode()).hexdigest()

Expand All @@ -27,8 +37,7 @@ def read_file_cached(blobpath: str) -> bytes:
with open(cache_path, "rb") as f:
return f.read()

with blobfile.BlobFile(blobpath, "rb") as f:
contents = f.read()
contents = read_file(blobpath)

os.makedirs(cache_dir, exist_ok=True)
tmp_filename = cache_path + "." + str(uuid.uuid4()) + ".tmp"
Expand Down
Empty file added tiktoken/py.typed
Empty file.
2 changes: 2 additions & 0 deletions tiktoken/registry.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import importlib
import pkgutil
import threading
Expand Down
16 changes: 11 additions & 5 deletions tiktoken_ext/openai_public.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

def gpt2():
mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
vocab_bpe_file="az://openaipublic/gpt-2/encodings/main/vocab.bpe",
encoder_json_file="az://openaipublic/gpt-2/encodings/main/encoder.json",
vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
)
return {
"name": "gpt2",
Expand All @@ -22,7 +22,9 @@ def gpt2():


def r50k_base():
mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/r50k_base.tiktoken")
mergeable_ranks = load_tiktoken_bpe(
"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
)
return {
"name": "r50k_base",
"explicit_n_vocab": 50257,
Expand All @@ -33,7 +35,9 @@ def r50k_base():


def p50k_base():
mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/p50k_base.tiktoken")
mergeable_ranks = load_tiktoken_bpe(
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
)
return {
"name": "p50k_base",
"explicit_n_vocab": 50281,
Expand All @@ -44,7 +48,9 @@ def p50k_base():


def cl100k_base():
mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/cl100k_base.tiktoken")
mergeable_ranks = load_tiktoken_bpe(
"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
)
special_tokens = {
ENDOFTEXT: 100257,
FIM_PREFIX: 100258,
Expand Down

0 comments on commit 40d9b1f

Please sign in to comment.