Skip to content

Commit

Permalink
Build wheels; update codebase
Browse files Browse the repository at this point in the history
  • Loading branch information
hauntsaninja committed Dec 15, 2022
1 parent a1a9f16 commit 1f098ca
Show file tree
Hide file tree
Showing 9 changed files with 122 additions and 4 deletions.
53 changes: 53 additions & 0 deletions .github/workflows/build_wheels.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
name: Build wheels

on: [push, pull_request, workflow_dispatch]

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

jobs:
build_wheels:
name: py${{ matrix.python-version }} on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
# cibuildwheel builds linux wheels inside a manylinux container
# it also takes care of procuring the correct python version for us
os: [ubuntu-latest, windows-latest, macos-latest]
python-version: [39, 310, 311]

steps:
- uses: actions/checkout@v3

- uses: pypa/[email protected]
env:
CIBW_BUILD: "cp${{ matrix.python-version}}-*"

- uses: actions/upload-artifact@v3
with:
name: dist
path: ./wheelhouse/*.whl

build_sdist:
name: sdist
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
name: Install Python
with:
python-version: "3.9"
- name: Run check-manifest
run: |
pip install check-manifest
check-manifest -v
- name: Build sdist
run: |
pip install --upgrade build
python -m build --sdist
- uses: actions/upload-artifact@v3
with:
name: dist
path: ./dist/*.tar.gz
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ include *.svg
include *.toml
include Makefile
recursive-include scripts *.py
recursive-include tests *.py
recursive-include src *.rs
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ The tokeniser API is documented in `tiktoken/core.py`.

## Performance

`tiktoken` is between 3-6x faster than huggingface's tokeniser:
`tiktoken` is between 3-6x faster than a comparable open source tokeniser:

![image](./perf.svg)

Expand Down
1 change: 1 addition & 0 deletions perf.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
25 changes: 24 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,30 @@
name = "tiktoken"
dependencies = ["blobfile>=2", "regex>=2022.1.18"]
dynamic = ["version"]
requires-python = ">=3.9"

[build-system]
requires = ["setuptools", "wheel", "setuptools-rust"]
build-backend = "setuptools.build_meta"
requires = ["setuptools>=61", "wheel", "setuptools-rust>=1.3"]

[tool.cibuildwheel]
build-frontend = "build"
build-verbosity = 1

linux.before-all = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y"
linux.environment = { PATH = "$PATH:$HOME/.cargo/bin" }
macos.before-all = "rustup target add aarch64-apple-darwin"

skip = [
"*-manylinux_i686",
"*-musllinux_i686",
"*-win32",
]
macos.archs = ["x86_64", "arm64"]
# When cross-compiling on Intel, it is not possible to test arm64 wheels.
# Warnings will be silenced with following CIBW_TEST_SKIP
test-skip = "*-macosx_arm64"

before-test = "pip install pytest"
test-command = "pytest {project}/tests"

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
public = True

if public:
version = "0.1"
version = "0.1.1"

setup(
name="tiktoken",
Expand Down
11 changes: 11 additions & 0 deletions tests/test_simple_public.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import tiktoken


def test_simple():
enc = tiktoken.get_encoding("gpt2")
assert enc.encode("hello world") == [31373, 995]
assert enc.decode([31373, 995]) == "hello world"

enc = tiktoken.get_encoding("cl100k_base")
assert enc.encode("hello world") == [15339, 1917]
assert enc.decode([15339, 1917]) == "hello world"
2 changes: 2 additions & 0 deletions tiktoken/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ def encode_with_unstable(
See `encode` for more details on `allowed_special` and `disallowed_special`.
This API should itself be considered unstable.
```
>>> enc.encode_with_unstable("hello fanta")
([31373], [(277, 4910), (5113, 265), ..., (8842,)])
Expand Down
29 changes: 28 additions & 1 deletion tiktoken_ext/openai_public.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,28 @@ def gpt2():
}


def r50k_base():
mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/r50k_base.tiktoken")
return {
"name": "r50k_base",
"explicit_n_vocab": 50257,
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": {ENDOFTEXT: 50256},
}


def p50k_base():
mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/p50k_base.tiktoken")
return {
"name": "p50k_base",
"explicit_n_vocab": 50281,
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": {ENDOFTEXT: 50256},
}


def cl100k_base():
mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/cl100k_base.tiktoken")
special_tokens = {
Expand All @@ -38,4 +60,9 @@ def cl100k_base():
}


ENCODING_CONSTRUCTORS = {"gpt2": gpt2, "cl100k_base": cl100k_base}
ENCODING_CONSTRUCTORS = {
"gpt2": gpt2,
"r50k_base": r50k_base,
"p50k_base": p50k_base,
"cl100k_base": cl100k_base,
}

0 comments on commit 1f098ca

Please sign in to comment.