diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 24e0da27..777a0acc 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -15,13 +15,13 @@ jobs: matrix: # cibuildwheel builds linux wheels inside a manylinux container # it also takes care of procuring the correct python version for us - os: [ubuntu-latest, windows-latest, macos-latest] + os: [ubuntu-latest, windows-latest, macos-13] python-version: [38, 39, 310, 311, 312] steps: - uses: actions/checkout@v4 - - uses: pypa/cibuildwheel@v2.16.5 + - uses: pypa/cibuildwheel@v2.18.0 env: CIBW_BUILD: "cp${{ matrix.python-version}}-*" @@ -48,7 +48,7 @@ jobs: platforms: arm64 - name: Build wheels - uses: pypa/cibuildwheel@v2.16.5 + uses: pypa/cibuildwheel@v2.18.0 env: CIBW_BUILD: "cp${{ matrix.python-version}}-*" CIBW_ARCHS: aarch64 diff --git a/CHANGELOG.md b/CHANGELOG.md index 049fdb4d..f94795ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ This is the changelog for the open source version of tiktoken. +## [v0.7.0] +- Support for `gpt-4o` +- Performance improvements + ## [v0.6.0] - Optimise regular expressions for a 20% performance improvement, thanks to @paplorinc! - Add `text-embedding-3-*` models to `encoding_for_model` diff --git a/Cargo.toml b/Cargo.toml index 14588580..4efb156f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tiktoken" -version = "0.6.0" +version = "0.7.0" edition = "2021" rust-version = "1.57.0" diff --git a/README.md b/README.md index 748578b6..124d5828 100644 --- a/README.md +++ b/README.md @@ -5,11 +5,11 @@ OpenAI's models. ```python import tiktoken -enc = tiktoken.get_encoding("cl100k_base") +enc = tiktoken.get_encoding("o200k_base") assert enc.decode(enc.encode("hello world")) == "hello world" # To get the tokeniser corresponding to a specific model in the OpenAI API: -enc = tiktoken.encoding_for_model("gpt-4") +enc = tiktoken.encoding_for_model("gpt-4o") ``` The open source version of `tiktoken` can be installed from PyPI: diff --git a/pyproject.toml b/pyproject.toml index 47aada31..7cc7cb10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "tiktoken" -version = "0.6.0" +version = "0.7.0" description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" readme = "README.md" license = {file = "LICENSE"} @@ -42,4 +42,3 @@ test-command = "pytest {project}/tests --import-mode=append" [[tool.cibuildwheel.overrides]] select = "*linux_aarch64" test-command = """python -c 'import tiktoken; enc = tiktoken.get_encoding("gpt2"); assert enc.encode("hello world") == [31373, 995]'""" - diff --git a/tiktoken/model.py b/tiktoken/model.py index 17532aee..6ecd7232 100644 --- a/tiktoken/model.py +++ b/tiktoken/model.py @@ -6,6 +6,7 @@ # TODO: these will likely be replaced by an API endpoint MODEL_PREFIX_TO_ENCODING: dict[str, str] = { # chat + "gpt-4o-": "o200k_base", # e.g., gpt-4o-2024-05-13 "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc. "gpt-35-turbo-": "cl100k_base", # Azure deployment name @@ -18,6 +19,7 @@ MODEL_TO_ENCODING: dict[str, str] = { # chat + "gpt-4o": "o200k_base", "gpt-4": "cl100k_base", "gpt-3.5-turbo": "cl100k_base", "gpt-3.5": "cl100k_base", # Common shorthand diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py index 330ecabb..6b29a711 100644 --- a/tiktoken_ext/openai_public.py +++ b/tiktoken_ext/openai_public.py @@ -88,10 +88,40 @@ def cl100k_base(): } +def o200k_base(): + mergeable_ranks = load_tiktoken_bpe( + "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken", + expected_hash="446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d", + ) + special_tokens = { + ENDOFTEXT: 199999, + ENDOFPROMPT: 200018, + } + # This regex could be made more efficient + pat_str = "|".join( + [ + r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""", + r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""", + r"""\p{N}{1,3}""", + r""" ?[^\s\p{L}\p{N}]+[\r\n/]*""", + r"""\s*[\r\n]+""", + r"""\s+(?!\S)""", + r"""\s+""", + ] + ) + return { + "name": "o200k_base", + "pat_str": pat_str, + "mergeable_ranks": mergeable_ranks, + "special_tokens": special_tokens, + } + + ENCODING_CONSTRUCTORS = { "gpt2": gpt2, "r50k_base": r50k_base, "p50k_base": p50k_base, "p50k_edit": p50k_edit, "cl100k_base": cl100k_base, + "o200k_base": o200k_base, }