forked from dqbd/tiktoken
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
28 changed files
with
1,756 additions
and
1,245 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
name: Build Java JAR | ||
|
||
on: [push, pull_request, workflow_dispatch] | ||
|
||
concurrency: | ||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} | ||
cancel-in-progress: true | ||
|
||
jobs: | ||
build_jni: | ||
name: jni on ${{ matrix.os }} | ||
runs-on: ${{ matrix.os }} | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
os: [ubuntu-latest, windows-latest, macos-latest] | ||
include: | ||
- os: ubuntu-latest | ||
outdir: linux_64 | ||
- os: windows-latest | ||
outdir: windows_64 | ||
- os: macos-latest | ||
outdir: osx_64 | ||
steps: | ||
- uses: actions/checkout@v3 | ||
|
||
- name: Install rust toolchain | ||
uses: actions-rs/toolchain@v1 | ||
with: | ||
# stable doesn't have --out-dir | ||
toolchain: nightly | ||
override: true | ||
|
||
- name: Build | ||
working-directory: ./jni | ||
# TODO: 32bit vs 64bit? | ||
# https://github.com/scijava/native-lib-loader | ||
run: cargo build --release -Z unstable-options --out-dir ../build/natives/${{ matrix.outdir }}/ | ||
|
||
- uses: actions/upload-artifact@v3 | ||
with: | ||
name: natives | ||
path: ./build/natives/* | ||
|
||
build_java: | ||
name: java | ||
runs-on: ${{ matrix.os }} | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
os: [ubuntu-latest, windows-latest, macos-latest] | ||
needs: [build_jni] | ||
|
||
steps: | ||
- uses: actions/checkout@v3 | ||
|
||
- name: Load outputs | ||
uses: actions/download-artifact@v3 | ||
with: | ||
name: natives | ||
path: natives | ||
|
||
- name: Set up JDK 11 | ||
uses: actions/setup-java@v3 | ||
with: | ||
java-version: '11' | ||
distribution: 'microsoft' | ||
architecture: x64 | ||
cache: maven | ||
|
||
- name: Build with Maven | ||
working-directory: ./java | ||
run: mvn --batch-mode package failsafe:integration-test | ||
|
||
- uses: actions/upload-artifact@v3 | ||
with: | ||
name: java | ||
path: ./java/target/*.jar | ||
|
||
# TODO: publish to maven (only from ubuntu) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,32 +1,15 @@ | ||
[package] | ||
name = "tiktoken" | ||
version = "0.2.0" | ||
edition = "2021" | ||
rust-version = "1.57.0" | ||
[workspace] | ||
|
||
[lib] | ||
name = "_tiktoken" | ||
crate-type = ["cdylib"] | ||
|
||
[dependencies] | ||
wasm-bindgen = "0.2.83" | ||
js-sys = "0.3.61" | ||
anyhow = "1.0.69" | ||
base64 = "0.21.0" | ||
gloo-utils = { version = "0.1", features = ["serde"] } | ||
serde = { version = "1.0", features = ["derive"] } | ||
|
||
# tiktoken dependencies | ||
fancy-regex = "0.10.0" | ||
regex = "1.7.0" | ||
rustc-hash = "1.1.0" | ||
bstr = "1.0.1" | ||
members = [ | ||
"core", | ||
"python", | ||
"jni", | ||
] | ||
|
||
[profile.release] | ||
incremental = true | ||
opt-level = "s" | ||
lto = true | ||
|
||
[features] | ||
default = ["inline"] | ||
inline = [] | ||
opt-level = 'z' # Optimize for size | ||
lto = true # Enable link-time optimization | ||
codegen-units = 1 # Reduce number of codegen units to increase optimizations | ||
panic = 'abort' # Abort on panic | ||
strip = true # Strip symbols from binary* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,144 +1,103 @@ | ||
# ⏳ tiktoken | ||
|
||
tiktoken is a [BPE](https://en.wikipedia.org/wiki/Byte_pair_encoding) tokeniser for use with | ||
OpenAI's models, forked from the original tiktoken library to provide NPM bindings for Node and other JS runtimes. | ||
tiktoken is a fast [BPE](https://en.wikipedia.org/wiki/Byte_pair_encoding) tokeniser for use with | ||
OpenAI's models. | ||
|
||
The open source version of `tiktoken` can be installed from NPM: | ||
```python | ||
import tiktoken | ||
enc = tiktoken.get_encoding("gpt2") | ||
assert enc.decode(enc.encode("hello world")) == "hello world" | ||
|
||
# To get the tokeniser corresponding to a specific model in the OpenAI API: | ||
enc = tiktoken.encoding_for_model("text-davinci-003") | ||
``` | ||
npm install @dqbd/tiktoken | ||
``` | ||
|
||
## Usage | ||
|
||
Basic usage follows: | ||
The open source version of `tiktoken` can be installed from PyPI: | ||
``` | ||
pip install tiktoken | ||
``` | ||
|
||
```typescript | ||
import assert from "node:assert"; | ||
import { get_encoding, encoding_for_model } from "@dqbd/tiktoken"; | ||
The tokeniser API is documented in `tiktoken/core.py`. | ||
|
||
const enc = get_encoding("gpt2"); | ||
assert( | ||
new TextDecoder().decode(enc.decode(enc.encode("hello world"))) === | ||
"hello world" | ||
); | ||
Example code using `tiktoken` can be found in the | ||
[OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb). | ||
|
||
// To get the tokeniser corresponding to a specific model in the OpenAI API: | ||
const enc = encoding_for_model("text-davinci-003"); | ||
|
||
// Extend existing encoding with custom special tokens | ||
const enc = encoding_for_model("gpt2", { | ||
"<|im_start|>": 100264, | ||
"<|im_end|>": 100265, | ||
}); | ||
``` | ||
## Performance | ||
|
||
If desired, you can create a Tiktoken instance directly with custom ranks, special tokens and regex pattern: | ||
`tiktoken` is between 3-6x faster than a comparable open source tokeniser: | ||
|
||
```typescript | ||
import { Tiktoken } from "../pkg"; | ||
import { readFileSync } from "fs"; | ||
![image](./perf.svg) | ||
|
||
const encoder = new Tiktoken( | ||
readFileSync("./ranks/gpt2.tiktoken").toString("utf-8"), | ||
{ "<|endoftext|>": 50256, "<|im_start|>": 100264, "<|im_end|>": 100265 }, | ||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+" | ||
); | ||
``` | ||
Performance measured on 1GB of text using the GPT-2 tokeniser, using `GPT2TokenizerFast` from | ||
`tokenizers==0.13.2` and `transformers==4.24.0`. | ||
|
||
## Compatibility | ||
|
||
As this is a WASM library, there might be some issues with specific runtimes. If you encounter any issues, please open an issue. | ||
## Getting help | ||
|
||
| Runtime | Status | Notes | | ||
| ------------------- | ------ | ------------------------------- | | ||
| Node.js | ✅ | | | ||
| Bun | ✅ | | | ||
| Vite | ✅ | See [here](#vite) for notes | | ||
| Next.js | ✅ 🚧 | See [here](#nextjs) for caveats | | ||
| Vercel Edge Runtime | 🚧 | Work in progress | | ||
| Cloudflare Workers | 🚧 | Untested | | ||
| Deno | ❌ | Currently unsupported | | ||
Please post questions in the [issue tracker](https://github.com/openai/tiktoken/issues). | ||
|
||
### [Vite](#vite) | ||
If you work at OpenAI, make sure to check the internal documentation or feel free to contact | ||
@shantanu. | ||
|
||
If you are using Vite, you will need to add both the `vite-plugin-wasm` and `vite-plugin-top-level-await`. Add the following to your `vite.config.js`: | ||
|
||
```js | ||
import wasm from "vite-plugin-wasm"; | ||
import topLevelAwait from "vite-plugin-top-level-await"; | ||
import { defineConfig } from "vite"; | ||
## Extending tiktoken | ||
|
||
export default defineConfig({ | ||
plugins: [wasm(), topLevelAwait()], | ||
}); | ||
``` | ||
You may wish to extend `tiktoken` to support new encodings. There are two ways to do this. | ||
|
||
### [Next.js](#nextjs) | ||
|
||
Both API routes and `/pages` are supported with some caveats. To overcome issues with importing `/node` variant and incorrect `__dirname` resolution, you can import the package from `@dqbd/tiktoken/bundler` instead. | ||
**Create your `Encoding` object exactly the way you want and simply pass it around.** | ||
|
||
```typescript | ||
import { get_encoding } from "@dqbd/tiktoken/bundler"; | ||
import { NextApiRequest, NextApiResponse } from "next"; | ||
```python | ||
cl100k_base = tiktoken.get_encoding("cl100k_base") | ||
|
||
export default function handler(req: NextApiRequest, res: NextApiResponse) { | ||
return res.status(200).json({ | ||
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment | ||
message: get_encoding("gpt2").encode(`Hello World ${Math.random()}`), | ||
}); | ||
} | ||
# In production, load the arguments directly instead of accessing private attributes | ||
# See openai_public.py for examples of arguments for specific encodings | ||
enc = tiktoken.Encoding( | ||
# If you're changing the set of special tokens, make sure to use a different name | ||
# It should be clear from the name what behaviour to expect. | ||
name="cl100k_im", | ||
pat_str=cl100k_base._pat_str, | ||
mergeable_ranks=cl100k_base._mergeable_ranks, | ||
special_tokens={ | ||
**cl100k_base._special_tokens, | ||
"<|im_start|>": 100264, | ||
"<|im_end|>": 100265, | ||
} | ||
) | ||
``` | ||
|
||
Additional Webpack configuration is also required, see https://github.com/vercel/next.js/issues/29362. | ||
|
||
```typescript | ||
class WasmChunksFixPlugin { | ||
apply(compiler) { | ||
compiler.hooks.thisCompilation.tap("WasmChunksFixPlugin", (compilation) => { | ||
compilation.hooks.processAssets.tap( | ||
{ name: "WasmChunksFixPlugin" }, | ||
(assets) => | ||
Object.entries(assets).forEach(([pathname, source]) => { | ||
if (!pathname.match(/\.wasm$/)) return; | ||
compilation.deleteAsset(pathname); | ||
|
||
const name = pathname.split("/")[1]; | ||
const info = compilation.assetsInfo.get(pathname); | ||
compilation.emitAsset(name, source, info); | ||
}) | ||
); | ||
}); | ||
} | ||
} | ||
|
||
const config = { | ||
webpack(config, { isServer, dev }) { | ||
config.experiments = { | ||
asyncWebAssembly: true, | ||
layers: true, | ||
}; | ||
|
||
if (!dev && isServer) { | ||
config.output.webassemblyModuleFilename = "chunks/[id].wasm"; | ||
config.plugins.push(new WasmChunksFixPlugin()); | ||
} | ||
**Use the `tiktoken_ext` plugin mechanism to register your `Encoding` objects with `tiktoken`.** | ||
|
||
return config; | ||
}, | ||
}; | ||
``` | ||
This is only useful if you need `tiktoken.get_encoding` to find your encoding, otherwise prefer | ||
option 1. | ||
|
||
To properly resolve `tsconfig.json`, use either `moduleResolution: "node16"` or `moduleResolution: "nodenext"`: | ||
To do this, you'll need to create a namespace package under `tiktoken_ext`. | ||
|
||
```json | ||
{ | ||
"compilerOptions": { | ||
"moduleResolution": "node16" | ||
} | ||
} | ||
Layout your project like this, making sure to omit the `tiktoken_ext/__init__.py` file: | ||
``` | ||
my_tiktoken_extension | ||
├── tiktoken_ext | ||
│ └── my_encodings.py | ||
└── setup.py | ||
``` | ||
|
||
## Acknowledgements | ||
`my_encodings.py` should be a module that contains a variable named `ENCODING_CONSTRUCTORS`. | ||
This is a dictionary from an encoding name to a function that takes no arguments and returns | ||
arguments that can be passed to `tiktoken.Encoding` to construct that encoding. For an example, see | ||
`tiktoken_ext/openai_public.py`. For precise details, see `tiktoken/registry.py`. | ||
|
||
Your `setup.py` should look something like this: | ||
```python | ||
from setuptools import setup, find_namespace_packages | ||
|
||
setup( | ||
name="my_tiktoken_extension", | ||
packages=find_namespace_packages(include=['tiktoken_ext*']), | ||
install_requires=["tiktoken"], | ||
... | ||
) | ||
``` | ||
|
||
- https://github.com/zurawiki/tiktoken-rs | ||
Then simply `pip install ./my_tiktoken_extension` and you should be able to use your | ||
custom encodings! Make sure **not** to use an editable install. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
[package] | ||
name = "tiktoken_core" | ||
version = "0.2.0" | ||
edition = "2021" | ||
rust-version = "1.57.0" | ||
|
||
[lib] | ||
name = "_tiktoken_core" | ||
crate-type = ["lib"] | ||
|
||
[dependencies] | ||
# tiktoken dependencies | ||
fancy-regex = "0.10.0" | ||
regex = "1.7.0" | ||
rustc-hash = "1.1.0" | ||
bstr = "1.0.1" | ||
reqwest = { version = "0.11.14", features = ["blocking"] } | ||
sha1 = "0.10.5" | ||
json = "0.12.4" | ||
base64 = "0.21.0" | ||
lazy_static = "1.4.0" |
Oops, something went wrong.