Skip to content

Commit

Permalink
Bump version, sync codebase
Browse files Browse the repository at this point in the history
  • Loading branch information
hauntsaninja committed Mar 13, 2023
1 parent b2e85f1 commit 3e86200
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 22 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

This is the changelog for the open source version of tiktoken.

## [v0.3.1]
- Build aarch64 wheels
- Make `blobfile` an optional dependency

Thank you to @messense for the environment variable that makes cargo not OOM under emulation!

## [v0.3.0]
- Improve performance by 5-20%; thank you to @nistath!
- Add `gpt-3.5-turbo` models to `encoding_for_model`
Expand All @@ -14,6 +20,8 @@ This is the changelog for the open source version of tiktoken.
- Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model
- Improve portability of caching logic

Thank you to @fritzo, @arvid220u, @khanhvu207, @henriktorget for various small corrections

## [v0.1.2]
- Avoid use of `blobfile` for public files
- Add support for Python 3.8
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "tiktoken"
version = "0.3.0"
version = "0.3.1"
edition = "2021"
rust-version = "1.57.0"

Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
[project]
name = "tiktoken"
version = "0.3.0"
version = "0.3.1"
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
readme = "README.md"
license = {file = "LICENSE"}
authors = [{name = "Shantanu Jain"}, {email = "[email protected]"}]
dependencies = ["blobfile>=2", "regex>=2022.1.18", "requests>=2.26.0"]
dependencies = ["regex>=2022.1.18", "requests>=2.26.0"]
optional-dependencies = {blobfile = ["blobfile>=2"]}
requires-python = ">=3.8"

[project.urls]
Expand Down
27 changes: 9 additions & 18 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,32 +21,23 @@ fn _byte_pair_merge<T>(
// The rank of the last item in the vector is not a valid value.
let mut parts: Vec<(usize, usize)> = (0..piece.len() + 1).map(|i| (i, usize::MAX)).collect();

// NOTE: using a macro here because a closure fails to get inlined
// according to optimization remarks.
// A closure also cannot capture a reference to `piece` without
// the borrow checker complaining about the mutable borrows during
// the assignments later in this code.
macro_rules! get_rank {
($start_idx:expr, $skip:expr) => {{
let start_idx: usize = $start_idx;
let skip: usize = $skip;
let get_rank = {
#[inline(always)]
|parts: &Vec<(usize, usize)>, start_idx: usize, skip: usize| {
if (start_idx + skip + 2) < parts.len() {
ranks
.get(&piece[parts[start_idx].0..parts[start_idx + skip + 2].0])
.map(|r| *r)
.copied()
} else {
None
}
}};
($idx:expr) => {{
get_rank!($idx, 0)
}};
}
}
};

// We look up the ranks once in the beggining and iteratively update
// them during each merge, which reduces the number of rank lookups.
for i in 0..parts.len() - 2 {
match get_rank!(i) {
match get_rank(&parts, i, 0) {
Some(rank) => {
// usize::MAX is a sentinel value and cannot be a valid rank
debug_assert!(rank != usize::MAX);
Expand Down Expand Up @@ -89,9 +80,9 @@ fn _byte_pair_merge<T>(
// parts[i] and parts[i-1] before removing, which could thrash
// the cache. Thus, we update the rank calculation by skipping over
// parts[i + 1], by invoking `get_rank!` with `skip = 1`.
parts[i].1 = get_rank!(i, 1).unwrap_or(usize::MAX);
parts[i].1 = get_rank(&parts, i, 1).unwrap_or(usize::MAX);
if i > 0 {
parts[i - 1].1 = get_rank!(i - 1, 1).unwrap_or(usize::MAX);
parts[i - 1].1 = get_rank(&parts, i - 1, 1).unwrap_or(usize::MAX);
}

parts.remove(i + 1);
Expand Down
12 changes: 12 additions & 0 deletions tests/test_simple_public.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import subprocess
import sys

import tiktoken


Expand Down Expand Up @@ -28,3 +31,12 @@ def test_encoding_for_model():
assert enc.name == "p50k_edit"
enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
assert enc.name == "cl100k_base"


def test_optional_blobfile_dependency():
prog = """
import tiktoken
import sys
assert "blobfile" not in sys.modules
"""
subprocess.check_call([sys.executable, "-c", prog])
13 changes: 12 additions & 1 deletion tiktoken/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,17 @@
import tempfile
import uuid

import blobfile
import requests


def read_file(blobpath: str) -> bytes:
if not blobpath.startswith("http:https://") and not blobpath.startswith("https://"):
try:
import blobfile
except ImportError:
raise ImportError(
"blobfile is not installed. Please install it by running `pip install blobfile`."
)
with blobfile.BlobFile(blobpath, "rb") as f:
return f.read()
# avoiding blobfile for public files helps avoid auth issues, like MFA prompts
Expand Down Expand Up @@ -93,6 +98,12 @@ def decode_data_gym(value: str) -> bytes:


def dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> None:
try:
import blobfile
except ImportError:
raise ImportError(
"blobfile is not installed. Please install it by running `pip install blobfile`."
)
with blobfile.BlobFile(tiktoken_bpe_file, "wb") as f:
for token, rank in sorted(bpe_ranks.items(), key=lambda x: x[1]):
f.write(base64.b64encode(token) + b" " + str(rank).encode() + b"\n")
Expand Down

0 comments on commit 3e86200

Please sign in to comment.