Skip to content

Commit

Permalink
✨ feat(rust): Convert project to a multi-crate workspace
Browse files Browse the repository at this point in the history
This commit restructures the project from a single-crate workspace into a multi-crate workspace, dividing it into 'rs-tiktoken' and 'py-tiktoken'. This is done to improve the clarity of the organization of the codebase and make the Rust and Python modules separate for easier code maintenance. The setup.py is also updated to reflect these changes in the directory structure.

Refs: #24
  • Loading branch information
Miuler committed Jul 18, 2023
1 parent f28ce4c commit be02293
Show file tree
Hide file tree
Showing 11 changed files with 122 additions and 28 deletions.
26 changes: 5 additions & 21 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,21 +1,5 @@
[package]
name = "tiktoken"
version = "0.4.0"
edition = "2021"
rust-version = "1.57.0"

[lib]
name = "_tiktoken"
crate-type = ["cdylib"]

[dependencies]
pyo3 = { version = "0.19.0", features = ["extension-module"] }

# tiktoken dependencies
fancy-regex = "0.11.0"
regex = "1.8.3"
rustc-hash = "1.1.0"
bstr = "1.5.0"

[profile.release]
incremental = true
[workspace]
members = [
"rs-tiktoken",
"py-tiktoken",
]
3 changes: 2 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ include Makefile
global-include py.typed
recursive-include scripts *.py
recursive-include tests *.py
recursive-include src *.rs
recursive-include py-tiktoken *.rs
recursive-include rs-tiktoken *.rs
22 changes: 22 additions & 0 deletions py-tiktoken/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[package]
name = "py-tiktoken"
version = "0.4.0"
edition = "2021"
rust-version = "1.57.0"

[lib]
name = "_tiktoken"
crate-type = ["cdylib"]

[dependencies]
tiktoken = { path = "../rs-tiktoken" }
pyo3 = { version = "0.19.0", features = ["extension-module"] }

# tiktoken dependencies
fancy-regex = "0.11.0"
regex = "1.8.3"
rustc-hash = "1.1.0"
bstr = "1.5.0"

[profile.release]
incremental = true
1 change: 1 addition & 0 deletions py-tiktoken/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub mod tiktoken_py;
4 changes: 2 additions & 2 deletions src/tiktoken_py.rs → py-tiktoken/src/tiktoken_py.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use pyo3::PyResult;
use pyo3::types::{PyBytes, PyList, PyTuple};
use rustc_hash::FxHashMap as HashMap;

use crate::tiktoken::{byte_pair_encode, CoreBPE, MAX_NUM_THREADS};
use tiktoken::core::{byte_pair_encode, CoreBPE, MAX_NUM_THREADS};

#[pyclass]
pub struct PyCoreBPE {
Expand Down Expand Up @@ -181,7 +181,7 @@ pub fn _tiktoken(_py: Python, m: &PyModule) -> PyResult<()> {
mod tests {
use rustc_hash::FxHashMap as HashMap;

use crate::tiktoken::byte_pair_split;
use crate::core::byte_pair_split;

#[test]
fn very_simple_test() {
Expand Down
15 changes: 15 additions & 0 deletions rs-tiktoken/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[package]
name = "tiktoken"
version = "0.4.0"
edition = "2021"
rust-version = "1.57.0"

[dependencies]
fancy-regex = "0.11.0"
regex = "1.8.3"
rustc-hash = "1.1.0"
bstr = "1.5.0"
once_cell = "1.18.0"

[profile.release]
incremental = true
2 changes: 1 addition & 1 deletion src/tiktoken.rs → rs-tiktoken/src/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ pub fn byte_pair_split<'a>(piece: &'a [u8], ranks: &HashMap<Vec<u8>, usize>) ->

pub struct FakeThreadId(NonZeroU64);

pub fn hash_current_thread() -> usize {
fn hash_current_thread() -> usize {
// It's easier to use unsafe than to use nightly. Rust has this nice u64 thread id counter
// that works great for our use case of avoiding collisions in our array. Unfortunately,
// it's private. However, there are only so many ways you can layout a u64, so just transmute
Expand Down
66 changes: 66 additions & 0 deletions rs-tiktoken/src/encoding.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
//! WARNING: This code is under active development. Functionality,
//! behavior, and the interface may change in future updates.

use std::collections::HashMap;
use once_cell::sync::Lazy;
use regex::Regex;


pub struct Encoding {
/// The name of the encoding. It should be clear from the name of the encoding
/// what behaviour to expect, in particular, encodings with different special tokens
/// should have different names.
pub name: &'static str,
/// A regex pattern string that is used to split the input text.
pub pat_str: Regex,
/// A dictionary mapping mergeable token bytes to their ranks. The ranks
/// must correspond to merge priority.
pub mergeable_ranks: HashMap<&'static str, u32>,
/// A dictionary mapping special token strings to their token values.
pub special_tokens: HashMap<&'static str, u32>,
/// The number of tokens in the vocabulary. If provided, it is checked
/// that the number of mergeable tokens and special tokens is equal to this number.
pub explicit_n_vocab: Option<u32>,
}

pub static GPT2: Lazy<Encoding> = Lazy::new(|| {
let mergeable_ranks = Default::default();
let special_tokens = [
("<|endoftext|>", 50256)
].iter().cloned().collect();

Encoding{
name: "gpt2",
pat_str: Regex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+").unwrap(),
mergeable_ranks,
special_tokens,
explicit_n_vocab: Some(50257),
}
});

pub fn get_encoding() {

}

#[cfg(test)]
mod test {
use super::*;

#[test]
fn test_simple() {
// enc = tiktoken.get_encoding("gpt2")
// assert enc.encode("hello world") == [31373, 995]
// assert enc.decode([31373, 995]) == "hello world"
// assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256]
//
// enc = tiktoken.get_encoding("cl100k_base")
// assert enc.encode("hello world") == [15339, 1917]
// assert enc.decode([15339, 1917]) == "hello world"
// assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257]
//
// for enc_name in tiktoken.list_encoding_names():
// enc = tiktoken.get_encoding(enc_name)
// for token in range(10_000):
// assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token
}
}
5 changes: 3 additions & 2 deletions src/lib.rs → rs-tiktoken/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// This check is new and seems buggy (possibly with PyO3 interaction)
pub mod tiktoken_py;
pub mod tiktoken;
pub mod core;
pub mod encoding;
mod model;
3 changes: 3 additions & 0 deletions rs-tiktoken/src/model.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
//! WARNING: This code is under active development. Functionality,
//! behavior, and the interface may change in future updates.

3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
name="tiktoken",
rust_extensions=[
RustExtension(
"tiktoken._tiktoken",
target="tiktoken._tiktoken",
binding=Binding.PyO3,
# Between our use of editable installs and wanting to use Rust for performance sensitive
# code, it makes sense to just always use --release
debug=False,
path="py-tiktoken/Cargo.toml",
)
],
package_data={"tiktoken": ["py.typed"]},
Expand Down

0 comments on commit be02293

Please sign in to comment.