Skip to content

Commit

Permalink
✨ feat(rust): Convert project to a multi-crate workspace
Browse files Browse the repository at this point in the history
This commit restructures the project from a single-crate workspace into a multi-crate workspace, dividing it into 'rs-tiktoken' and 'py-tiktoken'. This is done to improve the clarity of the organization of the codebase and make the Rust and Python modules separate for easier code maintenance. The setup.py is also updated to reflect these changes in the directory structure.

Refs: #24
  • Loading branch information
Miuler committed Jul 18, 2023
1 parent f28ce4c commit 9db8696
Show file tree
Hide file tree
Showing 11 changed files with 128 additions and 28 deletions.
26 changes: 5 additions & 21 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,21 +1,5 @@
[package]
name = "tiktoken"
version = "0.4.0"
edition = "2021"
rust-version = "1.57.0"

[lib]
name = "_tiktoken"
crate-type = ["cdylib"]

[dependencies]
pyo3 = { version = "0.19.0", features = ["extension-module"] }

# tiktoken dependencies
fancy-regex = "0.11.0"
regex = "1.8.3"
rustc-hash = "1.1.0"
bstr = "1.5.0"

[profile.release]
incremental = true
[workspace]
members = [
"rs-tiktoken",
"py-tiktoken",
]
22 changes: 22 additions & 0 deletions py-tiktoken/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[package]
name = "py-tiktoken"
version = "0.4.0"
edition = "2021"
rust-version = "1.57.0"

[lib]
name = "_tiktoken"
crate-type = ["cdylib"]

[dependencies]
tiktoken = { path = "../rs-tiktoken" }
pyo3 = { version = "0.19.0", features = ["extension-module"] }

# tiktoken dependencies
fancy-regex = "0.11.0"
regex = "1.8.3"
rustc-hash = "1.1.0"
bstr = "1.5.0"

[profile.release]
incremental = true
1 change: 1 addition & 0 deletions py-tiktoken/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub mod tiktoken_py;
4 changes: 2 additions & 2 deletions src/tiktoken_py.rs → py-tiktoken/src/tiktoken_py.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use pyo3::PyResult;
use pyo3::types::{PyBytes, PyList, PyTuple};
use rustc_hash::FxHashMap as HashMap;

use crate::tiktoken::{byte_pair_encode, CoreBPE, MAX_NUM_THREADS};
use tiktoken::core::{byte_pair_encode, CoreBPE, MAX_NUM_THREADS};

#[pyclass]
pub struct PyCoreBPE {
Expand Down Expand Up @@ -181,7 +181,7 @@ pub fn _tiktoken(_py: Python, m: &PyModule) -> PyResult<()> {
mod tests {
use rustc_hash::FxHashMap as HashMap;

use crate::tiktoken::byte_pair_split;
use crate::core::byte_pair_split;

#[test]
fn very_simple_test() {
Expand Down
15 changes: 15 additions & 0 deletions rs-tiktoken/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[package]
name = "tiktoken"
version = "0.4.0"
edition = "2021"
rust-version = "1.57.0"

[dependencies]
fancy-regex = "0.11.0"
regex = "1.8.3"
rustc-hash = "1.1.0"
bstr = "1.5.0"
once_cell = "1.18.0"

[profile.release]
incremental = true
2 changes: 1 addition & 1 deletion src/tiktoken.rs → rs-tiktoken/src/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ pub fn byte_pair_split<'a>(piece: &'a [u8], ranks: &HashMap<Vec<u8>, usize>) ->

pub struct FakeThreadId(NonZeroU64);

pub fn hash_current_thread() -> usize {
fn hash_current_thread() -> usize {
// It's easier to use unsafe than to use nightly. Rust has this nice u64 thread id counter
// that works great for our use case of avoiding collisions in our array. Unfortunately,
// it's private. However, there are only so many ways you can layout a u64, so just transmute
Expand Down
49 changes: 49 additions & 0 deletions rs-tiktoken/src/encoding.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
//! WARNING: This code is under active development. Functionality,
//! behavior, and the interface may change in future updates.

use std::collections::HashMap;
use once_cell::sync::Lazy;
use regex::Regex;


pub struct Encoding {
/// The name of the encoding. It should be clear from the name of the encoding
/// what behaviour to expect, in particular, encodings with different special tokens
/// should have different names.
pub name: &'static str,
/// A regex pattern string that is used to split the input text.
pub pat_str: Regex,
/// A dictionary mapping mergeable token bytes to their ranks. The ranks
/// must correspond to merge priority.
pub mergeable_ranks: HashMap<&'static str, u32>,
/// A dictionary mapping special token strings to their token values.
pub special_tokens: HashMap<&'static str, u32>,
/// The number of tokens in the vocabulary. If provided, it is checked
/// that the number of mergeable tokens and special tokens is equal to this number.
pub explicit_n_vocab: Option<u32>,
}

pub static GPT2: Lazy<Encoding> = Lazy::new(|| {
let mergeable_ranks = Default::default();
let special_tokens = [
("<|endoftext|>", 50256)
].iter().cloned().collect();

Encoding{
name: "gpt2",
pat_str: Regex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+").unwrap(),
mergeable_ranks,
special_tokens,
explicit_n_vocab: Some(50257),
}
});

#[cfg(test)]
mod tes {
use super::*;

#[test]
fn test() {
let a = GPT2.name;
}
}
31 changes: 31 additions & 0 deletions rs-tiktoken/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// This check is new and seems buggy (possibly with PyO3 interaction)
pub mod core;
pub mod encoding;
mod model;

pub fn get_encoding() {

}

#[cfg(test)]
mod test {


#[test]
fn test_simple() {
// enc = tiktoken.get_encoding("gpt2")
// assert enc.encode("hello world") == [31373, 995]
// assert enc.decode([31373, 995]) == "hello world"
// assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256]
//
// enc = tiktoken.get_encoding("cl100k_base")
// assert enc.encode("hello world") == [15339, 1917]
// assert enc.decode([15339, 1917]) == "hello world"
// assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257]
//
// for enc_name in tiktoken.list_encoding_names():
// enc = tiktoken.get_encoding(enc_name)
// for token in range(10_000):
// assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token
}
}
Empty file added rs-tiktoken/src/model.rs
Empty file.
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
name="tiktoken",
rust_extensions=[
RustExtension(
"tiktoken._tiktoken",
target="tiktoken._tiktoken",
binding=Binding.PyO3,
# Between our use of editable installs and wanting to use Rust for performance sensitive
# code, it makes sense to just always use --release
debug=False,
path="py-tiktoken/Cargo.toml",
)
],
package_data={"tiktoken": ["py.typed"]},
Expand Down
3 changes: 0 additions & 3 deletions src/lib.rs

This file was deleted.

0 comments on commit 9db8696

Please sign in to comment.