From 9db86969ad9e2b7a81ad6e0535baf02300a05b7b Mon Sep 17 00:00:00 2001 From: Hector Miuler Malpica Gallegos Date: Tue, 18 Jul 2023 17:46:38 -0500 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat(rust):=20Convert=20project=20t?= =?UTF-8?q?o=20a=20multi-crate=20workspace?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit restructures the project from a single-crate workspace into a multi-crate workspace, dividing it into 'rs-tiktoken' and 'py-tiktoken'. This is done to improve the clarity of the organization of the codebase and make the Rust and Python modules separate for easier code maintenance. The setup.py is also updated to reflect these changes in the directory structure. Refs: #24 --- Cargo.toml | 26 +++--------- py-tiktoken/Cargo.toml | 22 ++++++++++ py-tiktoken/src/lib.rs | 1 + {src => py-tiktoken/src}/tiktoken_py.rs | 4 +- rs-tiktoken/Cargo.toml | 15 +++++++ src/tiktoken.rs => rs-tiktoken/src/core.rs | 2 +- rs-tiktoken/src/encoding.rs | 49 ++++++++++++++++++++++ rs-tiktoken/src/lib.rs | 31 ++++++++++++++ rs-tiktoken/src/model.rs | 0 setup.py | 3 +- src/lib.rs | 3 -- 11 files changed, 128 insertions(+), 28 deletions(-) create mode 100644 py-tiktoken/Cargo.toml create mode 100644 py-tiktoken/src/lib.rs rename {src => py-tiktoken/src}/tiktoken_py.rs (98%) create mode 100644 rs-tiktoken/Cargo.toml rename src/tiktoken.rs => rs-tiktoken/src/core.rs (99%) create mode 100644 rs-tiktoken/src/encoding.rs create mode 100644 rs-tiktoken/src/lib.rs create mode 100644 rs-tiktoken/src/model.rs delete mode 100644 src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index 948b9f13..908492d3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,21 +1,5 @@ -[package] -name = "tiktoken" -version = "0.4.0" -edition = "2021" -rust-version = "1.57.0" - -[lib] -name = "_tiktoken" -crate-type = ["cdylib"] - -[dependencies] -pyo3 = { version = "0.19.0", features = ["extension-module"] } - -# tiktoken dependencies -fancy-regex = "0.11.0" -regex = "1.8.3" -rustc-hash = "1.1.0" -bstr = "1.5.0" - -[profile.release] -incremental = true +[workspace] +members = [ + "rs-tiktoken", + "py-tiktoken", +] \ No newline at end of file diff --git a/py-tiktoken/Cargo.toml b/py-tiktoken/Cargo.toml new file mode 100644 index 00000000..e02a8121 --- /dev/null +++ b/py-tiktoken/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "py-tiktoken" +version = "0.4.0" +edition = "2021" +rust-version = "1.57.0" + +[lib] +name = "_tiktoken" +crate-type = ["cdylib"] + +[dependencies] +tiktoken = { path = "../rs-tiktoken" } +pyo3 = { version = "0.19.0", features = ["extension-module"] } + +# tiktoken dependencies +fancy-regex = "0.11.0" +regex = "1.8.3" +rustc-hash = "1.1.0" +bstr = "1.5.0" + +[profile.release] +incremental = true diff --git a/py-tiktoken/src/lib.rs b/py-tiktoken/src/lib.rs new file mode 100644 index 00000000..e13657a2 --- /dev/null +++ b/py-tiktoken/src/lib.rs @@ -0,0 +1 @@ +pub mod tiktoken_py; diff --git a/src/tiktoken_py.rs b/py-tiktoken/src/tiktoken_py.rs similarity index 98% rename from src/tiktoken_py.rs rename to py-tiktoken/src/tiktoken_py.rs index cb7cc6df..90157116 100644 --- a/src/tiktoken_py.rs +++ b/py-tiktoken/src/tiktoken_py.rs @@ -10,7 +10,7 @@ use pyo3::PyResult; use pyo3::types::{PyBytes, PyList, PyTuple}; use rustc_hash::FxHashMap as HashMap; -use crate::tiktoken::{byte_pair_encode, CoreBPE, MAX_NUM_THREADS}; +use tiktoken::core::{byte_pair_encode, CoreBPE, MAX_NUM_THREADS}; #[pyclass] pub struct PyCoreBPE { @@ -181,7 +181,7 @@ pub fn _tiktoken(_py: Python, m: &PyModule) -> PyResult<()> { mod tests { use rustc_hash::FxHashMap as HashMap; - use crate::tiktoken::byte_pair_split; + use crate::core::byte_pair_split; #[test] fn very_simple_test() { diff --git a/rs-tiktoken/Cargo.toml b/rs-tiktoken/Cargo.toml new file mode 100644 index 00000000..520a6ebf --- /dev/null +++ b/rs-tiktoken/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "tiktoken" +version = "0.4.0" +edition = "2021" +rust-version = "1.57.0" + +[dependencies] +fancy-regex = "0.11.0" +regex = "1.8.3" +rustc-hash = "1.1.0" +bstr = "1.5.0" +once_cell = "1.18.0" + +[profile.release] +incremental = true diff --git a/src/tiktoken.rs b/rs-tiktoken/src/core.rs similarity index 99% rename from src/tiktoken.rs rename to rs-tiktoken/src/core.rs index 53c1a075..e32a37c3 100644 --- a/src/tiktoken.rs +++ b/rs-tiktoken/src/core.rs @@ -152,7 +152,7 @@ pub fn byte_pair_split<'a>(piece: &'a [u8], ranks: &HashMap, usize>) -> pub struct FakeThreadId(NonZeroU64); -pub fn hash_current_thread() -> usize { +fn hash_current_thread() -> usize { // It's easier to use unsafe than to use nightly. Rust has this nice u64 thread id counter // that works great for our use case of avoiding collisions in our array. Unfortunately, // it's private. However, there are only so many ways you can layout a u64, so just transmute diff --git a/rs-tiktoken/src/encoding.rs b/rs-tiktoken/src/encoding.rs new file mode 100644 index 00000000..cdc1671f --- /dev/null +++ b/rs-tiktoken/src/encoding.rs @@ -0,0 +1,49 @@ +//! WARNING: This code is under active development. Functionality, +//! behavior, and the interface may change in future updates. + +use std::collections::HashMap; +use once_cell::sync::Lazy; +use regex::Regex; + + +pub struct Encoding { + /// The name of the encoding. It should be clear from the name of the encoding + /// what behaviour to expect, in particular, encodings with different special tokens + /// should have different names. + pub name: &'static str, + /// A regex pattern string that is used to split the input text. + pub pat_str: Regex, + /// A dictionary mapping mergeable token bytes to their ranks. The ranks + /// must correspond to merge priority. + pub mergeable_ranks: HashMap<&'static str, u32>, + /// A dictionary mapping special token strings to their token values. + pub special_tokens: HashMap<&'static str, u32>, + /// The number of tokens in the vocabulary. If provided, it is checked + /// that the number of mergeable tokens and special tokens is equal to this number. + pub explicit_n_vocab: Option, +} + +pub static GPT2: Lazy = Lazy::new(|| { + let mergeable_ranks = Default::default(); + let special_tokens = [ + ("<|endoftext|>", 50256) + ].iter().cloned().collect(); + + Encoding{ + name: "gpt2", + pat_str: Regex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+").unwrap(), + mergeable_ranks, + special_tokens, + explicit_n_vocab: Some(50257), + } +}); + +#[cfg(test)] +mod tes { + use super::*; + + #[test] + fn test() { + let a = GPT2.name; + } +} \ No newline at end of file diff --git a/rs-tiktoken/src/lib.rs b/rs-tiktoken/src/lib.rs new file mode 100644 index 00000000..e94db18a --- /dev/null +++ b/rs-tiktoken/src/lib.rs @@ -0,0 +1,31 @@ +// This check is new and seems buggy (possibly with PyO3 interaction) +pub mod core; +pub mod encoding; +mod model; + +pub fn get_encoding() { + +} + +#[cfg(test)] +mod test { + + + #[test] + fn test_simple() { + // enc = tiktoken.get_encoding("gpt2") + // assert enc.encode("hello world") == [31373, 995] + // assert enc.decode([31373, 995]) == "hello world" + // assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256] + // + // enc = tiktoken.get_encoding("cl100k_base") + // assert enc.encode("hello world") == [15339, 1917] + // assert enc.decode([15339, 1917]) == "hello world" + // assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257] + // + // for enc_name in tiktoken.list_encoding_names(): + // enc = tiktoken.get_encoding(enc_name) + // for token in range(10_000): + // assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token + } +} diff --git a/rs-tiktoken/src/model.rs b/rs-tiktoken/src/model.rs new file mode 100644 index 00000000..e69de29b diff --git a/setup.py b/setup.py index a22e8e5d..b0b42967 100644 --- a/setup.py +++ b/setup.py @@ -5,11 +5,12 @@ name="tiktoken", rust_extensions=[ RustExtension( - "tiktoken._tiktoken", + target="tiktoken._tiktoken", binding=Binding.PyO3, # Between our use of editable installs and wanting to use Rust for performance sensitive # code, it makes sense to just always use --release debug=False, + path="py-tiktoken/Cargo.toml", ) ], package_data={"tiktoken": ["py.typed"]}, diff --git a/src/lib.rs b/src/lib.rs deleted file mode 100644 index 54210bd5..00000000 --- a/src/lib.rs +++ /dev/null @@ -1,3 +0,0 @@ -// This check is new and seems buggy (possibly with PyO3 interaction) -pub mod tiktoken_py; -pub mod tiktoken; \ No newline at end of file