Skip to content

Commit

Permalink
Merge pull request zesterer#476 from CraftSpider/regex-up
Browse files Browse the repository at this point in the history
Swap to regex-automata and significantly improve regex performance
  • Loading branch information
zesterer committed Jul 14, 2023
2 parents 6f3371f + 94e237c commit 342d2ec
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 59 deletions.
10 changes: 6 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ lexical-numbers = ["lexical", "unstable"]
# Adds impl of Parser for either::Either
either = ["dep:either"]

# Enables regex combinators
regex = ["dep:regex-automata"]

# An alias of all features that work with the stable compiler.
# Do not use this feature, its removal is not considered a breaking change and its behaviour may change.
# If you're working on chumsky and you're adding a feature that does not require nightly support, please add it to this list.
Expand All @@ -59,14 +62,13 @@ all-features = true
rustdoc-args = ["--cfg", "docsrs"]

[dependencies]
hashbrown = "0.13"
hashbrown = "0.14"
stacker = { version = "0.1", optional = true }
# Enables regex combinators
regex = { version = "1.7", optional = true }
regex-automata = { version = "0.3", optional = true }
spin = { version = "0.9", features = ["once"], default-features = false, optional = true }
lexical = { version = "6.1.1", default-features = false, features = ["parse-integers", "parse-floats", "format"], optional = true }
either = { version = "1.8.1", optional = true }
unicode-ident = "1.0.9"
unicode-ident = "1.0.10"

[dev-dependencies]
ariadne = "0.2"
Expand Down
62 changes: 61 additions & 1 deletion benches/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -243,9 +243,69 @@ fn bench_then(c: &mut Criterion) {
});
}

#[cfg(feature = "regex")]
fn bench_regex(c: &mut Criterion) {
let re_foo = regex::<_, _, extra::Default>("foo");
let re_foo2 = regex::<_, _, extra::Default>("[fF]oo");
let re_rep = regex::<_, _, extra::Default>("(?:abc){4}");

let mut group = c.benchmark_group("regex");

group.bench_function(BenchmarkId::new("foo", "foo"), |b| {
b.iter(|| {
black_box(re_foo.parse(black_box("foo")))
.into_result()
.unwrap();
})
});

group.bench_function(BenchmarkId::new("foo", "barfoofoofoo"), |b| {
b.iter(|| {
black_box(re_foo.parse(black_box("barfoofoofoo")))
.into_result()
.unwrap_err();
})
});

group.bench_function(BenchmarkId::new("[fF]oo", "foo"), |b| {
b.iter(|| {
black_box(re_foo2.parse(black_box("foo")))
.into_result()
.unwrap()
})
});

group.bench_function(BenchmarkId::new("[fF]oo", "Foo"), |b| {
b.iter(|| {
black_box(re_foo2.parse(black_box("Foo")))
.into_result()
.unwrap()
})
});

group.bench_function(BenchmarkId::new("[fF]oo", "barFoofoo"), |b| {
b.iter(|| {
black_box(re_foo2.parse(black_box("barFoofoo")))
.into_result()
.unwrap_err()
})
});

group.bench_function(BenchmarkId::new("(?:abc){4}", "abcabcabcabc"), |b| {
b.iter(|| {
black_box(re_rep.parse(black_box("abcabcabcabc")))
.into_result()
.unwrap()
})
});
}

#[cfg(not(feature = "regex"))]
fn bench_regex(_: &mut Criterion) {}

criterion_group!(
name = benches;
config = utils::make_criterion();
targets = bench_choice, bench_or, bench_group, bench_then,
targets = bench_choice, bench_or, bench_group, bench_then, bench_regex,
);
criterion_main!(benches);
45 changes: 44 additions & 1 deletion src/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,10 @@ pub trait SliceInput<'a>: ExactSizeInput<'a> {
/// The unsized slice type of this input. For [`&str`] it's `&str`, and for [`&[T]`] it will be `&[T]`.
type Slice;

/// Get the full slice of the input
#[doc(hidden)]
fn full_slice(&self) -> Self::Slice;

/// Get a slice from a start and end offset
// TODO: Make unsafe
#[doc(hidden)]
Expand Down Expand Up @@ -249,6 +253,11 @@ impl<'a> StrInput<'a, char> for &'a str {}
impl<'a> SliceInput<'a> for &'a str {
type Slice = &'a str;

#[inline(always)]
fn full_slice(&self) -> Self::Slice {
*self
}

#[inline(always)]
fn slice(&self, range: Range<Self::Offset>) -> Self::Slice {
&self[range]
Expand Down Expand Up @@ -301,6 +310,11 @@ impl<'a> StrInput<'a, u8> for &'a [u8] {}
impl<'a, T> SliceInput<'a> for &'a [T] {
type Slice = &'a [T];

#[inline(always)]
fn full_slice(&self) -> Self::Slice {
*self
}

#[inline(always)]
fn slice(&self, range: Range<Self::Offset>) -> Self::Slice {
&self[range]
Expand Down Expand Up @@ -375,6 +389,11 @@ impl<'a, const N: usize> StrInput<'a, u8> for &'a [u8; N] {}
impl<'a, T: 'a, const N: usize> SliceInput<'a> for &'a [T; N] {
type Slice = &'a [T];

#[inline(always)]
fn full_slice(&self) -> Self::Slice {
*self
}

#[inline(always)]
fn slice(&self, range: Range<Self::Offset>) -> Self::Slice {
&self[range]
Expand Down Expand Up @@ -535,6 +554,11 @@ where
{
type Slice = I::Slice;

#[inline(always)]
fn full_slice(&self) -> Self::Slice {
<I as SliceInput>::full_slice(&self.input)
}

#[inline(always)]
fn slice(&self, range: Range<Self::Offset>) -> Self::Slice {
<I as SliceInput>::slice(&self.input, range)
Expand Down Expand Up @@ -642,6 +666,11 @@ where
{
type Slice = I::Slice;

#[inline(always)]
fn full_slice(&self) -> Self::Slice {
<I as SliceInput>::full_slice(&self.input)
}

#[inline(always)]
fn slice(&self, range: Range<Self::Offset>) -> Self::Slice {
<I as SliceInput>::slice(&self.input, range)
Expand Down Expand Up @@ -758,6 +787,11 @@ where
{
type Slice = I::Slice;

#[inline(always)]
fn full_slice(&self) -> Self::Slice {
<I as SliceInput>::full_slice(&self.input)
}

#[inline(always)]
fn slice(&self, range: Range<Self::Offset>) -> Self::Slice {
<I as SliceInput>::slice(&self.input, range)
Expand Down Expand Up @@ -1300,6 +1334,15 @@ impl<'a, 'parse, I: Input<'a>, E: ParserExtra<'a, I>> InputRef<'a, 'parse, I, E>
let _ = self.next_inner();
}

#[cfg_attr(not(feature = "regex"), allow(dead_code))]
#[inline]
pub(crate) fn full_slice(&self) -> I::Slice
where
I: SliceInput<'a>,
{
self.input.full_slice()
}

/// Get a slice of the input that covers the given offset range.
#[inline]
pub fn slice(&self, range: Range<Offset<'a, 'parse, I>>) -> I::Slice
Expand Down Expand Up @@ -1336,7 +1379,7 @@ impl<'a, 'parse, I: Input<'a>, E: ParserExtra<'a, I>> InputRef<'a, 'parse, I, E>
self.input.slice_from(range)
}

#[cfg_attr(not(feature = "regex"), allow(dead_code))]
#[cfg_attr(not(feature = "lexical-numbers"), allow(dead_code))]
#[inline(always)]
pub(crate) fn slice_trailing_inner(&self) -> I::Slice
where
Expand Down
22 changes: 13 additions & 9 deletions src/regex.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,16 @@
//! Implementations of regex-based parsers

use super::*;
use regex_automata::{meta, Anchored, Input as ReInput};

/// See [`regex()`].
pub struct Regex<C: Char, I, E> {
regex: C::Regex,
regex: meta::Regex,
#[allow(dead_code)]
phantom: EmptyPhantom<(E, I)>,
phantom: EmptyPhantom<(C, E, I)>,
}

impl<C: Char, I, E> Copy for Regex<C, I, E> where C::Regex: Copy {}
impl<C: Char, I, E> Clone for Regex<C, I, E>
where
C::Regex: Clone,
{
impl<C: Char, I, E> Clone for Regex<C, I, E> {
fn clone(&self) -> Self {
Self {
regex: self.regex.clone(),
Expand All @@ -25,7 +22,7 @@ where
/// Match input based on a provided regex pattern
pub fn regex<C: Char, I, E>(pattern: &str) -> Regex<C, I, E> {
Regex {
regex: C::new_regex(pattern),
regex: meta::Regex::new(pattern).expect("Failed to compile regex"),
phantom: EmptyPhantom::new(),
}
}
Expand All @@ -39,7 +36,14 @@ where
#[inline]
fn go<M: Mode>(&self, inp: &mut InputRef<'a, '_, I, E>) -> PResult<M, &'a C::Str> {
let before = inp.offset();
match C::match_regex(&self.regex, inp.slice_trailing_inner()) {

let re_in = ReInput::new(inp.full_slice())
.anchored(Anchored::Yes)
.range(before.offset..);

let res = self.regex.find(re_in).map(|m| m.len());

match res {
Some(len) => {
let before = inp.offset();
inp.skip_bytes(len);
Expand Down
45 changes: 1 addition & 44 deletions src/text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,7 @@ pub trait Char: Sized + Copy + PartialEq + fmt::Debug + Sealed + 'static {
/// The default unsized [`str`]-like type of a linear sequence of this character.
///
/// For [`char`], this is [`str`]. For [`u8`], this is [`[u8]`].
type Str: ?Sized + AsRef<Self::Str> + 'static;

/// The type of a regex expression which can match on this type
#[cfg(feature = "regex")]
type Regex;

#[cfg(feature = "regex")]
#[doc(hidden)]
fn new_regex(pattern: &str) -> Self::Regex;
#[cfg(feature = "regex")]
#[doc(hidden)]
fn match_regex(regex: &Self::Regex, trailing: &Self::Str) -> Option<usize>;
type Str: ?Sized + AsRef<[u8]> + AsRef<Self::Str> + 'static;

/// Convert the given ASCII character to this character type.
fn from_ascii(c: u8) -> Self;
Expand Down Expand Up @@ -66,22 +55,6 @@ impl Sealed for char {}
impl Char for char {
type Str = str;

#[cfg(feature = "regex")]
type Regex = ::regex::Regex;

#[cfg(feature = "regex")]
fn new_regex(pattern: &str) -> Self::Regex {
::regex::Regex::new(pattern).expect("Failed to compile regex")
}
#[cfg(feature = "regex")]
#[inline]
fn match_regex(regex: &Self::Regex, trailing: &Self::Str) -> Option<usize> {
regex
.find(trailing)
.filter(|m| m.start() == 0)
.map(|m| m.end())
}

fn from_ascii(c: u8) -> Self {
c as char
}
Expand Down Expand Up @@ -119,22 +92,6 @@ impl Sealed for u8 {}
impl Char for u8 {
type Str = [u8];

#[cfg(feature = "regex")]
type Regex = ::regex::bytes::Regex;

#[cfg(feature = "regex")]
fn new_regex(pattern: &str) -> Self::Regex {
::regex::bytes::Regex::new(pattern).expect("Failed to compile regex")
}
#[cfg(feature = "regex")]
#[inline]
fn match_regex(regex: &Self::Regex, trailing: &Self::Str) -> Option<usize> {
regex
.find(trailing)
.filter(|m| m.start() == 0)
.map(|m| m.end())
}

fn from_ascii(c: u8) -> Self {
c
}
Expand Down

0 comments on commit 342d2ec

Please sign in to comment.