-
Notifications
You must be signed in to change notification settings - Fork 15
/
engines.toml
697 lines (664 loc) · 20.1 KB
/
engines.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
# This file defines all of the regex engines that 'rebar' knows how to run. In
# all cases, regex engines are executed over a sub-process boundary.
#
# While a sub-process boundary is not actually necessary to benchmark many
# regex engines (for example, it could be done in-process via a C API), we
# still use a sub-process boundary in all cases to ensure our methodology
# remains invariant with respect to other regex engines. The communication at
# the process boundary is designed such that sub-process overhead plays no role
# in the measurements.
#
# The reason for using a sub-process boundary at all is because many regex
# engines cannot be or cannot easily be exposed via a C API. Thus, rebar uses
# the lowest possible common denominator: the process boundary. That is,
# probably every regex engine that could ever be worth benchmarking should be
# able to be exposed via some kind of process.
#
# The format for this file is defined at FORMAT.md#engine-toml-format. Although
# it's likely enough to just look at existing examples if you want to add a new
# engine.
# The regex crate for Rust. The standard or "default" regex engine for Rust.
#
# URL: https://github.com/rust-lang/regex
[[engine]]
name = "rust/regex"
cwd = "engines/rust/regex"
[engine.version]
bin = "./target/release/main"
args = ["--version"]
[engine.run]
bin = "./target/release/main"
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]
# The regress regex engine, written in Rust. It is meant to support EcmaScript
# syntax.
#
# URL: https://github.com/ridiculousfish/regress
[[engine]]
name = "regress"
cwd = "engines/regress"
[engine.version]
bin = "./target/release/main"
args = ["--version"]
[engine.run]
bin = "./target/release/main"
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]
# The RE2 regex engine out of Google, written in C++. Responsible for
# rekindling interest in using automata for regex engine implementations. Also
# an ancestor to Rust's regex crate, RE2/J and Go's standard library regexp
# package.
#
# URL: https://github.com/google/re2
[[engine]]
name = "re2"
cwd = "engines/re2"
[engine.version]
bin = "./target/release/main"
args = ["--version"]
[engine.run]
bin = "./target/release/main"
[[engine.dependency]]
bin = "cc"
args = ["--version"]
[[engine.dependency]]
bin = "c++"
args = ["--version"]
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]
# PCRE2 brings "Perl compatible regular expressions" to C as a standalone
# library. This engine uses the "standard" PCRE2 API that utilizes an
# interpreter as an implementation detail.
#
# URL: https://github.com/PCRE2Project/pcre2
[[engine]]
name = "pcre2"
cwd = "engines/pcre2"
[engine.version]
bin = "./target/release/main"
args = ["interp", "--version"]
[engine.run]
bin = "./target/release/main"
args = ["interp"]
[[engine.dependency]]
bin = "cc"
args = ["--version"]
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]
# PCRE2 brings "Perl compatible regular expressions" to C as a standalone
# library. This engine uses PCRE2's homegrown JIT built via the sljit compiler.
#
# URL: https://github.com/PCRE2Project/pcre2
# URL: https://github.com/zherczeg/sljit
[[engine]]
name = "pcre2/jit"
cwd = "engines/pcre2"
[engine.version]
bin = "./target/release/main"
args = ["jit", "--version"]
[engine.run]
bin = "./target/release/main"
args = ["jit"]
[[engine.dependency]]
bin = "cc"
args = ["--version"]
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]
# The regex engine included as part of Go's standard library. It's ancestor is
# RE2, although it lacks RE2's lazy DFA.
#
# URL: https://pkg.go.dev/regexp
[[engine]]
name = "go/regexp"
cwd = "engines/go"
[engine.version]
bin = "./main"
args = ["version"]
regex = 'go(?P<version>[0-9]\.[0-9]+\.[0-9]+)'
[engine.run]
bin = "./main"
[[engine.build]]
bin = "go"
args = ["build"]
[[engine.clean]]
bin = "rm"
args = ["-f", "main"]
# The regex engine included as part of Python's standard library.
#
# URL: https://docs.python.org/3/library/re.html
[[engine]]
name = "python/re"
cwd = "engines/python"
[engine.version]
bin = "ve/bin/python"
args = ["--version"]
regex = '(?m)^Python (?P<version>.+)$'
[engine.run]
bin = "ve/bin/python"
args = ["main.py", "re"]
[[engine.dependency]]
bin = "python"
args = ["--version"]
regex = '(?m)^Python 3\.'
[[engine.dependency]]
bin = "virtualenv"
args = ["--version"]
regex = '(?m)^virtualenv\s+'
[[engine.build]]
bin = "virtualenv"
args = ["ve"]
[[engine.clean]]
bin = "rm"
args = ["-rf", "./ve"]
# The third party Python 'regex' module. Supports oodles of features.
#
# URL: https://github.com/mrabarnett/mrab-regex
[[engine]]
name = "python/regex"
cwd = "engines/python"
[engine.version]
bin = "ve/bin/pip"
args = ["show", "regex"]
regex = '(?m)^Version: (?P<version>.+)$'
[engine.run]
bin = "ve/bin/python"
args = ["main.py", "regex"]
[[engine.dependency]]
bin = "python"
args = ["--version"]
regex = '(?m)^Python 3\.'
[[engine.dependency]]
bin = "virtualenv"
args = ["--version"]
regex = '(?m)^virtualenv\s+'
[[engine.build]]
bin = "virtualenv"
args = ["ve"]
[[engine.build]]
bin = "ve/bin/pip"
args = ["install", "regex"]
[[engine.clean]]
bin = "rm"
args = ["-rf", "./ve"]
# The venerable Hyperscan regex engine. Notable for its best-in-class literal
# optimizations, support for stream searching and excellent support for
# searching for many regexes at once with good scaling properties. Its main
# quirks are that it struggles with finding the start of a match and always
# reports the position of every match instead of the leftmost-first or
# leftmost-longest match.
#
# Sadly, it looks like Hyperscan might be a dead (or nearly dead) project. We
# should consider switching (or just adding) Vectorscan...
#
# URL: https://github.com/intel/hyperscan
#
# Note that we currently use a third party Rust binding to Hyperscan. I
# generally tried to avoid these for other engines (like RE2 and PCRE2), but
# I ran out of steam. Hyperscan's API is a bit more complicated to wrap and
# its build is also more complex. I did briefly audit the binding library and
# things look reasonableish. (But not the 'regex' API wrapper. It allocates new
# scratch space for every search. Definitely do not use that ever.)
#
# URL: https://github.com/flier/rust-hyperscan
[[engine]]
name = "hyperscan"
cwd = "engines/hyperscan"
[engine.version]
bin = "./target/release/main"
args = ["--version"]
[engine.run]
bin = "./target/release/main"
[[engine.dependency]]
bin = "cc"
args = ["--version"]
[[engine.dependency]]
bin = "c++"
args = ["--version"]
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]
# .NET's default or interpreter regex engine.
#
# URL: https://learn.microsoft.com/en-us/dotnet/standard/base-types/regular-expressions
[[engine]]
name = "dotnet"
cwd = "engines/dotnet"
[engine.version]
bin = "bin/Release/net7.0/main"
args = ["version"]
[engine.run]
bin = "bin/Release/net7.0/main"
args = ["interp"]
[[engine.dependency]]
bin = "dotnet"
args = ["--list-sdks"]
regex = '(?m)^7\.'
[[engine.build]]
bin = "dotnet"
args = ["build", "-c", "Release"]
[[engine.clean]]
bin = "dotnet"
args = ["clean", "-c", "Release"]
# .NET's "compiled" regex engine.
#
# This is like the interpreter engine, but sets RegexOptions.Compiled.
#
# URL: https://learn.microsoft.com/en-us/dotnet/standard/base-types/regular-expressions
[[engine]]
name = "dotnet/compiled"
cwd = "engines/dotnet"
[engine.version]
bin = "bin/Release/net7.0/main"
args = ["version"]
[engine.run]
bin = "bin/Release/net7.0/main"
args = ["compiled"]
[[engine.dependency]]
bin = "dotnet"
args = ["--list-sdks"]
regex = '(?m)^7\.'
[[engine.build]]
bin = "dotnet"
args = ["build", "-c", "Release"]
[[engine.clean]]
bin = "dotnet"
args = ["clean", "-c", "Release"]
# .NET's "non-backtracking" regex engine.
#
# This is like the interpreter engine, but sets RegexOptions.NonBacktracking.
#
# Note that this was introduced in .NET 7.
#
# URL: https://learn.microsoft.com/en-us/dotnet/standard/base-types/regular-expressions
[[engine]]
name = "dotnet/nobacktrack"
cwd = "engines/dotnet"
[engine.version]
bin = "bin/Release/net7.0/main"
args = ["version"]
[engine.run]
bin = "bin/Release/net7.0/main"
args = ["nobacktrack"]
[[engine.dependency]]
bin = "dotnet"
args = ["--list-sdks"]
regex = '(?m)^7\.'
[[engine.build]]
bin = "dotnet"
args = ["build", "-c", "Release"]
[[engine.clean]]
bin = "dotnet"
args = ["clean", "-c", "Release"]
# Perl's regex engine.
#
# URL: https://perldoc.perl.org/perlre
[[engine]]
name = "perl"
cwd = "engines/perl"
[engine.version]
bin = "perl"
args = ["--version"]
regex = 'This is perl 5, .+\(v(?P<version>[0-9]+\.[0-9]+\.[0-9]+)\)'
[engine.run]
bin = "perl"
args = ["main.pl"]
# Java's regex engine on the HotSpot VM.
#
# URL: https://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html
[[engine]]
name = "java/hotspot"
cwd = "engines/java"
[engine.version]
bin = "java"
args = ["Main", "version"]
# We specifically require a mention of 'HotSpot' to avoid accidentally
# benchmarking something other than what we advertise.
regex = '^Java HotSpot.*VM (?P<version>.*)'
[engine.run]
bin = "java"
args = ["Main"]
[[engine.build]]
bin = "javac"
args = ["Main.java"]
[[engine.clean]]
bin = "sh"
args = ["-c", "rm *.class"]
# Javascript's regex engine from v8.
#
# URL: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp
#
# Also known as irregexp:
# https://blog.chromium.org/2009/02/irregexp-google-chromes-new-regexp.html
#
# Note that v8 also has an experimental non-backtracking regex engine[1] that
# might be interesting to include as well, but it's not clear what the best way
# to include it is.
#
# [1]: https://v8.dev/blog/non-backtracking-regexp
[[engine]]
name = "javascript/v8"
cwd = "engines/javascript"
[engine.version]
bin = "node"
args = ["--version"]
regex = 'v(?P<version>.*)'
[engine.run]
bin = "node"
args = ["main.js"]
# A C++ regex engine bundled with ICU.
#
# URL: https://unicode-org.github.io/icu/userguide/strings/regexp.html
[[engine]]
name = "icu"
cwd = "engines/icu"
[engine.version]
bin = "./target/release/main"
args = ["--version"]
[engine.run]
bin = "./target/release/main"
[[engine.dependency]]
bin = "pkg-config"
args = ["--libs", "--cflags", "icu-uc", "icu-i18n"]
regex = '(?m)^-licui18n -licuuc -licudata\b'
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]
# RUST REGEX CRATE INTERNAL REGEX ENGINES
#
# These reflect different regex engines used by 'rust/regex' internally, and
# are split out here to aide in the development of 'rust/regex' itself by
# comparing the perf of different regex engines directly. These generally help
# inform heuristics for knowing when to use one regex engine over the other.
# (Heuristics are not the only mechanism for deciding which engine to use, but
# they play a part. For example, some regex engines like the one-pass DFA can
# only run a relatively small subset of all regexes.)
# A "meta" regex engine that combines several others: a PikeVM, a backtracker,
# a one-pass DFA, a lazy DFA, a full DFA, Aho-Corasick, Teddy and memmem.
#
# The "meta" regex engine is nearly equivalent to the 'rust/regex' engine,
# as the latter is "just" a light wrapper over the meta engine. The meta
# engine principally exposes a lower level API and doesn't do any internal
# synchronization to access mutable scratch space. (Because it requires the
# caller to pass that mutable scratch space directly, unlike the higher level
# regex crate API.)
[[engine]]
name = "rust/regex/meta"
cwd = "engines/rust/regex-automata"
[engine.version]
bin = "./target/release/main"
args = ["meta", "--version"]
[engine.run]
bin = "./target/release/main"
args = ["meta"]
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]
# The "dense DFA" regex engine. Basically, a full DFA with a classic table
# based transition table.
[[engine]]
name = "rust/regex/dense"
cwd = "engines/rust/regex-automata"
[engine.version]
bin = "./target/release/main"
args = ["dense", "--version"]
[engine.run]
bin = "./target/release/main"
args = ["dense"]
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]
# The "sparse DFA" regex engine. A full DFA, but represents its transition
# table using a sparse format. Basically, slower transitions but less memory.
[[engine]]
name = "rust/regex/sparse"
cwd = "engines/rust/regex-automata"
[engine.version]
bin = "./target/release/main"
args = ["sparse", "--version"]
[engine.run]
bin = "./target/release/main"
args = ["sparse"]
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]
# A hybrid NFA/DFA or "lazy DFA." Like a DFA, but builds itself during a
# search. It avoids the worst case exponential time/space complexity of fully
# building a DFA by guaranteeing that at most one transition (and one new
# state) are compiled for each byte in the haystack. Main downside is that in
# the worst cases, search can be quite slow.
[[engine]]
name = "rust/regex/hybrid"
cwd = "engines/rust/regex-automata"
[engine.version]
bin = "./target/release/main"
args = ["hybrid", "--version"]
[engine.run]
bin = "./target/release/main"
args = ["hybrid"]
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]
# A full DFA limited to a subset of regexes that are "one pass." That is,
# conceptually, the regex does not require backtracking. Because of this
# property, the DFA contains at most one state for each state in the NFA, and
# thus it does not suffer from the usual worst case exponential space/time
# complexity of a full DFA.
#
# The main upside of a one-pass DFA is that it can resolve capture groups and
# handle Unicode word boundaries.
[[engine]]
name = "rust/regex/onepass"
cwd = "engines/rust/regex-automata"
[engine.version]
bin = "./target/release/main"
args = ["onepass", "--version"]
[engine.run]
bin = "./target/release/main"
args = ["onepass"]
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]
# A bounded backtracking engine. Only used for small haystacks/regexes. It
# avoids worst case exponential time in exchange for heap space used to ensure
# any combination of haystack position and NFA state are only visited once.
[[engine]]
name = "rust/regex/backtrack"
cwd = "engines/rust/regex-automata"
[engine.version]
bin = "./target/release/main"
args = ["backtrack", "--version"]
[engine.run]
bin = "./target/release/main"
args = ["backtrack"]
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]
# The Thompson NFA construction adapted to support capture groups, aka, the
# "Pike VM." Can handle everything and has worst case linear time/space
# complexity (with the regex held constant), but is usually quite slow in
# practice because of the overhead of NFA simulation.
[[engine]]
name = "rust/regex/pikevm"
cwd = "engines/rust/regex-automata"
[engine.version]
bin = "./target/release/main"
args = ["pikevm", "--version"]
[engine.run]
bin = "./target/release/main"
args = ["pikevm"]
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]
# This is for benchmarking NFA compilation times. It doesn't make sense to use
# this for anything other than a 'model = "compile"' benchmark.
[[engine]]
name = "rust/regex/nfa"
cwd = "engines/rust/regex-automata"
[engine.version]
bin = "./target/release/main"
args = ["nfa", "--version"]
[engine.run]
bin = "./target/release/main"
args = ["nfa"]
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]
# This is for benchmarking HIR translation times. It doesn't make sense to use
# this for anything other than a 'model = "compile"' benchmark.
[[engine]]
name = "rust/regex/hir"
cwd = "engines/rust/regex-syntax"
[engine.version]
bin = "./target/release/main"
args = ["hir", "--version"]
[engine.run]
bin = "./target/release/main"
args = ["hir"]
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]
# This is for benchmarking AST parsing times. It doesn't make sense to use
# this for anything other than a 'model = "compile"' benchmark.
[[engine]]
name = "rust/regex/ast"
cwd = "engines/rust/regex-syntax"
[engine.version]
bin = "./target/release/main"
args = ["ast", "--version"]
[engine.run]
bin = "./target/release/main"
args = ["ast"]
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]
# A fully compiled Aho-Corasick DFA. This is sometimes used in the meta regex
# engine.
[[engine]]
name = "rust/aho-corasick/dfa"
cwd = "engines/rust/aho-corasick"
[engine.version]
bin = "./target/release/main"
args = ["dfa", "--version"]
[engine.run]
bin = "./target/release/main"
args = ["dfa"]
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]
# A fully compiled Aho-Corasick NFA. This is sometimes used in the meta regex
# engine.
[[engine]]
name = "rust/aho-corasick/nfa"
cwd = "engines/rust/aho-corasick"
[engine.version]
bin = "./target/release/main"
args = ["nfa", "--version"]
[engine.run]
bin = "./target/release/main"
args = ["nfa"]
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]
# Single substring search. Used whenever possible to accelerate searches.
# Utilizes heuristic frequency distribution of bytes, SIMD, Two-Way, memchr and
# Rabin-Karp.
[[engine]]
name = "rust/memchr/memmem"
cwd = "engines/rust/memchr"
[engine.version]
bin = "./target/release/main"
args = ["--version"]
[engine.run]
bin = "./target/release/main"
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]
# OLD RUST REGEX ENGINE
# This is the "old" regex crate before its internals were switched over to
# regex-automata. It's commented out by default because it's not generally
# interesting, but we do support it because it can be interesting to compare
# perf of the rewrite with the old version, particularly in cases where there
# was a regression.
#
# Note that in order to actually get measurements from this regex engine,
# you'll have to also uncomment some lines in rebar's Cargo.toml, as it is not
# compiled in by default.
[[engine]]
name = "rust/regexold"
cwd = "engines/rust/regex-old"
[engine.version]
bin = "./target/release/main"
args = ["--version"]
[engine.run]
bin = "./target/release/main"
[[engine.build]]
bin = "cargo"
args = ["build", "--release"]
[[engine.clean]]
bin = "cargo"
args = ["clean"]