using System.Diagnostics; using System.Text; using System.Text.RegularExpressions; /// The configuration of a benchmark, as parsed from the incoming KLV /// data. struct Config { public string engine; public string? name; public string? model; public string? pattern; public bool caseInsensitive; public bool unicode; // rebar benchmarks permit the haystack to be invalid UTF-8, // but .NET's regex engine can only search sequences of UTF-16 // code units (as far as I can tell). So there's no point in // trying to represent the haystack as a byte[]. If this runner // program is called with a haystack that contains invalid UTF-8, // then it will throw an exception. public string? haystack; public int maxIters; public int maxWarmupIters; public long maxTime; public long maxWarmupTime; public Config(string engineName) => engine = engineName; public Regex CompileRegex() => CompilePattern(pattern!); public Regex CompilePattern(string pat) { // We enable the invariant culture to avoid any sort of tailoring. // Tailoring might be useful to benchmark, but rebar doesn't do it. // Primarily because most (not all) regex engines don't support it. RegexOptions options = RegexOptions.CultureInvariant; if (caseInsensitive) { options |= RegexOptions.IgnoreCase; } options |= engine switch { "interp" => RegexOptions.None, "compiled" => RegexOptions.Compiled, "nobacktrack" => RegexOptions.NonBacktracking, _ => throw new Exception($"unrecognized engine '${engine}'"), }; return new Regex(pat, options); } } /// A single Key-Length-Value item. struct OneKLV { /// The key name. public string key; /// The value contents. public string value; /// /// The length, in bytes, used up by this KLV item. /// This is useful for parsing a sequence of KLV items. /// This length says how much to skip ahead to start /// parsing the next KLV item. /// public int len; public OneKLV(ReadOnlySpan raw) { // The default for the UTF-8 encoding is to lossily decode bytes. // But we really don't want to do that here. We want to return an // error, since otherwise, this runner could silently search a slightly // different haystack than what other runner programs do. // // This means that this runner program must only be used in benchmarks // with valid UTF-8. This is not an arbitrary decision. As far as I can // tell, C#'s regex engine only works on String or ReadOnlySpan, // both of which are Unicode strings and incapable of representing // arbitrary bytes. Encoding utf8 = Encoding.GetEncoding( "utf-8", new EncoderExceptionFallback(), new DecoderExceptionFallback() ); var keyEnd = raw.IndexOf((byte)':'); if (keyEnd < 0) { throw new Exception("invalid KLV item: could not find first ':'"); } key = utf8.GetString(raw.Slice(0, keyEnd)); raw = raw.Slice(keyEnd + 1); var valueLenEnd = raw.IndexOf((byte)':'); if (valueLenEnd < 0) { throw new Exception("invalid KLV item: could not find second ':'"); } int valueLen = int.Parse(utf8.GetString(raw.Slice(0, valueLenEnd))); if (raw[valueLenEnd + 1 + valueLen] != '\n') { throw new Exception("invalid KLV item: no line terminator"); } value = utf8.GetString(raw.Slice(valueLenEnd + 1, valueLen)); len = keyEnd + 1 + valueLenEnd + 1 + valueLen + 1; } } /// /// A representation of the data we gather from a single benchmark execution. /// That is, the time it took to run and the count reported for verification. /// /// /// The duration, in nanoseconds. This might not always /// have nanosecond resolution, but its units are always /// nanoseconds. /// /// /// The count reported by the benchmark. This is checked /// against what is expected in the benchmark definition /// by rebar. /// record struct Sample(long duration, int count); class Program { static void Main(string[] args) { if (args.Length != 1) { throw new Exception( "Usage: main " ); } if (args[0] == "version") { Console.WriteLine(Environment.Version); return; } // Read all of stdin into a span of bytes MemoryStream stdinCopy = new(); using (Stream stdin = Console.OpenStandardInput()) { stdin.CopyTo(stdinCopy); } ReadOnlySpan raw = stdinCopy.GetBuffer().AsSpan( 0, (int)stdinCopy.Length ); // OK, now read each of our KLV items and build up our config. Config config = new(args[0]); while (!raw.IsEmpty) { var klv = new OneKLV(raw); raw = raw.Slice(klv.len); switch (klv.key) { case "name": config.name = klv.value; break; case "model": config.model = klv.value; break; case "pattern": if (config.pattern != null) { throw new Exception("only one pattern is supported"); } config.pattern = klv.value; break; case "case-insensitive": config.caseInsensitive = klv.value == "true"; break; case "unicode": config.unicode = klv.value == "unicode"; break; case "haystack": config.haystack = klv.value; break; case "max-iters": config.maxIters = int.Parse(klv.value); break; case "max-warmup-iters": config.maxWarmupIters = int.Parse(klv.value); break; case "max-time": config.maxTime = long.Parse(klv.value); break; case "max-warmup-time": config.maxWarmupTime = long.Parse(klv.value); break; default: throw new Exception($"unrecognized KLV key {klv.key}"); } } if (config.model != "regex-redux" && config.pattern == null) { throw new Exception("missing pattern, must be provided once"); } // Run our selected model and print the samples. List samples = config.model switch { "compile" => ModelCompile(config), "count" => ModelCount(config), "count-spans" => ModelCountSpans(config), "count-captures" => ModelCountCaptures(config), "grep" => ModelGrep(config), "grep-captures" => ModelGrepCaptures(config), "regex-redux" => ModelRegexRedux(config), _ => throw new Exception( $"unknown benchmark model {config.model}" ), }; foreach (Sample s in samples) { Console.WriteLine($"{s.duration},{s.count}"); } } static List ModelCompile(Config config) { return RunAndCount( config, re => re.Count(config.haystack!), config.CompileRegex ); } static List ModelCount(Config config) { var re = config.CompileRegex(); return RunAndCount( config, n => n, () => re.Count(config.haystack!) ); } static List ModelCountSpans(Config config) { var re = config.CompileRegex(); return RunAndCount( config, n => n, () => { int count = 0; foreach (ValueMatch m in re.EnumerateMatches(config.haystack!)) { // This is not quite the same as most other regex // engines, which report span lengths in terms of // number of bytes. This is in terms of UTF-16 code // units. We deal this by permitting different counts // for .NET regex engines in the benchmark definition. count += m.Length; } return count; } ); } static List ModelCountCaptures(Config config) { var re = config.CompileRegex(); return RunAndCount( config, n => n, () => { int count = 0; Match m = re.Match(config.haystack!); while (m.Success) { foreach (Group g in m.Groups) { if (g.Success) { count++; } } m = m.NextMatch(); } return count; } ); } static List ModelGrep(Config config) { var re = config.CompileRegex(); return RunAndCount( config, n => n, () => { int count = 0; var span = config.haystack.AsSpan(); foreach (ReadOnlySpan line in span.EnumerateLines()) { if (re.IsMatch(line)) { count++; } } return count; } ); } static List ModelGrepCaptures(Config config) { var re = config.CompileRegex(); return RunAndCount( config, n => n, () => { int count = 0; var span = config.haystack.AsSpan(); foreach (ReadOnlySpan line in span.EnumerateLines()) { Match m = re.Match(line.ToString()); while (m.Success) { foreach (Group g in m.Groups) { if (g.Success) { count++; } } m = m.NextMatch(); } } return count; } ); } static List ModelRegexRedux(Config config) { return RunAndCount( config, n => n, () => { var expected = """ agggtaaa|tttaccct 6 [cgt]gggtaaa|tttaccc[acg] 26 a[act]ggtaaa|tttacc[agt]t 86 ag[act]gtaaa|tttac[agt]ct 58 agg[act]taaa|ttta[agt]cct 113 aggg[acg]aaa|ttt[cgt]ccct 31 agggt[cgt]aa|tt[acg]accct 31 agggta[cgt]a|t[acg]taccct 32 agggtaa[cgt]|[acg]ttaccct 43 1016745 1000000 547899 """; var result = new StringBuilder(); var seq = config.haystack!; var ilen = seq.Length; seq = config.CompilePattern(@">[^\n]*\n|\n").Replace(seq, ""); var clen = seq.Length; var variants = new string[] { @"agggtaaa|tttaccct", @"[cgt]gggtaaa|tttaccc[acg]", @"a[act]ggtaaa|tttacc[agt]t", @"ag[act]gtaaa|tttac[agt]ct", @"agg[act]taaa|ttta[agt]cct", @"aggg[acg]aaa|ttt[cgt]ccct", @"agggt[cgt]aa|tt[acg]accct", @"agggta[cgt]a|t[acg]taccct", @"agggtaa[cgt]|[acg]ttaccct", }; foreach (string variant in variants) { var re = config.CompilePattern(variant); var count = re.Count(seq); result.AppendLine($"{variant} {count}"); } seq = config.CompilePattern(@"tHa[Nt]").Replace(seq, "<4>"); seq = config.CompilePattern(@"aND|caN|Ha[DS]|WaS").Replace(seq, "<3>"); seq = config.CompilePattern(@"a[NSt]|BY").Replace(seq, "<2>"); seq = config.CompilePattern(@"<[^>]*>").Replace(seq, "|"); seq = config.CompilePattern(@"\|[^|][^|]*\|").Replace(seq, "-"); result.AppendLine(""); result.AppendLine($"{ilen}"); result.AppendLine($"{clen}"); result.AppendLine($"{seq.Length}"); if (result.ToString() != expected) { throw new Exception("result did not match expected"); } return seq.Length; } ); } // Takes in a benchmark config, a closure that returns the count from the // benchmark function and a benchmark function that returns a result that // can be converted into a count. As output, it returns a list of samples // generated by repeatedly running the 'bench' function and timing how long // it takes. // // In practice, 'bench' returns the count and 'count' is just the identity // function in all except for one case: measuring compilation time. In // that case, 'bench' returns the regex object itself, and 'count' runs the // regex to get the count. // // The 'count' function is not part of the measurement. static List RunAndCount( Config config, Func count, Func bench ) { Stopwatch warmupTimer = Stopwatch.StartNew(); for (int i = 0; i < config.maxWarmupIters; i++) { var result = bench(); count(result); if (warmupTimer.Elapsed.TotalNanoseconds >= config.maxWarmupTime) { break; } } List samples = new(); Stopwatch runTimer = Stopwatch.StartNew(); for (int i = 0; i < config.maxIters; i++) { Stopwatch benchTimer = Stopwatch.StartNew(); var result = bench(); var elapsed = benchTimer.Elapsed.TotalNanoseconds; var n = count(result); samples.Add(new Sample((long)elapsed, n)); if (runTimer.Elapsed.TotalNanoseconds >= config.maxTime) { break; } } return samples; } }