From 5163d5585a5c2014f92fb1668b356bee30625a83 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Thu, 27 Jun 2024 18:50:47 +0200 Subject: [PATCH] Add optimised findall(isequal(::Char), ::String) (#54593) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This uses the same approach as the existing findnext and findprev functions in the same file. The following benchmark: ```julia using BenchmarkTools s = join(rand('A':'z', 10000)); @btime findall(==('c'), s); ``` Gives these results: * This PR: 3.489 μs * 1.11-beta1: 31.970 μs --- base/strings/search.jl | 39 +++++++++++++++++++++++++++++++++++++++ test/strings/search.jl | 16 ++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/base/strings/search.jl b/base/strings/search.jl index 493e241723867..b9c14f06e0898 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -12,6 +12,16 @@ abstract type AbstractPattern end nothing_sentinel(i) = i == 0 ? nothing : i +function last_utf8_byte(c::Char) + u = reinterpret(UInt32, c) + shift = ((4 - ncodeunits(c)) * 8) & 31 + (u >> shift) % UInt8 +end + +# Whether the given byte is guaranteed to be the only byte in a Char +# This holds even in the presence of invalid UTF8 +is_standalone_byte(x::UInt8) = (x < 0x80) | (x > 0xf7) + function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar}, s::Union{String, SubString{String}}, i::Integer) if i < 1 || i > sizeof(s) @@ -112,6 +122,35 @@ function _rsearch(a::ByteArray, b::AbstractChar, i::Integer = length(a)) end end +function findall( + pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar}, + s::Union{String, SubString{String}} +) + c = Char(pred.x)::Char + byte = last_utf8_byte(c) + ncu = ncodeunits(c) + + # If only one byte, and can't be part of another Char: Forward to memchr. + is_standalone_byte(byte) && return findall(==(byte), codeunits(s)) + result = Int[] + i = firstindex(s) + while true + i = _search(s, byte, i) + iszero(i) && return result + i += 1 + index = i - ncu + # If the char is invalid, it's possible that its first byte is + # inside another char. If so, indexing into the string will throw an + # error, so we need to check for valid indices. + isvalid(s, index) || continue + # We use iterate here instead of indexing, because indexing wastefully + # checks for valid index. It would be better if there was something like + # try_getindex(::String, ::Int) we could use. + char = first(something(iterate(s, index))) + pred(char) && push!(result, index) + end +end + """ findfirst(pattern::AbstractString, string::AbstractString) findfirst(pattern::AbstractPattern, string::String) diff --git a/test/strings/search.jl b/test/strings/search.jl index d328168bfa466..e737096b3371d 100644 --- a/test/strings/search.jl +++ b/test/strings/search.jl @@ -395,6 +395,22 @@ s_18109 = "fooα🐨βcd3" @test findall("aa", "aaaaaa", overlap=true) == [1:2, 2:3, 3:4, 4:5, 5:6] end +@testset "Findall char in string" begin + @test findall(==('w'), "wabcwewwawk") == [1, 5, 7, 8, 10] + @test isempty(findall(isequal("w"), "abcde!,")) + @test findall(==('读'), "联国读大会一九四二月十读日第号决通过并颁布读") == [7, 34, 64] + + # Empty string + @test isempty(findall(isequal('K'), "")) + @test isempty(findall(isequal('α'), "")) + + # Finds an invalid char ONLY if it's at a char boundary in the string, + # i.e. iterating the string would emit the given char. + @test findall(==('\xfe'), "abκæøc\xfeα\xfeβå!") == [10, 13] + @test isempty(findall(==('\xaf'), "abκæ读α\xe8\xaf\xfeβå!")) + @test isempty(findall(==('\xc3'), ";æ")) +end + # issue 37280 @testset "UInt8, Int8 vector" begin for T in [Int8, UInt8], VT in [Int8, UInt8]