Skip to content

Commit

Permalink
Add optimised findall(isequal(::Char), ::String) (JuliaLang#54593)
Browse files Browse the repository at this point in the history
This uses the same approach as the existing findnext and findprev
functions in the same file.

The following benchmark:
```julia
using BenchmarkTools
s = join(rand('A':'z', 10000));
@Btime findall(==('c'), s);
```
Gives these results:
* This PR: 3.489 μs
* 1.11-beta1: 31.970 μs
  • Loading branch information
jakobnissen committed Jun 27, 2024
1 parent f3298ee commit 5163d55
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 0 deletions.
39 changes: 39 additions & 0 deletions base/strings/search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,16 @@ abstract type AbstractPattern end

nothing_sentinel(i) = i == 0 ? nothing : i

function last_utf8_byte(c::Char)
u = reinterpret(UInt32, c)
shift = ((4 - ncodeunits(c)) * 8) & 31
(u >> shift) % UInt8
end

# Whether the given byte is guaranteed to be the only byte in a Char
# This holds even in the presence of invalid UTF8
is_standalone_byte(x::UInt8) = (x < 0x80) | (x > 0xf7)

function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
s::Union{String, SubString{String}}, i::Integer)
if i < 1 || i > sizeof(s)
Expand Down Expand Up @@ -112,6 +122,35 @@ function _rsearch(a::ByteArray, b::AbstractChar, i::Integer = length(a))
end
end

function findall(
pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
s::Union{String, SubString{String}}
)
c = Char(pred.x)::Char
byte = last_utf8_byte(c)
ncu = ncodeunits(c)

# If only one byte, and can't be part of another Char: Forward to memchr.
is_standalone_byte(byte) && return findall(==(byte), codeunits(s))
result = Int[]
i = firstindex(s)
while true
i = _search(s, byte, i)
iszero(i) && return result
i += 1
index = i - ncu
# If the char is invalid, it's possible that its first byte is
# inside another char. If so, indexing into the string will throw an
# error, so we need to check for valid indices.
isvalid(s, index) || continue
# We use iterate here instead of indexing, because indexing wastefully
# checks for valid index. It would be better if there was something like
# try_getindex(::String, ::Int) we could use.
char = first(something(iterate(s, index)))
pred(char) && push!(result, index)
end
end

"""
findfirst(pattern::AbstractString, string::AbstractString)
findfirst(pattern::AbstractPattern, string::String)
Expand Down
16 changes: 16 additions & 0 deletions test/strings/search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,22 @@ s_18109 = "fooα🐨βcd3"
@test findall("aa", "aaaaaa", overlap=true) == [1:2, 2:3, 3:4, 4:5, 5:6]
end

@testset "Findall char in string" begin
@test findall(==('w'), "wabcwewwawk") == [1, 5, 7, 8, 10]
@test isempty(findall(isequal("w"), "abcde!,"))
@test findall(==(''), "联国读大会一九四二月十读日第号决通过并颁布读") == [7, 34, 64]

# Empty string
@test isempty(findall(isequal('K'), ""))
@test isempty(findall(isequal('α'), ""))

# Finds an invalid char ONLY if it's at a char boundary in the string,
# i.e. iterating the string would emit the given char.
@test findall(==('\xfe'), "abκæøc\xfeα\xfeβå!") == [10, 13]
@test isempty(findall(==('\xaf'), "abκæ读α\xe8\xaf\xfeβå!"))
@test isempty(findall(==('\xc3'), ""))
end

# issue 37280
@testset "UInt8, Int8 vector" begin
for T in [Int8, UInt8], VT in [Int8, UInt8]
Expand Down

0 comments on commit 5163d55

Please sign in to comment.