Skip to content

Commit

Permalink
expose findfirst findnext for UInt8 vector (#37283)
Browse files Browse the repository at this point in the history
* expose findfirst findnext for UInt8 vector

* Update base/strings/search.jl

Co-authored-by: Milan Bouchet-Valat <[email protected]>

* Update base/strings/search.jl

Co-authored-by: Milan Bouchet-Valat <[email protected]>

* Update base/strings/search.jl

Co-authored-by: Milan Bouchet-Valat <[email protected]>

* Update base/strings/search.jl

Co-authored-by: Milan Bouchet-Valat <[email protected]>

* Update base/strings/search.jl

Co-authored-by: Milan Bouchet-Valat <[email protected]>

* Update test/strings/search.jl

Co-authored-by: Milan Bouchet-Valat <[email protected]>

* Update base/strings/search.jl

Co-authored-by: Milan Bouchet-Valat <[email protected]>

* address comments

* address comments add OffsetArray test

* add findlast findprev

* implement comments

* let _(r)searchindex handle exception

* fix _rsearchindex special behavior

* style fix

* restirct to 1-indexed array

* Update test/strings/search.jl

Co-authored-by: Milan Bouchet-Valat <[email protected]>

* Update base/strings/search.jl

Co-authored-by: Milan Bouchet-Valat <[email protected]>

* Update base/strings/search.jl

Co-authored-by: Milan Bouchet-Valat <[email protected]>

* Update base/strings/search.jl

Co-authored-by: Milan Bouchet-Valat <[email protected]>

* Update base/strings/search.jl

Co-authored-by: Milan Bouchet-Valat <[email protected]>

* address comments

* change sentinel value to firstindex - 1

* NEWS for find* on Vector of U/Int8

* Update NEWS.md

shorten NEWS

Co-authored-by: Milan Bouchet-Valat <[email protected]>
Co-authored-by: Steven G. Johnson <[email protected]>
  • Loading branch information
3 people authored Oct 26, 2020
1 parent c293757 commit 5d8225a
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 31 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ New language features
* The library name passed to `ccall` or `@ccall` can now be an expression involving
global variables and function calls. The expression will be evaluated the first
time the `ccall` executes ([#36458]).
* `findfirst`, `findnext`, `findlast`, and `findall` now support `AbstractVector{<:Union{Int8,UInt8}}` (pattern, array) arguments ([#37283]).
* `` (U+A71B), `` (U+A71C) and `` (U+A71D) can now also be used as operator
suffixes. They can be tab-completed from `\^uparrow`, `\^downarrow` and `\^!` in the REPL
([#37542]).
Expand Down
151 changes: 120 additions & 31 deletions base/strings/search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,25 @@ true
"""
findfirst(ch::AbstractChar, string::AbstractString) = findfirst(==(ch), string)

"""
findfirst(pattern::AbstractVector{<:Union{Int8,UInt8}},
A::AbstractVector{<:Union{Int8,UInt8}})
Find the first occurrence of sequence `pattern` in vector `A`.
!!! compat "Julia 1.6"
This method requires at least Julia 1.6.
# Examples
```jldoctest
julia> findfirst([0x52, 0x62], [0x40, 0x52, 0x62, 0x63])
2:3
```
"""
findfirst(pattern::AbstractVector{<:Union{Int8,UInt8}},
A::AbstractVector{<:Union{Int8,UInt8}}) =
_search(A, pattern, firstindex(A))

# AbstractString implementation of the generic findnext interface
function findnext(testf::Function, s::AbstractString, i::Integer)
i = Int(i)
Expand Down Expand Up @@ -166,29 +185,34 @@ function _search_bloom_mask(c)
end

_nthbyte(s::String, i) = codeunit(s, i)
_nthbyte(a::Union{AbstractVector{UInt8},AbstractVector{Int8}}, i) = a[i]
_nthbyte(t::AbstractVector, index) = t[index + (firstindex(t)-1)]

function _searchindex(s::String, t::String, i::Integer)
# Check for fast case of a single byte
lastindex(t) == 1 && return something(findnext(isequal(t[1]), s, i), 0)
_searchindex(unsafe_wrap(Vector{UInt8},s), unsafe_wrap(Vector{UInt8},t), i)
end

function _searchindex(s::ByteArray, t::ByteArray, i::Integer)
n = sizeof(t)
m = sizeof(s)
function _searchindex(s::AbstractVector{<:Union{Int8,UInt8}},
t::AbstractVector{<:Union{Int8,UInt8}},
_i::Integer)
sentinel = firstindex(s) - 1
n = length(t)
m = length(s)
i = Int(_i) - sentinel
(i < 1 || i > m+1) && throw(BoundsError(s, _i))

if n == 0
return 1 <= i <= m+1 ? max(1, i) : 0
return 1 <= i <= m+1 ? max(1, i) : sentinel
elseif m == 0
return 0
return sentinel
elseif n == 1
return something(findnext(isequal(_nthbyte(t,1)), s, i), 0)
return something(findnext(isequal(_nthbyte(t,1)), s, i), sentinel)
end

w = m - n
if w < 0 || i - 1 > w
return 0
return sentinel
end

bloom_mask = UInt64(0)
Expand All @@ -215,7 +239,8 @@ function _searchindex(s::ByteArray, t::ByteArray, i::Integer)

# match found
if j == n - 1
return i+1
# restore in case `s` is an OffSetArray
return i+firstindex(s)
end

# no match, try to rule out the next character
Expand All @@ -232,16 +257,16 @@ function _searchindex(s::ByteArray, t::ByteArray, i::Integer)
i += 1
end

0
sentinel
end

function _search(s::Union{AbstractString,ByteArray},
t::Union{AbstractString,AbstractChar,Int8,UInt8},
function _search(s::Union{AbstractString,AbstractVector{<:Union{Int8,UInt8}}},
t::Union{AbstractString,AbstractChar,AbstractVector{<:Union{Int8,UInt8}}},
i::Integer)
idx = _searchindex(s,t,i)
if isempty(t)
idx:idx-1
elseif idx > 0
elseif idx >= firstindex(s)
idx:(idx + lastindex(t) - 1)
else
nothing
Expand Down Expand Up @@ -274,7 +299,7 @@ julia> findnext("Lang", "JuliaLang", 2)
6:9
```
"""
findnext(t::AbstractString, s::AbstractString, i::Integer) = _search(s, t, Int(i))
findnext(t::AbstractString, s::AbstractString, start::Integer) = _search(s, t, Int(start))

"""
findnext(ch::AbstractChar, string::AbstractString, start::Integer)
Expand All @@ -293,8 +318,32 @@ julia> findnext('o', "Hello to the world", 6)
8
```
"""
findnext(ch::AbstractChar, string::AbstractString, ind::Integer) =
findnext(==(ch), string, ind)
findnext(ch::AbstractChar, string::AbstractString, start::Integer) =
findnext(==(ch), string, start)

"""
findnext(pattern::AbstractVector{<:Union{Int8,UInt8}},
A::AbstractVector{<:Union{Int8,UInt8}},
start::Integer)
Find the next occurrence of the sequence `pattern` in vector `A` starting at position `start`.
!!! compat "Julia 1.6"
This method requires at least Julia 1.6.
# Examples
```jldoctest
julia> findnext([0x52, 0x62], [0x52, 0x62, 0x72], 3) === nothing
true
julia> findnext([0x52, 0x62], [0x40, 0x52, 0x62, 0x52, 0x62], 3)
4:5
```
"""
findnext(pattern::AbstractVector{<:Union{Int8,UInt8}},
A::AbstractVector{<:Union{Int8,UInt8}},
start::Integer) =
_search(A, pattern, start)

"""
findlast(pattern::AbstractString, string::AbstractString)
Expand All @@ -314,6 +363,23 @@ julia> findfirst("Julia", "JuliaLang")
findlast(pattern::AbstractString, string::AbstractString) =
findprev(pattern, string, lastindex(string))

"""
findlast(pattern::AbstractVector{<:Union{Int8,UInt8}},
A::AbstractVector{<:Union{Int8,UInt8}})
Find the last occurrence of `pattern` in array `A`. Equivalent to
[`findprev(pattern, A, lastindex(A))`](@ref).
# Examples
```jldoctest
julia> findlast([0x52, 0x62], [0x52, 0x62, 0x52, 0x62])
3:4
```
"""
findlast(pattern::AbstractVector{<:Union{Int8,UInt8}},
A::AbstractVector{<:Union{Int8,UInt8}}) =
findprev(pattern, A, lastindex(A))

"""
findlast(ch::AbstractChar, string::AbstractString)
Expand Down Expand Up @@ -387,21 +453,24 @@ function _rsearchindex(s::String, t::String, i::Integer)
end
end

function _rsearchindex(s::ByteArray, t::ByteArray, k::Integer)
n = sizeof(t)
m = sizeof(s)
function _rsearchindex(s::AbstractVector{<:Union{Int8,UInt8}}, t::AbstractVector{<:Union{Int8,UInt8}}, _k::Integer)
sentinel = firstindex(s) - 1
n = length(t)
m = length(s)
k = Int(_k) - sentinel
k < 1 && throw(BoundsError(s, _k))

if n == 0
return 0 <= k <= m ? max(k, 1) : 0
return 0 <= k <= m ? max(k, 1) : sentinel
elseif m == 0
return 0
return sentinel
elseif n == 1
return something(findprev(isequal(_nthbyte(t,1)), s, k), 0)
return something(findprev(isequal(_nthbyte(t,1)), s, k), sentinel)
end

w = m - n
if w < 0 || k <= 0
return 0
return sentinel
end

bloom_mask = UInt64(0)
Expand All @@ -426,9 +495,9 @@ function _rsearchindex(s::ByteArray, t::ByteArray, k::Integer)
j += 1
end

# match found
# match found, restore in case `s` is an OffsetArray
if j == n
return i
return i + sentinel
end

# no match, try to rule out the next character
Expand All @@ -445,16 +514,16 @@ function _rsearchindex(s::ByteArray, t::ByteArray, k::Integer)
i -= 1
end

0
sentinel
end

function _rsearch(s::Union{AbstractString,ByteArray},
t::Union{AbstractString,AbstractChar,Int8,UInt8},
function _rsearch(s::Union{AbstractString,AbstractVector{<:Union{Int8,UInt8}}},
t::Union{AbstractString,AbstractChar,AbstractVector{<:Union{Int8,UInt8}}},
i::Integer)
idx = _rsearchindex(s,t,i)
if isempty(t)
idx:idx-1
elseif idx > 0
elseif idx > firstindex(s) - 1
idx:(idx + lastindex(t) - 1)
else
nothing
Expand Down Expand Up @@ -503,9 +572,29 @@ julia> findprev('o', "Hello to the world", 18)
15
```
"""
findprev(ch::AbstractChar, string::AbstractString, ind::Integer) =
findprev(==(ch), string, ind)
findprev(ch::AbstractChar, string::AbstractString, start::Integer) =
findprev(==(ch), string, start)

"""
findprev(pattern::AbstractVector{<:Union{Int8,UInt8}},
A::AbstractVector{<:Union{Int8,UInt8}},
start::Integer)
Find the previous occurrence of the sequence `pattern` in vector `A` starting at position `start`.
!!! compat "Julia 1.6"
This method requires at least Julia 1.6.
# Examples
```jldoctest
julia> findprev([0x52, 0x62], [0x40, 0x52, 0x62, 0x52, 0x62], 3)
2:3
```
"""
findprev(pattern::AbstractVector{<:Union{Int8,UInt8}},
A::AbstractVector{<:Union{Int8,UInt8}},
start::Integer) =
_rsearch(A, pattern, start)
"""
occursin(needle::Union{AbstractString,AbstractPattern,AbstractChar}, haystack::AbstractString)
Expand Down
30 changes: 30 additions & 0 deletions test/strings/search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,36 @@ s_18109 = "fooα🐨βcd3"
@test findall("aa", "aaaaaa", overlap=true) == [1:2, 2:3, 3:4, 4:5, 5:6]
end

# issue 37280
@testset "UInt8, Int8 vector" begin
for T in [Int8, UInt8], VT in [Int8, UInt8]
A = T[0x40, 0x52, 0x62, 0x52, 0x62]

@test findfirst(VT[0x30], A) === nothing
@test findfirst(VT[0x52], A) === 2:2
@test findlast(VT[0x30], A) === nothing
@test findlast(VT[0x52], A) === 4:4

pattern = VT[0x52, 0x62]

@test findfirst(pattern, A) === 2:3
@test findnext(pattern, A, 2) === 2:3
@test findnext(pattern, A, 3) === 4:5
# 1 idx too far is allowed
@test findnext(pattern, A, length(A)+1) === nothing
@test_throws BoundsError findnext(pattern, A, -3)
@test_throws BoundsError findnext(pattern, A, length(A)+2)

@test findlast(pattern, A) === 4:5
@test findprev(pattern, A, 3) === 2:3
@test findprev(pattern, A, 5) === 4:5
@test findprev(pattern, A, 2) === nothing
@test findprev(pattern, A, length(A)+1) == findlast(pattern, A)
@test findprev(pattern, A, length(A)+2) == findlast(pattern, A)
@test_throws BoundsError findprev(pattern, A, -3)
end
end

# issue 32568
for T = (UInt, BigInt)
for x = (4, 5)
Expand Down

8 comments on commit 5d8225a

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Executing the daily benchmark build, I will reply here when finished:

@nanosoldier runbenchmarks(ALL, isdaily = true)

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Executing the daily package evaluation, I will reply here when finished:

@nanosoldier runtests(ALL, isdaily = true)

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your benchmark job has completed - successfully executed benchmarks. A full report can be found here. cc @christopher-dG

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your package evaluation job has completed - possible new issues were detected. A full report can be found here. cc @maleadt

@maleadt
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PkgEval test:

@nanosoldier runtests(["JSON", "Crayons"])

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something went wrong when running your job:

NanosoldierError: failed to run tests: UndefVarError: contains not defined

Logs and partial data can be found here
cc @maleadt

@maleadt
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nanosoldier runtests(["JSON", "Crayons"])

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your package evaluation job has completed - no issues were detected. A full report can be found here. cc @maleadt

Please sign in to comment.