Skip to content

Commit

Permalink
fix isequal_normalized for combining-char reordering (#52447)
Browse files Browse the repository at this point in the history
Fixes #52408.

(Note that this function was added in Julia 1.8, in #42493.)

In the future it would be good to further optimize this function by
adding a fast path for the common case of strings that are mostly ASCII
characters. Perhaps simply skip ahead to the first byte that doesn't
match before we begin doing decomposition etcetera.
  • Loading branch information
stevengj committed Dec 19, 2023
1 parent 91d87c6 commit 3b250c7
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 11 deletions.
81 changes: 70 additions & 11 deletions stdlib/Unicode/src/Unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -208,12 +208,19 @@ end

using Base.Unicode: utf8proc_error, UTF8PROC_DECOMPOSE, UTF8PROC_CASEFOLD, UTF8PROC_STRIPMARK

function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32}, options::Integer)
ret = @ccall utf8proc_decompose_char(codepoint::UInt32, dest::Ptr{UInt32}, length(dest)::Int, options::Cint, C_NULL::Ptr{Cint})::Int
function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32}, offset::Integer, options::Integer)
ret = GC.@preserve dest @ccall utf8proc_decompose_char(codepoint::UInt32, pointer(dest, 1+offset)::Ptr{UInt32}, (length(dest)-offset)::Int, options::Cint, C_NULL::Ptr{Cint})::Int
ret < 0 && utf8proc_error(ret)
return ret
end

# would be good to have higher-level accessor functions in utf8proc. alternatively,
# we could mirror the whole utf8proc_property_t struct in Julia, but that is annoying
# because of the bitfields.
combining_class(uc::Integer) =
0x000301 uc 0x10ffff ? unsafe_load(ccall(:utf8proc_get_property, Ptr{UInt16}, (UInt32,), uc), 2) : 0x0000
combining_class(c::AbstractChar) = ismalformed(c) ? 0x0000 : combining_class(UInt32(c))

"""
isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false, chartransform=identity)
Expand All @@ -225,6 +232,9 @@ As with [`Unicode.normalize`](@ref), you can also pass an arbitrary
function via the `chartransform` keyword (mapping `Integer` codepoints to codepoints)
to perform custom normalizations, such as [`Unicode.julia_chartransform`](@ref).
!!! compat "Julia 1.8"
The `isequal_normalized` function was added in Julia 1.8.
# Examples
For example, the string `"noël"` can be constructed in two canonically equivalent ways
Expand All @@ -251,29 +261,78 @@ julia> isequal_normalized(s1, "NOËL", casefold=true)
true
```
"""
function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false, chartransform=identity)
function decompose_next_char!(c, state, d, options, s)
n = _decompose_char!(c, d, options)
if n > length(d) # may be possible in future Unicode versions?
n = _decompose_char!(c, resize!(d, n), options)
isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false, chartransform=identity) =
_isequal_normalized!(s1, s2, Vector{UInt32}(undef, 4), Vector{UInt32}(undef, 4), chartransform; casefold, stripmark)

# like isequal_normalized, but takes pre-allocated codepoint buffers as arguments, and chartransform is a positional argument
function _isequal_normalized!(s1::AbstractString, s2::AbstractString,
d1::Vector{UInt32}, d2::Vector{UInt32}, chartransform::F=identity;
casefold::Bool=false, stripmark::Bool=false) where {F}
function decompose_next_chars!(state, d, options, s)
local n
offset = 0
@inbounds while true
# read a char and decompose it to d
c = chartransform(UInt32(state[1]))
state = iterate(s, state[2])
if c < 0x80 # fast path for common ASCII case
n = 1 + offset
n > length(d) && resize!(d, 2n)
d[n] = casefold ? (0x41 c 0x5A ? c+0x20 : c) : c
break # ASCII characters are all zero combining class
else
while true
n = _decompose_char!(c, d, offset, options) + offset
if n > length(d)
resize!(d, 2n)
continue
end
break
end
end

# decomposed chars must be sorted in ascending order of combining class,
# which means we need to keep fetching chars until we get to non-combining
(iszero(combining_class(d[n])) || isnothing(state)) && break # non-combining
offset = n
end
return 1, n, iterate(s, state)

# sort by combining class
if n < 32 # almost always true
for j1 = 2:n # insertion sort
cc = combining_class(d[j1])
iszero(cc) && continue # don't re-order non-combiners
for j2 = j1:-1:2
combining_class(d[j2-1]) cc && break
d[j2-1], d[j2] = d[j2], d[j2-1]
end
end
else # avoid n^2 complexity in crazy large-n case
j = 1
@views while j < n
j₀ = j + something(findnext(iszero combining_class, d[j+1:n], 1), n+1-j)
sort!(d[j:j₀-1], by=combining_class)
j = j₀
end
end

# split return statement to help type inference:
return state === nothing ? (1, n, nothing) : (1, n, state)
end
options = UTF8PROC_DECOMPOSE
casefold && (options |= UTF8PROC_CASEFOLD)
stripmark && (options |= UTF8PROC_STRIPMARK)
i1,i2 = iterate(s1),iterate(s2)
d1,d2 = Vector{UInt32}(undef, 4), Vector{UInt32}(undef, 4) # codepoint buffers
n1 = n2 = 0 # lengths of codepoint buffers
j1 = j2 = 1 # indices in d1, d2
while true
if j1 > n1
i1 === nothing && return i2 === nothing && j2 > n2
j1, n1, i1 = decompose_next_char!(chartransform(UInt32(i1[1])), i1[2], d1, options, s1)
j1, n1, i1 = decompose_next_chars!(i1, d1, options, s1)
end
if j2 > n2
i2 === nothing && return false
j2, n2, i2 = decompose_next_char!(chartransform(UInt32(i2[1])), i2[2], d2, options, s2)
j2, n2, i2 = decompose_next_chars!(i2, d2, options, s2)
end
d1[j1] == d2[j2] || return false
j1 += 1; j2 += 1
Expand Down
67 changes: 67 additions & 0 deletions stdlib/Unicode/test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
using Test
using Unicode
using Unicode: normalize, isassigned, julia_chartransform
import Random

Random.seed!(12345)

@testset "string normalization" begin
# normalize (Unicode normalization etc.):
Expand Down Expand Up @@ -455,6 +458,9 @@ end
@test !Base.Unicode.isvalid(Char, overlong_char)
end

# the obvious, but suboptimal, algorithm:
isequal_normalized_naive(s1, s2; kws...) = normalize(s1; kws...) == normalize(s2; kws...)

@testset "Unicode equivalence" begin
@test isequal_normalized("no\u00EBl", "noe\u0308l")
@test !isequal_normalized("no\u00EBl", "noe\u0308l ")
Expand All @@ -466,4 +472,65 @@ end
@test isequal_normalized("no\u00EBl", "noel", stripmark=true)
@test isequal_normalized("no\u00EBl", "NOEL", stripmark=true, casefold=true)
@test isequal_normalized("\u00B5\u0302m", "\u03BC\u0302m", chartransform=julia_chartransform)

# issue #52408
@testset "Sorting combining characters" begin
for str in ("\u5bc\u5b0", "j\u5ae\u5bf\u5b2\u5b4") # julia#52408 examples
@test isequal_normalized(str, normalize(str))
end

# first codepoint in every possible Unicode combining class
let cc_chars = UInt32[0x00000334, 0x00016ff0, 0x0000093c, 0x00003099, 0x0000094d, 0x000005b0, 0x000005b1, 0x000005b2, 0x000005b3, 0x000005b4, 0x000005b5, 0x000005b6, 0x000005b7, 0x000005b8, 0x000005b9, 0x000005bb, 0x000005bc, 0x000005bd, 0x000005bf, 0x000005c1, 0x000005c2, 0x0000fb1e, 0x0000064b, 0x0000064c, 0x0000064d, 0x00000618, 0x00000619, 0x0000061a, 0x00000651, 0x00000652, 0x00000670, 0x00000711, 0x00000c55, 0x00000c56, 0x00000e38, 0x00000e48, 0x00000eb8, 0x00000ec8, 0x00000f71, 0x00000f72, 0x00000f74, 0x00000321, 0x00001dce, 0x0000031b, 0x00001dfa, 0x00000316, 0x0000059a, 0x0000302e, 0x0001d16d, 0x000005ae, 0x00000301, 0x00000315, 0x0000035c, 0x0000035d, 0x00000345],
vowels = ['a', 'e', 'i', 'o', 'u', 'å', 'é', 'î', 'ö', 'ü'], Vowels = [vowels; uppercase.(vowels)]
function randcc(n, n_cc) # random string with lots of combining chars
buf = IOBuffer()
for _ = 1:n
print.(buf, rand(Vowels, rand(1:5)))
print.(buf, Char.(rand(cc_chars, rand(0:n_cc))))
end
return String(take!(buf))
end
for _ = 1:100
s = randcc(10,10)
ns = normalize(s)
cs = normalize(s, casefold=true)
@test isequal_normalized(s, s)
if !isequal_normalized(s, ns)
@show s
end
@test isequal_normalized(s, ns)
@test isequal_normalized(cs, ns) == isequal_normalized_naive(cs, ns)
@test isequal_normalized(cs, ns, casefold=true) ==
isequal_normalized_naive(cs, ns, casefold=true)
end
for _ = 1:3
s = randcc(5,1000) # exercise sort!-based fallback
@test isequal_normalized(s, normalize(s))
end
function randcc2(n, n_cc) # 2 strings with equivalent reordered combiners
buf1 = IOBuffer()
buf2 = IOBuffer()
p = n_cc / length(cc_chars)
for _ = 1:n
a = join(rand(Vowels, rand(1:5)))
print(buf1, a)
print(buf2, a)

# chars from distinct combining classes
# are canonically equivalent when re-ordered
c = Random.randsubseq(cc_chars, p)
print.(buf1, Char.(Random.shuffle!(c)))
print.(buf2, Char.(Random.shuffle!(c)))
end
return String(take!(buf1)), String(take!(buf2))
end
for _ = 1:100
s1, s2 = randcc2(10,10)
@test isequal_normalized(s1, s2)
end
end

# combining characters in the same class are inequivalent if re-ordered:
@test !isequal_normalized("x\u0334\u0335", "x\u0335\u0334")
end
end

0 comments on commit 3b250c7

Please sign in to comment.