Skip to content

Commit

Permalink
add Unicode.julia_chartransform Julia-parser normalization (JuliaLang…
Browse files Browse the repository at this point in the history
  • Loading branch information
stevengj committed Oct 18, 2021
1 parent 1b64755 commit 50fcb03
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 15 deletions.
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,11 @@ Standard library changes
#### Unicode
* Added function `isequal_normalized` to check for Unicode equivalence without
explicitly constructing normalized strings ([#42493]).
* The `Unicode.normalize` function now accepts a `chartransform` keyword that can
be used to supply custom character mappings, and a `Unicode.julia_chartransform`
function is provided to reproduce the mapping used in identifier normalization
by the Julia parser ([#42561]).


Deprecated or removed
---------------------
Expand Down
42 changes: 33 additions & 9 deletions base/strings/unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -145,20 +145,43 @@ const UTF8PROC_STRIPMARK = (1<<13)

utf8proc_error(result) = error(unsafe_string(ccall(:utf8proc_errmsg, Cstring, (Cssize_t,), result)))

function utf8proc_map(str::Union{String,SubString{String}}, options::Integer)
nwords = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint),
str, sizeof(str), C_NULL, 0, options)
nwords < 0 && utf8proc_error(nwords)
# static wrapper around user callback function
utf8proc_custom_func(codepoint::UInt32, callback::Any) =
UInt32(callback(codepoint))::UInt32

function utf8proc_decompose(str, options, buffer, nwords, chartransform::typeof(identity))
ret = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint),
str, sizeof(str), buffer, nwords, options)
ret < 0 && utf8proc_error(ret)
return ret
end
function utf8proc_decompose(str, options, buffer, nwords, chartransform::T) where T
ret = ccall(:utf8proc_decompose_custom, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint, Ptr{Cvoid}, Ref{T}),
str, sizeof(str), buffer, nwords, options,
@cfunction(utf8proc_custom_func, UInt32, (UInt32, Ref{T})), chartransform)
ret < 0 && utf8proc_error(ret)
return ret
end

function utf8proc_map(str::Union{String,SubString{String}}, options::Integer, chartransform=identity)
nwords = utf8proc_decompose(str, options, C_NULL, 0, chartransform)
buffer = Base.StringVector(nwords*4)
nwords = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint),
str, sizeof(str), buffer, nwords, options)
nwords < 0 && utf8proc_error(nwords)
nwords = utf8proc_decompose(str, options, buffer, nwords, chartransform)
nbytes = ccall(:utf8proc_reencode, Int, (Ptr{UInt8}, Int, Cint), buffer, nwords, options)
nbytes < 0 && utf8proc_error(nbytes)
return String(resize!(buffer, nbytes))
end

utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(String(s), flags)
# from julia_charmap.h, used by julia_chartransform in the Unicode stdlib
const _julia_charmap = Dict{UInt32,UInt32}(
0x025B => 0x03B5,
0x00B5 => 0x03BC,
0x00B7 => 0x22C5,
0x0387 => 0x22C5,
0x2212 => 0x002D,
)

utf8proc_map(s::AbstractString, flags::Integer, chartransform=identity) = utf8proc_map(String(s), flags, chartransform)

# Documented in Unicode module
function normalize(
Expand All @@ -176,6 +199,7 @@ function normalize(
casefold::Bool=false,
lump::Bool=false,
stripmark::Bool=false,
chartransform=identity,
)
flags = 0
stable && (flags = flags | UTF8PROC_STABLE)
Expand All @@ -198,7 +222,7 @@ function normalize(
casefold && (flags = flags | UTF8PROC_CASEFOLD)
lump && (flags = flags | UTF8PROC_LUMP)
stripmark && (flags = flags | UTF8PROC_STRIPMARK)
utf8proc_map(s, flags)
utf8proc_map(s, flags, chartransform)
end

function normalize(s::AbstractString, nf::Symbol)
Expand Down
5 changes: 4 additions & 1 deletion src/flisp/julia_charmap.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
/* Array of {original codepoint, replacement codepoint} normalizations
to perform on Julia identifiers, to canonicalize characters that
are both easily confused and easily inputted by accident. */
are both easily confused and easily inputted by accident.
Important: when this table is updated, also update the corresponding table
in base/strings/unicode.jl */
static const uint32_t charmap[][2] = {
{ 0x025B, 0x03B5 }, // latin small letter open e -> greek small letter epsilon
{ 0x00B5, 0x03BC }, // micro sign -> greek small letter mu
Expand Down
1 change: 1 addition & 0 deletions stdlib/Unicode/docs/src/index.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Unicode

```@docs
Unicode.julia_chartransform
Unicode.isassigned
Unicode.isequal_normalized
Unicode.normalize
Expand Down
66 changes: 62 additions & 4 deletions stdlib/Unicode/src/Unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,50 @@ module Unicode

export graphemes, isequal_normalized

"""
Unicode.julia_chartransform(c::Union{Char,Integer})
Map the Unicode character (`Char`) or codepoint (`Integer`) `c` to the corresponding
"equivalent" character or codepoint, respectively, according to the custom equivalence
used within the Julia parser (in addition to NFC normalization).
For example, `'µ'` (U+00B5 micro) is treated as equivalent to `'μ'` (U+03BC mu) by
Julia's parser, so `julia_chartransform` performs this transformation while leaving
other characters unchanged:
```jldoctest
julia> Unicode.julia_chartransform('\u00B5')
'μ': Unicode U+03BC (category Ll: Letter, lowercase)
julia> Unicode.julia_chartransform('x')
'x': ASCII/Unicode U+0078 (category Ll: Letter, lowercase)
```
`julia_chartransform` is mainly useful for passing to the [`Unicode.normalize`](@ref)
function in order to mimic the normalization used by the Julia parser:
```jl
julia> s = "\u00B5o\u0308"
"µö"
julia> s2 = Unicode.normalize(s, compose=true, stable=true, chartransform=Unicode.julia_chartransform)
"μö"
julia> collect(s2)
2-element Vector{Char}:
'μ': Unicode U+03BC (category Ll: Letter, lowercase)
'ö': Unicode U+00F6 (category Ll: Letter, lowercase)
julia> s2 == string(Meta.parse(s))
true
```
!!! compat "Julia 1.8"
This function was introduced in Julia 1.8.
"""
function julia_chartransform end
julia_chartransform(codepoint::UInt32) = get(Base.Unicode._julia_charmap, codepoint, codepoint)
julia_chartransform(codepoint::Integer) = julia_chartransform(UInt32(codepoint))
julia_chartransform(char::Char) = Char(julia_chartransform(UInt32(char)))

"""
Unicode.normalize(s::AbstractString; keywords...)
Unicode.normalize(s::AbstractString, normalform::Symbol)
Expand Down Expand Up @@ -42,6 +86,13 @@ options (which all default to `false` except for `compose`) are specified:
* `rejectna=true`: throw an error if unassigned code points are found
* `stable=true`: enforce Unicode versioning stability (never introduce characters missing from earlier Unicode versions)
You can also use the `chartransform` keyword (which defaults to `identity`) to pass an arbitrary
*function* mapping `Integer` codepoints to codepoints, which is is called on each
character in `s` as it is processed, in order to perform arbitrary additional normalizations.
For example, by passing `chartransform=Unicode.julia_chartransform`, you can apply a few Julia-specific
character normalizations that are performed by Julia when parsing identifiers (in addition to
NFC normalization: `compose=true, stable=true`).
For example, NFKC corresponds to the options `compose=true, compat=true, stable=true`.
# Examples
Expand All @@ -58,6 +109,9 @@ julia> Unicode.normalize("JuLiA", casefold=true)
julia> Unicode.normalize("JúLiA", stripmark=true)
"JuLiA"
```
!!! compat "Julia 1.8"
The `chartransform` keyword argument requires Julia 1.8.
"""
function normalize end
normalize(s::AbstractString, nf::Symbol) = Base.Unicode.normalize(s, nf)
Expand Down Expand Up @@ -98,12 +152,16 @@ function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32},
end

"""
isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false)
isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false, chartransform=identity)
Return whether `s1` and `s2` are canonically equivalent Unicode strings. If `casefold=true`,
ignores case (performs Unicode case-folding); if `stripmark=true`, strips diacritical marks
and other combining characters.
As with [`Unicode.normalize`](@ref), you can also pass an arbitrary
function via the `chartransform` keyword (mapping `Integer` codepoints to codepoints)
to perform custom normalizations, such as [`Unicode.julia_chartransform`](@ref).
# Examples
For example, the string `"noël"` can be constructed in two canonically equivalent ways
Expand All @@ -130,7 +188,7 @@ julia> isequal_normalized(s1, "NOËL", casefold=true)
true
```
"""
function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false)
function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false, chartransform=identity)
function decompose_next_char!(c, state, d, options, s)
n = _decompose_char!(c, d, options)
if n > length(d) # may be possible in future Unicode versions?
Expand All @@ -148,11 +206,11 @@ function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bo
while true
if j1 > n1
i1 === nothing && return i2 === nothing && j2 > n2
j1, n1, i1 = decompose_next_char!(UInt32(i1[1]), i1[2], d1, options, s1)
j1, n1, i1 = decompose_next_char!(chartransform(UInt32(i1[1])), i1[2], d1, options, s1)
end
if j2 > n2
i2 === nothing && return false
j2, n2, i2 = decompose_next_char!(UInt32(i2[1]), i2[2], d2, options, s2)
j2, n2, i2 = decompose_next_char!(chartransform(UInt32(i2[1])), i2[2], d2, options, s2)
end
d1[j1] == d2[j2] || return false
j1 += 1; j2 += 1
Expand Down
8 changes: 7 additions & 1 deletion stdlib/Unicode/test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

using Test
using Unicode
using Unicode: normalize, isassigned
using Unicode: normalize, isassigned, julia_chartransform

@testset "string normalization" begin
# normalize (Unicode normalization etc.):
Expand All @@ -25,6 +25,11 @@ using Unicode: normalize, isassigned
@test normalize("\t\r", stripcc=true) == " "
@test normalize("\t\r", stripcc=true, newline2ls=true) == " \u2028"
@test normalize("\u0072\u0307\u0323", :NFC) == "\u1E5B\u0307" #26917

# julia_chartransform identifier normalization
@test normalize("julia\u025B\u00B5\u00B7\u0387\u2212", chartransform=julia_chartransform) ==
"julia\u03B5\u03BC\u22C5\u22C5\u002D"
@test julia_chartransform('\u00B5') === '\u03BC'
end

@testset "unicode sa#15" begin
Expand Down Expand Up @@ -428,4 +433,5 @@ end
@test !isequal_normalized("no\u00EBl", "noel")
@test isequal_normalized("no\u00EBl", "noel", stripmark=true)
@test isequal_normalized("no\u00EBl", "NOEL", stripmark=true, casefold=true)
@test isequal_normalized("\u00B5\u0302m", "\u03BC\u0302m", chartransform=julia_chartransform)
end

0 comments on commit 50fcb03

Please sign in to comment.