add Unicode.julia_chartransform Julia-parser normalization (JuliaLang…

…#42561)
shirodkara · Oct 18, 2021 · 50fcb03 · 50fcb03
1 parent 1b64755
commit 50fcb03
Show file tree

Hide file tree

Showing 6 changed files with 112 additions and 15 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -124,6 +124,11 @@ Standard library changes
 #### Unicode
 * Added function `isequal_normalized` to check for Unicode equivalence without
  explicitly constructing normalized strings ([#42493]).
+* The `Unicode.normalize` function now accepts a `chartransform` keyword that can
+ be used to supply custom character mappings, and a `Unicode.julia_chartransform`
+ function is provided to reproduce the mapping used in identifier normalization
+ by the Julia parser ([#42561]).
+
 
 Deprecated or removed
 ---------------------

diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl
@@ -145,20 +145,43 @@ const UTF8PROC_STRIPMARK = (1<<13)
 
 utf8proc_error(result) = error(unsafe_string(ccall(:utf8proc_errmsg, Cstring, (Cssize_t,), result)))
 
-function utf8proc_map(str::Union{String,SubString{String}}, options::Integer)
- nwords = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint),
- str, sizeof(str), C_NULL, 0, options)
- nwords < 0 && utf8proc_error(nwords)
+# static wrapper around user callback function
+utf8proc_custom_func(codepoint::UInt32, callback::Any) =
+ UInt32(callback(codepoint))::UInt32
+
+function utf8proc_decompose(str, options, buffer, nwords, chartransform::typeof(identity))
+ ret = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint),
+ str, sizeof(str), buffer, nwords, options)
+ ret < 0 && utf8proc_error(ret)
+ return ret
+end
+function utf8proc_decompose(str, options, buffer, nwords, chartransform::T) where T
+ ret = ccall(:utf8proc_decompose_custom, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint, Ptr{Cvoid}, Ref{T}),
+ str, sizeof(str), buffer, nwords, options,
+ @cfunction(utf8proc_custom_func, UInt32, (UInt32, Ref{T})), chartransform)
+ ret < 0 && utf8proc_error(ret)
+ return ret
+end
+
+function utf8proc_map(str::Union{String,SubString{String}}, options::Integer, chartransform=identity)
+ nwords = utf8proc_decompose(str, options, C_NULL, 0, chartransform)
  buffer = Base.StringVector(nwords*4)
- nwords = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint),
- str, sizeof(str), buffer, nwords, options)
- nwords < 0 && utf8proc_error(nwords)
+ nwords = utf8proc_decompose(str, options, buffer, nwords, chartransform)
  nbytes = ccall(:utf8proc_reencode, Int, (Ptr{UInt8}, Int, Cint), buffer, nwords, options)
  nbytes < 0 && utf8proc_error(nbytes)
  return String(resize!(buffer, nbytes))
 end
 
-utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(String(s), flags)
+# from julia_charmap.h, used by julia_chartransform in the Unicode stdlib
+const _julia_charmap = Dict{UInt32,UInt32}(
+ 0x025B => 0x03B5,
+ 0x00B5 => 0x03BC,
+ 0x00B7 => 0x22C5,
+ 0x0387 => 0x22C5,
+ 0x2212 => 0x002D,
+)
+
+utf8proc_map(s::AbstractString, flags::Integer, chartransform=identity) = utf8proc_map(String(s), flags, chartransform)
 
 # Documented in Unicode module
 function normalize(
@@ -176,6 +199,7 @@ function normalize(
  casefold::Bool=false,
  lump::Bool=false,
  stripmark::Bool=false,
+ chartransform=identity,
 )
  flags = 0
  stable && (flags = flags | UTF8PROC_STABLE)
@@ -198,7 +222,7 @@ function normalize(
  casefold && (flags = flags | UTF8PROC_CASEFOLD)
  lump && (flags = flags | UTF8PROC_LUMP)
  stripmark && (flags = flags | UTF8PROC_STRIPMARK)
- utf8proc_map(s, flags)
+ utf8proc_map(s, flags, chartransform)
 end
 
 function normalize(s::AbstractString, nf::Symbol)

diff --git a/src/flisp/julia_charmap.h b/src/flisp/julia_charmap.h
@@ -1,6 +1,9 @@
 /* Array of {original codepoint, replacement codepoint} normalizations
  to perform on Julia identifiers, to canonicalize characters that
- are both easily confused and easily inputted by accident. */
+ are both easily confused and easily inputted by accident.
+
+ Important: when this table is updated, also update the corresponding table
+ in base/strings/unicode.jl */
 static const uint32_t charmap[][2] = {
  { 0x025B, 0x03B5 }, // latin small letter open e -> greek small letter epsilon
  { 0x00B5, 0x03BC }, // micro sign -> greek small letter mu

diff --git a/stdlib/Unicode/docs/src/index.md b/stdlib/Unicode/docs/src/index.md
@@ -1,6 +1,7 @@
 # Unicode
 
 ```@docs
+Unicode.julia_chartransform
 Unicode.isassigned
 Unicode.isequal_normalized
 Unicode.normalize

diff --git a/stdlib/Unicode/src/Unicode.jl b/stdlib/Unicode/src/Unicode.jl
@@ -4,6 +4,50 @@ module Unicode
 
 export graphemes, isequal_normalized
 
+"""
+ Unicode.julia_chartransform(c::Union{Char,Integer})
+
+Map the Unicode character (`Char`) or codepoint (`Integer`) `c` to the corresponding
+"equivalent" character or codepoint, respectively, according to the custom equivalence
+used within the Julia parser (in addition to NFC normalization).
+
+For example, `'µ'` (U+00B5 micro) is treated as equivalent to `'μ'` (U+03BC mu) by
+Julia's parser, so `julia_chartransform` performs this transformation while leaving
+other characters unchanged:
+```jldoctest
+julia> Unicode.julia_chartransform('\u00B5')
+'μ': Unicode U+03BC (category Ll: Letter, lowercase)
+
+julia> Unicode.julia_chartransform('x')
+'x': ASCII/Unicode U+0078 (category Ll: Letter, lowercase)
+```
+
+`julia_chartransform` is mainly useful for passing to the [`Unicode.normalize`](@ref)
+function in order to mimic the normalization used by the Julia parser:
+```jl
+julia> s = "\u00B5o\u0308"
+"µö"
+
+julia> s2 = Unicode.normalize(s, compose=true, stable=true, chartransform=Unicode.julia_chartransform)
+"μö"
+
+julia> collect(s2)
+2-element Vector{Char}:
+ 'μ': Unicode U+03BC (category Ll: Letter, lowercase)
+ 'ö': Unicode U+00F6 (category Ll: Letter, lowercase)
+
+julia> s2 == string(Meta.parse(s))
+true
+```
+
+!!! compat "Julia 1.8"
+ This function was introduced in Julia 1.8.
+"""
+function julia_chartransform end
+julia_chartransform(codepoint::UInt32) = get(Base.Unicode._julia_charmap, codepoint, codepoint)
+julia_chartransform(codepoint::Integer) = julia_chartransform(UInt32(codepoint))
+julia_chartransform(char::Char) = Char(julia_chartransform(UInt32(char)))
+
 """
  Unicode.normalize(s::AbstractString; keywords...)
  Unicode.normalize(s::AbstractString, normalform::Symbol)
@@ -42,6 +86,13 @@ options (which all default to `false` except for `compose`) are specified:
 * `rejectna=true`: throw an error if unassigned code points are found
 * `stable=true`: enforce Unicode versioning stability (never introduce characters missing from earlier Unicode versions)
 
+You can also use the `chartransform` keyword (which defaults to `identity`) to pass an arbitrary
+*function* mapping `Integer` codepoints to codepoints, which is is called on each
+character in `s` as it is processed, in order to perform arbitrary additional normalizations.
+For example, by passing `chartransform=Unicode.julia_chartransform`, you can apply a few Julia-specific
+character normalizations that are performed by Julia when parsing identifiers (in addition to
+NFC normalization: `compose=true, stable=true`).
+
 For example, NFKC corresponds to the options `compose=true, compat=true, stable=true`.
 
 # Examples
@@ -58,6 +109,9 @@ julia> Unicode.normalize("JuLiA", casefold=true)
 julia> Unicode.normalize("JúLiA", stripmark=true)
 "JuLiA"
 ```
+
+!!! compat "Julia 1.8"
+ The `chartransform` keyword argument requires Julia 1.8.
 """
 function normalize end
 normalize(s::AbstractString, nf::Symbol) = Base.Unicode.normalize(s, nf)
@@ -98,12 +152,16 @@ function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32},
 end
 
 """
- isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false)
+ isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false, chartransform=identity)
 
 Return whether `s1` and `s2` are canonically equivalent Unicode strings. If `casefold=true`,
 ignores case (performs Unicode case-folding); if `stripmark=true`, strips diacritical marks
 and other combining characters.
 
+As with [`Unicode.normalize`](@ref), you can also pass an arbitrary
+function via the `chartransform` keyword (mapping `Integer` codepoints to codepoints)
+to perform custom normalizations, such as [`Unicode.julia_chartransform`](@ref).
+
 # Examples
 
 For example, the string `"noël"` can be constructed in two canonically equivalent ways
@@ -130,7 +188,7 @@ julia> isequal_normalized(s1, "NOËL", casefold=true)
 true
 ```
 """
-function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false)
+function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false, chartransform=identity)
  function decompose_next_char!(c, state, d, options, s)
  n = _decompose_char!(c, d, options)
  if n > length(d) # may be possible in future Unicode versions?
@@ -148,11 +206,11 @@ function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bo
  while true
  if j1 > n1
  i1 === nothing && return i2 === nothing && j2 > n2
- j1, n1, i1 = decompose_next_char!(UInt32(i1[1]), i1[2], d1, options, s1)
+ j1, n1, i1 = decompose_next_char!(chartransform(UInt32(i1[1])), i1[2], d1, options, s1)
  end
  if j2 > n2
  i2 === nothing && return false
- j2, n2, i2 = decompose_next_char!(UInt32(i2[1]), i2[2], d2, options, s2)
+ j2, n2, i2 = decompose_next_char!(chartransform(UInt32(i2[1])), i2[2], d2, options, s2)
  end
  d1[j1] == d2[j2] || return false
  j1 += 1; j2 += 1

diff --git a/stdlib/Unicode/test/runtests.jl b/stdlib/Unicode/test/runtests.jl
@@ -2,7 +2,7 @@
 
 using Test
 using Unicode
-using Unicode: normalize, isassigned
+using Unicode: normalize, isassigned, julia_chartransform
 
 @testset "string normalization" begin
  # normalize (Unicode normalization etc.):
@@ -25,6 +25,11 @@ using Unicode: normalize, isassigned
  @test normalize("\t\r", stripcc=true) == " "
  @test normalize("\t\r", stripcc=true, newline2ls=true) == " \u2028"
  @test normalize("\u0072\u0307\u0323", :NFC) == "\u1E5B\u0307" #26917
+
+ # julia_chartransform identifier normalization
+ @test normalize("julia\u025B\u00B5\u00B7\u0387\u2212", chartransform=julia_chartransform) ==
+ "julia\u03B5\u03BC\u22C5\u22C5\u002D"
+ @test julia_chartransform('\u00B5') === '\u03BC'
 end
 
 @testset "unicode sa#15" begin
@@ -428,4 +433,5 @@ end
  @test !isequal_normalized("no\u00EBl", "noel")
  @test isequal_normalized("no\u00EBl", "noel", stripmark=true)
  @test isequal_normalized("no\u00EBl", "NOEL", stripmark=true, casefold=true)
+ @test isequal_normalized("\u00B5\u0302m", "\u03BC\u0302m", chartransform=julia_chartransform)
 end