Move docstrings to Unicode stdlib module (#25902)

Functions need to be defined in Unicode module, as reexporting Base.Unicode functions makes them appear with the latter prefix in the manual. Also fix examples by fully qualifying Unicode.normalize().
JuliaLang · Feb 12, 2018 · f9f11a3 · f9f11a3 · nanosoldier · Feb 13, 2018
1 parent 131813f
commit f9f11a3
Show file tree

Hide file tree

Showing 3 changed files with 92 additions and 81 deletions.
diff --git a/base/strings/substring.jl b/base/strings/substring.jl
@@ -107,7 +107,7 @@ pointer(x::SubString{String}, i::Integer) = pointer(x.string) + x.offset + (i-1)
 Reverses a string. Technically, this function reverses the codepoints in a string and its
 main utility is for reversed-order string processing, especially for reversed
 regular-expression searches. See also [`reverseind`](@ref) to convert indices in `s` to
-indices in `reverse(s)` and vice-versa, and [`Unicode.graphemes`](@ref Base.Unicode.graphemes) to
+indices in `reverse(s)` and vice-versa, and `graphemes` from module `Unicode` to
 operate on user-visible "characters" (graphemes) rather than codepoints.
 See also [`Iterators.reverse`](@ref) for
 reverse-order iteration without making a copy. Custom string types must implement the

diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl
@@ -150,6 +150,7 @@ end
 
 utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(String(s), flags)
 
+# Documented in Unicode module
 function normalize(
  s::AbstractString;
  stable::Bool=false,
@@ -190,55 +191,6 @@ function normalize(
  utf8proc_map(s, flags)
 end
 
-"""
- Unicode.normalize(s::AbstractString, normalform::Symbol)
-
-Normalize the string `s` according to one of the four "normal forms" of the Unicode
-standard: `normalform` can be `:NFC`, `:NFD`, `:NFKC`, or `:NFKD`. Normal forms C
-(canonical composition) and D (canonical decomposition) convert different visually identical
-representations of the same abstract string into a single canonical form, with form C being
-more compact. Normal forms KC and KD additionally canonicalize "compatibility equivalents":
-they convert characters that are abstractly similar but visually distinct into a single
-canonical choice (e.g. they expand ligatures into the individual characters), with form KC
-being more compact.
-
-Alternatively, finer control and additional transformations may be be obtained by calling
-`Unicode.normalize(s; keywords...)`, where any number of the following boolean keywords
-options (which all default to `false` except for `compose`) are specified:
-
-* `compose=false`: do not perform canonical composition
-* `decompose=true`: do canonical decomposition instead of canonical composition
- (`compose=true` is ignored if present)
-* `compat=true`: compatibility equivalents are canonicalized
-* `casefold=true`: perform Unicode case folding, e.g. for case-insensitive string comparison
-* `newline2lf=true`, `newline2ls=true`, or `newline2ps=true`: convert various newline
- sequences (LF, CRLF, CR, NEL) into a linefeed (LF), line-separation (LS), or
- paragraph-separation (PS) character, respectively
-* `stripmark=true`: strip diacritical marks (e.g. accents)
-* `stripignore=true`: strip Unicode's "default ignorable" characters (e.g. the soft hyphen
- or the left-to-right marker)
-* `stripcc=true`: strip control characters; horizontal tabs and form feeds are converted to
- spaces; newlines are also converted to spaces unless a newline-conversion flag was
- specified
-* `rejectna=true`: throw an error if unassigned code points are found
-* `stable=true`: enforce Unicode Versioning Stability
-
-For example, NFKC corresponds to the options `compose=true, compat=true, stable=true`.
-
-# Examples
-```jldoctest
-julia> using Unicode
-
-julia> "μ" == normalize("µ", compat=true) #LHS: Unicode U+03bc, RHS: Unicode U+00b5
-true
-
-julia> normalize("JuLiA", casefold=true)
-"julia"
-
-julia> normalize("JúLiA", stripmark=true)
-"JuLiA"
-```
-"""
 function normalize(s::AbstractString, nf::Symbol)
  utf8proc_map(s, nf == :NFC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE) :
  nf == :NFD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE) :
@@ -311,22 +263,6 @@ end
 
 category_string(c) = category_strings[category_code(c)+1]
 
-"""
- Unicode.isassigned(c) -> Bool
-
-Returns `true` if the given char or integer is an assigned Unicode code point.
-
-# Examples
-```jldoctest
-julia> using Unicode
-
-julia> Unicode.isassigned(101)
-true
-
-julia> Unicode.isassigned('\\x01')
-true
-```
-"""
 isassigned(c) = UTF8PROC_CATEGORY_CN < category_code(c) <= UTF8PROC_CATEGORY_CO
 
 ## libc character class predicates ##
@@ -378,11 +314,7 @@ function isupper(c::Char)
  cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LT
 end
 
-"""
- iscased(c::Char) -> Bool
-
-Tests whether a character is cased, i.e. is lower-, upper- or title-cased.
-"""
+# Documented in Unicode module
 function iscased(c::Char)
  cat = category_code(c)
  return cat == UTF8PROC_CATEGORY_LU ||
@@ -696,14 +628,7 @@ struct GraphemeIterator{S<:AbstractString}
  s::S # original string (for generation of SubStrings)
 end
 
-"""
- graphemes(s::AbstractString) -> GraphemeIterator
-
-Returns an iterator over substrings of `s` that correspond to the extended graphemes in the
-string, as defined by Unicode UAX #29. (Roughly, these are what users would perceive as
-single characters, even though they may contain more than one codepoint; for example a
-letter combined with an accent mark is a single grapheme.)
-"""
+# Documented in Unicode module
 graphemes(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
 
 eltype(::Type{GraphemeIterator{S}}) where {S} = SubString{S}

diff --git a/stdlib/Unicode/src/Unicode.jl b/stdlib/Unicode/src/Unicode.jl
@@ -4,10 +4,96 @@ __precompile__(true)
 
 module Unicode
 
-using Base.Unicode: normalize, graphemes, isassigned, iscased
-
 export graphemes
 
+"""
+ Unicode.normalize(s::AbstractString, normalform::Symbol)
+
+Normalize the string `s` according to one of the four "normal forms" of the Unicode
+standard: `normalform` can be `:NFC`, `:NFD`, `:NFKC`, or `:NFKD`. Normal forms C
+(canonical composition) and D (canonical decomposition) convert different visually identical
+representations of the same abstract string into a single canonical form, with form C being
+more compact. Normal forms KC and KD additionally canonicalize "compatibility equivalents":
+they convert characters that are abstractly similar but visually distinct into a single
+canonical choice (e.g. they expand ligatures into the individual characters), with form KC
+being more compact.
+
+Alternatively, finer control and additional transformations may be be obtained by calling
+`Unicode.normalize(s; keywords...)`, where any number of the following boolean keywords
+options (which all default to `false` except for `compose`) are specified:
+
+* `compose=false`: do not perform canonical composition
+* `decompose=true`: do canonical decomposition instead of canonical composition
+ (`compose=true` is ignored if present)
+* `compat=true`: compatibility equivalents are canonicalized
+* `casefold=true`: perform Unicode case folding, e.g. for case-insensitive string comparison
+* `newline2lf=true`, `newline2ls=true`, or `newline2ps=true`: convert various newline
+ sequences (LF, CRLF, CR, NEL) into a linefeed (LF), line-separation (LS), or
+ paragraph-separation (PS) character, respectively
+* `stripmark=true`: strip diacritical marks (e.g. accents)
+* `stripignore=true`: strip Unicode's "default ignorable" characters (e.g. the soft hyphen
+ or the left-to-right marker)
+* `stripcc=true`: strip control characters; horizontal tabs and form feeds are converted to
+ spaces; newlines are also converted to spaces unless a newline-conversion flag was
+ specified
+* `rejectna=true`: throw an error if unassigned code points are found
+* `stable=true`: enforce Unicode Versioning Stability
+
+For example, NFKC corresponds to the options `compose=true, compat=true, stable=true`.
+
+# Examples
+```jldoctest
+julia> using Unicode
+
+julia> "μ" == Unicode.normalize("µ", compat=true) #LHS: Unicode U+03bc, RHS: Unicode U+00b5
+true
+
+julia> Unicode.normalize("JuLiA", casefold=true)
+"julia"
+
+julia> Unicode.normalize("JúLiA", stripmark=true)
+"JuLiA"
+```
+"""
+function normalize end
+normalize(s::AbstractString, nf::Symbol) = Base.Unicode.normalize(s, nf)
+normalize(s::AbstractString; kwargs...) = Base.Unicode.normalize(s; kwargs...)
+
+"""
+ Unicode.isassigned(c) -> Bool
+
+Returns `true` if the given char or integer is an assigned Unicode code point.
+
+# Examples
+```jldoctest
+julia> using Unicode
+
+julia> Unicode.isassigned(101)
+true
+
+julia> Unicode.isassigned('\\x01')
+true
+```
+"""
+isassigned(c) = Base.Unicode.isassigned(c)
+
+"""
+ iscased(c::Char) -> Bool
+
+Tests whether a character is cased, i.e. is lower-, upper- or title-cased.
+"""
+iscased(c::Char) = Base.Unicode.iscased(c)
+
+"""
+ graphemes(s::AbstractString) -> GraphemeIterator
+
+Returns an iterator over substrings of `s` that correspond to the extended graphemes in the
+string, as defined by Unicode UAX #29. (Roughly, these are what users would perceive as
+single characters, even though they may contain more than one codepoint; for example a
+letter combined with an accent mark is a single grapheme.)
+"""
+graphemes(s::AbstractString) = Base.Unicode.GraphemeIterator{typeof(s)}(s)
+
 # BEGIN 0.7 deprecations
 
 @deprecate is_assigned_char(c::Char) Unicode.isassigned(c)