Add isvalid(Type, value) methods, to replace is_valid_*

pwl · May 19, 2015 · ca88515 · ca88515
1 parent eb5da26
commit ca88515
Show file tree

Hide file tree

Showing 13 changed files with 151 additions and 69 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -367,6 +367,13 @@ Deprecated or removed
 
  * Instead of `linrange`, use `linspace` ([#9666]).
 
+ * The functions `is_valid_char`, `is_valid_ascii`, `is_valid_utf8`, `is_valid_utf16`, and
+ `is_valid_utf32` have been replaced by generic `isvalid` methods.
+ The single argument form `isvalid(value)` can now be used for values of type `Char`, `ASCIIString`,
+ `UTF8String`, `UTF16String` and `UTF32String`.
+ The two argument form `isvalid(type, value)` can be used with the above types, with values
+ of type `Vector{UInt8}`, `Vector{UInt16}`, `Vector{UInt32}`, and `Vector{Char}` ([#11241]).
+
 Julia v0.3.0 Release Notes
 ==========================
 
@@ -1379,6 +1386,7 @@ Too numerous to mention.
 [#9779]: https://github.com/JuliaLang/julia/issues/9779
 [#9862]: https://github.com/JuliaLang/julia/issues/9862
 [#9957]: https://github.com/JuliaLang/julia/issues/9957
+[#10008]: https://github.com/JuliaLang/julia/issues/10008
 [#10024]: https://github.com/JuliaLang/julia/issues/10024
 [#10031]: https://github.com/JuliaLang/julia/issues/10031
 [#10075]: https://github.com/JuliaLang/julia/issues/10075
@@ -1406,5 +1414,9 @@ Too numerous to mention.
 [#10888]: https://github.com/JuliaLang/julia/issues/10888
 [#10893]: https://github.com/JuliaLang/julia/issues/10893
 [#10914]: https://github.com/JuliaLang/julia/issues/10914
+[#10955]: https://github.com/JuliaLang/julia/issues/10955
 [#10994]: https://github.com/JuliaLang/julia/issues/10994
+[#11105]: https://github.com/JuliaLang/julia/issues/11105
 [#11145]: https://github.com/JuliaLang/julia/issues/11145
+[#11171]: https://github.com/JuliaLang/julia/issues/11171
+[#11241]: https://github.com/JuliaLang/julia/issues/11241
diff --git a/base/ascii.jl b/base/ascii.jl
@@ -100,7 +100,7 @@ ascii(x) = convert(ASCIIString, x)
 convert(::Type{ASCIIString}, s::ASCIIString) = s
 convert(::Type{ASCIIString}, s::UTF8String) = ascii(s.data)
 convert(::Type{ASCIIString}, a::Vector{UInt8}) = begin
- is_valid_ascii(a) || throw(ArgumentError("invalid ASCII sequence"))
+ isvalid(ASCIIString,a) || throw(ArgumentError("invalid ASCII sequence"))
  return ASCIIString(a)
 end
 

diff --git a/base/deprecated.jl b/base/deprecated.jl
@@ -443,3 +443,17 @@ export float32_isvalid, float64_isvalid
 @deprecate (&)(x::Char, y::Char) Char(UInt32(x) & UInt32(y))
 @deprecate (|)(x::Char, y::Char) Char(UInt32(x) | UInt32(y))
 @deprecate ($)(x::Char, y::Char) Char(UInt32(x) $ UInt32(y))
+
+# 11241
+
+@deprecate is_valid_char(ch::Char) isvalid(ch)
+@deprecate is_valid_ascii(str::ASCIIString) isvalid(str)
+@deprecate is_valid_utf8(str::UTF8String) isvalid(str)
+@deprecate is_valid_utf16(str::UTF16String) isvalid(str)
+@deprecate is_valid_utf32(str::UTF32String) isvalid(str)
+
+@deprecate is_valid_char(ch) isvalid(Char, ch)
+@deprecate is_valid_ascii(str) isvalid(ASCIIString, str)
+@deprecate is_valid_utf8(str) isvalid(UTF8String, str)
+@deprecate is_valid_utf16(str) isvalid(UTF16String, str)
+@deprecate is_valid_utf32(str) isvalid(UTF32String, str)
diff --git a/base/exports.jl b/base/exports.jl
@@ -820,11 +820,6 @@ export
  ind2chr,
  info,
  is_assigned_char,
- is_valid_ascii,
- is_valid_char,
- is_valid_utf8,
- is_valid_utf16,
- is_valid_utf32,
  isalnum,
  isalpha,
  isascii,

diff --git a/base/io.jl b/base/io.jl
@@ -246,7 +246,7 @@ end
 
 function readall(s::IO)
  b = readbytes(s)
- return is_valid_ascii(b) ? ASCIIString(b) : UTF8String(b)
+ return isvalid(ASCIIString, b) ? ASCIIString(b) : UTF8String(b)
 end
 readall(filename::AbstractString) = open(readall, filename)
 

diff --git a/base/string.jl b/base/string.jl
@@ -968,8 +968,8 @@ byte_string_classify(s::ByteString) = byte_string_classify(s.data)
  # 1: valid ASCII
  # 2: valid UTF-8
 
-is_valid_ascii(s::Union(Array{UInt8,1},ByteString)) = byte_string_classify(s) == 1
-is_valid_utf8(s::Union(Array{UInt8,1},ByteString)) = byte_string_classify(s) != 0
+isvalid(::Type{ASCIIString}, s::Union(Array{UInt8,1},ByteString)) = byte_string_classify(s) == 1
+isvalid(::Type{UTF8String}, s::Union(Array{UInt8,1},ByteString)) = byte_string_classify(s) != 0
 
 ## multiline strings ##
 

diff --git a/base/utf16.jl b/base/utf16.jl
@@ -95,7 +95,7 @@ sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
 unsafe_convert{T<:Union(Int16,UInt16)}(::Type{Ptr{T}}, s::UTF16String) =
  convert(Ptr{T}, pointer(s))
 
-function is_valid_utf16(data::AbstractArray{UInt16})
+function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
  i = 1
  n = length(data) # this may include NULL termination; that's okay
  while i < n # check for unpaired surrogates
@@ -110,10 +110,8 @@ function is_valid_utf16(data::AbstractArray{UInt16})
  return i > n || !utf16_is_surrogate(data[i])
 end
 
-is_valid_utf16(s::UTF16String) = is_valid_utf16(s.data)
-
 function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
- !is_valid_utf16(data) && throw(ArgumentError("invalid UTF16 data"))
+ !isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data"))
  len = length(data)
  d = Array(UInt16, len + 1)
  d[end] = 0 # NULL terminate
@@ -144,7 +142,7 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
  copy!(d,1, data,1, length(data)) # assume native byte order
  end
  d[end] = 0 # NULL terminate
- !is_valid_utf16(d) && throw(ArgumentError("invalid UTF16 data"))
+ !isvalid(UTF16String, d) && throw(ArgumentError("invalid UTF16 data"))
  UTF16String(d)
 end
 

diff --git a/base/utf32.jl b/base/utf32.jl
@@ -92,13 +92,14 @@ function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
  UTF32String(d)
 end
 
-function is_valid_utf32(s::Union(Vector{Char}, Vector{UInt32}))
- for i=1:length(s)
- @inbounds if !is_valid_char(reinterpret(UInt32, s[i])) ; return false ; end
+function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32}))
+ for i=1:length(str)
+ @inbounds if !isvalid(Char, reinterpret(UInt32, str[i])) ; return false ; end
  end
  return true
 end
-is_valid_utf32(s::UTF32String) = is_valid_utf32(s.data)
+isvalid(str::Vector{Char}) = isvalid(UTF32String, str)
+isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data)
 
 utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len))
 utf32(p::Union(Ptr{UInt32}, Ptr{Int32}), len::Integer) = utf32(convert(Ptr{Char}, p), len)

diff --git a/base/utf8.jl b/base/utf8.jl
@@ -212,7 +212,7 @@ write(io::IO, s::UTF8String) = write(io, s.data)
 utf8(x) = convert(UTF8String, x)
 convert(::Type{UTF8String}, s::UTF8String) = s
 convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data)
-convert(::Type{UTF8String}, a::Array{UInt8,1}) = is_valid_utf8(a) ? UTF8String(a) : throw(ArgumentError("invalid UTF-8 sequence"))
+convert(::Type{UTF8String}, a::Array{UInt8,1}) = isvalid(UTF8String, a) ? UTF8String(a) : throw(ArgumentError("invalid UTF-8 sequence"))
 function convert(::Type{UTF8String}, a::Array{UInt8,1}, invalids_as::AbstractString)
  l = length(a)
  idx = 1

diff --git a/base/utf8proc.jl b/base/utf8proc.jl
@@ -3,19 +3,21 @@
 # Various Unicode functionality from the utf8proc library
 module UTF8proc
 
-import Base: show, showcompact, ==, hash, string, symbol, isless, length, eltype, start, next, done, convert
+import Base: show, showcompact, ==, hash, string, symbol, isless, length, eltype, start, next, done, convert, isvalid
 
 export isgraphemebreak
 
 # also exported by Base:
-export normalize_string, graphemes, is_valid_char, is_assigned_char, charwidth,
+export normalize_string, graphemes, is_assigned_char, charwidth, isvalid,
  islower, isupper, isalpha, isdigit, isnumber, isalnum,
  iscntrl, ispunct, isspace, isprint, isgraph, isblank
 
 # whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff
-is_valid_char(ch::Unsigned) = !Bool((ch-0xd800<0x800)|(ch>0x10ffff))
-is_valid_char(ch::Integer) = is_valid_char(Unsigned(ch))
-is_valid_char(ch::Char) = is_valid_char(UInt32(ch))
+isvalid(::Type{Char}, ch::Unsigned) = !((ch - 0xd800 < 0x800) | (ch > 0x10ffff))
+isvalid(::Type{Char}, ch::Integer) = isvalid(Char, Unsigned(ch))
+isvalid(::Type{Char}, ch::Char) = isvalid(Char, UInt32(ch))
+
+isvalid(ch::Char) = isvalid(Char, ch)
 
 # utf8 category constants
 const UTF8PROC_CATEGORY_CN = 0

diff --git a/doc/manual/strings.rst b/doc/manual/strings.rst
@@ -99,14 +99,14 @@ convert an integer value back to a :obj:`Char` just as easily:
 Not all integer values are valid Unicode code points, but for
 performance, the :func:`Char` conversion does not check that every character
 value is valid. If you want to check that each converted value is a
-valid code point, use the :func:`is_valid_char` function:
+valid code point, use the :func:`isvalid` function:
 
 .. doctest::
 
  julia> Char(0x110000)
  '\U110000'
 
- julia> is_valid_char(0x110000)
+ julia> isvalid(Char, 0x110000)
  false
 
 As of this writing, the valid Unicode code points are ``U+00`` through

diff --git a/doc/stdlib/strings.rst b/doc/stdlib/strings.rst
@@ -109,17 +109,19 @@
  even though they may contain more than one codepoint; for example
  a letter combined with an accent mark is a single grapheme.)
 
-.. function:: is_valid_ascii(s) -> Bool
+.. function:: isvalid(value) -> Bool
 
- Returns true if the argument (``ASCIIString``, ``UTF8String``, or byte vector) is valid ASCII, false otherwise.
+ Returns true if the given value is valid for its type,
+ which currently can be one of ``Char``, ``ASCIIString``, ``UTF8String``, ``UTF16String``, or ``UTF32String``
 
-.. function:: is_valid_utf8(s) -> Bool
+.. function:: isvalid(T, value) -> Bool
 
- Returns true if the argument (``ASCIIString``, ``UTF8String``, or byte vector) is valid UTF-8, false otherwise.
-
-.. function:: is_valid_char(c) -> Bool
-
- Returns true if the given char or integer is a valid Unicode code point.
+ Returns true if the given value is valid for that type.
+ Types currently can be ``Char``, ``ASCIIString``, ``UTF8String``, ``UTF16String``, or ``UTF32String``
+ Values for ``Char`` can be of type ``Char`` or ``UInt32``
+ Values for ``ASCIIString`` and ``UTF8String`` can be of that type, or ``Vector{UInt8}``
+ Values for ``UTF16String`` can be ``UTF16String`` or ``Vector{UInt16}``
+ Values for ``UTF32String`` can be ``UTF32String``, ``Vector{Char}`` or ``Vector{UInt32}``
 
 .. function:: is_assigned_char(c) -> Bool
 
@@ -379,10 +381,6 @@
 
  Create a string from the address of a NUL-terminated UTF-16 string. A copy is made; the pointer can be safely freed. If ``length`` is specified, the string does not have to be NUL-terminated.
 
-.. function:: is_valid_utf16(s) -> Bool
-
- Returns true if the argument (``UTF16String`` or ``UInt16`` array) is valid UTF-16.
-
 .. function:: utf32(s)
 
  Create a UTF-32 string from a byte array, array of ``UInt32``, or