Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

wip: string overhaul #24439

Closed
wants to merge 39 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
1917811
strings: some cosmetic tweaks
StefanKarpinski Nov 10, 2017
80cb480
`Char` representation: UTF-8 bytes with most significant padding
StefanKarpinski Nov 3, 2017
40ac089
`Char` representation: UTF-8 bytes with least significant padding
StefanKarpinski Nov 3, 2017
8ba3bba
iswellformed(c::Char) to test if `c` represents a code point
StefanKarpinski Nov 9, 2017
2aa2c8c
remove internal chomp! function
StefanKarpinski Nov 11, 2017
806441e
convert(String, ::Vector{Char}): don't normalize surrogate pairs
StefanKarpinski Nov 11, 2017
5ab5eef
delete unused `unescape_chars` function
StefanKarpinski Nov 12, 2017
08656a3
malformed chars are always grapheme breaks
StefanKarpinski Nov 13, 2017
b8f7306
iswellformed => !ismalformed; make test stricter
StefanKarpinski Nov 13, 2017
747ce23
slightly more efficient character checking and decoding
StefanKarpinski Nov 13, 2017
d1e83e8
wip
StefanKarpinski Nov 15, 2017
1972d46
wip [ci skip]
StefanKarpinski Nov 15, 2017
0024056
wip
StefanKarpinski Nov 27, 2017
a84e666
wip
StefanKarpinski Nov 27, 2017
358ce5d
Revert "wip"
StefanKarpinski Nov 28, 2017
912779e
wip
StefanKarpinski Nov 29, 2017
cbbee08
wip
StefanKarpinski Nov 29, 2017
8a22a96
wip
StefanKarpinski Nov 29, 2017
829aba2
wip
StefanKarpinski Nov 29, 2017
b2d231b
wip
StefanKarpinski Nov 29, 2017
f82c793
wip
StefanKarpinski Nov 29, 2017
c55cca0
wip
StefanKarpinski Nov 29, 2017
68467ad
my dirty laundry, you filthy voyeur [ci skip]
StefanKarpinski Nov 29, 2017
29dc1c3
wip [ci skip]
StefanKarpinski Dec 5, 2017
d68eb07
wip: added a doc string for AbstractString
StefanKarpinski Dec 6, 2017
2fadfb0
wip
StefanKarpinski Dec 7, 2017
5aad731
wip
StefanKarpinski Dec 7, 2017
cad41c5
wip
StefanKarpinski Dec 7, 2017
f849593
wip
StefanKarpinski Dec 7, 2017
61dbb90
wip
StefanKarpinski Dec 7, 2017
931b289
wip
StefanKarpinski Dec 8, 2017
0794487
wip
StefanKarpinski Dec 8, 2017
b674bc1
wip
StefanKarpinski Dec 8, 2017
b802606
wip
StefanKarpinski Dec 8, 2017
8d414fd
fix [ci skip]
StefanKarpinski Dec 8, 2017
120f9ca
docstring typo fix [ci skip]
StefanKarpinski Dec 8, 2017
1861238
test for more method errors [ci skip]
StefanKarpinski Dec 8, 2017
1c722f1
cosmetic tweaks
StefanKarpinski Dec 8, 2017
e54e4c0
wip
StefanKarpinski Dec 8, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
wip
  • Loading branch information
StefanKarpinski committed Dec 8, 2017
commit 8a22a96e87545f2e3757b3113e8343feffce9028
4 changes: 2 additions & 2 deletions base/char.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ end
function ismalformed(c::Char)
u = reinterpret(UInt32, c)
l1 = leading_ones(u) << 3
t0 = trailing_zeros(u) & 24
t0 = trailing_zeros(u) & 56
(l1 == 8) | (l1 + t0 > 32) |
(((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0)
end
Expand All @@ -22,7 +22,7 @@ function convert(::Type{UInt32}, c::Char)
u = reinterpret(UInt32, c)
u < 0x80000000 && return reinterpret(UInt32, u >> 24)
l1 = leading_ones(u)
t0 = trailing_zeros(u) & 24
t0 = trailing_zeros(u) & 56
(l1 == 1) | (8l1 + t0 > 32) |
(((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0) &&
malformed_char(c)::Union{}
Expand Down
2 changes: 1 addition & 1 deletion base/repl/REPLCompletions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ const sorted_keywords = [
"primitive type", "quote", "return", "struct",
"true", "try", "using", "while"]

function complete_keyword(s::String)
function complete_keyword(s::Union{String,SubString{String}})
r = searchsorted(sorted_keywords, s)
i = first(r)
n = length(sorted_keywords)
Expand Down
14 changes: 7 additions & 7 deletions base/strings/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ end

## thisind, prevind and nextind ##

# TODO: these need updating

function thisind(s::String, i::Integer)
j = Int(i)
j < 1 && return 0
Expand All @@ -131,8 +133,6 @@ function thisind(s::String, i::Integer)
j
end

# TODO: these need updating

function prevind(s::String, i::Integer)
j = Int(i)
e = sizeof(s)
Expand Down Expand Up @@ -244,22 +244,22 @@ function next(s::String, i::Int)
end

@noinline function next_continued(s::String, i::Int, u::UInt32)
z = sizeof(s)
z = ncodeunits(s)
# first continuation byte
(i += 1) > z && @goto ret
@inbounds b = codeunit(s, i)
(b & 0xc0 == 0x80) || @goto ret
b & 0xc0 == 0x80 || @goto ret
u |= UInt32(b) << 16
# second continuation byte
((i += 1) > z) | (u < 0xe0000000) && @goto ret
@inbounds b = codeunit(s, i)
(b & 0xc0 == 0x80) || @goto ret
b & 0xc0 == 0x80 || @goto ret
u |= UInt32(b) << 8
# third continuation byte
((i += 1) > z) | (u < 0xf0000000) && @goto ret
@inbounds b = codeunit(s, i)
(b & 0xc0 == 0x80) || @goto ret
u |= UInt32(b)
b & 0xc0 == 0x80 || @goto ret
u |= UInt32(b); i += 1
@label ret
return reinterpret(Char, u), i
end
Expand Down
3 changes: 1 addition & 2 deletions test/strings/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,7 @@ myio = IOBuffer()
join(myio, "", "", 1)
@test isempty(take!(myio))

@testset "unescape_chars" begin
@test Base.unescape_chars("\\t","t") == "t"
@testset "unescape_string ArgumentErrors" begin
@test_throws ArgumentError unescape_string(IOBuffer(), string('\\',"xZ"))
@test_throws ArgumentError unescape_string(IOBuffer(), string('\\',"777"))
end
Expand Down
17 changes: 3 additions & 14 deletions test/unicode/utf8.jl
Original file line number Diff line number Diff line change
@@ -1,19 +1,8 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license

@testset "cesu8 input" begin
let ch = 0x10000
for hi = 0xd800:0xdbff
for lo = 0xdc00:0xdfff
@test String(Vector{UInt8}(String(Char[hi, lo]))) == string(Char(ch))
ch += 1
end
end
end
end

@testset "string indexing" begin
let str = String(b"this is a test\xed\x80")
@test next(str, 15) == ('\ufffd', 16)
@test next(str, 15) == (reinterpret(Char, 0xed800000), 17)
@test_throws BoundsError getindex(str, 0:3)
@test_throws BoundsError getindex(str, 17:18)
@test_throws BoundsError getindex(str, 2:17)
Expand All @@ -36,12 +25,12 @@ end
b"xyz\xf0\x80" => b"\xf0\x80zyx",
b"xyz\xf0\x80\x80" => b"\xf0\x80\x80zyx",
]
@test_broken reverse(String(s)) == String(r)
@test reverse(String(s)) == String(r)
end
end

@testset "string convert" begin
@test String(b"this is a test\xed\x80\x80") == "this is a test\ud000"
## Specifically check UTF-8 string whose lead byte is same as a surrogate
# Specifically check UTF-8 string whose lead byte is same as a surrogate
@test String(b"\xed\x9f\xbf") == "\ud7ff"
end