Skip to content

Commit

Permalink
reverse iteration for eachline (JuliaLang#42225)
Browse files Browse the repository at this point in the history
  • Loading branch information
stevengj committed Dec 20, 2021
1 parent f835c24 commit fda5769
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 1 deletion.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ Standard library changes

* `range` accepts either `stop` or `length` as a sole keyword argument ([#39241])
* `precision` and `setprecision` now accept a `base` keyword ([#42428]).
* `Iterators.reverse` (and hence `last`) now supports `eachline` iterators ([#42225]).
* The `length` function on certain ranges of certain specific element types no longer checks for integer
overflow in most cases. The new function `checked_length` is now available, which will try to use checked
arithmetic to error if the result may be wrapping. Or use a package such as SaferIntegers.jl when
Expand Down
117 changes: 117 additions & 0 deletions base/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1019,6 +1019,11 @@ closed when the `EachLine` object is garbage collected.
To iterate over each line of a `String`, `eachline(IOBuffer(str))` can be used.
[`Iterators.reverse`](@ref) can be used on an `EachLine` object to read the lines
in reverse order (for files, buffers, and other I/O streams supporting [`seek`](@ref)),
and [`first`](@ref) or [`last`](@ref) can be used to extract the initial or final
lines, respectively.
# Examples
```jldoctest
julia> open("my_file.txt", "w") do io
Expand All @@ -1032,6 +1037,9 @@ JuliaLang is a GitHub organization. It has many members.
julia> rm("my_file.txt");
```
!!! compat "Julia 1.8"
Julia 1.8 is required to use `Iterators.reverse` or `last` with `eachline` iterators.
"""
function eachline(stream::IO=stdin; keep::Bool=false)
EachLine(stream, keep=keep)::EachLine
Expand All @@ -1053,6 +1061,115 @@ IteratorSize(::Type{<:EachLine}) = SizeUnknown()

isdone(itr::EachLine, state...) = eof(itr.stream)

# Reverse-order iteration for the EachLine iterator for seekable streams,
# which works by reading the stream from the end in 4kiB chunks.
function iterate(r::Iterators.Reverse{<:EachLine})
p0 = position(r.itr.stream)
seekend(r.itr.stream) # may throw if io is non-seekable
p = position(r.itr.stream)
# chunks = circular buffer of 4kiB blocks read from end of stream
chunks = empty!(Vector{Vector{UInt8}}(undef, 2)) # allocate space for 2 buffers (common case)
inewline = jnewline = 0
while p > p0 && inewline == 0 # read chunks until we find a newline or we read whole file
chunk = Vector{UInt8}(undef, min(4096, p-p0))
p -= length(chunk)
readbytes!(seek(r.itr.stream, p), chunk)
pushfirst!(chunks, chunk)
inewline = something(findlast(==(UInt8('\n')), chunk), 0)
if length(chunks) == 1 && inewline == length(chunks[1])
# found newline at end of file … keep looking
jnewline = inewline
inewline = something(findprev(==(UInt8('\n')), chunk, inewline-1), 0)
end
end
return iterate(r, (; p0, p, chunks, ichunk=1, inewline, jchunk=length(chunks), jnewline = jnewline == 0 && !isempty(chunks) ? length(chunks[end]) : jnewline))
end
function iterate(r::Iterators.Reverse{<:EachLine}, state)
function _stripnewline(keep, pos, data)
# strip \n or \r\n from data[pos] by decrementing pos
if !keep && pos > 0 && data[pos] == UInt8('\n')
pos -= 1
pos -= pos > 0 && data[pos] == UInt8('\r')
end
return pos
end
# state tuple: p0 = initial file position, p = current position,
# chunks = circular array of chunk buffers,
# current line is from chunks[ichunk][inewline+1] to chunks[jchunk][jnewline]
p0, p, chunks, ichunk, inewline, jchunk, jnewline = state
if inewline == 0 # no newline found, remaining line = rest of chunks (if any)
isempty(chunks) && return (r.itr.ondone(); nothing)
buf = IOBuffer(sizehint = ichunk==jchunk ? jnewline : 4096)
while ichunk != jchunk
write(buf, chunks[ichunk])
ichunk = ichunk == length(chunks) ? 1 : ichunk + 1
end
chunk = chunks[jchunk]
write(buf, view(chunk, 1:jnewline))
buf.size = _stripnewline(r.itr.keep, buf.size, buf.data)
empty!(chunks) # will cause next iteration to terminate
seekend(r.itr.stream) # reposition to end of stream for isdone
s = String(take!(buf))
else
# extract the string from chunks[ichunk][inewline+1] to chunks[jchunk][jnewline]
if ichunk == jchunk # common case: current and previous newline in same chunk
chunk = chunks[ichunk]
s = String(view(chunk, inewline+1:_stripnewline(r.itr.keep, jnewline, chunk)))
else
buf = IOBuffer(sizehint=max(128, length(chunks[ichunk])-inewline+jnewline))
write(buf, view(chunks[ichunk], inewline+1:length(chunks[ichunk])))
i = ichunk
while true
i = i == length(chunks) ? 1 : i + 1
i == jchunk && break
write(buf, chunks[i])
end
write(buf, view(chunks[jchunk], 1:jnewline))
buf.size = _stripnewline(r.itr.keep, buf.size, buf.data)
s = String(take!(buf))

# overwrite obsolete chunks (ichunk+1:jchunk)
i = jchunk
while i != ichunk
chunk = chunks[i]
p -= length(resize!(chunk, min(4096, p-p0)))
readbytes!(seek(r.itr.stream, p), chunk)
i = i == 1 ? length(chunks) : i - 1
end
end

# find the newline previous to inewline
jchunk = ichunk
jnewline = inewline
while true
inewline = something(findprev(==(UInt8('\n')), chunks[ichunk], inewline-1), 0)
inewline > 0 && break
ichunk = ichunk == 1 ? length(chunks) : ichunk - 1
ichunk == jchunk && break # found nothing — may need to read more chunks
inewline = length(chunks[ichunk])+1 # start for next findprev
end

# read more chunks to look for a newline (should rarely happen)
if inewline == 0 && p > p0
ichunk = jchunk + 1
while true
chunk = Vector{UInt8}(undef, min(4096, p-p0))
p -= length(chunk)
readbytes!(seek(r.itr.stream, p), chunk)
insert!(chunks, ichunk, chunk)
inewline = something(findlast(==(UInt8('\n')), chunk), 0)
(p == p0 || inewline > 0) && break
end
end
end
return (s, (; p0, p, chunks, ichunk, inewline, jchunk, jnewline))
end
isdone(r::Iterators.Reverse{<:EachLine}, state) = isempty(state.chunks)
isdone(r::Iterators.Reverse{<:EachLine}) = isdone(r.itr)

# use reverse iteration to get end of EachLines (if possible)
last(itr::EachLine) = first(Iterators.reverse(itr))

struct ReadEachIterator{T, IOT <: IO}
stream::IOT
end
Expand Down
1 change: 0 additions & 1 deletion base/iterators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ size(r::Reverse) = size(r.itr)
IteratorSize(::Type{Reverse{T}}) where {T} = IteratorSize(T)
IteratorEltype(::Type{Reverse{T}}) where {T} = IteratorEltype(T)
last(r::Reverse) = first(r.itr) # the first shall be last
first(r::Reverse) = last(r.itr) # and the last shall be first

# reverse-order array iterators: assumes more-specialized Reverse for eachindex
@propagate_inbounds function iterate(A::Reverse{<:AbstractArray}, state=(reverse(eachindex(A.itr)),))
Expand Down
38 changes: 38 additions & 0 deletions test/read.jl
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,14 @@ for (name, f) in l
@test collect(eachline(io(), keep=true)) == collect(eachline(filename, keep=true))
@test collect(eachline(io())) == collect(eachline(IOBuffer(text)))
@test collect(@inferred(eachline(io()))) == collect(@inferred(eachline(filename))) #20351
if try; seekend(io()); true; catch; false; end # reverse iteration only supports seekable streams
for keep in (true, false)
lines = readlines(io(); keep)
@test last(lines) == last(eachline(io(); keep))
@test last(lines,2) == last(eachline(io(); keep),2)
@test reverse!(lines) == collect(Iterators.reverse(eachline(io(); keep))) == collect(Iterators.reverse(eachline(IOBuffer(text); keep)))
end
end

cleanup()

Expand Down Expand Up @@ -621,3 +629,33 @@ end
first(itr) # consume the iterator
@test isempty(itr) # now it is empty
end

# more tests for reverse(eachline)
@testset "reverse(eachline)" begin
lines = vcat(repr.(1:4), ' '^50000 .* repr.(5:10), repr.(11:10^5))
for lines in (lines, reverse(lines)), finalnewline in (true, false), eol in ("\n", "\r\n")
buf = IOBuffer(join(lines, eol) * (finalnewline ? eol : ""))
@test reverse!(collect(Iterators.reverse(eachline(seekstart(buf))))) == lines
@test last(eachline(seekstart(buf))) == last(lines)
@test last(eachline(seekstart(buf)),10^4) == last(lines,10^4)
@test last(eachline(seekstart(buf)),length(lines)*2) == lines
@test reverse!(collect(Iterators.reverse(eachline(seek(buf, sum(sizeof, lines[1:100]) + 100*sizeof(eol)))))) == lines[101:end]
@test isempty(Iterators.reverse(eachline(buf)))
end

let rempty = Iterators.reverse(eachline(IOBuffer()))
@test isempty(rempty)
@test isempty(collect(rempty))
end

let buf = IOBuffer("foo\nbar")
@test readline(buf) == "foo"
r = Iterators.reverse(eachline(buf))
line, state = iterate(r)
@test line == "bar"
@test Base.isdone(r, state)
@test Base.isdone(r)
@test isempty(r) && isempty(collect(r))
end
end

0 comments on commit fda5769

Please sign in to comment.