From ed9d6022776a1a7c95acf4d02a5dc4c33773dffa Mon Sep 17 00:00:00 2001 From: Jeffrey Lin Date: Wed, 8 Sep 2021 12:21:41 -0400 Subject: [PATCH] add `eachsplit` for iterative splitting (#39245) This moves the existing splitting implementation into an iterator named `eachsplit` and changes the definition of `split(...)` to `collect(eachsplit(...))`, plus a few edge cases. --- NEWS.md | 1 + base/binaryplatforms.jl | 2 +- base/cmd.jl | 4 +- base/exports.jl | 1 + base/initdefs.jl | 4 +- base/logging.jl | 9 +-- base/mpfr.jl | 4 +- base/path.jl | 4 +- base/strings/util.jl | 119 +++++++++++++++++++++++++++------------- base/sysinfo.jl | 2 +- base/util.jl | 2 +- base/version.jl | 2 +- 12 files changed, 99 insertions(+), 55 deletions(-) diff --git a/NEWS.md b/NEWS.md index e46de4367cac2..b1442f0192350 100644 --- a/NEWS.md +++ b/NEWS.md @@ -11,6 +11,7 @@ New language features * `@inline` and `@noinline` annotations can now be applied to a function callsite or block to enforce the involved function calls to be (or not to be) inlined. ([#41312]) * The default behavior of observing `@inbounds` declarations is now an option via `auto` in `--check-bounds=yes|no|auto` ([#41551]) +* New function `eachsplit(str)` for iteratively performing `split(str)`. Language changes ---------------- diff --git a/base/binaryplatforms.jl b/base/binaryplatforms.jl index aff1de4a80993..cfd7412faf656 100644 --- a/base/binaryplatforms.jl +++ b/base/binaryplatforms.jl @@ -706,7 +706,7 @@ function Base.parse(::Type{Platform}, triplet::AbstractString; validate_strict:: libstdcxx_version = get_field(m, libstdcxx_version_mapping) cxxstring_abi = get_field(m, cxxstring_abi_mapping) function split_tags(tagstr) - tag_fields = filter(!isempty, split(tagstr, "-")) + tag_fields = split(tagstr, "-"; keepempty=false) if isempty(tag_fields) return Pair{String,String}[] end diff --git a/base/cmd.jl b/base/cmd.jl index 0c2a22e6cf852..70d22857522b7 100644 --- a/base/cmd.jl +++ b/base/cmd.jl @@ -269,7 +269,7 @@ function addenv(cmd::Cmd, env::Dict; inherit::Bool = true) merge!(new_env, ENV) end else - for (k, v) in split.(cmd.env, "=") + for (k, v) in eachsplit.(cmd.env, "=") new_env[string(k)::String] = string(v)::String end end @@ -284,7 +284,7 @@ function addenv(cmd::Cmd, pairs::Pair{<:AbstractString}...; inherit::Bool = true end function addenv(cmd::Cmd, env::Vector{<:AbstractString}; inherit::Bool = true) - return addenv(cmd, Dict(k => v for (k, v) in split.(env, "=")); inherit) + return addenv(cmd, Dict(k => v for (k, v) in eachsplit.(env, "=")); inherit) end (&)(left::AbstractCmd, right::AbstractCmd) = AndCmds(left, right) diff --git a/base/exports.jl b/base/exports.jl index 36baa386d5510..ba454936cb7f3 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -578,6 +578,7 @@ export codeunits, digits, digits!, + eachsplit, escape_string, hex2bytes, hex2bytes!, diff --git a/base/initdefs.jl b/base/initdefs.jl index 2cac786cfd194..402e90a0445b4 100644 --- a/base/initdefs.jl +++ b/base/initdefs.jl @@ -100,7 +100,7 @@ function init_depot_path() if haskey(ENV, "JULIA_DEPOT_PATH") str = ENV["JULIA_DEPOT_PATH"] isempty(str) && return - for path in split(str, Sys.iswindows() ? ';' : ':') + for path in eachsplit(str, Sys.iswindows() ? ';' : ':') if isempty(path) append_default_depot_path!(DEPOT_PATH) else @@ -198,7 +198,7 @@ end function parse_load_path(str::String) envs = String[] isempty(str) && return envs - for env in split(str, Sys.iswindows() ? ';' : ':') + for env in eachsplit(str, Sys.iswindows() ? ';' : ':') if isempty(env) for env′ in DEFAULT_LOAD_PATH env′ in envs || push!(envs, env′) diff --git a/base/logging.jl b/base/logging.jl index 712fe19c23699..731b203a950ba 100644 --- a/base/logging.jl +++ b/base/logging.jl @@ -674,10 +674,11 @@ function handle_message(logger::SimpleLogger, level::LogLevel, message, _module, end iob = IOContext(buf, stream) levelstr = level == Warn ? "Warning" : string(level) - msglines = split(chomp(string(message)::String), '\n') - println(iob, "┌ ", levelstr, ": ", msglines[1]) - for i in 2:length(msglines) - println(iob, "│ ", msglines[i]) + msglines = eachsplit(chomp(string(message)::String), '\n') + msg1, rest = Iterators.peel(msglines) + println(iob, "┌ ", levelstr, ": ", msg1) + for msg in rest + println(iob, "│ ", msg) end for (key, val) in kwargs key === :maxlog && continue diff --git a/base/mpfr.jl b/base/mpfr.jl index 0ffb0f50b1034..c1cdcfb583497 100644 --- a/base/mpfr.jl +++ b/base/mpfr.jl @@ -962,7 +962,7 @@ function string_mpfr(x::BigFloat, fmt::String) end function _prettify_bigfloat(s::String)::String - mantissa, exponent = split(s, 'e') + mantissa, exponent = eachsplit(s, 'e') if !occursin('.', mantissa) mantissa = string(mantissa, '.') end @@ -973,7 +973,7 @@ function _prettify_bigfloat(s::String)::String expo = parse(Int, exponent) if -5 < expo < 6 expo == 0 && return mantissa - int, frac = split(mantissa, '.') + int, frac = eachsplit(mantissa, '.') if expo > 0 expo < length(frac) ? string(int, frac[1:expo], '.', frac[expo+1:end]) : diff --git a/base/path.jl b/base/path.jl index 6f74bff25ba26..253bd81684d2f 100644 --- a/base/path.jl +++ b/base/path.jl @@ -368,8 +368,8 @@ function normpath(path::String) isabs = isabspath(path) isdir = isdirpath(path) drive, path = splitdrive(path) - parts = split(path, path_separator_re) - filter!(x->!isempty(x) && x!=".", parts) + parts = split(path, path_separator_re; keepempty=false) + filter!(!=("."), parts) while true clean = true for j = 1:length(parts)-1 diff --git a/base/strings/util.jl b/base/strings/util.jl index b2e9e2e4588f5..ad61a01677a70 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -383,6 +383,83 @@ function rpad( r == 0 ? string(s, p^q) : string(s, p^q, first(p, r)) end +""" + eachsplit(str::AbstractString, dlm; limit::Integer=0) + eachsplit(str::AbstractString; limit::Integer=0) + +Split `str` on occurrences of the delimiter(s) `dlm` and return an iterator over the +substrings. `dlm` can be any of the formats allowed by [`findnext`](@ref)'s first argument +(i.e. as a string, regular expression or a function), or as a single character or collection +of characters. + +If `dlm` is omitted, it defaults to [`isspace`](@ref). + +The iterator will return a maximum of `limit` results if the keyword argument is supplied. +The default of `limit=0` implies no maximum. + +See also [`split`](@ref). + +# Examples +```jldoctest +julia> a = "Ma.rch" +"Ma.rch" + +julia> collect(eachsplit(a, ".")) +2-element Vector{SubString}: + "Ma" + "rch" +``` +""" +function eachsplit end + +# Forcing specialization on `splitter` improves performance (roughly 30% decrease in runtime) +# and prevents a major invalidation risk (1550 MethodInstances) +struct SplitIterator{S<:AbstractString,F} + str::S + splitter::F + limit::Int + keepempty::Bool +end + +eltype(::Type{<:SplitIterator}) = SubString + +IteratorSize(::Type{<:SplitIterator}) = SizeUnknown() + +# i: the starting index of the substring to be extracted +# k: the starting index of the next substring to be extracted +# n: the number of splits returned so far; always less than iter.limit - 1 (1 for the rest) +function iterate(iter::SplitIterator, (i, k, n)=(firstindex(iter.str), firstindex(iter.str), 0)) + i - 1 > ncodeunits(iter.str)::Int && return nothing + r = findnext(iter.splitter, iter.str, k)::Union{Nothing,Int,UnitRange{Int}} + while r !== nothing && n != iter.limit - 1 && first(r) <= ncodeunits(iter.str) + j, k = first(r), nextind(iter.str, last(r))::Int + k_ = k <= j ? nextind(iter.str, j) : k + if i < k + substr = @inbounds SubString(iter.str, i, prevind(iter.str, j)::Int) + (iter.keepempty || i < j) && return (substr, (k, k_, n + 1)) + i = k + end + k = k_ + r = findnext(iter.splitter, iter.str, k)::Union{Nothing,Int,UnitRange{Int}} + end + iter.keepempty || i <= ncodeunits(iter.str) || return nothing + @inbounds SubString(iter.str, i), (ncodeunits(iter.str) + 2, k, n + 1) +end + +eachsplit(str::T, splitter; limit::Integer=0, keepempty::Bool=true) where {T<:AbstractString} = + SplitIterator(str, splitter, limit, keepempty) + +eachsplit(str::T, splitter::Union{Tuple{Vararg{AbstractChar}},AbstractVector{<:AbstractChar},Set{<:AbstractChar}}; + limit::Integer=0, keepempty=true) where {T<:AbstractString} = + eachsplit(str, in(splitter); limit, keepempty) + +eachsplit(str::T, splitter::AbstractChar; limit::Integer=0, keepempty=true) where {T<:AbstractString} = + eachsplit(str, isequal(splitter); limit, keepempty) + +# a bit oddball, but standard behavior in Perl, Ruby & Python: +eachsplit(str::AbstractString; limit::Integer=0, keepempty=false) = + eachsplit(str, isspace; limit, keepempty) + """ split(str::AbstractString, dlm; limit::Integer=0, keepempty::Bool=true) split(str::AbstractString; limit::Integer=0, keepempty::Bool=false) @@ -412,52 +489,16 @@ julia> split(a, ".") "rch" ``` """ -function split end - function split(str::T, splitter; limit::Integer=0, keepempty::Bool=true) where {T<:AbstractString} - _split(str, splitter, limit, keepempty, T <: SubString ? T[] : SubString{T}[]) -end -function split(str::T, splitter::Union{Tuple{Vararg{AbstractChar}},AbstractVector{<:AbstractChar},Set{<:AbstractChar}}; - limit::Integer=0, keepempty::Bool=true) where {T<:AbstractString} - _split(str, in(splitter), limit, keepempty, T <: SubString ? T[] : SubString{T}[]) -end -function split(str::T, splitter::AbstractChar; - limit::Integer=0, keepempty::Bool=true) where {T<:AbstractString} - _split(str, isequal(splitter), limit, keepempty, T <: SubString ? T[] : SubString{T}[]) -end - -function _split(str::AbstractString, splitter::F, limit::Integer, keepempty::Bool, strs::Vector) where F - # Forcing specialization on `splitter` improves performance (roughly 30% decrease in runtime) - # and prevents a major invalidation risk (1550 MethodInstances) - i = 1 # firstindex(str) - n = lastindex(str)::Int - r = findfirst(splitter,str)::Union{Nothing,Int,UnitRange{Int}} - if r !== nothing - j, k = first(r), nextind(str,last(r))::Int - while 0 < j <= n && length(strs) != limit-1 - if i < k - if keepempty || i < j - push!(strs, @inbounds SubString(str,i,prevind(str,j)::Int)) - end - i = k - end - (k <= j) && (k = nextind(str,j)::Int) - r = findnext(splitter,str,k)::Union{Nothing,Int,UnitRange{Int}} - r === nothing && break - j, k = first(r), nextind(str,last(r))::Int - end - end - if keepempty || i <= ncodeunits(str)::Int - push!(strs, @inbounds SubString(str,i)) - end - return strs + itr = eachsplit(str, splitter; limit, keepempty) + collect(T <: SubString ? T : SubString{T}, itr) end # a bit oddball, but standard behavior in Perl, Ruby & Python: split(str::AbstractString; limit::Integer=0, keepempty::Bool=false) = - split(str, isspace; limit=limit, keepempty=keepempty) + split(str, isspace; limit, keepempty) """ rsplit(s::AbstractString; limit::Integer=0, keepempty::Bool=false) diff --git a/base/sysinfo.jl b/base/sysinfo.jl index cdcb304271b5d..6df8cdc56d20a 100644 --- a/base/sysinfo.jl +++ b/base/sysinfo.jl @@ -499,7 +499,7 @@ function which(program_name::String) # If we have been given just a program name (not a relative or absolute # path) then we should search `PATH` for it here: pathsep = iswindows() ? ';' : ':' - path_dirs = abspath.(split(get(ENV, "PATH", ""), pathsep)) + path_dirs = map(abspath, eachsplit(get(ENV, "PATH", ""), pathsep)) # On windows we always check the current directory as well if iswindows() diff --git a/base/util.jl b/base/util.jl index f26823cc69ad8..f9f0a02376faf 100644 --- a/base/util.jl +++ b/base/util.jl @@ -97,7 +97,7 @@ function with_output_color(@nospecialize(f::Function), color::Union{Int, Symbol} (bold ? disable_text_style[:bold] : "") * get(disable_text_style, color, text_colors[:default]) first = true - for line in split(str, '\n') + for line in eachsplit(str, '\n') first || print(buf, '\n') first = false isempty(line) && continue diff --git a/base/version.jl b/base/version.jl index 77676f80e3676..2ff1842b79caf 100644 --- a/base/version.jl +++ b/base/version.jl @@ -100,7 +100,7 @@ const VERSION_REGEX = r"^ $"ix function split_idents(s::AbstractString) - idents = split(s, '.') + idents = eachsplit(s, '.') pidents = Union{UInt64,String}[occursin(r"^\d+$", ident) ? parse(UInt64, ident) : String(ident) for ident in idents] return tuple(pidents...)::VerTuple end