## core string functions ## length{T<:String}(s::T) = error("you must implement length(",T,")") next{T<:String}(s::T, i::Int) = error("you must implement next(",T,",Int)") next(s::DirectIndexString, i::Int) = (s[i],i+1) ## generic supplied functions ## start(s::String) = 1 done(s::String,i) = (i > length(s)) numel(s::String) = length(s) isempty(s::String) = done(s,start(s)) ref(s::String, i::Index) = next(s,i)[1] ref(s::String, x::Real) = s[iround(x)] ref(s::String, r::Range1) = s[iround(r.start):iround(r.stop)] symbol(s::String) = symbol(cstring(s)) string(s::String) = s print(s::String) = for c=s; print(c); end print(x...) = for i=x; print(i); end println(args...) = print(args..., '\n') show(s::String) = print_quoted(s) (*)(s::String...) = strcat(s...) (^)(s::String, r::Int) = repeat(s,r) size(s::String) = (length(s),) size(s::String, d::Index) = d == 1 ? length(s) : error("in size: tupleref: index ",d," out of range") strlen(s::DirectIndexString) = length(s) function strlen(s::String) i = start(s) if done(s,i) return 0 end n = 1 while true c, j = next(s,i) if done(s,j) return n end n += 1 i = j end end isvalid(s::DirectIndexString, i::Int) = true function isvalid(s::String, i::Int) try next(s,i) true catch false end end thisind(s::DirectIndexString, i::Int) = i nextind(s::DirectIndexString, i::Int) = i+1 function thisind(s::String, i::Int) for j = i:-1:1 if isvalid(s,j) return j end end return 0 # out of range end function nextind(s::String, i::Int) for j = i+1:length(s) if isvalid(s,j) return j end end length(s)+1 # out of range end ind2chr(s::DirectIndexString, i::Int) = i chr2ind(s::DirectIndexString, i::Int) = i function ind2chr(s::String, i::Int) s[i] # throws error if invalid j = 1 k = start(s) while true c, l = next(s,k) if i <= k return j end j += 1 k = l end end function chr2ind(s::String, i::Int) if i < 1 return i end j = 1 k = start(s) while true c, l = next(s,k) if i == j return k end j += 1 k = l end end function strchr(s::String, c::Char, i::Int) i = nextind(s,i) while !done(s,i) d, j = next(s,i) if c == d return i end i = j end return 0 end strchr(s::String, c::Char) = strchr(s, c, start(s)) contains(s::String, c::Char) = (strchr(s,c)!=0) function chars(s::String) cx = Array(Char,strlen(s)) i = 0 for c = s cx[i += 1] = c end return cx end function cmp(a::String, b::String) i = start(a) j = start(b) while !done(a,i) && !done(b,i) c, i = next(a,i) d, j = next(b,j) if c != d return c < d ? -1 : +1 end end done(a,i) && !done(b,j) ? -1 : !done(a,i) && done(b,j) ? +1 : 0 end (< )(a::String, b::String) = cmp(a,b) < 0 (<=)(a::String, b::String) = cmp(a,b) <= 0 isequal(a::String, b::String) = cmp(a,b) == 0 # faster comparisons for byte strings cmp(a::ByteString, b::ByteString) = lexcmp(a.data, b.data) isequal(a::ByteString, b::ByteString) = length(a)==length(b) && cmp(a,b)==0 ## generic string uses only length and next ## type GenericString <: String string::String end length(s::GenericString) = length(s.string) next(s::GenericString, i::Int) = next(s.string, i) ## plain old character arrays ## type CharString <: String chars::Array{Char,1} CharString(a::Array{Char,1}) = new(a) CharString(c::Char...) = new([ c[i] | i=1:length(c) ]) end CharString(x...) = CharString(map(char,x)...) next(s::CharString, i::Index) = (s.chars[i], i+1) length(s::CharString) = length(s.chars) strlen(s::CharString) = length(s) string(c::Char) = CharString(c) string(c::Char, x::Char...) = CharString(c, x...) ## substrings reference original strings ## type SubString <: String string::String offset::Index length::Index SubString(s::String, i::Index, j::Index) = new(s, i-1, j-i+1) SubString(s::SubString, i::Index, j::Index) = new(s.string, i-1+s.offset, j-i+1) end function next(s::SubString, i::Index) if i < 1 || i > s.length error("string index out of bounds") end c, i = next(s.string, i+s.offset) c, i-s.offset end length(s::SubString) = s.length # TODO: strlen(s::SubString) = ?? # default implementation will work but it's slow # can this be delegated efficiently somehow? # that may require additional string interfaces function ref(s::String, r::Range1{Index}) if r.start < 1 || length(s) < r.stop error("in substring slice: index out of range") end SubString(s, r.start, r.stop) end ## efficient representation of repeated strings ## type RepString <: String string::String repeat::Int end length(s::RepString) = length(s.string)*s.repeat strlen(s::RepString) = strlen(s.string)*s.repeat function next(s::RepString, i::Index) if i < 1 || i > length(s) error("string index out of bounds") end j = mod1(i,length(s.string)) c, k = next(s.string, j) c, k-j+i end function repeat(s::String, r::Int) r < 0 ? error("can't repeat a string ",r," times") : r == 0 ? "" : r == 1 ? s : RepString(s,r) end ## reversed strings without data movement ## type RevString <: String string::String end length(s::RevString) = length(s.string) strlen(s::RevString) = strlen(s.string) start(s::RevString) = (n=length(s); n-thisind(s.string,n)+1) function next(s::RevString, i::Index) n = length(s); j = n-i+1 (s.string[j], n-thisind(s.string,j-1)+1) end reverse(s::String) = RevString(s) reverse(s::RevString) = s.string ## ropes for efficient concatenation, etc. ## # Idea: instead of this standard binary tree structure, # how about we keep an array of substrings, with an # offset array. We can do binary search on the offset # array so we get O(log(n)) indexing time still, but we # can compute the offsets lazily and avoid all the # futzing around while the string is being constructed. type RopeString <: String head::String tail::String depth::Int32 length::Index RopeString(h::RopeString, t::RopeString) = depth(h.tail) + depth(t) < depth(h.head) ? RopeString(h.head, RopeString(h.tail, t)) : new(h, t, max(h.depth,t.depth)+1, length(h)+length(t)) RopeString(h::RopeString, t::String) = depth(h.tail) < depth(h.head) ? RopeString(h.head, RopeString(h.tail, t)) : new(h, t, h.depth+1, length(h)+length(t)) RopeString(h::String, t::RopeString) = depth(t.head) < depth(t.tail) ? RopeString(RopeString(h, t.head), t.tail) : new(h, t, t.depth+1, length(h)+length(t)) RopeString(h::String, t::String) = new(h, t, 1, length(h)+length(t)) end depth(s::String) = 0 depth(s::RopeString) = s.depth function next(s::RopeString, i::Index) if i <= length(s.head) return next(s.head, i) else c, j = next(s.tail, i-length(s.head)) return c, j+length(s.head) end end length(s::RopeString) = s.length strlen(s::RopeString) = strlen(s.head) + strlen(s.tail) strcat() = "" strcat(s::String) = s strcat(x...) = strcat(map(string,x)...) strcat(s::String, t::String...) = (t = strcat(t...); isempty(s) ? t : isempty(t) ? s : RopeString(s, t)) print(s::RopeString) = print(s.head, s.tail) ## transformed strings ## type TransformedString <: String transform::Function string::String end length(s::TransformedString) = length(s.string) strlen(s::TransformedString) = strlen(s.string) function next(s::TransformedString, i::Index) c, j = next(s.string,i) c = s.transform(c, i) return c, j end ## uppercase and lowercase transformations ## uc(c::Char) = ccall(dlsym(libc, :towupper), Char, (Char,), c) lc(c::Char) = ccall(dlsym(libc, :towlower), Char, (Char,), c) uc(s::String) = TransformedString((c,i)->uc(c), s) lc(s::String) = TransformedString((c,i)->lc(c), s) ucfirst(s::String) = TransformedString((c,i)->i==1 ? uc(c) : c, s) lcfirst(s::String) = TransformedString((c,i)->i==1 ? lc(c) : c, s) ## conversion of general objects to strings ## string(x) = print_to_string(show, x) cstring(x...) = print_to_string(print, x...) function cstring(p::Ptr{Uint8}) p == C_NULL ? error("cannot convert NULL to string") : ccall(:jl_cstr_to_string, Any, (Ptr{Uint8},), p)::String end ## printing literal quoted string data ## # TODO: this is really the inverse of print_unbackslashed function print_quoted_literal(s::String) print('"') for c = s; c == '"' ? print("\\\"") : print(c); end print('"') end ## string escaping & unescaping ## escape_nul(s::String, i::Index) = !done(s,i) && '0' <= next(s,i)[1] <= '7' ? L"\x00" : L"\0" is_hex_digit(c::Char) = '0'<=c<='9' || 'a'<=c<='f' || 'A'<=c<='F' need_full_hex(s::String, i::Index) = !done(s,i) && is_hex_digit(next(s,i)[1]) function print_escaped(s::String, esc::String) i = start(s) while !done(s,i) c, j = next(s,i) c == '\0' ? print(escape_nul(s,j)) : c == '\e' ? print(L"\e") : c == '\\' ? print("\\\\") : contains(esc,c) ? print('\\', c) : iswprint(c) ? print(c) : 7 <= c <= 13 ? print('\\', "abtnvfr"[c-6]) : c <= '\x7f' ? print(L"\x", hex(c, 2)) : c <= '\uffff' ? print(L"\u", hex(c, need_full_hex(s,j) ? 4 : 2)) : print(L"\U", hex(c, need_full_hex(s,j) ? 8 : 4)) i = j end end escape_string(s::String) = print_to_string(length(s), print_escaped, s, "\"") print_quoted(s::String) = (print('"'); print_escaped(s, "\"\$"); print('"')) #" # work around syntax highlighting problem quote_string(s::String) = print_to_string(length(s)+2, print_quoted, s) # bare minimum unescaping function unescapes only given characters function print_unescaped_chars(s::String, esc::String) if !contains(esc,'\\') esc = strcat("\\", esc) end i = start(s) while !done(s,i) c, i = next(s,i) if c == '\\' && !done(s,i) && contains(esc,s[i]) c, i = next(s,i) end print(c) end end unescape_chars(s::String, esc::String) = print_to_string(length(s), print_unescaped_chars, s, esc) # general unescaping of traditional C and Unicode escape sequences function print_unescaped(s::String) i = start(s) while !done(s,i) c, i = next(s,i) if !done(s,i) && c == '\\' c, i = next(s,i) if c == 'x' || c == 'u' || c == 'U' n = k = 0 m = c == 'x' ? 2 : c == 'u' ? 4 : 8 while (k+=1) <= m && !done(s,i) c, j = next(s,i) n = '0' <= c <= '9' ? n<<4 + c-'0' : 'a' <= c <= 'f' ? n<<4 + c-'a'+10 : 'A' <= c <= 'F' ? n<<4 + c-'A'+10 : break i = j end if k == 1 error("\\x used with no following hex digits") end if m == 2 # \x escape sequence write(uint8(n)) else print(char(n)) end elseif '0' <= c <= '7' k = 1 n = c-'0' while (k+=1) <= 3 && !done(s,i) c, j = next(s,i) n = '0' <= c <= '7' ? n<<3 + c-'0' : break i = j end if n > 255 error("octal escape sequence out of range") end write(uint8(n)) else print(c == 'a' ? '\a' : c == 'b' ? '\b' : c == 't' ? '\t' : c == 'n' ? '\n' : c == 'v' ? '\v' : c == 'f' ? '\f' : c == 'r' ? '\r' : c == 'e' ? '\e' : c) end else print(c) end end end unescape_string(s::String) = print_to_string(length(s), print_unescaped, s) ## checking UTF-8 & ACSII validity ## byte_string_classify(s::ByteString) = ccall(:u8_isvalid, Int32, (Ptr{Uint8}, Size), s.data, length(s)) # 0: neither valid ASCII nor UTF-8 # 1: valid ASCII # 2: valid UTF-8 is_valid_ascii(s::ByteString) = byte_string_classify(s) == 1 is_valid_utf8 (s::ByteString) = byte_string_classify(s) != 0 check_ascii(s::ByteString) = is_valid_ascii(s) ? s : error("invalid ASCII sequence") check_utf8 (s::ByteString) = is_valid_utf8(s) ? s : error("invalid UTF-8 sequence") ## string interpolation parsing ## function _jl_interp_parse(s::String, unescape::Function, printer::Function) sx = {} i = j = start(s) while !done(s,j) c, k = next(s,j) if c == '$' if !isempty(s[i:j-1]) push(sx, unescape(s[i:j-1])) end ex, j = parseatom(s,k) push(sx, ex) i = j elseif c == '\\' && !done(s,k) if s[k] == '$' if !isempty(s[i:j-1]) push(sx, unescape(s[i:j-1])) end i = k end c, j = next(s,k) else j = k end end if !isempty(s[i:]) push(sx, unescape(s[i:j-1])) end length(sx) == 1 && isa(sx[1],ByteString) ? sx[1] : expr(:call, :print_to_string, printer, sx...) end _jl_interp_parse(s::String, u::Function) = _jl_interp_parse(s, u, print) _jl_interp_parse(s::String) = _jl_interp_parse(s, x->check_utf8(unescape_string(x))) function _jl_interp_parse_bytes(s::String) writer(x...) = for w=x; write(w); end _jl_interp_parse(s, unescape_string, writer) end ## core string macros ## macro str(s); _jl_interp_parse(s); end macro S_str(s); _jl_interp_parse(s); end macro I_str(s); _jl_interp_parse(s, x->unescape_chars(x,"\"")); end macro E_str(s); check_utf8(unescape_string(s)); end macro B_str(s); _jl_interp_parse_bytes(s); end macro b_str(s); ex = _jl_interp_parse_bytes(s); :(($ex).data); end ## shell-like command parsing ## function _jl_shell_parse(s::String, interp::Bool) in_single_quotes = false in_double_quotes = false args = {} arg = {} i = start(s) j = i function update_arg(x) if !isa(x,String) || !isempty(x) push(arg, x) end end function append_arg() if isempty(arg); arg = {"",}; end push(args, arg) arg = {} end while !done(s,j) c, k = next(s,j) if !in_single_quotes && !in_double_quotes && iswspace(c) update_arg(s[i:j-1]) append_arg() j = k while !done(s,j) c, k = next(s,j) if !iswspace(c) i = j break end j = k end elseif interp && !in_single_quotes && c == '$' update_arg(s[i:j-1]); i = k j = k ex, j = parseatom(s,j) update_arg(ex); i = j else if !in_double_quotes && c == '\'' in_single_quotes = !in_single_quotes update_arg(s[i:j-1]); i = k elseif !in_single_quotes && c == '"' in_double_quotes = !in_double_quotes update_arg(s[i:j-1]); i = k elseif c == '\\' if in_double_quotes # TODO: handle \$ in double quotes if done(s,k) error("unterminated double quote") end if s[k] == '"' update_arg(s[i:j-1]); i = k c, k = next(s,k) end elseif !in_single_quotes if done(s,k) error("dangling backslash") end update_arg(s[i:j-1]); i = k c, k = next(s,k) end end j = k end end if in_single_quotes; error("unterminated single quote"); end if in_double_quotes; error("unterminated double quote"); end update_arg(s[i:]) append_arg() if !interp return args end # construct an expression exprs = {} for arg = args push(exprs, expr(:tuple, arg)) end expr(:tuple,exprs) end _jl_shell_parse(s::String) = _jl_shell_parse(s,true) function shell_split(s::String) parsed = _jl_shell_parse(s,false) args = empty(String) for arg = parsed push(args, strcat(arg...)) end args end function print_shell_word(word::String) if isempty(word) print("''") end has_single = false has_special = false for c = word if iswspace(c) || c=='\\' || c=='\'' || c=='"' || c=='$' has_special = true if c == '\'' has_single = true end end end if !has_special print(word) elseif !has_single print('\'', word, '\'') else print('"') for c = word if c == '"' || c == '$' print('\\') end print(c) end print('"') end end function print_shell_escaped(cmd::String, args::String...) print_shell_word(cmd) for arg = args print(' ') print_shell_word(arg) end end shell_escape(cmd::String, args::String...) = print_to_string(print_shell_escaped, cmd, args...) ## interface to parser ## function parse(s::String, pos, greedy) # returns (expr, end_pos). expr is () in case of parse error. ex, pos = ccall(:jl_parse_string, Any, (Ptr{Uint8},Int32,Int32), cstring(s), int32(pos-1), int32(greedy ? 1:0)) if isa(ex,Expr) && is(ex.head,:error) throw(ParseError(ex.args[1])) end if ex == (); throw(ParseError("end of input")); end ex, pos+1 # C is zero-based, Julia is 1-based end parse(s::String) = parse(s, 1, true) parse(s::String, pos) = parse(s, pos, true) parseatom(s::String) = parse(s, 1, false) parseatom(s::String, pos) = parse(s, pos, false) ## miscellaneous string functions ## function lpad(s::String, n::Int, p::String) m = n - strlen(s) if m <= 0; return s; end l = strlen(p) q = div(m,l) r = m - q*l p^q*p[1:chr2ind(p,r)]*s end function rpad(s::String, n::Int, p::String) m = n - strlen(s) if m <= 0; return s; end l = strlen(p) q = div(m,l) r = m - q*l s*p^q*p[1:chr2ind(p,r)] end lpad(s, n::Int, p) = lpad(string(s), n, string(p)) rpad(s, n::Int, p) = rpad(string(s), n, string(p)) lpad(s, n::Int) = lpad(string(s), n, " ") rpad(s, n::Int) = rpad(string(s), n, " ") function split(s::String, delims, include_empty::Bool) i = 1 strs = empty(String) len = length(s) while true tokstart = tokend = i while !done(s,i) (c,i) = next(s,i) if contains(delims, c) break end tokend = i end tok = s[tokstart:(tokend-1)] if include_empty || !isempty(tok) push(strs, tok) end if !((i <= len) || (i==len+1 && tokend!=i)) break end end strs end split(s::String, x) = split(s, x, true) split(s::String, x::Char, incl::Bool) = split(s, (x,), incl) function print_joined(strings, delim) i = start(strings) while !done(strings,i) str, i = next(strings,i) print(str) if !done(strings,i) print(delim) end end end function print_joined(strings, delim, last) i = start(strings) if done(strings,i) return end str, i = next(strings,i) print(str) while !done(strings,i) str, i = next(strings,i) print(done(strings,i) ? last : delim) print(str) end end join(strings, delim) = print_to_string(print_joined, strings, delim) join(strings, delim, last) = print_to_string(print_joined, strings, delim, last) chop(s::String) = s[1:thisind(s,length(s))-1] chomp(s::String) = (i=thisind(s,length(s)); s[i]=='\n' ? s[1:i-1] : s) chomp(s::ByteString) = s.data[end]==0x0a ? s[1:end-1] : s ## string to integer functions ## function parse_int{T<:Int}(::Type{T}, s::String, base::Int) n::T = 0 base = convert(T,base) for c = s d = '0' <= c <= '9' ? c-'0' : 'A' <= c <= 'Z' ? c-'A'+int32(10) : 'a' <= c <= 'z' ? c-'a'+int32(10) : error(c, " is not an alphanumeric digit") if base <= d error(c, " is not a valid digit in base ", base) end n = n*base + d end return n end parse_bin(s::String) = parse_int(Int64, s, 2) parse_oct(s::String) = parse_int(Int64, s, 8) parse_dec(s::String) = parse_int(Int64, s, 10) parse_hex(s::String) = parse_int(Int64, s, 16) int (s::String) = parse_dec(s) int8 (s::String) = int8(int(s)) uint8 (s::String) = uint8(int(s)) int16 (s::String) = int16(int(s)) uint16(s::String) = uint16(int(s)) int32 (s::String) = int32(int(s)) uint32(s::String) = uint32(int(s)) int64 (s::String) = int64(int(s)) uint64(s::String) = parse_int(Uint64, s, 10) ## integer to string functions ## function ndigits(n::Int, b::Int) nd = 1 ba = convert(typeof(n), b) while true n = div(n, ba) if n == 0 break end nd += 1 end return nd end function int2str(n::Int, b::Int) if b < 2 || b > 40; error("int2str: invalid base ", b); end neg = n0 data[1] = '-' end ASCIIString(data) end int2str(n::Int, b::Int, len::Int) = lpad(int2str(n,b),len,'0') # TODO: support signed Ints too bin(n::Int) = int2str(n, 2) oct(n::Int) = int2str(n, 8) dec(n::Int) = int2str(n, 10) hex(n::Int) = int2str(n, 16) bin(n::Int, l::Int) = lpad(bin(n), l, '0') oct(n::Int, l::Int) = lpad(oct(n), l, '0') dec(n::Int, l::Int) = lpad(dec(n), l, '0') hex(n::Int, l::Int) = lpad(hex(n), l, '0') ## string to float functions ## function float64_isvalid(s::String, out::Array{Float64,1}) s = cstring(s) return (ccall(:jl_strtod, Int32, (Ptr{Uint8},Ptr{Float64}), s, out)==0) end function float32_isvalid(s::String, out::Array{Float32,1}) s = cstring(s) return (ccall(:jl_strtof, Int32, (Ptr{Uint8},Ptr{Float32}), s, out)==0) end begin local tmp::Array{Float64,1} = Array(Float64,1) local tmpf::Array{Float32,1} = Array(Float32,1) global float64, float32 function float64(s::String) if !float64_isvalid(s, tmp) throw(ArgumentError("float64(String): invalid number format")) end return tmp[1] end function float32(s::String) if !float32_isvalid(s, tmpf) throw(ArgumentError("float32(String): invalid number format")) end return tmpf[1] end end float(x::String) = float64(x) # copying a byte string (generally not needed due to "immutability") strcpy{T<:ByteString}(s::T) = T(copy(s.data)) # lexicographically compare byte arrays (used by Latin-1 and UTF-8) function lexcmp(a::Array{Uint8,1}, b::Array{Uint8,1}) c = ccall(dlsym(libc, :memcmp), Int32, (Ptr{Uint8}, Ptr{Uint8}, Ulong), a, b, ulong(min(length(a),length(b)))) c < 0 ? -1 : c > 0 ? +1 : cmp(length(a),length(b)) end # find the index of a byte in a byte array function memchr(a::Array{Uint8,1}, b::Int) p = pointer(a) q = ccall(dlsym(libc, :memchr), Ptr{Uint8}, (Ptr{Uint8}, Int32, Ulong), p, int32(b), ulong(length(a))) q == C_NULL ? 0 : q - p + 1 end # concatenate byte arrays into a single array memcat() = Array(Uint8,0) memcat(a::Array{Uint8,1}) = copy(a) function memcat(arrays::Array{Uint8,1}...) n = 0 for a = arrays n += length(a) end arr = Array(Uint8, n) ptr = pointer(arr) offset = 0 for a = arrays ccall(dlsym(libc, :memcpy), Ptr{Uint8}, (Ptr{Uint8}, Ptr{Uint8}, Ulong), ptr + offset, pointer(a), ulong(length(a))) offset += length(a) end return arr end # concatenate the data fields of byte strings memcat(s::ByteString) = memcat(s.data) function memcat(strs::ByteString...) n = 0 for s = strs n += length(s) end data = Array(Uint8, n) ptr = pointer(data) offset = 0 for s = strs ccall(dlsym(libc, :memcpy), Ptr{Uint8}, (Ptr{Uint8}, Ptr{Uint8}, Ulong), ptr + offset, pointer(s.data), ulong(length(s))) offset += length(s) end data end