diff --git a/base/pcre.jl b/base/pcre.jl index 39a9e8da693f6..edf4530cb52de 100644 --- a/base/pcre.jl +++ b/base/pcre.jl @@ -140,4 +140,23 @@ function substring_number_from_name(re, name) (Ptr{Void}, Cstring), re, name) end +function capture_names(re) + name_count = info(re, INFO_NAMECOUNT, UInt32) + name_entry_size = info(re, INFO_NAMEENTRYSIZE, UInt32) + nametable_ptr = info(re, INFO_NAMETABLE, Ptr{UInt8}) + names = Dict{Int, ASCIIString}() + for i=1:name_count + offset = (i-1)*name_entry_size + 1 + # The capture group index corresponding to name 'i' is stored as a + # big-endian 16-bit value. + high_byte = UInt16(unsafe_load(nametable_ptr, offset)) + low_byte = UInt16(unsafe_load(nametable_ptr, offset+1)) + idx = (high_byte << 8) | low_byte + # The capture group name is a null-terminated string located directly + # after the index. + names[idx] = bytestring(nametable_ptr+offset+1) + end + names +end + end # module diff --git a/base/regex.jl b/base/regex.jl index e0762d868fc53..2699c563ec349 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -16,7 +16,6 @@ type Regex ovec::Vector{Csize_t} match_data::Ptr{Void} - function Regex(pattern::AbstractString, compile_options::Integer, match_options::Integer) pattern = bytestring(pattern) @@ -92,15 +91,20 @@ immutable RegexMatch captures::Vector{Union{Void,SubString{UTF8String}}} offset::Int offsets::Vector{Int} + regex::Regex end function show(io::IO, m::RegexMatch) print(io, "RegexMatch(") show(io, m.match) + idx_to_capture_name = PCRE.capture_names(m.regex.regex) if !isempty(m.captures) print(io, ", ") for i = 1:length(m.captures) - print(io, i, "=") + # If the capture group is named, show the name. + # Otherwise show its index. + capture_name = get(idx_to_capture_name, i, i) + print(io, capture_name, "=") show(io, m.captures[i]) if i < length(m.captures) print(io, ", ") @@ -110,6 +114,15 @@ function show(io::IO, m::RegexMatch) print(io, ")") end +# Capture group extraction +getindex(m::RegexMatch, idx::Integer) = m.captures[idx] +function getindex(m::RegexMatch, name::Symbol) + idx = PCRE.substring_number_from_name(m.regex.regex, name) + idx <= 0 && error("no capture group named $name found in regex") + m[idx] +end +getindex(m::RegexMatch, name::AbstractString) = m[Symbol(name)] + function ismatch(r::Regex, s::AbstractString, offset::Integer=0) compile(r) return PCRE.exec(r.regex, bytestring(s), offset, r.match_options, @@ -136,7 +149,7 @@ function match(re::Regex, str::UTF8String, idx::Integer, add_opts::UInt32=UInt32 cap = Union{Void,SubString{UTF8String}}[ ovec[2i+1] == PCRE.UNSET ? nothing : SubString(str, ovec[2i+1]+1, ovec[2i+2]) for i=1:n ] off = Int[ ovec[2i+1]+1 for i=1:n ] - RegexMatch(mat, cap, ovec[1]+1, off) + RegexMatch(mat, cap, ovec[1]+1, off, re) end match(re::Regex, str::Union{ByteString,SubString}, idx::Integer, add_opts::UInt32=UInt32(0)) = diff --git a/doc/manual/strings.rst b/doc/manual/strings.rst index 91085005aee8c..b51eef87b0352 100644 --- a/doc/manual/strings.rst +++ b/doc/manual/strings.rst @@ -697,6 +697,16 @@ use destructuring syntax to bind them to local variables:: julia> first, second, third = m.captures; first "a" +Captures can also be accessed by indexing the :obj:`RegexMatch` object +with the number or name of the capture group:: + + julia> m=match(r"(?P\d+):(?P\d+)","12:45") + RegexMatch("12:45", hour="12", minute="45") + julia> m[:minute] + "45" + julia> m[2] + "45" + You can modify the behavior of regular expressions by some combination of the flags ``i``, ``m``, ``s``, and ``x`` after the closing double quote mark. These flags have the same meaning as they do in Perl, as diff --git a/test/regex.jl b/test/regex.jl index 938abd8df70c6..aaf8eafa72a39 100644 --- a/test/regex.jl +++ b/test/regex.jl @@ -37,3 +37,8 @@ show(buf, r"") # regex match / search string must be a ByteString @test_throws ArgumentError match(r"test", utf32("this is a test")) @test_throws ArgumentError search(utf32("this is a test"), r"test") + +# Named subpatterns +m = match(r"(?.)(.)(?.)", "xyz") +@test (m[:a], m[2], m["b"]) == ("x", "y", "z") +@test sprint(show, m) == "RegexMatch(\"xyz\", a=\"x\", 2=\"y\", b=\"z\")"