From 14a02da08df84807e2c674b0736527c80d7997d2 Mon Sep 17 00:00:00 2001 From: Jon Malmaud Date: Wed, 3 Jun 2015 15:11:37 -0600 Subject: [PATCH 1/4] define getindex on regex matches to return captures. --- base/pcre.jl | 19 +++++++++++++++++++ base/regex.jl | 23 ++++++++++++++++++++--- test/regex.jl | 4 ++++ 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/base/pcre.jl b/base/pcre.jl index 39a9e8da693f6..edf4530cb52de 100644 --- a/base/pcre.jl +++ b/base/pcre.jl @@ -140,4 +140,23 @@ function substring_number_from_name(re, name) (Ptr{Void}, Cstring), re, name) end +function capture_names(re) + name_count = info(re, INFO_NAMECOUNT, UInt32) + name_entry_size = info(re, INFO_NAMEENTRYSIZE, UInt32) + nametable_ptr = info(re, INFO_NAMETABLE, Ptr{UInt8}) + names = Dict{Int, ASCIIString}() + for i=1:name_count + offset = (i-1)*name_entry_size + 1 + # The capture group index corresponding to name 'i' is stored as a + # big-endian 16-bit value. + high_byte = UInt16(unsafe_load(nametable_ptr, offset)) + low_byte = UInt16(unsafe_load(nametable_ptr, offset+1)) + idx = (high_byte << 8) | low_byte + # The capture group name is a null-terminated string located directly + # after the index. + names[idx] = bytestring(nametable_ptr+offset+1) + end + names +end + end # module diff --git a/base/regex.jl b/base/regex.jl index d6b229f922494..d5a7bc859737c 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -15,6 +15,8 @@ type Regex extra::Ptr{Void} ovec::Vector{Csize_t} match_data::Ptr{Void} + capture_name_to_idx::Dict{ASCIIString, Int} + idx_to_capture_name::Dict{Int, ASCIIString} function Regex(pattern::AbstractString, compile_options::Integer, @@ -29,7 +31,8 @@ type Regex throw(ArgumentError("invalid regex match options: $match_options")) end re = compile(new(pattern, compile_options, match_options, C_NULL, - C_NULL, Csize_t[], C_NULL)) + C_NULL, Csize_t[], C_NULL, + Dict{ASCIIString, Int}(), Dict{Int, ASCIIString}())) finalizer(re, re->begin re.regex == C_NULL || PCRE.free_re(re.regex) re.match_data == C_NULL || PCRE.free_match_data(re.match_data) @@ -57,6 +60,10 @@ function compile(regex::Regex) PCRE.jit_compile(regex.regex) regex.match_data = PCRE.create_match_data(regex.regex) regex.ovec = PCRE.get_ovec(regex.match_data) + regex.idx_to_capture_name = PCRE.capture_names(regex.regex) + for (i, name) in regex.idx_to_capture_name + regex.capture_name_to_idx[name] = i + end end regex end @@ -92,6 +99,7 @@ immutable RegexMatch captures::Vector{Union(Void,SubString{UTF8String})} offset::Int offsets::Vector{Int} + regex::Regex end function show(io::IO, m::RegexMatch) @@ -100,7 +108,10 @@ function show(io::IO, m::RegexMatch) if !isempty(m.captures) print(io, ", ") for i = 1:length(m.captures) - print(io, i, "=") + # If the capture group is named, show the name. + # Otherwise show its index. + capture_name = get(m.regex.idx_to_capture_name, i, i) + print(io, capture_name, "=") show(io, m.captures[i]) if i < length(m.captures) print(io, ", ") @@ -110,6 +121,12 @@ function show(io::IO, m::RegexMatch) print(io, ")") end +# Capture group extraction +getindex(m::RegexMatch, idx::Int) = m.captures[idx] +function getindex(m::RegexMatch, name::AbstractString) + m[m.regex.capture_name_to_idx[name]] +end + function ismatch(r::Regex, s::AbstractString, offset::Integer=0) compile(r) return PCRE.exec(r.regex, bytestring(s), offset, r.match_options, @@ -136,7 +153,7 @@ function match(re::Regex, str::UTF8String, idx::Integer, add_opts::UInt32=UInt32 cap = Union(Void,SubString{UTF8String})[ ovec[2i+1] == PCRE.UNSET ? nothing : SubString(str, ovec[2i+1]+1, ovec[2i+2]) for i=1:n ] off = Int[ ovec[2i+1]+1 for i=1:n ] - RegexMatch(mat, cap, ovec[1]+1, off) + RegexMatch(mat, cap, ovec[1]+1, off, re) end match(re::Regex, str::Union(ByteString,SubString), idx::Integer, add_opts::UInt32=UInt32(0)) = diff --git a/test/regex.jl b/test/regex.jl index 938abd8df70c6..fed738270dae9 100644 --- a/test/regex.jl +++ b/test/regex.jl @@ -37,3 +37,7 @@ show(buf, r"") # regex match / search string must be a ByteString @test_throws ArgumentError match(r"test", utf32("this is a test")) @test_throws ArgumentError search(utf32("this is a test"), r"test") + +# Named subpatterns +m = match(r"(?.)(.)(?.)", "xyz") +@test (m["a"], m[2], m["b"]) == ("x", "y", "z") From 0479e7a9a8ba612ce8c141e0a777b6eece09e3ed Mon Sep 17 00:00:00 2001 From: Jon Malmaud Date: Wed, 3 Jun 2015 16:16:06 -0600 Subject: [PATCH 2/4] Store capture names as symbols instead of strings --- base/regex.jl | 15 ++++++++------- test/regex.jl | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/base/regex.jl b/base/regex.jl index d5a7bc859737c..150d7abd912b7 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -15,8 +15,8 @@ type Regex extra::Ptr{Void} ovec::Vector{Csize_t} match_data::Ptr{Void} - capture_name_to_idx::Dict{ASCIIString, Int} - idx_to_capture_name::Dict{Int, ASCIIString} + capture_name_to_idx::Dict{Symbol, Int} + idx_to_capture_name::Dict{Int, Symbol} function Regex(pattern::AbstractString, compile_options::Integer, @@ -32,7 +32,7 @@ type Regex end re = compile(new(pattern, compile_options, match_options, C_NULL, C_NULL, Csize_t[], C_NULL, - Dict{ASCIIString, Int}(), Dict{Int, ASCIIString}())) + Dict{Symbol, Int}(), Dict{Int, Symbol}())) finalizer(re, re->begin re.regex == C_NULL || PCRE.free_re(re.regex) re.match_data == C_NULL || PCRE.free_match_data(re.match_data) @@ -60,9 +60,9 @@ function compile(regex::Regex) PCRE.jit_compile(regex.regex) regex.match_data = PCRE.create_match_data(regex.regex) regex.ovec = PCRE.get_ovec(regex.match_data) - regex.idx_to_capture_name = PCRE.capture_names(regex.regex) - for (i, name) in regex.idx_to_capture_name - regex.capture_name_to_idx[name] = i + for (idx, name) in PCRE.capture_names(regex.regex) + regex.capture_name_to_idx[Symbol(name)] = idx + regex.idx_to_capture_name[idx] = Symbol(name) end end regex @@ -123,9 +123,10 @@ end # Capture group extraction getindex(m::RegexMatch, idx::Int) = m.captures[idx] -function getindex(m::RegexMatch, name::AbstractString) +function getindex(m::RegexMatch, name::Symbol) m[m.regex.capture_name_to_idx[name]] end +getindex(m::RegexMatch, name::AbstractString) = m[Symbol(name)] function ismatch(r::Regex, s::AbstractString, offset::Integer=0) compile(r) diff --git a/test/regex.jl b/test/regex.jl index fed738270dae9..e76776b9692ae 100644 --- a/test/regex.jl +++ b/test/regex.jl @@ -40,4 +40,4 @@ show(buf, r"") # Named subpatterns m = match(r"(?.)(.)(?.)", "xyz") -@test (m["a"], m[2], m["b"]) == ("x", "y", "z") +@test (m[:a], m[2], m["b"]) == ("x", "y", "z") From 1b0127cd7af78b5e9a769a667bb17739c19641fd Mon Sep 17 00:00:00 2001 From: Jon Malmaud Date: Wed, 24 Jun 2015 10:54:21 -0400 Subject: [PATCH 3/4] Don't cache capture names in regex object --- base/regex.jl | 19 +++++++------------ test/regex.jl | 1 + 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/base/regex.jl b/base/regex.jl index 150d7abd912b7..3e7e2cd93ac31 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -15,9 +15,6 @@ type Regex extra::Ptr{Void} ovec::Vector{Csize_t} match_data::Ptr{Void} - capture_name_to_idx::Dict{Symbol, Int} - idx_to_capture_name::Dict{Int, Symbol} - function Regex(pattern::AbstractString, compile_options::Integer, match_options::Integer) @@ -31,8 +28,7 @@ type Regex throw(ArgumentError("invalid regex match options: $match_options")) end re = compile(new(pattern, compile_options, match_options, C_NULL, - C_NULL, Csize_t[], C_NULL, - Dict{Symbol, Int}(), Dict{Int, Symbol}())) + C_NULL, Csize_t[], C_NULL)) finalizer(re, re->begin re.regex == C_NULL || PCRE.free_re(re.regex) re.match_data == C_NULL || PCRE.free_match_data(re.match_data) @@ -60,10 +56,6 @@ function compile(regex::Regex) PCRE.jit_compile(regex.regex) regex.match_data = PCRE.create_match_data(regex.regex) regex.ovec = PCRE.get_ovec(regex.match_data) - for (idx, name) in PCRE.capture_names(regex.regex) - regex.capture_name_to_idx[Symbol(name)] = idx - regex.idx_to_capture_name[idx] = Symbol(name) - end end regex end @@ -105,12 +97,13 @@ end function show(io::IO, m::RegexMatch) print(io, "RegexMatch(") show(io, m.match) + idx_to_capture_name = PCRE.capture_names(m.regex.regex) if !isempty(m.captures) print(io, ", ") for i = 1:length(m.captures) # If the capture group is named, show the name. # Otherwise show its index. - capture_name = get(m.regex.idx_to_capture_name, i, i) + capture_name = get(idx_to_capture_name, i, i) print(io, capture_name, "=") show(io, m.captures[i]) if i < length(m.captures) @@ -122,9 +115,11 @@ function show(io::IO, m::RegexMatch) end # Capture group extraction -getindex(m::RegexMatch, idx::Int) = m.captures[idx] +getindex(m::RegexMatch, idx::Integer) = m.captures[idx] function getindex(m::RegexMatch, name::Symbol) - m[m.regex.capture_name_to_idx[name]] + idx = PCRE.substring_number_from_name(m.regex.regex, name) + idx <= 0 && error("no capture group named $name found in regex") + m[idx] end getindex(m::RegexMatch, name::AbstractString) = m[Symbol(name)] diff --git a/test/regex.jl b/test/regex.jl index e76776b9692ae..aaf8eafa72a39 100644 --- a/test/regex.jl +++ b/test/regex.jl @@ -41,3 +41,4 @@ show(buf, r"") # Named subpatterns m = match(r"(?.)(.)(?.)", "xyz") @test (m[:a], m[2], m["b"]) == ("x", "y", "z") +@test sprint(show, m) == "RegexMatch(\"xyz\", a=\"x\", 2=\"y\", b=\"z\")" From 1b8d47aec9b6d3faae2a31c2f119c11ca2892dc4 Mon Sep 17 00:00:00 2001 From: Jon Malmaud Date: Thu, 2 Jul 2015 15:34:49 -0400 Subject: [PATCH 4/4] Added manual section on accessing groups --- doc/manual/strings.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/doc/manual/strings.rst b/doc/manual/strings.rst index 5e24be59235ee..d43cefaeb1153 100644 --- a/doc/manual/strings.rst +++ b/doc/manual/strings.rst @@ -697,6 +697,16 @@ use destructuring syntax to bind them to local variables:: julia> first, second, third = m.captures; first "a" +Captures can also be accessed by indexing the :obj:`RegexMatch` object +with the number or name of the capture group:: + + julia> m=match(r"(?P\d+):(?P\d+)","12:45") + RegexMatch("12:45", hour="12", minute="45") + julia> m[:minute] + "45" + julia> m[2] + "45" + You can modify the behavior of regular expressions by some combination of the flags ``i``, ``m``, ``s``, and ``x`` after the closing double quote mark. These flags have the same meaning as they do in Perl, as