Skip to content

Commit

Permalink
Merge pull request JuliaLang#11566 from malmaud/regex_named_subpattern
Browse files Browse the repository at this point in the history
define getindex on regex matches to return captures.
  • Loading branch information
ivarne committed Jul 2, 2015
2 parents 8eaf177 + 1b8d47a commit 95bf20d
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 3 deletions.
19 changes: 19 additions & 0 deletions base/pcre.jl
Original file line number Diff line number Diff line change
Expand Up @@ -140,4 +140,23 @@ function substring_number_from_name(re, name)
(Ptr{Void}, Cstring), re, name)
end

function capture_names(re)
name_count = info(re, INFO_NAMECOUNT, UInt32)
name_entry_size = info(re, INFO_NAMEENTRYSIZE, UInt32)
nametable_ptr = info(re, INFO_NAMETABLE, Ptr{UInt8})
names = Dict{Int, ASCIIString}()
for i=1:name_count
offset = (i-1)*name_entry_size + 1
# The capture group index corresponding to name 'i' is stored as a
# big-endian 16-bit value.
high_byte = UInt16(unsafe_load(nametable_ptr, offset))
low_byte = UInt16(unsafe_load(nametable_ptr, offset+1))
idx = (high_byte << 8) | low_byte
# The capture group name is a null-terminated string located directly
# after the index.
names[idx] = bytestring(nametable_ptr+offset+1)
end
names
end

end # module
19 changes: 16 additions & 3 deletions base/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ type Regex
ovec::Vector{Csize_t}
match_data::Ptr{Void}


function Regex(pattern::AbstractString, compile_options::Integer,
match_options::Integer)
pattern = bytestring(pattern)
Expand Down Expand Up @@ -92,15 +91,20 @@ immutable RegexMatch
captures::Vector{Union{Void,SubString{UTF8String}}}
offset::Int
offsets::Vector{Int}
regex::Regex
end

function show(io::IO, m::RegexMatch)
print(io, "RegexMatch(")
show(io, m.match)
idx_to_capture_name = PCRE.capture_names(m.regex.regex)
if !isempty(m.captures)
print(io, ", ")
for i = 1:length(m.captures)
print(io, i, "=")
# If the capture group is named, show the name.
# Otherwise show its index.
capture_name = get(idx_to_capture_name, i, i)
print(io, capture_name, "=")
show(io, m.captures[i])
if i < length(m.captures)
print(io, ", ")
Expand All @@ -110,6 +114,15 @@ function show(io::IO, m::RegexMatch)
print(io, ")")
end

# Capture group extraction
getindex(m::RegexMatch, idx::Integer) = m.captures[idx]
function getindex(m::RegexMatch, name::Symbol)
idx = PCRE.substring_number_from_name(m.regex.regex, name)
idx <= 0 && error("no capture group named $name found in regex")
m[idx]
end
getindex(m::RegexMatch, name::AbstractString) = m[Symbol(name)]

function ismatch(r::Regex, s::AbstractString, offset::Integer=0)
compile(r)
return PCRE.exec(r.regex, bytestring(s), offset, r.match_options,
Expand All @@ -136,7 +149,7 @@ function match(re::Regex, str::UTF8String, idx::Integer, add_opts::UInt32=UInt32
cap = Union{Void,SubString{UTF8String}}[
ovec[2i+1] == PCRE.UNSET ? nothing : SubString(str, ovec[2i+1]+1, ovec[2i+2]) for i=1:n ]
off = Int[ ovec[2i+1]+1 for i=1:n ]
RegexMatch(mat, cap, ovec[1]+1, off)
RegexMatch(mat, cap, ovec[1]+1, off, re)
end

match(re::Regex, str::Union{ByteString,SubString}, idx::Integer, add_opts::UInt32=UInt32(0)) =
Expand Down
10 changes: 10 additions & 0 deletions doc/manual/strings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,16 @@ use destructuring syntax to bind them to local variables::
julia> first, second, third = m.captures; first
"a"

Captures can also be accessed by indexing the :obj:`RegexMatch` object
with the number or name of the capture group::

julia> m=match(r"(?P<hour>\d+):(?P<minute>\d+)","12:45")
RegexMatch("12:45", hour="12", minute="45")
julia> m[:minute]
"45"
julia> m[2]
"45"

You can modify the behavior of regular expressions by some combination
of the flags ``i``, ``m``, ``s``, and ``x`` after the closing double
quote mark. These flags have the same meaning as they do in Perl, as
Expand Down
5 changes: 5 additions & 0 deletions test/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,8 @@ show(buf, r"")
# regex match / search string must be a ByteString
@test_throws ArgumentError match(r"test", utf32("this is a test"))
@test_throws ArgumentError search(utf32("this is a test"), r"test")

# Named subpatterns
m = match(r"(?<a>.)(.)(?<b>.)", "xyz")
@test (m[:a], m[2], m["b"]) == ("x", "y", "z")
@test sprint(show, m) == "RegexMatch(\"xyz\", a=\"x\", 2=\"y\", b=\"z\")"

0 comments on commit 95bf20d

Please sign in to comment.