Skip to content

Commit

Permalink
Fix two C++ wrapper bugs, unnoticed for years.
Browse files Browse the repository at this point in the history
git-svn-id: svn:https://vcs.exim.org/pcre/code/trunk@1735 2f5784b3-3f2a-0410-8824-cb99058d5e15
  • Loading branch information
ph10 committed Jun 26, 2018
1 parent 9f6ffa7 commit 2ede5a4
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 10 deletions.
18 changes: 18 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,24 @@ Note that the PCRE 8.xx series (PCRE1) is now in a bugfix-only state. All
development is happening in the PCRE2 10.xx series.


Version 8.43 25-June-2018
-------------------------

1. Some time ago the config macro SUPPORT_UTF8 was changed to SUPPORT_UTF
because it also applies to UTF-16 and UTF-32. However, this change was not made
in the pcre2cpp files; consequently the C++ wrapper has from then been compiled
with a bug in it, which would have been picked up by the unit test except that
it also had its UTF8 code cut out. The bug was in a global replace when moving
forward after matching an empty string.

2. The C++ wrapper got broken a long time ago (version 7.3, August 2007) when
(*CR) was invented (assuming it was the first such start-of-pattern option).
The wrapper could never handle such patterns because it wraps patterns in
(?:...)\z in order to support end anchoring. I have hacked in some code to fix
this, that is, move the wrapping till after any existing start-of-pattern
special settings.


Version 8.42 20-March-2018
--------------------------

Expand Down
6 changes: 3 additions & 3 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ dnl The PCRE_PRERELEASE feature is for identifying release candidates. It might
dnl be defined as -RC2, for example. For real releases, it should be empty.

m4_define(pcre_major, [8])
m4_define(pcre_minor, [42])
m4_define(pcre_prerelease, [])
m4_define(pcre_date, [2018-03-20])
m4_define(pcre_minor, [43])
m4_define(pcre_prerelease, [-RC1])
m4_define(pcre_date, [2018-06-25])

# NOTE: The CMakeLists.txt file searches for the above variables in the first
# 50 lines of this file. Please update that if the variables above are moved.
Expand Down
64 changes: 62 additions & 2 deletions pcrecpp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,24 @@ static const string empty_string;
// If the user doesn't ask for any options, we just use this one
static RE_Options default_options;

// Specials for the start of patterns. See comments where start_options is used
// below. (PH June 2018)
static const char *start_options[] = {
"(*UTF8)",
"(*UTF)",
"(*UCP)",
"(*NO_START_OPT)",
"(*NO_AUTO_POSSESS)",
"(*LIMIT_RECURSION=",
"(*LIMIT_MATCH=",
"(*CRLF)",
"(*CR)",
"(*BSR_UNICODE)",
"(*BSR_ANYCRLF)",
"(*ANYCRLF)",
"(*ANY)",
"" };

void RE::Init(const string& pat, const RE_Options* options) {
pattern_ = pat;
if (options == NULL) {
Expand Down Expand Up @@ -135,7 +153,49 @@ pcre* RE::Compile(Anchor anchor) {
} else {
// Tack a '\z' at the end of RE. Parenthesize it first so that
// the '\z' applies to all top-level alternatives in the regexp.
string wrapped = "(?:"; // A non-counting grouping operator

/* When this code was written (for PCRE 6.0) it was enough just to
parenthesize the entire pattern. Unfortunately, when the feature of
starting patterns with (*UTF8) or (*CR) etc. was added to PCRE patterns,
this code was never updated. This bug was not noticed till 2018, long after
PCRE became obsolescent and its maintainer no longer around. Since PCRE is
frozen, I have added a hack to check for all the existing "start of
pattern" specials - knowing that no new ones will ever be added. I am not a
C++ programmer, so the code style is no doubt crude. It is also
inefficient, but is only run when the pattern starts with "(*".
PH June 2018. */

string wrapped = "";

if (pattern_.c_str()[0] == '(' && pattern_.c_str()[1] == '*') {
int kk, klen, kmat;
for (;;) { // Loop for any number of leading items

for (kk = 0; start_options[kk][0] != 0; kk++) {
klen = strlen(start_options[kk]);
kmat = strncmp(pattern_.c_str(), start_options[kk], klen);
if (kmat >= 0) break;
}
if (kmat != 0) break; // Not found

// If the item ended in "=" we must copy digits up to ")".

if (start_options[kk][klen-1] == '=') {
while (isdigit(pattern_.c_str()[klen])) klen++;
if (pattern_.c_str()[klen] != ')') break; // Syntax error
klen++;
}

// Move the item from the pattern to the start of the wrapped string.

wrapped += pattern_.substr(0, klen);
pattern_.erase(0, klen);
}
}

// Wrap the rest of the pattern.

wrapped += "(?:"; // A non-counting grouping operator
wrapped += pattern_;
wrapped += ")\\z";
re = pcre_compile(wrapped.c_str(), pcre_options,
Expand Down Expand Up @@ -415,7 +475,7 @@ int RE::GlobalReplace(const StringPiece& rewrite,
matchend++;
}
// We also need to advance more than one char if we're in utf8 mode.
#ifdef SUPPORT_UTF8
#ifdef SUPPORT_UTF
if (options_.utf8()) {
while (matchend < static_cast<int>(str->length()) &&
((*str)[matchend] & 0xc0) == 0x80)
Expand Down
28 changes: 23 additions & 5 deletions pcrecpp_unittest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ static void TestReplace() {
"@aa",
"@@@",
3 },
#ifdef SUPPORT_UTF8
#ifdef SUPPORT_UTF
{ "b*",
"bb",
"\xE3\x83\x9B\xE3\x83\xBC\xE3\x83\xA0\xE3\x81\xB8", // utf8
Expand All @@ -327,7 +327,7 @@ static void TestReplace() {
{ "", NULL, NULL, NULL, NULL, 0 }
};

#ifdef SUPPORT_UTF8
#ifdef SUPPORT_UTF
const bool support_utf8 = true;
#else
const bool support_utf8 = false;
Expand Down Expand Up @@ -535,7 +535,7 @@ static void TestQuoteMetaLatin1() {
}

static void TestQuoteMetaUtf8() {
#ifdef SUPPORT_UTF8
#ifdef SUPPORT_UTF
TestQuoteMeta("Pl\xc3\xa1\x63ido Domingo", pcrecpp::UTF8());
TestQuoteMeta("xyz", pcrecpp::UTF8()); // No fancy utf8
TestQuoteMeta("\xc2\xb0", pcrecpp::UTF8()); // 2-byte utf8 (degree symbol)
Expand Down Expand Up @@ -1178,7 +1178,7 @@ int main(int argc, char** argv) {
CHECK(re.error().empty()); // Must have no error
}

#ifdef SUPPORT_UTF8
#ifdef SUPPORT_UTF
// Check UTF-8 handling
{
printf("Testing UTF-8 handling\n");
Expand All @@ -1202,6 +1202,24 @@ int main(int argc, char** argv) {
CHECK(re_test1.FullMatch(utf8_string));
RE re_test2("...", pcrecpp::UTF8());
CHECK(re_test2.FullMatch(utf8_string));

// PH added these tests for leading option settings

RE re_testZ1("(*UTF8)...");
CHECK(re_testZ1.FullMatch(utf8_string));

RE re_testZ2("(*UTF)...");
CHECK(re_testZ2.FullMatch(utf8_string));

RE re_testZ3("(*UCP)(*UTF)...");
CHECK(re_testZ3.FullMatch(utf8_string));

RE re_testZ4("(*UCP)(*LIMIT_MATCH=1000)(*UTF)...");
CHECK(re_testZ4.FullMatch(utf8_string));

RE re_testZ5("(*UCP)(*LIMIT_MATCH=1000)(*ANY)(*UTF)...");
CHECK(re_testZ5.FullMatch(utf8_string));


// Check that '.' matches one byte or UTF-8 character
// according to the mode.
Expand Down Expand Up @@ -1248,7 +1266,7 @@ int main(int argc, char** argv) {
CHECK(!match_sentence.FullMatch(target));
CHECK(!match_sentence_re.FullMatch(target));
}
#endif /* def SUPPORT_UTF8 */
#endif /* def SUPPORT_UTF */

printf("Testing error reporting\n");

Expand Down

0 comments on commit 2ede5a4

Please sign in to comment.