Fix two C++ wrapper bugs, unnoticed for years.

git-svn-id: svn:https://vcs.exim.org/pcre/code/trunk@1735 2f5784b3-3f2a-0410-8824-cb99058d5e15
svn2github · Jun 26, 2018 · 2ede5a4 · 2ede5a4
1 parent 9f6ffa7
commit 2ede5a4
Show file tree

Hide file tree

Showing 4 changed files with 106 additions and 10 deletions.
diff --git a/ChangeLog b/ChangeLog
@@ -5,6 +5,24 @@ Note that the PCRE 8.xx series (PCRE1) is now in a bugfix-only state. All
 development is happening in the PCRE2 10.xx series.
 
 
+Version 8.43 25-June-2018
+-------------------------
+
+1. Some time ago the config macro SUPPORT_UTF8 was changed to SUPPORT_UTF 
+because it also applies to UTF-16 and UTF-32. However, this change was not made 
+in the pcre2cpp files; consequently the C++ wrapper has from then been compiled 
+with a bug in it, which would have been picked up by the unit test except that 
+it also had its UTF8 code cut out. The bug was in a global replace when moving 
+forward after matching an empty string.
+
+2. The C++ wrapper got broken a long time ago (version 7.3, August 2007) when 
+(*CR) was invented (assuming it was the first such start-of-pattern option). 
+The wrapper could never handle such patterns because it wraps patterns in 
+(?:...)\z in order to support end anchoring. I have hacked in some code to fix 
+this, that is, move the wrapping till after any existing start-of-pattern 
+special settings.
+
+
 Version 8.42 20-March-2018
 --------------------------
 

diff --git a/configure.ac b/configure.ac
@@ -9,9 +9,9 @@ dnl The PCRE_PRERELEASE feature is for identifying release candidates. It might
 dnl be defined as -RC2, for example. For real releases, it should be empty.
 
 m4_define(pcre_major, [8])
-m4_define(pcre_minor, [42])
-m4_define(pcre_prerelease, [])
-m4_define(pcre_date, [2018-03-20])
+m4_define(pcre_minor, [43])
+m4_define(pcre_prerelease, [-RC1])
+m4_define(pcre_date, [2018-06-25])
 
 # NOTE: The CMakeLists.txt file searches for the above variables in the first
 # 50 lines of this file. Please update that if the variables above are moved.

diff --git a/pcrecpp.cc b/pcrecpp.cc
@@ -80,6 +80,24 @@ static const string empty_string;
 // If the user doesn't ask for any options, we just use this one
 static RE_Options default_options;
 
+// Specials for the start of patterns. See comments where start_options is used
+// below. (PH June 2018)
+static const char *start_options[] = {
+ "(*UTF8)",
+ "(*UTF)",
+ "(*UCP)",
+ "(*NO_START_OPT)",
+ "(*NO_AUTO_POSSESS)",
+ "(*LIMIT_RECURSION=",
+ "(*LIMIT_MATCH=",
+ "(*CRLF)",
+ "(*CR)",
+ "(*BSR_UNICODE)",
+ "(*BSR_ANYCRLF)",
+ "(*ANYCRLF)",
+ "(*ANY)",
+ "" };
+
 void RE::Init(const string& pat, const RE_Options* options) {
  pattern_ = pat;
  if (options == NULL) {
@@ -135,7 +153,49 @@ pcre* RE::Compile(Anchor anchor) {
  } else {
  // Tack a '\z' at the end of RE. Parenthesize it first so that
  // the '\z' applies to all top-level alternatives in the regexp.
- string wrapped = "(?:"; // A non-counting grouping operator
+
+ /* When this code was written (for PCRE 6.0) it was enough just to
+ parenthesize the entire pattern. Unfortunately, when the feature of
+ starting patterns with (*UTF8) or (*CR) etc. was added to PCRE patterns,
+ this code was never updated. This bug was not noticed till 2018, long after
+ PCRE became obsolescent and its maintainer no longer around. Since PCRE is
+ frozen, I have added a hack to check for all the existing "start of
+ pattern" specials - knowing that no new ones will ever be added. I am not a
+ C++ programmer, so the code style is no doubt crude. It is also
+ inefficient, but is only run when the pattern starts with "(*".
+ PH June 2018. */
+
+ string wrapped = "";
+
+ if (pattern_.c_str()[0] == '(' && pattern_.c_str()[1] == '*') {
+ int kk, klen, kmat;
+ for (;;) { // Loop for any number of leading items
+
+ for (kk = 0; start_options[kk][0] != 0; kk++) {
+ klen = strlen(start_options[kk]);
+ kmat = strncmp(pattern_.c_str(), start_options[kk], klen);
+ if (kmat >= 0) break;
+ }
+ if (kmat != 0) break; // Not found
+
+ // If the item ended in "=" we must copy digits up to ")".
+
+ if (start_options[kk][klen-1] == '=') {
+ while (isdigit(pattern_.c_str()[klen])) klen++;
+ if (pattern_.c_str()[klen] != ')') break; // Syntax error
+ klen++;
+ }
+
+ // Move the item from the pattern to the start of the wrapped string.
+
+ wrapped += pattern_.substr(0, klen);
+ pattern_.erase(0, klen);
+ }
+ }
+
+ // Wrap the rest of the pattern.
+
+ wrapped += "(?:"; // A non-counting grouping operator
  wrapped += pattern_;
  wrapped += ")\\z";
  re = pcre_compile(wrapped.c_str(), pcre_options,
@@ -415,7 +475,7 @@ int RE::GlobalReplace(const StringPiece& rewrite,
  matchend++;
  }
  // We also need to advance more than one char if we're in utf8 mode.
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
  if (options_.utf8()) {
  while (matchend < static_cast<int>(str->length()) &&
  ((*str)[matchend] & 0xc0) == 0x80)

diff --git a/pcrecpp_unittest.cc b/pcrecpp_unittest.cc
@@ -309,7 +309,7 @@ static void TestReplace() {
  "@aa",
  "@@@",
  3 },
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
  { "b*",
  "bb",
  "\xE3\x83\x9B\xE3\x83\xBC\xE3\x83\xA0\xE3\x81\xB8", // utf8
@@ -327,7 +327,7 @@ static void TestReplace() {
  { "", NULL, NULL, NULL, NULL, 0 }
  };
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
  const bool support_utf8 = true;
 #else
  const bool support_utf8 = false;
@@ -535,7 +535,7 @@ static void TestQuoteMetaLatin1() {
 }
 
 static void TestQuoteMetaUtf8() {
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
  TestQuoteMeta("Pl\xc3\xa1\x63ido Domingo", pcrecpp::UTF8());
  TestQuoteMeta("xyz", pcrecpp::UTF8()); // No fancy utf8
  TestQuoteMeta("\xc2\xb0", pcrecpp::UTF8()); // 2-byte utf8 (degree symbol)
@@ -1178,7 +1178,7 @@ int main(int argc, char** argv) {
  CHECK(re.error().empty()); // Must have no error
  }
 
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
  // Check UTF-8 handling
  {
  printf("Testing UTF-8 handling\n");
@@ -1202,6 +1202,24 @@ int main(int argc, char** argv) {
  CHECK(re_test1.FullMatch(utf8_string));
  RE re_test2("...", pcrecpp::UTF8());
  CHECK(re_test2.FullMatch(utf8_string));
+
+ // PH added these tests for leading option settings
+
+ RE re_testZ1("(*UTF8)...");
+ CHECK(re_testZ1.FullMatch(utf8_string));
+
+ RE re_testZ2("(*UTF)...");
+ CHECK(re_testZ2.FullMatch(utf8_string));
+
+ RE re_testZ3("(*UCP)(*UTF)...");
+ CHECK(re_testZ3.FullMatch(utf8_string));
+
+ RE re_testZ4("(*UCP)(*LIMIT_MATCH=1000)(*UTF)...");
+ CHECK(re_testZ4.FullMatch(utf8_string));
+
+ RE re_testZ5("(*UCP)(*LIMIT_MATCH=1000)(*ANY)(*UTF)...");
+ CHECK(re_testZ5.FullMatch(utf8_string));
+
 
  // Check that '.' matches one byte or UTF-8 character
  // according to the mode.
@@ -1248,7 +1266,7 @@ int main(int argc, char** argv) {
  CHECK(!match_sentence.FullMatch(target));
  CHECK(!match_sentence_re.FullMatch(target));
  }
-#endif /* def SUPPORT_UTF8 */
+#endif /* def SUPPORT_UTF */
 
  printf("Testing error reporting\n");