Merge pull request #252 from LeszekSwirski/parse-error

Record parse failure reason and location
fastfloat · Aug 3, 2024 · 0e7a10a · 0e7a10a
2 parents 3838b00 + b6ce2c4
commit 0e7a10a
Show file tree

Hide file tree

Showing 2 changed files with 97 additions and 15 deletions.
diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h
@@ -234,6 +234,25 @@ void loop_parse_if_eight_digits(const char*& p, const char* const pend, uint64_t
  }
 }
 
+enum class parse_error {
+ no_error,
+ // [JSON-only] The minus sign must be followed by an integer.
+ missing_integer_after_sign,
+ // A sign must be followed by an integer or dot.
+ missing_integer_or_dot_after_sign,
+ // [JSON-only] The integer part must not have leading zeros.
+ leading_zeros_in_integer_part,
+ // [JSON-only] The integer part must have at least one digit.
+ no_digits_in_integer_part,
+ // [JSON-only] If there is a decimal point, there must be digits in the
+ // fractional part.
+ no_digits_in_fractional_part,
+ // The mantissa must have at least one digit.
+ no_digits_in_mantissa,
+ // Scientific notation requires an exponential part.
+ missing_exponential_part,
+};
+
 template <typename UC>
 struct parsed_number_string_t {
  int64_t exponent{0};
@@ -245,11 +264,22 @@ struct parsed_number_string_t {
  // contains the range of the significant digits
  span<const UC> integer{}; // non-nullable
  span<const UC> fraction{}; // nullable
+ parse_error error{parse_error::no_error};
 };
 
 using byte_span = span<const char>;
 using parsed_number_string = parsed_number_string_t<char>;
 
+template <typename UC>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 parsed_number_string_t<UC>
+report_parse_error(UC const* p, parse_error error) {
+ parsed_number_string_t<UC> answer;
+ answer.valid = false;
+ answer.lastmatch = p;
+ answer.error = error;
+ return answer;
+}
+
 // Assuming that you use no more than 19 digits, this will
 // parse an ASCII string.
 template <typename UC>
@@ -269,15 +299,16 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
 #endif
  ++p;
  if (p == pend) {
- return answer;
+ return report_parse_error<UC>(
+ p, parse_error::missing_integer_or_dot_after_sign);
  }
  if (fmt & FASTFLOAT_JSONFMT) {
  if (!is_integer(*p)) { // a sign must be followed by an integer
- return answer;
+ return report_parse_error<UC>(p, parse_error::missing_integer_after_sign);
  } 
  } else {
  if (!is_integer(*p) && (*p != decimal_point)) { // a sign must be followed by an integer or the dot
- return answer;
+ return report_parse_error<UC>(p, parse_error::missing_integer_or_dot_after_sign);
  }
  }
  }
@@ -297,8 +328,12 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
  answer.integer = span<const UC>(start_digits, size_t(digit_count));
  if (fmt & FASTFLOAT_JSONFMT) {
  // at least 1 digit in integer part, without leading zeros
- if (digit_count == 0 || (start_digits[0] == UC('0') && digit_count > 1)) {
- return answer;
+ if (digit_count == 0) {
+ return report_parse_error<UC>(p, parse_error::no_digits_in_integer_part);
+ }
+ if ((start_digits[0] == UC('0') && digit_count > 1)) {
+ return report_parse_error<UC>(start_digits,
+ parse_error::leading_zeros_in_integer_part);
  }
  }
 
@@ -323,11 +358,10 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
  if (fmt & FASTFLOAT_JSONFMT) {
  // at least 1 digit in fractional part
  if (has_decimal_point && exponent == 0) {
- return answer;
+ return report_parse_error<UC>(p, parse_error::no_digits_in_fractional_part);
  }
- } 
- else if (digit_count == 0) { // we must have encountered at least one integer!
- return answer;
+ } else if (digit_count == 0) { // we must have encountered at least one integer!
+ return report_parse_error<UC>(p, parse_error::no_digits_in_mantissa);
  }
  int64_t exp_number = 0; // explicit exponential part
  if ( ((fmt & chars_format::scientific) &&
@@ -350,8 +384,10 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
  }
  if ((p == pend) || !is_integer(*p)) {
  if(!(fmt & chars_format::fixed)) {
- // We are in error.
- return answer;
+ // The exponential part is invalid for scientific notation, so it must
+ // be a trailing token for fixed notation. However, fixed notation is
+ // disabled, so report a scientific notation error.
+ return report_parse_error<UC>(p, parse_error::missing_exponential_part);
  }
  // Otherwise, we will be ignoring the 'e'.
  p = location_of_e;
@@ -368,7 +404,9 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
  }
  } else {
  // If it scientific and not fixed, we have to bail out.
- if((fmt & chars_format::scientific) && !(fmt & chars_format::fixed)) { return answer; }
+ if ((fmt & chars_format::scientific) && !(fmt & chars_format::fixed)) {
+ return report_parse_error<UC>(p, parse_error::missing_exponential_part);
+ }
  }
  answer.lastmatch = p;
  answer.valid = true;

diff --git a/tests/json_fmt.cpp b/tests/json_fmt.cpp
@@ -45,6 +45,15 @@ struct AcceptedValue {
  ExpectedResult expected;
 };
 
+struct RejectReason {
+ fast_float::parse_error error;
+ intptr_t location_offset;
+};
+struct RejectedValue {
+ std::string input;
+ RejectReason reason;
+};
+
 int main() {
  const std::vector<AcceptedValue> accept{
  {"-0.2", {-0.2, ""}},
@@ -55,8 +64,18 @@ int main() {
  {"1e", {1., "e"}},
  {"1e+", {1., "e+"}},
  {"inf", {std::numeric_limits<double>::infinity(), ""}}};
- const std::vector<std::string> reject{"-.2", "00.02", "0.e+1", "00.e+1",
- ".25", "+0.25", "inf", "nan(snan)"};
+ const std::vector<RejectedValue> reject{
+ {"-.2", {fast_float::parse_error::missing_integer_after_sign, 1}},
+ {"00.02", {fast_float::parse_error::leading_zeros_in_integer_part, 0}},
+ {"0.e+1", {fast_float::parse_error::no_digits_in_fractional_part, 2}},
+ {"00.e+1", {fast_float::parse_error::leading_zeros_in_integer_part, 0}},
+ {".25", {fast_float::parse_error::no_digits_in_integer_part, 0}},
+ // The following cases already start as invalid JSON, so they are
+ // handled as trailing junk and the error is for not having digits in the
+ // empty string before the invalid token.
+ {"+0.25", {fast_float::parse_error::no_digits_in_integer_part, 0}},
+ {"inf", {fast_float::parse_error::no_digits_in_integer_part, 0}},
+ {"nan(snan)", {fast_float::parse_error::no_digits_in_integer_part, 0}}};
 
  for (std::size_t i = 0; i < accept.size(); ++i)
  {
@@ -80,7 +99,7 @@ int main() {
 
  for (std::size_t i = 0; i < reject.size(); ++i)
  {
- const auto& s = reject[i];
+ const auto& s = reject[i].input;
  double result;
  auto answer = fast_float::from_chars(s.data(), s.data() + s.size(), result, fast_float::chars_format::json);
  if (answer.ec == std::errc()) {
@@ -89,6 +108,31 @@ int main() {
  }
  }
 
+ for (std::size_t i = 0; i < reject.size(); ++i)
+ {
+ const auto& f = reject[i].input;
+ const auto& expected_reason = reject[i].reason;
+ auto answer = fast_float::parse_number_string(
+ f.data(), f.data() + f.size(),
+ fast_float::parse_options(fast_float::chars_format::json));
+ if (answer.valid) {
+ std::cerr << "json parse accepted invalid json " << f << std::endl;
+ return EXIT_FAILURE;
+ }
+ if (answer.error != expected_reason.error) {
+ std::cerr << "json parse failure had invalid error reason " << f
+ << std::endl;
+ return EXIT_FAILURE;
+ }
+ intptr_t error_location = answer.lastmatch - f.data();
+ if (error_location != expected_reason.location_offset) {
+ std::cerr << "json parse failure had invalid error location " << f
+ << " (expected " << expected_reason.location_offset << " got "
+ << error_location << ")" << std::endl;
+ return EXIT_FAILURE;
+ }
+ }
+
  if(main_readme() != EXIT_SUCCESS) { return EXIT_FAILURE; }
  if(main_readme2() != EXIT_SUCCESS) { return EXIT_FAILURE; }