From f9379435244a3c6e3c79f763e084c13efaad4a9e Mon Sep 17 00:00:00 2001 From: TP Boudreau Date: Sat, 27 Apr 2019 18:59:53 +0000 Subject: [PATCH 1/9] Use logical annotations in Arrow Parquet reader/writer --- .../parquet/arrow/arrow-reader-writer-test.cc | 159 ++++++--- cpp/src/parquet/arrow/arrow-schema-test.cc | 227 +++++++++++- cpp/src/parquet/arrow/reader.cc | 6 +- cpp/src/parquet/arrow/schema.cc | 332 ++++++++++++------ cpp/src/parquet/arrow/writer.cc | 49 ++- python/pyarrow/tests/test_parquet.py | 125 ++++--- 6 files changed, 663 insertions(+), 235 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 21f6f0405f73b..c352a62942913 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -87,58 +87,84 @@ static constexpr int LARGE_SIZE = 10000; static constexpr uint32_t kDefaultSeed = 0; -LogicalType::type get_logical_type(const ::DataType& type) { +std::shared_ptr get_logical_annotation(const ::DataType& type, + int32_t precision, + int32_t scale) { switch (type.id()) { case ArrowId::UINT8: - return LogicalType::UINT_8; + return LogicalAnnotation::Int(8, false); case ArrowId::INT8: - return LogicalType::INT_8; + return LogicalAnnotation::Int(8, true); case ArrowId::UINT16: - return LogicalType::UINT_16; + return LogicalAnnotation::Int(16, false); case ArrowId::INT16: - return LogicalType::INT_16; + return LogicalAnnotation::Int(16, true); case ArrowId::UINT32: - return LogicalType::UINT_32; + return LogicalAnnotation::Int(32, false); case ArrowId::INT32: - return LogicalType::INT_32; + return LogicalAnnotation::Int(32, true); case ArrowId::UINT64: - return LogicalType::UINT_64; + return LogicalAnnotation::Int(64, false); case ArrowId::INT64: - return LogicalType::INT_64; + return LogicalAnnotation::Int(64, true); case ArrowId::STRING: - return LogicalType::UTF8; + return LogicalAnnotation::String(); case ArrowId::DATE32: - return LogicalType::DATE; + return LogicalAnnotation::Date(); case ArrowId::DATE64: - return LogicalType::DATE; + return LogicalAnnotation::Date(); case ArrowId::TIMESTAMP: { const auto& ts_type = static_cast(type); + const bool adjusted_to_utc = (ts_type.timezone() == "UTC"); switch (ts_type.unit()) { case TimeUnit::MILLI: - return LogicalType::TIMESTAMP_MILLIS; + return LogicalAnnotation::Timestamp(adjusted_to_utc, + LogicalAnnotation::TimeUnit::MILLIS); case TimeUnit::MICRO: - return LogicalType::TIMESTAMP_MICROS; + return LogicalAnnotation::Timestamp(adjusted_to_utc, + LogicalAnnotation::TimeUnit::MICROS); + case TimeUnit::NANO: + return LogicalAnnotation::Timestamp(adjusted_to_utc, + LogicalAnnotation::TimeUnit::NANOS); default: - DCHECK(false) << "Only MILLI and MICRO units supported for Arrow timestamps " - "with Parquet."; + DCHECK(false) + << "Only MILLI, MICRO, and NANO units supported for Arrow TIMESTAMP."; } break; } case ArrowId::TIME32: - return LogicalType::TIME_MILLIS; - case ArrowId::TIME64: - return LogicalType::TIME_MICROS; + return LogicalAnnotation::Time(false, LogicalAnnotation::TimeUnit::MILLIS); + case ArrowId::TIME64: { + const auto& tm_type = static_cast(type); + switch (tm_type.unit()) { + case TimeUnit::MICRO: + return LogicalAnnotation::Time(false, LogicalAnnotation::TimeUnit::MICROS); + case TimeUnit::NANO: + return LogicalAnnotation::Time(false, LogicalAnnotation::TimeUnit::NANOS); + default: + DCHECK(false) << "Only MICRO and NANO units supported for Arrow TIME64."; + } + break; + } case ArrowId::DICTIONARY: { const ::arrow::DictionaryType& dict_type = static_cast(type); - return get_logical_type(*dict_type.value_type()); + const ::DataType& ty = *dict_type.value_type(); + int32_t pr = -1; + int32_t sc = -1; + if (ty.id() == ArrowId::DECIMAL) { + const auto& dt = static_cast(ty); + pr = dt.precision(); + sc = dt.scale(); + } + return get_logical_annotation(ty, pr, sc); } case ArrowId::DECIMAL: - return LogicalType::DECIMAL; + return LogicalAnnotation::Decimal(precision, scale); default: break; } - return LogicalType::NONE; + return LogicalAnnotation::None(); } ParquetType::type get_physical_type(const ::DataType& type) { @@ -383,6 +409,8 @@ void CheckSimpleRoundtrip(const std::shared_ptr& table, int64_t row_group std::shared_ptr
result; DoSimpleRoundtrip(table, false /* use_threads */, row_group_size, {}, &result, arrow_properties); + ASSERT_NO_FATAL_FAILURE( + ::arrow::AssertSchemaEqual(*table->schema(), *result->schema())); ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result, false)); } @@ -424,8 +452,9 @@ static std::shared_ptr MakeSimpleSchema(const ::DataType& type, default: break; } - auto pnode = PrimitiveNode::Make("column1", repetition, get_physical_type(type), - get_logical_type(type), byte_width, precision, scale); + auto pnode = PrimitiveNode::Make("column1", repetition, + get_logical_annotation(type, precision, scale), + get_physical_type(type), byte_width); NodePtr node_ = GroupNode::Make("schema", Repetition::REQUIRED, std::vector({pnode})); return std::static_pointer_cast(node_); @@ -1195,7 +1224,7 @@ TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredChunkedTableRead) { ASSERT_NO_FATAL_FAILURE(this->CheckSingleColumnRequiredTableRead(4)); } -void MakeDateTimeTypesTable(std::shared_ptr
* out, bool nanos_as_micros = false) { +void MakeDateTimeTypesTable(std::shared_ptr
* out) { using ::arrow::ArrayFromVector; std::vector is_valid = {true, true, true, false, true, true}; @@ -1204,12 +1233,13 @@ void MakeDateTimeTypesTable(std::shared_ptr
* out, bool nanos_as_micros = auto f0 = field("f0", ::arrow::date32()); auto f1 = field("f1", ::arrow::timestamp(TimeUnit::MILLI)); auto f2 = field("f2", ::arrow::timestamp(TimeUnit::MICRO)); - auto f3_unit = nanos_as_micros ? TimeUnit::MICRO : TimeUnit::NANO; - auto f3 = field("f3", ::arrow::timestamp(f3_unit)); + auto f3 = field("f3", ::arrow::timestamp(TimeUnit::NANO)); auto f4 = field("f4", ::arrow::time32(TimeUnit::MILLI)); auto f5 = field("f5", ::arrow::time64(TimeUnit::MICRO)); + auto f6 = field("f6", ::arrow::time64(TimeUnit::NANO)); - std::shared_ptr<::arrow::Schema> schema(new ::arrow::Schema({f0, f1, f2, f3, f4, f5})); + std::shared_ptr<::arrow::Schema> schema( + new ::arrow::Schema({f0, f1, f2, f3, f4, f5, f6})); std::vector t32_values = {1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000}; @@ -1220,34 +1250,37 @@ void MakeDateTimeTypesTable(std::shared_ptr
* out, bool nanos_as_micros = std::vector t64_ms_values = {1489269, 1489270, 1489271, 1489272, 1489272, 1489273}; - std::shared_ptr a0, a1, a2, a3, a4, a5; + std::shared_ptr a0, a1, a2, a3, a4, a5, a6; ArrayFromVector<::arrow::Date32Type, int32_t>(f0->type(), is_valid, t32_values, &a0); ArrayFromVector<::arrow::TimestampType, int64_t>(f1->type(), is_valid, t64_ms_values, &a1); ArrayFromVector<::arrow::TimestampType, int64_t>(f2->type(), is_valid, t64_us_values, &a2); - auto f3_data = nanos_as_micros ? t64_us_values : t64_ns_values; - ArrayFromVector<::arrow::TimestampType, int64_t>(f3->type(), is_valid, f3_data, &a3); + ArrayFromVector<::arrow::TimestampType, int64_t>(f3->type(), is_valid, t64_ns_values, + &a3); ArrayFromVector<::arrow::Time32Type, int32_t>(f4->type(), is_valid, t32_values, &a4); ArrayFromVector<::arrow::Time64Type, int64_t>(f5->type(), is_valid, t64_us_values, &a5); + ArrayFromVector<::arrow::Time64Type, int64_t>(f6->type(), is_valid, t64_ns_values, &a6); std::vector> columns = { std::make_shared("f0", a0), std::make_shared("f1", a1), std::make_shared("f2", a2), std::make_shared("f3", a3), - std::make_shared("f4", a4), std::make_shared("f5", a5)}; + std::make_shared("f4", a4), std::make_shared("f5", a5), + std::make_shared("f6", a6)}; *out = Table::Make(schema, columns); } TEST(TestArrowReadWrite, DateTimeTypes) { std::shared_ptr
table, result; - MakeDateTimeTypesTable(&table); - // Cast nanaoseconds to microseconds and use INT64 physical type + MakeDateTimeTypesTable(&table); ASSERT_NO_FATAL_FAILURE( DoSimpleRoundtrip(table, false /* use_threads */, table->num_rows(), {}, &result)); - MakeDateTimeTypesTable(&table, true); + MakeDateTimeTypesTable(&table); + ASSERT_NO_FATAL_FAILURE( + ::arrow::AssertSchemaEqual(*table->schema(), *result->schema())); ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result)); } @@ -1300,6 +1333,8 @@ TEST(TestArrowReadWrite, UseDeprecatedInt96) { input, false /* use_threads */, input->num_rows(), {}, &result, ArrowWriterProperties::Builder().enable_deprecated_int96_timestamps()->build())); + ASSERT_NO_FATAL_FAILURE( + ::arrow::AssertSchemaEqual(*ex_result->schema(), *result->schema())); ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_result, *result)); // Ensure enable_deprecated_int96_timestamps as precedence over @@ -1311,6 +1346,8 @@ TEST(TestArrowReadWrite, UseDeprecatedInt96) { ->coerce_timestamps(TimeUnit::MILLI) ->build())); + ASSERT_NO_FATAL_FAILURE( + ::arrow::AssertSchemaEqual(*ex_result->schema(), *result->schema())); ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_result, *result)); } @@ -1318,7 +1355,6 @@ TEST(TestArrowReadWrite, CoerceTimestamps) { using ::arrow::ArrayFromVector; using ::arrow::field; - // PARQUET-1078, coerce Arrow timestamps to either TIMESTAMP_MILLIS or TIMESTAMP_MICROS std::vector is_valid = {true, true, true, false, true, true}; auto t_s = ::arrow::timestamp(TimeUnit::SECOND); @@ -1363,6 +1399,8 @@ TEST(TestArrowReadWrite, CoerceTimestamps) { ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( input, false /* use_threads */, input->num_rows(), {}, &milli_result, ArrowWriterProperties::Builder().coerce_timestamps(TimeUnit::MILLI)->build())); + ASSERT_NO_FATAL_FAILURE( + ::arrow::AssertSchemaEqual(*ex_milli_result->schema(), *milli_result->schema())); ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_milli_result, *milli_result)); // Result when coercing to microseconds @@ -1378,7 +1416,26 @@ TEST(TestArrowReadWrite, CoerceTimestamps) { ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( input, false /* use_threads */, input->num_rows(), {}, µ_result, ArrowWriterProperties::Builder().coerce_timestamps(TimeUnit::MICRO)->build())); + ASSERT_NO_FATAL_FAILURE( + ::arrow::AssertSchemaEqual(*ex_micro_result->schema(), *micro_result->schema())); ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_micro_result, *micro_result)); + + // Result when coercing to nanoseconds + auto s4 = std::shared_ptr<::arrow::Schema>( + new ::arrow::Schema({field("f_s", t_ns), field("f_ms", t_ns), field("f_us", t_ns), + field("f_ns", t_ns)})); + auto ex_nano_result = Table::Make( + s4, + {std::make_shared("f_s", a_ns), std::make_shared("f_ms", a_ns), + std::make_shared("f_us", a_ns), std::make_shared("f_ns", a_ns)}); + + std::shared_ptr
nano_result; + ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( + input, false /* use_threads */, input->num_rows(), {}, &nano_result, + ArrowWriterProperties::Builder().coerce_timestamps(TimeUnit::NANO)->build())); + ASSERT_NO_FATAL_FAILURE( + ::arrow::AssertSchemaEqual(*ex_nano_result->schema(), *nano_result->schema())); + ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_nano_result, *nano_result)); } TEST(TestArrowReadWrite, CoerceTimestampsLosePrecision) { @@ -1439,25 +1496,37 @@ TEST(TestArrowReadWrite, CoerceTimestampsLosePrecision) { ASSERT_RAISES(Invalid, WriteTable(*t4, ::arrow::default_memory_pool(), sink, 10, default_writer_properties(), coerce_millis)); - // OK to lose precision if we explicitly allow it - auto allow_truncation = (ArrowWriterProperties::Builder() - .coerce_timestamps(TimeUnit::MILLI) - ->allow_truncated_timestamps() - ->build()); + // OK to lose micros/nanos -> millis precision if we explicitly allow it + auto allow_truncation_to_millis = (ArrowWriterProperties::Builder() + .coerce_timestamps(TimeUnit::MILLI) + ->allow_truncated_timestamps() + ->build()); ASSERT_OK_NO_THROW(WriteTable(*t3, ::arrow::default_memory_pool(), sink, 10, - default_writer_properties(), allow_truncation)); + default_writer_properties(), allow_truncation_to_millis)); ASSERT_OK_NO_THROW(WriteTable(*t4, ::arrow::default_memory_pool(), sink, 10, - default_writer_properties(), allow_truncation)); + default_writer_properties(), allow_truncation_to_millis)); - // OK to write micros to micros + // OK to write to micros auto coerce_micros = (ArrowWriterProperties::Builder().coerce_timestamps(TimeUnit::MICRO)->build()); + ASSERT_OK_NO_THROW(WriteTable(*t1, ::arrow::default_memory_pool(), sink, 10, + default_writer_properties(), coerce_micros)); + ASSERT_OK_NO_THROW(WriteTable(*t2, ::arrow::default_memory_pool(), sink, 10, + default_writer_properties(), coerce_micros)); ASSERT_OK_NO_THROW(WriteTable(*t3, ::arrow::default_memory_pool(), sink, 10, default_writer_properties(), coerce_micros)); // Loss of precision ASSERT_RAISES(Invalid, WriteTable(*t4, ::arrow::default_memory_pool(), sink, 10, default_writer_properties(), coerce_micros)); + + // OK to lose nanos -> micros precision if we explicitly allow it + auto allow_truncation_to_micros = (ArrowWriterProperties::Builder() + .coerce_timestamps(TimeUnit::MICRO) + ->allow_truncated_timestamps() + ->build()); + ASSERT_OK_NO_THROW(WriteTable(*t4, ::arrow::default_memory_pool(), sink, 10, + default_writer_properties(), allow_truncation_to_micros)); } TEST(TestArrowReadWrite, ConvertedDateTimeTypes) { @@ -1515,6 +1584,8 @@ TEST(TestArrowReadWrite, ConvertedDateTimeTypes) { ASSERT_NO_FATAL_FAILURE( DoSimpleRoundtrip(table, false /* use_threads */, table->num_rows(), {}, &result)); + ASSERT_NO_FATAL_FAILURE( + ::arrow::AssertSchemaEqual(*ex_table->schema(), *result->schema())); ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_table, *result)); } diff --git a/cpp/src/parquet/arrow/arrow-schema-test.cc b/cpp/src/parquet/arrow/arrow-schema-test.cc index b806782a09dca..6424dd3f5def6 100644 --- a/cpp/src/parquet/arrow/arrow-schema-test.cc +++ b/cpp/src/parquet/arrow/arrow-schema-test.cc @@ -33,6 +33,7 @@ using arrow::Field; using arrow::TimeUnit; using ParquetType = parquet::Type; +using parquet::LogicalAnnotation; using parquet::LogicalType; using parquet::Repetition; using parquet::schema::GroupNode; @@ -115,12 +116,14 @@ TEST_F(TestConvertParquetSchema, ParquetFlatPrimitives) { parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED, ParquetType::INT64, LogicalType::TIMESTAMP_MILLIS)); - arrow_fields.push_back(std::make_shared("timestamp", TIMESTAMP_MS, false)); + arrow_fields.push_back(std::make_shared( + "timestamp", ::arrow::timestamp(TimeUnit::MILLI, "UTC"), false)); parquet_fields.push_back(PrimitiveNode::Make("timestamp[us]", Repetition::REQUIRED, ParquetType::INT64, LogicalType::TIMESTAMP_MICROS)); - arrow_fields.push_back(std::make_shared("timestamp[us]", TIMESTAMP_US, false)); + arrow_fields.push_back(std::make_shared( + "timestamp[us]", ::arrow::timestamp(TimeUnit::MICRO, "UTC"), false)); parquet_fields.push_back(PrimitiveNode::Make("date", Repetition::REQUIRED, ParquetType::INT32, LogicalType::DATE)); @@ -168,6 +171,103 @@ TEST_F(TestConvertParquetSchema, ParquetFlatPrimitives) { ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); } +TEST_F(TestConvertParquetSchema, ParquetAnnotatedFields) { + struct FieldConstructionArguments { + std::string name; + std::shared_ptr annotation; + parquet::Type::type physical_type; + int physical_length; + std::shared_ptr<::arrow::DataType> datatype; + }; + + std::vector cases = { + {"string", LogicalAnnotation::String(), ParquetType::BYTE_ARRAY, -1, + ::arrow::utf8()}, + {"enum", LogicalAnnotation::Enum(), ParquetType::BYTE_ARRAY, -1, ::arrow::binary()}, + {"decimal(8, 2)", LogicalAnnotation::Decimal(8, 2), ParquetType::INT32, -1, + ::arrow::decimal(8, 2)}, + {"decimal(16, 4)", LogicalAnnotation::Decimal(16, 4), ParquetType::INT64, -1, + ::arrow::decimal(16, 4)}, + {"decimal(32, 8)", LogicalAnnotation::Decimal(32, 8), + ParquetType::FIXED_LEN_BYTE_ARRAY, 16, ::arrow::decimal(32, 8)}, + {"date", LogicalAnnotation::Date(), ParquetType::INT32, -1, ::arrow::date32()}, + {"time(ms)", LogicalAnnotation::Time(true, LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT32, -1, ::arrow::time32(::arrow::TimeUnit::MILLI)}, + {"time(us)", LogicalAnnotation::Time(true, LogicalAnnotation::TimeUnit::MICROS), + ParquetType::INT64, -1, ::arrow::time64(::arrow::TimeUnit::MICRO)}, + {"time(ns)", LogicalAnnotation::Time(true, LogicalAnnotation::TimeUnit::NANOS), + ParquetType::INT64, -1, ::arrow::time64(::arrow::TimeUnit::NANO)}, + {"time(ms)", LogicalAnnotation::Time(false, LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT32, -1, ::arrow::time32(::arrow::TimeUnit::MILLI)}, + {"time(us)", LogicalAnnotation::Time(false, LogicalAnnotation::TimeUnit::MICROS), + ParquetType::INT64, -1, ::arrow::time64(::arrow::TimeUnit::MICRO)}, + {"time(ns)", LogicalAnnotation::Time(false, LogicalAnnotation::TimeUnit::NANOS), + ParquetType::INT64, -1, ::arrow::time64(::arrow::TimeUnit::NANO)}, + {"timestamp(true, ms)", + LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT64, -1, ::arrow::timestamp(::arrow::TimeUnit::MILLI, "UTC")}, + {"timestamp(true, us)", + LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::MICROS), + ParquetType::INT64, -1, ::arrow::timestamp(::arrow::TimeUnit::MICRO, "UTC")}, + {"timestamp(true, ns)", + LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::NANOS), + ParquetType::INT64, -1, ::arrow::timestamp(::arrow::TimeUnit::NANO, "UTC")}, + {"timestamp(false, ms)", + LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT64, -1, ::arrow::timestamp(::arrow::TimeUnit::MILLI)}, + {"timestamp(false, us)", + LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MICROS), + ParquetType::INT64, -1, ::arrow::timestamp(::arrow::TimeUnit::MICRO)}, + {"timestamp(false, ns)", + LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::NANOS), + ParquetType::INT64, -1, ::arrow::timestamp(::arrow::TimeUnit::NANO)}, + {"int(8, false)", LogicalAnnotation::Int(8, false), ParquetType::INT32, -1, + ::arrow::uint8()}, + {"int(8, true)", LogicalAnnotation::Int(8, true), ParquetType::INT32, -1, + ::arrow::int8()}, + {"int(16, false)", LogicalAnnotation::Int(16, false), ParquetType::INT32, -1, + ::arrow::uint16()}, + {"int(16, true)", LogicalAnnotation::Int(16, true), ParquetType::INT32, -1, + ::arrow::int16()}, + {"int(32, false)", LogicalAnnotation::Int(32, false), ParquetType::INT32, -1, + ::arrow::uint32()}, + {"int(32, true)", LogicalAnnotation::Int(32, true), ParquetType::INT32, -1, + ::arrow::int32()}, + {"int(64, false)", LogicalAnnotation::Int(64, false), ParquetType::INT64, -1, + ::arrow::uint64()}, + {"int(64, true)", LogicalAnnotation::Int(64, true), ParquetType::INT64, -1, + ::arrow::int64()}, + {"json", LogicalAnnotation::JSON(), ParquetType::BYTE_ARRAY, -1, ::arrow::binary()}, + {"bson", LogicalAnnotation::BSON(), ParquetType::BYTE_ARRAY, -1, ::arrow::binary()}, + {"interval", LogicalAnnotation::Interval(), ParquetType::FIXED_LEN_BYTE_ARRAY, 12, + ::arrow::fixed_size_binary(12)}, + {"uuid", LogicalAnnotation::UUID(), ParquetType::FIXED_LEN_BYTE_ARRAY, 16, + ::arrow::fixed_size_binary(16)}, + {"none", LogicalAnnotation::None(), ParquetType::BOOLEAN, -1, ::arrow::boolean()}, + {"none", LogicalAnnotation::None(), ParquetType::INT32, -1, ::arrow::int32()}, + {"none", LogicalAnnotation::None(), ParquetType::INT64, -1, ::arrow::int64()}, + {"none", LogicalAnnotation::None(), ParquetType::FLOAT, -1, ::arrow::float32()}, + {"none", LogicalAnnotation::None(), ParquetType::DOUBLE, -1, ::arrow::float64()}, + {"none", LogicalAnnotation::None(), ParquetType::BYTE_ARRAY, -1, ::arrow::binary()}, + {"none", LogicalAnnotation::None(), ParquetType::FIXED_LEN_BYTE_ARRAY, 64, + ::arrow::fixed_size_binary(64)}, + {"null", LogicalAnnotation::Null(), ParquetType::BYTE_ARRAY, -1, ::arrow::null()}, + }; + + std::vector parquet_fields; + std::vector> arrow_fields; + + for (const FieldConstructionArguments& c : cases) { + parquet_fields.push_back(PrimitiveNode::Make( + c.name, Repetition::OPTIONAL, c.annotation, c.physical_type, c.physical_length)); + arrow_fields.push_back(std::make_shared(c.name, c.datatype)); + } + + ASSERT_OK(ConvertSchema(parquet_fields)); + auto arrow_schema = std::make_shared<::arrow::Schema>(arrow_fields); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); +} + TEST_F(TestConvertParquetSchema, DuplicateFieldNames) { std::vector parquet_fields; std::vector> arrow_fields; @@ -586,6 +686,7 @@ TEST_F(TestConvertParquetSchema, ParquetNestedSchemaPartialOrdering) { ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); } + TEST_F(TestConvertParquetSchema, ParquetRepeatedNestedSchema) { std::vector parquet_fields; std::vector> arrow_fields; @@ -686,12 +787,14 @@ TEST_F(TestConvertArrowSchema, ParquetFlatPrimitives) { parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED, ParquetType::INT64, LogicalType::TIMESTAMP_MILLIS)); - arrow_fields.push_back(std::make_shared("timestamp", TIMESTAMP_MS, false)); + arrow_fields.push_back(std::make_shared( + "timestamp", ::arrow::timestamp(TimeUnit::MILLI, "UTC"), false)); parquet_fields.push_back(PrimitiveNode::Make("timestamp[us]", Repetition::REQUIRED, ParquetType::INT64, LogicalType::TIMESTAMP_MICROS)); - arrow_fields.push_back(std::make_shared("timestamp[us]", TIMESTAMP_US, false)); + arrow_fields.push_back(std::make_shared( + "timestamp[us]", ::arrow::timestamp(TimeUnit::MICRO, "UTC"), false)); parquet_fields.push_back( PrimitiveNode::Make("float", Repetition::OPTIONAL, ParquetType::FLOAT)); @@ -714,6 +817,113 @@ TEST_F(TestConvertArrowSchema, ParquetFlatPrimitives) { ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields)); } +TEST_F(TestConvertArrowSchema, ArrowFields) { + struct FieldConstructionArguments { + std::string name; + std::shared_ptr<::arrow::DataType> datatype; + std::shared_ptr annotation; + parquet::Type::type physical_type; + int physical_length; + }; + + std::vector cases = { + {"boolean", ::arrow::boolean(), LogicalAnnotation::None(), ParquetType::BOOLEAN, + -1}, + {"binary", ::arrow::binary(), LogicalAnnotation::None(), ParquetType::BYTE_ARRAY, + -1}, + {"fixed_size_binary", ::arrow::fixed_size_binary(64), LogicalAnnotation::None(), + ParquetType::FIXED_LEN_BYTE_ARRAY, 64}, + {"uint8", ::arrow::uint8(), LogicalAnnotation::Int(8, false), ParquetType::INT32, + -1}, + {"int8", ::arrow::int8(), LogicalAnnotation::Int(8, true), ParquetType::INT32, -1}, + {"uint16", ::arrow::uint16(), LogicalAnnotation::Int(16, false), ParquetType::INT32, + -1}, + {"int16", ::arrow::int16(), LogicalAnnotation::Int(16, true), ParquetType::INT32, + -1}, + {"uint32", ::arrow::uint32(), LogicalAnnotation::None(), ParquetType::INT64, + -1}, // Parquet 1.0 + {"int32", ::arrow::int32(), LogicalAnnotation::None(), ParquetType::INT32, -1}, + {"uint64", ::arrow::uint64(), LogicalAnnotation::Int(64, false), ParquetType::INT64, + -1}, + {"int64", ::arrow::int64(), LogicalAnnotation::None(), ParquetType::INT64, -1}, + {"float32", ::arrow::float32(), LogicalAnnotation::None(), ParquetType::FLOAT, -1}, + {"float64", ::arrow::float64(), LogicalAnnotation::None(), ParquetType::DOUBLE, -1}, + {"utf8", ::arrow::utf8(), LogicalAnnotation::String(), ParquetType::BYTE_ARRAY, -1}, + {"decimal(1, 0)", ::arrow::decimal(1, 0), LogicalAnnotation::Decimal(1, 0), + ParquetType::FIXED_LEN_BYTE_ARRAY, 1}, + {"decimal(8, 2)", ::arrow::decimal(8, 2), LogicalAnnotation::Decimal(8, 2), + ParquetType::FIXED_LEN_BYTE_ARRAY, 4}, + {"decimal(16, 4)", ::arrow::decimal(16, 4), LogicalAnnotation::Decimal(16, 4), + ParquetType::FIXED_LEN_BYTE_ARRAY, 7}, + {"decimal(32, 8)", ::arrow::decimal(32, 8), LogicalAnnotation::Decimal(32, 8), + ParquetType::FIXED_LEN_BYTE_ARRAY, 14}, + {"time32", ::arrow::time32(::arrow::TimeUnit::MILLI), + LogicalAnnotation::Time(false, LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT32, -1}, + {"time64(microsecond)", ::arrow::time64(::arrow::TimeUnit::MICRO), + LogicalAnnotation::Time(false, LogicalAnnotation::TimeUnit::MICROS), + ParquetType::INT64, -1}, + {"time64(nanosecond)", ::arrow::time64(::arrow::TimeUnit::NANO), + LogicalAnnotation::Time(false, LogicalAnnotation::TimeUnit::NANOS), + ParquetType::INT64, -1}, + {"timestamp(millisecond)", ::arrow::timestamp(::arrow::TimeUnit::MILLI), + LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT64, -1}, + {"timestamp(microsecond)", ::arrow::timestamp(::arrow::TimeUnit::MICRO), + LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MICROS), + ParquetType::INT64, -1}, + {"timestamp(nanosecond)", ::arrow::timestamp(::arrow::TimeUnit::NANO), + LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::NANOS), + ParquetType::INT64, -1}, + {"timestamp(millisecond, UTC)", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "UTC"), + LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT64, -1}, + {"timestamp(microsecond, UTC)", ::arrow::timestamp(::arrow::TimeUnit::MICRO, "UTC"), + LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::MICROS), + ParquetType::INT64, -1}, + {"timestamp(nanosecond, UTC)", ::arrow::timestamp(::arrow::TimeUnit::NANO, "UTC"), + LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::NANOS), + ParquetType::INT64, -1}, + {"timestamp(millisecond, CET)", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "CET"), + LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT64, -1}, + {"timestamp(microsecond, CET)", ::arrow::timestamp(::arrow::TimeUnit::MICRO, "CET"), + LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MICROS), + ParquetType::INT64, -1}, + {"timestamp(nanosecond, CET)", ::arrow::timestamp(::arrow::TimeUnit::NANO, "CET"), + LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::NANOS), + ParquetType::INT64, -1}, + {"null", ::arrow::null(), LogicalAnnotation::Null(), ParquetType::INT32, -1}}; + + std::vector> arrow_fields; + std::vector parquet_fields; + + for (const FieldConstructionArguments& c : cases) { + arrow_fields.push_back(std::make_shared(c.name, c.datatype, false)); + parquet_fields.push_back(PrimitiveNode::Make( + c.name, Repetition::REQUIRED, c.annotation, c.physical_type, c.physical_length)); + } + + ASSERT_OK(ConvertSchema(arrow_fields)); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields)); +} + +TEST_F(TestConvertArrowSchema, ArrowNonconvertibleFields) { + struct FieldConstructionArguments { + std::string name; + std::shared_ptr<::arrow::DataType> datatype; + }; + + std::vector cases = { + {"float16", ::arrow::float16()}, + }; + + for (const FieldConstructionArguments& c : cases) { + auto field = std::make_shared(c.name, c.datatype); + ASSERT_RAISES(NotImplemented, ConvertSchema({field})); + } +} + TEST_F(TestConvertArrowSchema, ParquetFlatPrimitivesAsDictionaries) { std::vector parquet_fields; std::vector> arrow_fields; @@ -809,15 +1019,6 @@ TEST_F(TestConvertArrowSchema, ParquetLists) { ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields)); } -TEST_F(TestConvertArrowSchema, UnsupportedTypes) { - std::vector> unsupported_fields = { - ::arrow::field("f0", ::arrow::time64(TimeUnit::NANO))}; - - for (const auto& field : unsupported_fields) { - ASSERT_RAISES(NotImplemented, ConvertSchema({field})); - } -} - TEST_F(TestConvertArrowSchema, ParquetFlatDecimals) { std::vector parquet_fields; std::vector> arrow_fields; diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index bdff716c193ad..56656038f81ef 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -1613,7 +1613,11 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, TRANSFER_DATA(::arrow::TimestampType, Int64Type); } break; case ::arrow::TimeUnit::NANO: { - TRANSFER_DATA(::arrow::TimestampType, Int96Type); + if (descr_->physical_type() == ::parquet::Type::INT96) { + TRANSFER_DATA(::arrow::TimestampType, Int96Type); + } else { + TRANSFER_DATA(::arrow::TimestampType, Int64Type); + } } break; default: return Status::NotImplemented("TimeUnit not supported"); diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 45b4b38fd4960..c7f71665ffda0 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -17,6 +17,7 @@ #include "parquet/arrow/schema.h" +#include #include #include #include @@ -25,6 +26,7 @@ #include "arrow/array.h" #include "arrow/status.h" #include "arrow/type.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" #include "parquet/arrow/writer.h" @@ -35,6 +37,7 @@ using arrow::Field; using arrow::Status; +using arrow::internal::checked_cast; using ArrowType = arrow::DataType; using ArrowTypeId = arrow::Type; @@ -46,6 +49,7 @@ using parquet::schema::NodePtr; using parquet::schema::PrimitiveNode; using ParquetType = parquet::Type; +using parquet::LogicalAnnotation; using parquet::LogicalType; namespace parquet { @@ -56,117 +60,202 @@ const auto TIMESTAMP_MS = ::arrow::timestamp(::arrow::TimeUnit::MILLI); const auto TIMESTAMP_US = ::arrow::timestamp(::arrow::TimeUnit::MICRO); const auto TIMESTAMP_NS = ::arrow::timestamp(::arrow::TimeUnit::NANO); -std::shared_ptr MakeDecimal128Type(const PrimitiveNode& node) { - const auto& metadata = node.decimal_metadata(); - return ::arrow::decimal(metadata.precision, metadata.scale); +static Status MakeArrowDecimal(const std::shared_ptr& annotation, + std::shared_ptr* out) { + const auto& decimal = checked_cast(*annotation); + *out = ::arrow::decimal(decimal.precision(), decimal.scale()); + return Status::OK(); } -static Status FromByteArray(const PrimitiveNode& node, std::shared_ptr* out) { - switch (node.logical_type()) { - case LogicalType::UTF8: - *out = ::arrow::utf8(); +static Status MakeArrowInt(const std::shared_ptr& annotation, + std::shared_ptr* out) { + const auto& integer = checked_cast(*annotation); + switch (integer.bit_width()) { + case 8: + *out = integer.is_signed() ? ::arrow::int8() : ::arrow::uint8(); break; - case LogicalType::DECIMAL: - *out = MakeDecimal128Type(node); + case 16: + *out = integer.is_signed() ? ::arrow::int16() : ::arrow::uint16(); break; - default: - // BINARY - *out = ::arrow::binary(); + case 32: + *out = integer.is_signed() ? ::arrow::int32() : ::arrow::uint32(); break; + default: + return Status::TypeError(annotation->ToString(), + " can not annotate physical type Int32"); } return Status::OK(); } -static Status FromFLBA(const PrimitiveNode& node, std::shared_ptr* out) { - switch (node.logical_type()) { - case LogicalType::NONE: - *out = ::arrow::fixed_size_binary(node.type_length()); +static Status MakeArrowInt64(const std::shared_ptr& annotation, + std::shared_ptr* out) { + const auto& integer = checked_cast(*annotation); + switch (integer.bit_width()) { + case 64: + *out = integer.is_signed() ? ::arrow::int64() : ::arrow::uint64(); break; - case LogicalType::DECIMAL: - *out = MakeDecimal128Type(node); + default: + return Status::TypeError(annotation->ToString(), + " can not annotate physical type Int64"); + } + return Status::OK(); +} + +static Status MakeArrowTime32(const std::shared_ptr& annotation, + std::shared_ptr* out) { + const auto& time = checked_cast(*annotation); + switch (time.time_unit()) { + case LogicalAnnotation::TimeUnit::MILLIS: + *out = ::arrow::time32(::arrow::TimeUnit::MILLI); break; default: - return Status::NotImplemented("Unhandled logical type ", - LogicalTypeToString(node.logical_type()), - " for fixed-length binary array"); + return Status::TypeError(annotation->ToString(), + " can not annotate physical type Time32"); } + return Status::OK(); +} +static Status MakeArrowTime64(const std::shared_ptr& annotation, + std::shared_ptr* out) { + const auto& time = checked_cast(*annotation); + switch (time.time_unit()) { + case LogicalAnnotation::TimeUnit::MICROS: + *out = ::arrow::time64(::arrow::TimeUnit::MICRO); + break; + case LogicalAnnotation::TimeUnit::NANOS: + *out = ::arrow::time64(::arrow::TimeUnit::NANO); + break; + default: + return Status::TypeError(annotation->ToString(), + " can not annotate physical type Time64"); + } return Status::OK(); } -static Status FromInt32(const PrimitiveNode& node, std::shared_ptr* out) { - switch (node.logical_type()) { - case LogicalType::NONE: - *out = ::arrow::int32(); +static Status MakeArrowTimestamp( + const std::shared_ptr& annotation, + std::shared_ptr* out) { + static constexpr auto utc = "UTC"; + const auto& timestamp = checked_cast(*annotation); + switch (timestamp.time_unit()) { + case LogicalAnnotation::TimeUnit::MILLIS: + *out = (timestamp.is_adjusted_to_utc() + ? ::arrow::timestamp(::arrow::TimeUnit::MILLI, utc) + : ::arrow::timestamp(::arrow::TimeUnit::MILLI)); + break; + case LogicalAnnotation::TimeUnit::MICROS: + *out = (timestamp.is_adjusted_to_utc() + ? ::arrow::timestamp(::arrow::TimeUnit::MICRO, utc) + : ::arrow::timestamp(::arrow::TimeUnit::MICRO)); + break; + case LogicalAnnotation::TimeUnit::NANOS: + *out = (timestamp.is_adjusted_to_utc() + ? ::arrow::timestamp(::arrow::TimeUnit::NANO, utc) + : ::arrow::timestamp(::arrow::TimeUnit::NANO)); break; - case LogicalType::UINT_8: - *out = ::arrow::uint8(); + default: + return Status::TypeError("Unrecognized time unit in timestamp annotation: ", + annotation->ToString()); + } + return Status::OK(); +} + +static Status FromByteArray(const std::shared_ptr& annotation, + std::shared_ptr* out) { + switch (annotation->type()) { + case LogicalAnnotation::Type::STRING: + *out = ::arrow::utf8(); break; - case LogicalType::INT_8: - *out = ::arrow::int8(); + case LogicalAnnotation::Type::DECIMAL: + RETURN_NOT_OK(MakeArrowDecimal(annotation, out)); break; - case LogicalType::UINT_16: - *out = ::arrow::uint16(); + case LogicalAnnotation::Type::NONE: + case LogicalAnnotation::Type::ENUM: + case LogicalAnnotation::Type::JSON: + case LogicalAnnotation::Type::BSON: + *out = ::arrow::binary(); break; - case LogicalType::INT_16: - *out = ::arrow::int16(); + default: + return Status::NotImplemented("Unhandled logical annotation ", + annotation->ToString(), " for binary array"); + } + return Status::OK(); +} + +static Status FromFLBA(const std::shared_ptr& annotation, + int32_t physical_length, std::shared_ptr* out) { + switch (annotation->type()) { + case LogicalAnnotation::Type::DECIMAL: + RETURN_NOT_OK(MakeArrowDecimal(annotation, out)); break; - case LogicalType::INT_32: - *out = ::arrow::int32(); + case LogicalAnnotation::Type::NONE: + case LogicalAnnotation::Type::INTERVAL: + case LogicalAnnotation::Type::UUID: + *out = ::arrow::fixed_size_binary(physical_length); break; - case LogicalType::UINT_32: - *out = ::arrow::uint32(); + default: + return Status::NotImplemented("Unhandled logical annotation ", + annotation->ToString(), + " for fixed-length binary array"); + } + + return Status::OK(); +} + +static Status FromInt32(const std::shared_ptr& annotation, + std::shared_ptr* out) { + switch (annotation->type()) { + case LogicalAnnotation::Type::INT: + RETURN_NOT_OK(MakeArrowInt(annotation, out)); break; - case LogicalType::DATE: + case LogicalAnnotation::Type::DATE: *out = ::arrow::date32(); break; - case LogicalType::TIME_MILLIS: - *out = ::arrow::time32(::arrow::TimeUnit::MILLI); + case LogicalAnnotation::Type::TIME: + RETURN_NOT_OK(MakeArrowTime32(annotation, out)); break; - case LogicalType::DECIMAL: - *out = MakeDecimal128Type(node); + case LogicalAnnotation::Type::DECIMAL: + RETURN_NOT_OK(MakeArrowDecimal(annotation, out)); + break; + case LogicalAnnotation::Type::NONE: + *out = ::arrow::int32(); break; default: - return Status::NotImplemented("Unhandled logical type ", - LogicalTypeToString(node.logical_type()), + return Status::NotImplemented("Unhandled logical type ", annotation->ToString(), " for INT32"); } return Status::OK(); } -static Status FromInt64(const PrimitiveNode& node, std::shared_ptr* out) { - switch (node.logical_type()) { - case LogicalType::NONE: - *out = ::arrow::int64(); - break; - case LogicalType::INT_64: - *out = ::arrow::int64(); +static Status FromInt64(const std::shared_ptr& annotation, + std::shared_ptr* out) { + switch (annotation->type()) { + case LogicalAnnotation::Type::INT: + RETURN_NOT_OK(MakeArrowInt64(annotation, out)); break; - case LogicalType::UINT_64: - *out = ::arrow::uint64(); + case LogicalAnnotation::Type::DECIMAL: + RETURN_NOT_OK(MakeArrowDecimal(annotation, out)); break; - case LogicalType::DECIMAL: - *out = MakeDecimal128Type(node); + case LogicalAnnotation::Type::TIMESTAMP: + RETURN_NOT_OK(MakeArrowTimestamp(annotation, out)); break; - case LogicalType::TIMESTAMP_MILLIS: - *out = TIMESTAMP_MS; + case LogicalAnnotation::Type::TIME: + RETURN_NOT_OK(MakeArrowTime64(annotation, out)); break; - case LogicalType::TIMESTAMP_MICROS: - *out = TIMESTAMP_US; - break; - case LogicalType::TIME_MICROS: - *out = ::arrow::time64(::arrow::TimeUnit::MICRO); + case LogicalAnnotation::Type::NONE: + *out = ::arrow::int64(); break; default: - return Status::NotImplemented("Unhandled logical type ", - LogicalTypeToString(node.logical_type()), + return Status::NotImplemented("Unhandled logical type ", annotation->ToString(), " for INT64"); } return Status::OK(); } Status FromPrimitive(const PrimitiveNode& primitive, std::shared_ptr* out) { - if (primitive.logical_type() == LogicalType::NA) { + const std::shared_ptr& annotation = + primitive.logical_annotation(); + if (annotation->is_invalid() || annotation->is_null()) { *out = ::arrow::null(); return Status::OK(); } @@ -176,10 +265,10 @@ Status FromPrimitive(const PrimitiveNode& primitive, std::shared_ptr* *out = ::arrow::boolean(); break; case ParquetType::INT32: - RETURN_NOT_OK(FromInt32(primitive, out)); + RETURN_NOT_OK(FromInt32(annotation, out)); break; case ParquetType::INT64: - RETURN_NOT_OK(FromInt64(primitive, out)); + RETURN_NOT_OK(FromInt64(annotation, out)); break; case ParquetType::INT96: *out = TIMESTAMP_NS; @@ -191,10 +280,10 @@ Status FromPrimitive(const PrimitiveNode& primitive, std::shared_ptr* *out = ::arrow::float64(); break; case ParquetType::BYTE_ARRAY: - RETURN_NOT_OK(FromByteArray(primitive, out)); + RETURN_NOT_OK(FromByteArray(annotation, out)); break; case ParquetType::FIXED_LEN_BYTE_ARRAY: - RETURN_NOT_OK(FromFLBA(primitive, out)); + RETURN_NOT_OK(FromFLBA(annotation, primitive.type_length(), out)); break; default: { // PARQUET-1565: This can occur if the file is corrupt @@ -321,7 +410,7 @@ Status NodeToFieldInternal(const Node& node, } } else if (node.is_group()) { const auto& group = static_cast(node); - if (node.logical_type() == LogicalType::LIST) { + if (node.logical_annotation()->is_list()) { RETURN_NOT_OK(NodeToList(group, included_leaf_nodes, &type)); } else { RETURN_NOT_OK(StructFromGroup(group, included_leaf_nodes, &type)); @@ -411,7 +500,7 @@ Status ListToNode(const std::shared_ptr<::arrow::ListType>& type, const std::str RETURN_NOT_OK(FieldToNode(type->value_field(), properties, arrow_properties, &element)); NodePtr list = GroupNode::Make("list", Repetition::REPEATED, {element}); - *out = GroupNode::Make(name, repetition, {list}, LogicalType::LIST); + *out = GroupNode::Make(name, repetition, {list}, LogicalAnnotation::List()); return Status::OK(); } @@ -431,27 +520,35 @@ Status StructToNode(const std::shared_ptr<::arrow::StructType>& type, return Status::OK(); } -static LogicalType::type LogicalTypeFromArrowTimeUnit(::arrow::TimeUnit::type time_unit) { +static bool HasUTCTimezone(const std::string& timezone) { + static const std::vector utczones{"UTC", "utc"}; + return std::any_of(utczones.begin(), utczones.end(), + [timezone](const std::string& utc) { return timezone == utc; }); +} + +static std::shared_ptr TimestampAnnotationFromArrowTimestamp( + const ::arrow::TimestampType& timestamp_type, ::arrow::TimeUnit::type time_unit) { + const bool utc = HasUTCTimezone(timestamp_type.timezone()); switch (time_unit) { case ::arrow::TimeUnit::MILLI: - return LogicalType::TIMESTAMP_MILLIS; + return LogicalAnnotation::Timestamp(utc, LogicalAnnotation::TimeUnit::MILLIS); case ::arrow::TimeUnit::MICRO: - return LogicalType::TIMESTAMP_MICROS; - case ::arrow::TimeUnit::SECOND: + return LogicalAnnotation::Timestamp(utc, LogicalAnnotation::TimeUnit::MICROS); case ::arrow::TimeUnit::NANO: + return LogicalAnnotation::Timestamp(utc, LogicalAnnotation::TimeUnit::NANOS); + case ::arrow::TimeUnit::SECOND: // No equivalent parquet logical type. break; } - - return LogicalType::NONE; + return LogicalAnnotation::None(); } static Status GetTimestampMetadata(const ::arrow::TimestampType& type, const ArrowWriterProperties& properties, ParquetType::type* physical_type, - LogicalType::type* logical_type) { + std::shared_ptr* annotation) { const bool coerce = properties.coerce_timestamps_enabled(); - const auto unit = coerce ? properties.coerce_timestamps_unit() : type.unit(); + const auto target_unit = coerce ? properties.coerce_timestamps_unit() : type.unit(); // The user is explicitly asking for Impala int96 encoding, there is no // logical type. @@ -461,41 +558,39 @@ static Status GetTimestampMetadata(const ::arrow::TimestampType& type, } *physical_type = ParquetType::INT64; - *logical_type = LogicalTypeFromArrowTimeUnit(unit); + PARQUET_CATCH_NOT_OK(*annotation = + TimestampAnnotationFromArrowTimestamp(type, target_unit)); - // The user is requesting that all timestamp columns are casted to a specific - // type. Only 2 TimeUnit are supported by arrow-parquet. + // The user is explicitly asking for timestamp data to be converted to the + // specified units (target_unit). if (coerce) { - switch (unit) { + switch (target_unit) { case ::arrow::TimeUnit::MILLI: case ::arrow::TimeUnit::MICRO: - break; case ::arrow::TimeUnit::NANO: + break; case ::arrow::TimeUnit::SECOND: return Status::NotImplemented( - "Can only coerce Arrow timestamps to milliseconds" - " or microseconds"); + "Can only coerce Arrow timestamps to milliseconds, microseconds, or " + "nanoseconds"); } - return Status::OK(); } - // Until ARROW-3729 is resolved, nanoseconds are explicitly converted to - // int64 microseconds when deprecated int96 is not requested. - if (type.unit() == ::arrow::TimeUnit::NANO) - *logical_type = LogicalType::TIMESTAMP_MICROS; - else if (type.unit() == ::arrow::TimeUnit::SECOND) + // The user implicitly wants timestamp data to retain its original time units, + // however the Arrow seconds time unit can not be represented (annotated) in Parquet. + if (type.unit() == ::arrow::TimeUnit::SECOND) { return Status::NotImplemented( - "Only MILLI, MICRO, and NANOS units supported for Arrow timestamps with " - "Parquet."); + "Only MILLI, MICRO, and NANO units supported for Arrow timestamps with Parquet."); + } return Status::OK(); -} // namespace arrow +} Status FieldToNode(const std::shared_ptr& field, const WriterProperties& properties, const ArrowWriterProperties& arrow_properties, NodePtr* out) { - LogicalType::type logical_type = LogicalType::NONE; + std::shared_ptr annotation = LogicalAnnotation::None(); ParquetType::type type; Repetition::type repetition = field->nullable() ? Repetition::OPTIONAL : Repetition::REQUIRED; @@ -507,33 +602,33 @@ Status FieldToNode(const std::shared_ptr& field, switch (field->type()->id()) { case ArrowTypeId::NA: type = ParquetType::INT32; - logical_type = LogicalType::NA; + annotation = LogicalAnnotation::Null(); break; case ArrowTypeId::BOOL: type = ParquetType::BOOLEAN; break; case ArrowTypeId::UINT8: type = ParquetType::INT32; - logical_type = LogicalType::UINT_8; + PARQUET_CATCH_NOT_OK(annotation = LogicalAnnotation::Int(8, false)); break; case ArrowTypeId::INT8: type = ParquetType::INT32; - logical_type = LogicalType::INT_8; + PARQUET_CATCH_NOT_OK(annotation = LogicalAnnotation::Int(8, true)); break; case ArrowTypeId::UINT16: type = ParquetType::INT32; - logical_type = LogicalType::UINT_16; + PARQUET_CATCH_NOT_OK(annotation = LogicalAnnotation::Int(16, false)); break; case ArrowTypeId::INT16: type = ParquetType::INT32; - logical_type = LogicalType::INT_16; + PARQUET_CATCH_NOT_OK(annotation = LogicalAnnotation::Int(16, true)); break; case ArrowTypeId::UINT32: if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0) { type = ParquetType::INT64; } else { type = ParquetType::INT32; - logical_type = LogicalType::UINT_32; + PARQUET_CATCH_NOT_OK(annotation = LogicalAnnotation::Int(32, false)); } break; case ArrowTypeId::INT32: @@ -541,7 +636,7 @@ Status FieldToNode(const std::shared_ptr& field, break; case ArrowTypeId::UINT64: type = ParquetType::INT64; - logical_type = LogicalType::UINT_64; + PARQUET_CATCH_NOT_OK(annotation = LogicalAnnotation::Int(64, false)); break; case ArrowTypeId::INT64: type = ParquetType::INT64; @@ -554,7 +649,7 @@ Status FieldToNode(const std::shared_ptr& field, break; case ArrowTypeId::STRING: type = ParquetType::BYTE_ARRAY; - logical_type = LogicalType::UTF8; + annotation = LogicalAnnotation::String(); break; case ArrowTypeId::BINARY: type = ParquetType::BYTE_ARRAY; @@ -567,37 +662,41 @@ Status FieldToNode(const std::shared_ptr& field, } break; case ArrowTypeId::DECIMAL: { type = ParquetType::FIXED_LEN_BYTE_ARRAY; - logical_type = LogicalType::DECIMAL; const auto& decimal_type = static_cast(*field->type()); precision = decimal_type.precision(); scale = decimal_type.scale(); length = DecimalSize(precision); + PARQUET_CATCH_NOT_OK(annotation = LogicalAnnotation::Decimal(precision, scale)); } break; case ArrowTypeId::DATE32: type = ParquetType::INT32; - logical_type = LogicalType::DATE; + annotation = LogicalAnnotation::Date(); break; case ArrowTypeId::DATE64: type = ParquetType::INT32; - logical_type = LogicalType::DATE; + annotation = LogicalAnnotation::Date(); break; case ArrowTypeId::TIMESTAMP: RETURN_NOT_OK( GetTimestampMetadata(static_cast<::arrow::TimestampType&>(*field->type()), - arrow_properties, &type, &logical_type)); + arrow_properties, &type, &annotation)); break; case ArrowTypeId::TIME32: type = ParquetType::INT32; - logical_type = LogicalType::TIME_MILLIS; + PARQUET_CATCH_NOT_OK(annotation = LogicalAnnotation::Time( + false, LogicalAnnotation::TimeUnit::MILLIS)); break; case ArrowTypeId::TIME64: { + type = ParquetType::INT64; auto time_type = static_cast<::arrow::Time64Type*>(field->type().get()); if (time_type->unit() == ::arrow::TimeUnit::NANO) { - return Status::NotImplemented("Nanosecond time not supported in Parquet."); + PARQUET_CATCH_NOT_OK(annotation = LogicalAnnotation::Time( + false, LogicalAnnotation::TimeUnit::NANOS)); + } else { + PARQUET_CATCH_NOT_OK(annotation = LogicalAnnotation::Time( + false, LogicalAnnotation::TimeUnit::MICROS)); } - type = ParquetType::INT64; - logical_type = LogicalType::TIME_MICROS; } break; case ArrowTypeId::STRUCT: { auto struct_type = std::static_pointer_cast<::arrow::StructType>(field->type()); @@ -625,9 +724,10 @@ Status FieldToNode(const std::shared_ptr& field, field->type()->ToString()); } } - PARQUET_CATCH_NOT_OK(*out = - PrimitiveNode::Make(field->name(), repetition, type, - logical_type, length, precision, scale)); + + PARQUET_CATCH_NOT_OK(*out = PrimitiveNode::Make(field->name(), repetition, annotation, + type, length)); + return Status::OK(); } @@ -725,7 +825,7 @@ int32_t DecimalSize(int32_t precision) { } DCHECK(false); return -1; -} // namespace arrow +} } // namespace arrow } // namespace parquet diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 96db68b3ec0d4..e31c947519079 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -652,18 +652,14 @@ Status ArrowColumnWriter::WriteTimestamps(const Array& values, int64_t num_level const int16_t* rep_levels) { const auto& type = static_cast(*values.type()); - const bool is_nanosecond = type.unit() == TimeUnit::NANO; - if (ctx_->properties->support_deprecated_int96_timestamps()) { // The user explicitly required to use Int96 storage. return TypedWriteBatch(values, num_levels, def_levels, rep_levels); - } else if (is_nanosecond || - (ctx_->properties->coerce_timestamps_enabled() && - (type.unit() != ctx_->properties->coerce_timestamps_unit()))) { - // Casting is required. This covers several cases - // * Nanoseconds -> cast to microseconds (until ARROW-3729 is resolved) - // * coerce_timestamps_enabled_, cast all timestamps to requested unit + } else if (ctx_->properties->coerce_timestamps_enabled() && + (type.unit() != ctx_->properties->coerce_timestamps_unit())) { + // Casting is required: coerce_timestamps_enabled_, cast all timestamps to requested + // unit return WriteTimestampsCoerce(ctx_->properties->truncated_timestamps_allowed(), values, num_levels, def_levels, rep_levels); } else { @@ -683,7 +679,8 @@ Status ArrowColumnWriter::WriteTimestampsCoerce(const bool truncated_timestamps_ const auto& data = static_cast(array); auto values = data.raw_values(); - const auto& type = static_cast(*array.type()); + const auto& source_type = static_cast(*array.type()); + auto source_unit = source_type.unit(); TimeUnit::type target_unit = ctx_->properties->coerce_timestamps_enabled() ? ctx_->properties->coerce_timestamps_unit() @@ -693,7 +690,7 @@ Status ArrowColumnWriter::WriteTimestampsCoerce(const bool truncated_timestamps_ auto DivideBy = [&](const int64_t factor) { for (int64_t i = 0; i < array.length(); i++) { if (!truncated_timestamps_allowed && !data.IsNull(i) && (values[i] % factor != 0)) { - return Status::Invalid("Casting from ", type.ToString(), " to ", + return Status::Invalid("Casting from ", source_type.ToString(), " to ", target_type->ToString(), " would lose data: ", values[i]); } buffer[i] = values[i] / factor; @@ -708,21 +705,37 @@ Status ArrowColumnWriter::WriteTimestampsCoerce(const bool truncated_timestamps_ return Status::OK(); }; - if (type.unit() == TimeUnit::NANO) { + if (source_unit == TimeUnit::NANO) { if (target_unit == TimeUnit::MICRO) { RETURN_NOT_OK(DivideBy(1000)); } else { DCHECK_EQ(TimeUnit::MILLI, target_unit); RETURN_NOT_OK(DivideBy(1000000)); } - } else if (type.unit() == TimeUnit::SECOND) { - RETURN_NOT_OK(MultiplyBy(target_unit == TimeUnit::MICRO ? 1000000 : 1000)); - } else if (type.unit() == TimeUnit::MILLI) { - DCHECK_EQ(TimeUnit::MICRO, target_unit); - RETURN_NOT_OK(MultiplyBy(1000)); + } else if (source_unit == TimeUnit::MICRO) { + if (target_unit == TimeUnit::NANO) { + RETURN_NOT_OK(MultiplyBy(1000)); + } else { + DCHECK_EQ(TimeUnit::MILLI, target_unit); + RETURN_NOT_OK(DivideBy(1000)); + } + } else if (source_unit == TimeUnit::MILLI) { + if (target_unit == TimeUnit::MICRO) { + RETURN_NOT_OK(MultiplyBy(1000)); + } else { + DCHECK_EQ(TimeUnit::NANO, target_unit); + RETURN_NOT_OK(MultiplyBy(1000000)); + } } else { - DCHECK_EQ(TimeUnit::MILLI, target_unit); - RETURN_NOT_OK(DivideBy(1000)); + DCHECK_EQ(TimeUnit::SECOND, source_unit); + if (target_unit == TimeUnit::MILLI) { + RETURN_NOT_OK(MultiplyBy(1000)); + } else if (target_unit == TimeUnit::MICRO) { + RETURN_NOT_OK(MultiplyBy(1000000)); + } else { + DCHECK_EQ(TimeUnit::NANO, target_unit); + RETURN_NOT_OK(MultiplyBy(INT64_C(1000000000))); + } } if (writer_->descr()->schema_node()->is_required() || (data.null_count() == 0)) { diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 4598bb9ab9ea0..b58dedd825894 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -179,9 +179,6 @@ def test_chunked_table_write(): # ARROW-232 df = alltypes_sample(size=10) - # The nanosecond->ms conversion is a nuisance, so we just avoid it here - del df['datetime'] - batch = pa.RecordBatch.from_pandas(df) table = pa.Table.from_batches([batch] * 3) _check_roundtrip(table, version='2.0') @@ -195,8 +192,6 @@ def test_chunked_table_write(): @pytest.mark.pandas def test_no_memory_map(tempdir): df = alltypes_sample(size=10) - # The nanosecond->us conversion is a nuisance, so we just avoid it here - del df['datetime'] table = pa.Table.from_pandas(df) _check_roundtrip(table, read_table_kwargs={'memory_map': False}, @@ -223,8 +218,6 @@ def test_special_chars_filename(tempdir): @pytest.mark.pandas def test_empty_table_roundtrip(): df = alltypes_sample(size=10) - # The nanosecond->us conversion is a nuisance, so we just avoid it here - del df['datetime'] # Create a non-empty table to infer the types correctly, then slice to 0 table = pa.Table.from_pandas(df) @@ -896,7 +889,7 @@ def test_column_of_lists(tempdir): @pytest.mark.pandas -def test_date_time_types(): +def test_date_time_types(tempdir): t1 = pa.date32() data1 = np.array([17259, 17260, 17261], dtype='int32') a1 = pa.array(data1, type=t1) @@ -929,12 +922,6 @@ def test_date_time_types(): dtype='int64') a7 = pa.array(data7, type=t7) - t7_us = pa.timestamp('us') - start = pd.Timestamp('2001-01-01').value - data7_us = np.array([start, start + 1000, start + 2000], - dtype='int64') // 1000 - a7_us = pa.array(data7_us, type=t7_us) - table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', @@ -943,8 +930,7 @@ def test_date_time_types(): # date64 as date32 # time32[s] to time32[ms] - # 'timestamp[ns]' to 'timestamp[us]' - expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us], + expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', @@ -952,35 +938,62 @@ def test_date_time_types(): _check_roundtrip(table, expected=expected, version='2.0') - # date64 as date32 - # time32[s] to time32[ms] - # 'timestamp[ms]' is saved as INT96 timestamp - # 'timestamp[ns]' is saved as INT96 timestamp - expected = pa.Table.from_arrays([a1, a1, a7, a4, a5, ex_a6, a7], - ['date32', 'date64', 'timestamp[us]', - 'time32[s]', 'time64[us]', - 'time32_from64[s]', - 'timestamp[ns]']) - - _check_roundtrip(table, expected=expected, version='2.0', - use_deprecated_int96_timestamps=True) - - # Check that setting flavor to 'spark' uses int96 timestamps - _check_roundtrip(table, expected=expected, version='2.0', - flavor='spark') - - # Unsupported stuff - def _assert_unsupported(array): - table = pa.Table.from_arrays([array], ['unsupported']) - buf = io.BytesIO() + t0 = pa.timestamp('ms') + data0 = np.arange(4, dtype='int64') + a0 = pa.array(data0, type=t0) - with pytest.raises(NotImplementedError): - _write_table(table, buf, version="2.0") + t1 = pa.timestamp('us') + data1 = np.arange(4, dtype='int64') + a1 = pa.array(data1, type=t1) - t7 = pa.time64('ns') - a7 = pa.array(data4.astype('int64'), type=t7) + t2 = pa.timestamp('ns') + data2 = np.arange(4, dtype='int64') + a2 = pa.array(data2, type=t2) - _assert_unsupported(a7) + table = pa.Table.from_arrays([a0, a1, a2], + ['ts[ms]', 'ts[us]', 'ts[ns]']) + expected = pa.Table.from_arrays([a0, a1, a2], + ['ts[ms]', 'ts[us]', 'ts[ns]']) + + # int64 for all timestamps supported by default + filename = tempdir / 'int64_timestamps.parquet' + _write_table(table, filename, version='2.0') + parquet_schema = pq.ParquetFile(filename).schema + for i in range(3): + assert parquet_schema.column(i).physical_type == 'INT64' + read_table = _read_table(filename) + assert read_table.equals(expected) + + t0_ns = pa.timestamp('ns') + data0_ns = np.array(data0 * 1000000, dtype='int64') + a0_ns = pa.array(data0_ns, type=t0_ns) + + t1_ns = pa.timestamp('ns') + data1_ns = np.array(data1 * 1000, dtype='int64') + a1_ns = pa.array(data1_ns, type=t1_ns) + + expected = pa.Table.from_arrays([a0_ns, a1_ns, a2], + ['ts[ms]', 'ts[us]', 'ts[ns]']) + + # int96 nanosecond timestamps produced upon request + filename = tempdir / 'explicit_int96_timestamps.parquet' + _write_table(table, filename, version='2.0', + use_deprecated_int96_timestamps=True) + parquet_schema = pq.ParquetFile(filename).schema + for i in range(3): + assert parquet_schema.column(i).physical_type == 'INT96' + read_table = _read_table(filename) + assert read_table.equals(expected) + + # int96 nanosecond timestamps implied by flavor 'spark' + filename = tempdir / 'spark_int96_timestamps.parquet' + _write_table(table, filename, version='2.0', + flavor='spark') + parquet_schema = pq.ParquetFile(filename).schema + for i in range(3): + assert parquet_schema.column(i).physical_type == 'INT96' + read_table = _read_table(filename) + assert read_table.equals(expected) @pytest.mark.pandas @@ -2028,13 +2041,39 @@ def test_write_error_deletes_incomplete_file(tempdir): filename = tempdir / 'tmp_file' try: - _write_table(pdf, filename) + _write_table(pdf, filename, coerce_timestamps='ms') except pa.ArrowException: pass assert not filename.exists() +@pytest.mark.pandas +def test_noncoerced_nanoseconds_written_without_exception(tempdir): + # ARROW-1957 + n = 9 + df = pd.DataFrame({'x': range(n)}, + index=pd.DatetimeIndex(start='2017-01-01', + freq='1n', + periods=n)) + tb = pa.Table.from_pandas(df) + + filename = tempdir / 'written.parquet' + try: + pq.write_table(tb, filename) + except Exception: + pass + assert filename.exists() + + recovered_table = _simple_table_roundtrip(tb) + assert tb.equals(recovered_table) + + # Loss of data thru coercion (without explicit override) still an error + filename = tempdir / 'not_written.parquet' + with pytest.raises(ValueError): + pq.write_table(tb, filename, coerce_timestamps='ms') + + def test_read_non_existent_file(tempdir): path = 'non-existent-file.parquet' try: From 0336eee5f0a78abfbe6bca0b2618e43c9be4902b Mon Sep 17 00:00:00 2001 From: TP Boudreau Date: Fri, 7 Jun 2019 22:26:31 +0000 Subject: [PATCH 2/9] Preserve Arrow timestamp timezones using Parquet file metadata --- cpp/src/parquet/arrow/arrow-schema-test.cc | 280 +++++++++++++++- cpp/src/parquet/arrow/schema.cc | 359 +++++++++++++++++---- cpp/src/parquet/arrow/schema.h | 26 +- cpp/src/parquet/arrow/writer.cc | 13 +- 4 files changed, 596 insertions(+), 82 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow-schema-test.cc b/cpp/src/parquet/arrow/arrow-schema-test.cc index 6424dd3f5def6..878832686d26e 100644 --- a/cpp/src/parquet/arrow/arrow-schema-test.cc +++ b/cpp/src/parquet/arrow/arrow-schema-test.cc @@ -89,7 +89,16 @@ class TestConvertParquetSchema : public ::testing::Test { const std::shared_ptr& key_value_metadata) { NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes); descr_.Init(schema); - return FromParquetSchema(&descr_, {}, key_value_metadata, &result_schema_); + return FromParquetSchema(&descr_, key_value_metadata, &result_schema_); + } + + ::arrow::Status ConvertSchema( + const std::vector& nodes, const std::vector& column_indices, + const std::shared_ptr& key_value_metadata) { + NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes); + descr_.Init(schema); + return FromParquetSchema(&descr_, column_indices, key_value_metadata, + &result_schema_); } protected: @@ -311,7 +320,7 @@ TEST_F(TestConvertParquetSchema, ParquetKeyValueMetadata) { auto key_value_metadata = std::make_shared(); key_value_metadata->Append("foo", "bar"); key_value_metadata->Append("biz", "baz"); - ASSERT_OK(ConvertSchema(parquet_fields, key_value_metadata)); + ASSERT_OK(ConvertSchema(parquet_fields, {}, key_value_metadata)); auto arrow_metadata = result_schema_->metadata(); ASSERT_EQ("foo", arrow_metadata->key(0)); @@ -329,12 +338,178 @@ TEST_F(TestConvertParquetSchema, ParquetEmptyKeyValueMetadata) { arrow_fields.push_back(std::make_shared("int32", INT32, false)); std::shared_ptr key_value_metadata = nullptr; - ASSERT_OK(ConvertSchema(parquet_fields, key_value_metadata)); + ASSERT_OK(ConvertSchema(parquet_fields, {}, key_value_metadata)); auto arrow_metadata = result_schema_->metadata(); ASSERT_EQ(arrow_metadata, nullptr); } +TEST_F(TestConvertParquetSchema, ParquetSimpleTimezoneNodes) { + std::vector parquet_fields; + std::vector> arrow_fields; + auto parquet_file_metadata = std::make_shared(); + + parquet_fields.push_back(PrimitiveNode::Make( + "ts", Repetition::REQUIRED, + LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT64)); + arrow_fields.push_back( + std::make_shared("ts", ::arrow::timestamp(TimeUnit::MILLI), false)); + + parquet_fields.push_back(PrimitiveNode::Make( + "ts:UTC", Repetition::REQUIRED, + LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT64)); + arrow_fields.push_back(std::make_shared( + "ts:UTC", ::arrow::timestamp(TimeUnit::MILLI, "UTC"), false)); + + parquet_fields.push_back(PrimitiveNode::Make( + "ts:CET", Repetition::REQUIRED, + LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT64)); + parquet_file_metadata->Append("org.apache.arrow.field[ts:CET].timestamp.timezone", + "CET"); + arrow_fields.push_back(std::make_shared( + "ts:CET", ::arrow::timestamp(TimeUnit::MILLI, "CET"), false)); + + parquet_file_metadata->Append("application.key", "application.value"); + + ASSERT_OK(ConvertSchema(parquet_fields, parquet_file_metadata)); + ASSERT_NO_FATAL_FAILURE( + CheckFlatSchema(std::make_shared<::arrow::Schema>(arrow_fields))); + // Confirm arrow timezone metadata removed, application metadata retained + auto arrow_metadata = result_schema_->metadata(); + ASSERT_EQ(1, arrow_metadata->size()); + ASSERT_EQ("application.key", arrow_metadata->key(0)); + ASSERT_EQ("application.value", arrow_metadata->value(0)); +} + +static void MakeNestedTimezoneSchemas(std::vector>& arrow_fields, + std::vector& parquet_fields, + std::shared_ptr& parquet_metadata, + bool unpacked_dictionaries = false) { + { + auto arrow_element = std::make_shared( + "timestamp:CET", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "CET"), false); + auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element); + arrow_fields.push_back(std::make_shared("some_list", arrow_list, false)); + + auto parquet_element = PrimitiveNode::Make( + "timestamp:CET", Repetition::REQUIRED, + LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT64); + auto parquet_list = GroupNode::Make("list", Repetition::REPEATED, {parquet_element}); + parquet_fields.push_back(GroupNode::Make("some_list", Repetition::REQUIRED, + {parquet_list}, LogicalType::LIST)); + parquet_metadata->Append( + "org.apache.arrow.field[some_list.list.timestamp:CET].timestamp.timezone", "CET"); + } + + { + auto arrow_element_1 = std::make_shared( + "timestamp", ::arrow::timestamp(::arrow::TimeUnit::MILLI), false); + auto arrow_element_2 = std::make_shared( + "timestamp:UTC", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "UTC"), false); + auto arrow_element_3 = std::make_shared( + "timestamp:CET", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "CET"), false); + auto arrow_elements = {arrow_element_1, arrow_element_2, arrow_element_3}; + auto arrow_struct = std::make_shared<::arrow::StructType>(arrow_elements); + auto arrow_group = std::make_shared("some_struct", arrow_struct, false); + auto arrow_field = std::make_shared( + "timestamp:NZST", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "NZST"), false); + arrow_fields.push_back(arrow_group); + arrow_fields.push_back(arrow_field); + + auto parquet_element_1 = PrimitiveNode::Make( + "timestamp", Repetition::REQUIRED, + LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT64); + auto parquet_element_2 = PrimitiveNode::Make( + "timestamp:UTC", Repetition::REQUIRED, + LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT64); + auto parquet_element_3 = PrimitiveNode::Make( + "timestamp:CET", Repetition::REQUIRED, + LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT64); + auto parquet_group = + GroupNode::Make("some_struct", Repetition::REQUIRED, + {parquet_element_1, parquet_element_2, parquet_element_3}); + auto parquet_field = PrimitiveNode::Make( + "timestamp:NZST", Repetition::REQUIRED, + LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT64); + parquet_fields.push_back(parquet_group); + parquet_fields.push_back(parquet_field); + parquet_metadata->Append( + "org.apache.arrow.field[some_struct.timestamp:CET].timestamp.timezone", "CET"); + parquet_metadata->Append("org.apache.arrow.field[timestamp:NZST].timestamp.timezone", + "NZST"); + } + + { + if (unpacked_dictionaries) { + // The Parquet nodes immediately below are converted to these non-dictionary Arrow + // fields + arrow_fields.push_back(std::make_shared( + "dictionary_timestamp", ::arrow::timestamp(::arrow::TimeUnit::MILLI), false)); + arrow_fields.push_back(std::make_shared( + "dictionary_timestamp:UTC", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "UTC"), + false)); + arrow_fields.push_back(std::make_shared( + "dictionary_timestamp:CET", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "CET"), + false)); + } else { + // When converted, these Arrow dictionaries are unpacked and stored as the Parquet + // nodes immediately below + arrow_fields.push_back(std::make_shared( + "dictionary_timestamp", + ::arrow::dictionary(::arrow::int16(), ::arrow::timestamp(TimeUnit::MILLI)), + false)); + arrow_fields.push_back(std::make_shared( + "dictionary_timestamp:UTC", + ::arrow::dictionary(::arrow::int16(), + ::arrow::timestamp(TimeUnit::MILLI, "UTC")), + false)); + arrow_fields.push_back(std::make_shared( + "dictionary_timestamp:CET", + ::arrow::dictionary(::arrow::int16(), + ::arrow::timestamp(TimeUnit::MILLI, "CET")), + false)); + } + + parquet_fields.push_back(PrimitiveNode::Make( + "dictionary_timestamp", Repetition::REQUIRED, + LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT64)); + parquet_fields.push_back(PrimitiveNode::Make( + "dictionary_timestamp:UTC", Repetition::REQUIRED, + LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT64)); + parquet_fields.push_back(PrimitiveNode::Make( + "dictionary_timestamp:CET", Repetition::REQUIRED, + LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT64)); + parquet_metadata->Append( + "org.apache.arrow.field[dictionary_timestamp:CET].timestamp.timezone", "CET"); + } + + return; +} + +TEST_F(TestConvertParquetSchema, ParquetNestedTimezoneNodes) { + std::vector parquet_fields; + std::shared_ptr parquet_metadata = + std::make_shared(); + std::vector> arrow_fields; + + MakeNestedTimezoneSchemas(arrow_fields, parquet_fields, parquet_metadata, true); + + ASSERT_OK(ConvertSchema(parquet_fields, parquet_metadata)); + ASSERT_NO_FATAL_FAILURE( + CheckFlatSchema(std::make_shared<::arrow::Schema>(arrow_fields))); +} + TEST_F(TestConvertParquetSchema, ParquetFlatDecimals) { std::vector parquet_fields; std::vector> arrow_fields; @@ -733,7 +908,10 @@ class TestConvertArrowSchema : public ::testing::Test { public: virtual void SetUp() {} - void CheckFlatSchema(const std::vector& nodes) { + void CheckFlatSchema( + const std::vector& nodes, + const std::shared_ptr& expected_metadata = nullptr, + bool expect_strict_metadata_equality = true) { NodePtr schema_node = GroupNode::Make("schema", Repetition::REPEATED, nodes); const GroupNode* expected_schema_node = static_cast(schema_node.get()); @@ -746,18 +924,35 @@ class TestConvertArrowSchema : public ::testing::Test { auto rhs = expected_schema_node->field(i); EXPECT_TRUE(lhs->Equals(rhs.get())); } + + if (expected_metadata) { + if (expect_strict_metadata_equality) { + ASSERT_TRUE(result_metadata_->Equals(*expected_metadata)); + } else { + // check for expected keys and values, indifferent to order in container + ASSERT_EQ(result_metadata_->size(), expected_metadata->size()); + for (int i = 0; i < expected_metadata->size(); ++i) { + int index = result_metadata_->FindKey(expected_metadata->key(i)); + ASSERT_NE(index, -1) << "key '" << expected_metadata->key(i) + << "' unexpectedly missing from result metadata"; + ASSERT_EQ(result_metadata_->value(index), expected_metadata->value(i)); + } + } + } } ::arrow::Status ConvertSchema(const std::vector>& fields) { arrow_schema_ = std::make_shared<::arrow::Schema>(fields); std::shared_ptr<::parquet::WriterProperties> properties = ::parquet::default_writer_properties(); - return ToParquetSchema(arrow_schema_.get(), *properties.get(), &result_schema_); + return ToParquetSchema(arrow_schema_.get(), *properties.get(), &result_metadata_, + &result_schema_); } protected: std::shared_ptr<::arrow::Schema> arrow_schema_; std::shared_ptr result_schema_; + std::shared_ptr result_metadata_; }; TEST_F(TestConvertArrowSchema, ParquetFlatPrimitives) { @@ -924,6 +1119,72 @@ TEST_F(TestConvertArrowSchema, ArrowNonconvertibleFields) { } } +TEST_F(TestConvertArrowSchema, ArrowSimpleTimezoneFields) { + struct TimezoneFieldConstructionArguments { + std::string name; + std::shared_ptr<::arrow::DataType> datatype; + bool is_adjusted_to_utc; + bool inserts_metadata; + std::pair key_value_metadata; + }; + + std::vector cases = { + {"timestamp", ::arrow::timestamp(::arrow::TimeUnit::MILLI), false, false, {"", ""}}, + {"timestamp:UTC", + ::arrow::timestamp(::arrow::TimeUnit::MILLI, "UTC"), + true, + false, + {"", ""}}, + {"timestamp:utc", + ::arrow::timestamp(::arrow::TimeUnit::MILLI, "utc"), + true, + false, + {"", ""}}, + {"timestamp:CET", + ::arrow::timestamp(::arrow::TimeUnit::MILLI, "CET"), + false, + true, + {"org.apache.arrow.field[timestamp:CET].timestamp.timezone", "CET"}}, + {"timestamp:NZST", + ::arrow::timestamp(::arrow::TimeUnit::MILLI, "NZST"), + false, + true, + {"org.apache.arrow.field[timestamp:NZST].timestamp.timezone", "NZST"}}, + }; + + std::vector> arrow_fields; + std::vector parquet_fields; + auto parquet_file_metadata = std::make_shared(); + + for (const TimezoneFieldConstructionArguments& c : cases) { + arrow_fields.push_back(std::make_shared(c.name, c.datatype, false)); + parquet_fields.push_back(PrimitiveNode::Make( + c.name, Repetition::REQUIRED, + LogicalAnnotation::Timestamp(c.is_adjusted_to_utc, + LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT64)); + if (c.inserts_metadata) { + parquet_file_metadata->Append(c.key_value_metadata.first, + c.key_value_metadata.second); + } + } + + ASSERT_OK(ConvertSchema(arrow_fields)); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields, parquet_file_metadata, false)); +} + +TEST_F(TestConvertArrowSchema, ArrowNestedTimezoneFields) { + std::vector> arrow_fields; + std::vector parquet_fields; + std::shared_ptr parquet_metadata = + std::make_shared(); + + MakeNestedTimezoneSchemas(arrow_fields, parquet_fields, parquet_metadata); + + ASSERT_OK(ConvertSchema(arrow_fields)); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields, parquet_metadata, false)); +} + TEST_F(TestConvertArrowSchema, ParquetFlatPrimitivesAsDictionaries) { std::vector parquet_fields; std::vector> arrow_fields; @@ -949,6 +1210,15 @@ TEST_F(TestConvertArrowSchema, ParquetFlatPrimitivesAsDictionaries) { arrow_fields.push_back(std::make_shared( "date64", ::arrow::dictionary(::arrow::int8(), ::arrow::date64()), false)); + parquet_fields.push_back(PrimitiveNode::Make( + "timestamp", Repetition::REQUIRED, + LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::MILLIS), + ParquetType::INT64)); + arrow_fields.push_back(std::make_shared( + "timestamp", + ::arrow::dictionary(::arrow::int8(), ::arrow::timestamp(TimeUnit::MILLI, "UTC")), + false)); + parquet_fields.push_back( PrimitiveNode::Make("float", Repetition::OPTIONAL, ParquetType::FLOAT)); arrow_fields.push_back(std::make_shared( diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index c7f71665ffda0..1e1c7580de879 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -18,6 +18,8 @@ #include "parquet/arrow/schema.h" #include +#include +#include #include #include #include @@ -33,6 +35,7 @@ #include "parquet/exception.h" #include "parquet/properties.h" #include "parquet/schema-internal.h" +#include "parquet/schema.h" #include "parquet/types.h" using arrow::Field; @@ -43,6 +46,7 @@ using ArrowType = arrow::DataType; using ArrowTypeId = arrow::Type; using parquet::Repetition; +using parquet::schema::ColumnPath; using parquet::schema::GroupNode; using parquet::schema::Node; using parquet::schema::NodePtr; @@ -56,6 +60,9 @@ namespace parquet { namespace arrow { +static const char* key_prefix = "org.apache.arrow.field["; +static const char* key_suffix = "].timestamp.timezone"; + const auto TIMESTAMP_MS = ::arrow::timestamp(::arrow::TimeUnit::MILLI); const auto TIMESTAMP_US = ::arrow::timestamp(::arrow::TimeUnit::MICRO); const auto TIMESTAMP_NS = ::arrow::timestamp(::arrow::TimeUnit::NANO); @@ -132,31 +139,46 @@ static Status MakeArrowTime64(const std::shared_ptr& an return Status::OK(); } +static int FetchMetadataTimezoneIndex( + const std::string& path, const std::shared_ptr& metadata) { + int index = -1; + if (metadata) { + const std::string key(key_prefix + path + key_suffix); + index = metadata->FindKey(key); + } + return index; +} + static Status MakeArrowTimestamp( - const std::shared_ptr& annotation, + const std::shared_ptr& annotation, const std::string& path, + const std::shared_ptr& metadata, std::shared_ptr* out) { - static constexpr auto utc = "UTC"; const auto& timestamp = checked_cast(*annotation); + ::arrow::TimeUnit::type time_unit; + switch (timestamp.time_unit()) { case LogicalAnnotation::TimeUnit::MILLIS: - *out = (timestamp.is_adjusted_to_utc() - ? ::arrow::timestamp(::arrow::TimeUnit::MILLI, utc) - : ::arrow::timestamp(::arrow::TimeUnit::MILLI)); + time_unit = ::arrow::TimeUnit::MILLI; break; case LogicalAnnotation::TimeUnit::MICROS: - *out = (timestamp.is_adjusted_to_utc() - ? ::arrow::timestamp(::arrow::TimeUnit::MICRO, utc) - : ::arrow::timestamp(::arrow::TimeUnit::MICRO)); + time_unit = ::arrow::TimeUnit::MICRO; break; case LogicalAnnotation::TimeUnit::NANOS: - *out = (timestamp.is_adjusted_to_utc() - ? ::arrow::timestamp(::arrow::TimeUnit::NANO, utc) - : ::arrow::timestamp(::arrow::TimeUnit::NANO)); + time_unit = ::arrow::TimeUnit::NANO; break; default: return Status::TypeError("Unrecognized time unit in timestamp annotation: ", annotation->ToString()); } + + // Attempt to recover optional timezone for this field from file metadata + int index = FetchMetadataTimezoneIndex(path, metadata); + + *out = (timestamp.is_adjusted_to_utc() + ? ::arrow::timestamp(time_unit, "UTC") + : (index == -1 ? ::arrow::timestamp(time_unit) + : ::arrow::timestamp(time_unit, metadata->value(index)))); + return Status::OK(); } @@ -228,6 +250,8 @@ static Status FromInt32(const std::shared_ptr& annotati } static Status FromInt64(const std::shared_ptr& annotation, + const std::string& path, + const std::shared_ptr& metadata, std::shared_ptr* out) { switch (annotation->type()) { case LogicalAnnotation::Type::INT: @@ -237,7 +261,7 @@ static Status FromInt64(const std::shared_ptr& annotati RETURN_NOT_OK(MakeArrowDecimal(annotation, out)); break; case LogicalAnnotation::Type::TIMESTAMP: - RETURN_NOT_OK(MakeArrowTimestamp(annotation, out)); + RETURN_NOT_OK(MakeArrowTimestamp(annotation, path, metadata, out)); break; case LogicalAnnotation::Type::TIME: RETURN_NOT_OK(MakeArrowTime64(annotation, out)); @@ -252,7 +276,9 @@ static Status FromInt64(const std::shared_ptr& annotati return Status::OK(); } -Status FromPrimitive(const PrimitiveNode& primitive, std::shared_ptr* out) { +Status FromPrimitive(const PrimitiveNode& primitive, + const std::shared_ptr& metadata, + std::shared_ptr* out) { const std::shared_ptr& annotation = primitive.logical_annotation(); if (annotation->is_invalid() || annotation->is_null()) { @@ -268,7 +294,8 @@ Status FromPrimitive(const PrimitiveNode& primitive, std::shared_ptr* RETURN_NOT_OK(FromInt32(annotation, out)); break; case ParquetType::INT64: - RETURN_NOT_OK(FromInt64(annotation, out)); + RETURN_NOT_OK(FromInt64(annotation, ColumnPath::FromNode(primitive)->ToDotString(), + metadata, out)); break; case ParquetType::INT96: *out = TIMESTAMP_NS; @@ -297,6 +324,7 @@ Status FromPrimitive(const PrimitiveNode& primitive, std::shared_ptr* // Forward declaration Status NodeToFieldInternal(const Node& node, const std::unordered_set* included_leaf_nodes, + const std::shared_ptr& metadata, std::shared_ptr* out); /* @@ -314,6 +342,7 @@ inline bool IsIncludedLeaf(const Node& node, Status StructFromGroup(const GroupNode& group, const std::unordered_set* included_leaf_nodes, + const std::shared_ptr& metadata, std::shared_ptr* out) { std::vector> fields; std::shared_ptr field; @@ -321,7 +350,8 @@ Status StructFromGroup(const GroupNode& group, *out = nullptr; for (int i = 0; i < group.field_count(); i++) { - RETURN_NOT_OK(NodeToFieldInternal(*group.field(i), included_leaf_nodes, &field)); + RETURN_NOT_OK( + NodeToFieldInternal(*group.field(i), included_leaf_nodes, metadata, &field)); if (field != nullptr) { fields.push_back(field); } @@ -334,6 +364,7 @@ Status StructFromGroup(const GroupNode& group, Status NodeToList(const GroupNode& group, const std::unordered_set* included_leaf_nodes, + const std::shared_ptr& metadata, std::shared_ptr* out) { *out = nullptr; if (group.field_count() == 1) { @@ -347,8 +378,8 @@ Status NodeToList(const GroupNode& group, if (list_group.field_count() == 1 && !schema::HasStructListName(list_group)) { // List of primitive type std::shared_ptr item_field; - RETURN_NOT_OK( - NodeToFieldInternal(*list_group.field(0), included_leaf_nodes, &item_field)); + RETURN_NOT_OK(NodeToFieldInternal(*list_group.field(0), included_leaf_nodes, + metadata, &item_field)); if (item_field != nullptr) { *out = ::arrow::list(item_field); @@ -356,7 +387,8 @@ Status NodeToList(const GroupNode& group, } else { // List of struct std::shared_ptr inner_type; - RETURN_NOT_OK(StructFromGroup(list_group, included_leaf_nodes, &inner_type)); + RETURN_NOT_OK( + StructFromGroup(list_group, included_leaf_nodes, metadata, &inner_type)); if (inner_type != nullptr) { auto item_field = std::make_shared(list_node.name(), inner_type, false); *out = ::arrow::list(item_field); @@ -366,8 +398,8 @@ Status NodeToList(const GroupNode& group, // repeated primitive node std::shared_ptr inner_type; if (IsIncludedLeaf(static_cast(list_node), included_leaf_nodes)) { - RETURN_NOT_OK( - FromPrimitive(static_cast(list_node), &inner_type)); + RETURN_NOT_OK(FromPrimitive(static_cast(list_node), + metadata, &inner_type)); auto item_field = std::make_shared(list_node.name(), inner_type, false); *out = ::arrow::list(item_field); } @@ -383,11 +415,12 @@ Status NodeToList(const GroupNode& group, } Status NodeToField(const Node& node, std::shared_ptr* out) { - return NodeToFieldInternal(node, nullptr, out); + return NodeToFieldInternal(node, nullptr, nullptr, out); } Status NodeToFieldInternal(const Node& node, const std::unordered_set* included_leaf_nodes, + const std::shared_ptr& metadata, std::shared_ptr* out) { std::shared_ptr type = nullptr; bool nullable = !node.is_required(); @@ -399,9 +432,10 @@ Status NodeToFieldInternal(const Node& node, std::shared_ptr inner_type; if (node.is_group()) { RETURN_NOT_OK(StructFromGroup(static_cast(node), - included_leaf_nodes, &inner_type)); + included_leaf_nodes, metadata, &inner_type)); } else if (IsIncludedLeaf(node, included_leaf_nodes)) { - RETURN_NOT_OK(FromPrimitive(static_cast(node), &inner_type)); + RETURN_NOT_OK( + FromPrimitive(static_cast(node), metadata, &inner_type)); } if (inner_type != nullptr) { auto item_field = std::make_shared(node.name(), inner_type, false); @@ -411,14 +445,15 @@ Status NodeToFieldInternal(const Node& node, } else if (node.is_group()) { const auto& group = static_cast(node); if (node.logical_annotation()->is_list()) { - RETURN_NOT_OK(NodeToList(group, included_leaf_nodes, &type)); + RETURN_NOT_OK(NodeToList(group, included_leaf_nodes, metadata, &type)); } else { - RETURN_NOT_OK(StructFromGroup(group, included_leaf_nodes, &type)); + RETURN_NOT_OK(StructFromGroup(group, included_leaf_nodes, metadata, &type)); } } else { // Primitive (leaf) node if (IsIncludedLeaf(node, included_leaf_nodes)) { - RETURN_NOT_OK(FromPrimitive(static_cast(node), &type)); + RETURN_NOT_OK( + FromPrimitive(static_cast(node), metadata, &type)); } } if (type != nullptr) { @@ -427,26 +462,56 @@ Status NodeToFieldInternal(const Node& node, return Status::OK(); } -Status FromParquetSchema( - const SchemaDescriptor* parquet_schema, - const std::shared_ptr& key_value_metadata, - std::shared_ptr<::arrow::Schema>* out) { +static std::shared_ptr FilterParquetMetadata( + const std::shared_ptr& metadata) { + // Return a copy of the input metadata stripped of any key-values pairs + // that held Arrow/Parquet storage specific information + std::shared_ptr filtered_metadata = + std::make_shared(); + if (metadata) { + for (int i = 0; i < metadata->size(); ++i) { + const std::string& key = metadata->key(i); + const std::string& value = metadata->value(i); + // Discard keys like "org.apache.arrow.field[*].timestamp.timezone" + size_t key_size = key.size(); + size_t key_prefix_size = strlen(key_prefix); + size_t key_suffix_size = strlen(key_suffix); + bool timezone_storage_key = + (key_size >= (key_prefix_size + key_suffix_size)) && + (key.compare(0, key_prefix_size, key_prefix) == 0) && + (key.compare(key_size - key_suffix_size, key_suffix_size, key_suffix) == 0); + if (!timezone_storage_key) { + filtered_metadata->Append(key, value); + } + } + } + if (filtered_metadata->size() == 0) { + filtered_metadata.reset(); + } + return filtered_metadata; +} + +Status FromParquetSchema(const SchemaDescriptor* parquet_schema, + const std::shared_ptr& parquet_metadata, + std::shared_ptr<::arrow::Schema>* out) { const GroupNode& schema_node = *parquet_schema->group_node(); int num_fields = static_cast(schema_node.field_count()); std::vector> fields(num_fields); for (int i = 0; i < num_fields; i++) { - RETURN_NOT_OK(NodeToField(*schema_node.field(i), &fields[i])); + RETURN_NOT_OK(NodeToFieldInternal(*schema_node.field(i), nullptr, parquet_metadata, + &fields[i])); } - *out = std::make_shared<::arrow::Schema>(fields, key_value_metadata); + *out = + std::make_shared<::arrow::Schema>(fields, FilterParquetMetadata(parquet_metadata)); return Status::OK(); } -Status FromParquetSchema( - const SchemaDescriptor* parquet_schema, const std::vector& column_indices, - const std::shared_ptr& key_value_metadata, - std::shared_ptr<::arrow::Schema>* out) { +Status FromParquetSchema(const SchemaDescriptor* parquet_schema, + const std::vector& column_indices, + const std::shared_ptr& parquet_metadata, + std::shared_ptr<::arrow::Schema>* out) { // TODO(wesm): Consider adding an arrow::Schema name attribute, which comes // from the root Parquet node @@ -470,13 +535,15 @@ Status FromParquetSchema( std::vector> fields; std::shared_ptr field; for (auto node : base_nodes) { - RETURN_NOT_OK(NodeToFieldInternal(*node, &included_leaf_nodes, &field)); + RETURN_NOT_OK( + NodeToFieldInternal(*node, &included_leaf_nodes, parquet_metadata, &field)); if (field != nullptr) { fields.push_back(field); } } - *out = std::make_shared<::arrow::Schema>(fields, key_value_metadata); + *out = + std::make_shared<::arrow::Schema>(fields, FilterParquetMetadata(parquet_metadata)); return Status::OK(); } @@ -491,13 +558,73 @@ Status FromParquetSchema(const SchemaDescriptor* parquet_schema, return FromParquetSchema(parquet_schema, nullptr, out); } +class PresumedColumnPath { + // Pre-constructs the eventual column path dot string for a Parquet node + // before it is fully constructed from Arrow fields + public: + PresumedColumnPath() = default; + + void Extend(const std::shared_ptr& field) { + switch (field->type()->id()) { + case ArrowTypeId::DICTIONARY: + path_.push_back({false, ""}); + break; + case ArrowTypeId::LIST: + path_.push_back({true, field->name() + ".list"}); + break; + default: + path_.push_back({true, field->name()}); + break; + } + return; + } + + void Retract() { + DCHECK(!path_.empty()); + path_.pop_back(); + return; + } + + std::string ToDotString() const { + std::stringstream ss; + int i = 0; + for (auto c = path_.cbegin(); c != path_.cend(); ++c) { + if (c->include) { + if (i > 0) { + ss << "."; + } + ss << c->component; + ++i; + } + } + return ss.str(); + } + + private: + struct Component { + bool include; + std::string component; + }; + std::deque path_; +}; + +Status FieldToNodeInternal(const std::shared_ptr& field, + const WriterProperties& properties, + const ArrowWriterProperties& arrow_properties, + PresumedColumnPath& path, + std::unordered_map& metadata, + NodePtr* out); + Status ListToNode(const std::shared_ptr<::arrow::ListType>& type, const std::string& name, bool nullable, const WriterProperties& properties, - const ArrowWriterProperties& arrow_properties, NodePtr* out) { - Repetition::type repetition = nullable ? Repetition::OPTIONAL : Repetition::REQUIRED; + const ArrowWriterProperties& arrow_properties, PresumedColumnPath& path, + std::unordered_map& metadata, NodePtr* out) { + const Repetition::type repetition = + nullable ? Repetition::OPTIONAL : Repetition::REQUIRED; NodePtr element; - RETURN_NOT_OK(FieldToNode(type->value_field(), properties, arrow_properties, &element)); + RETURN_NOT_OK(FieldToNodeInternal(type->value_field(), properties, arrow_properties, + path, metadata, &element)); NodePtr list = GroupNode::Make("list", Repetition::REPEATED, {element}); *out = GroupNode::Make(name, repetition, {list}, LogicalAnnotation::List()); @@ -507,13 +634,17 @@ Status ListToNode(const std::shared_ptr<::arrow::ListType>& type, const std::str Status StructToNode(const std::shared_ptr<::arrow::StructType>& type, const std::string& name, bool nullable, const WriterProperties& properties, - const ArrowWriterProperties& arrow_properties, NodePtr* out) { - Repetition::type repetition = nullable ? Repetition::OPTIONAL : Repetition::REQUIRED; + const ArrowWriterProperties& arrow_properties, + PresumedColumnPath& path, + std::unordered_map& metadata, + NodePtr* out) { + const Repetition::type repetition = + nullable ? Repetition::OPTIONAL : Repetition::REQUIRED; std::vector children(type->num_children()); for (int i = 0; i < type->num_children(); i++) { - RETURN_NOT_OK( - FieldToNode(type->child(i), properties, arrow_properties, &children[i])); + RETURN_NOT_OK(FieldToNodeInternal(type->child(i), properties, arrow_properties, path, + metadata, &children[i])); } *out = GroupNode::Make(name, repetition, children); @@ -526,9 +657,32 @@ static bool HasUTCTimezone(const std::string& timezone) { [timezone](const std::string& utc) { return timezone == utc; }); } +static void StoreMetadataTimezone(const std::string& path, + std::unordered_map& metadata, + const std::string& timezone) { + const std::string key(key_prefix + path + key_suffix); + if (metadata.find(key) != metadata.end()) { + std::stringstream ms; + ms << "Duplicate field name '" << path + << "' for timestamp with timezone field in Arrow schema."; + throw ParquetException(ms.str()); + } + metadata.insert(std::make_pair(key, timezone)); + return; +} + static std::shared_ptr TimestampAnnotationFromArrowTimestamp( - const ::arrow::TimestampType& timestamp_type, ::arrow::TimeUnit::type time_unit) { - const bool utc = HasUTCTimezone(timestamp_type.timezone()); + const std::string& path, const ::arrow::TimestampType& timestamp_type, + ::arrow::TimeUnit::type time_unit, + std::unordered_map& metadata) { + const std::string& timezone = timestamp_type.timezone(); + const bool utc = HasUTCTimezone(timezone); + + if (!utc && !timezone.empty()) { + // Attempt to preserve timezone for this field as file metadata + StoreMetadataTimezone(path, metadata, timezone); + } + switch (time_unit) { case ::arrow::TimeUnit::MILLI: return LogicalAnnotation::Timestamp(utc, LogicalAnnotation::TimeUnit::MILLIS); @@ -544,7 +698,10 @@ static std::shared_ptr TimestampAnnotationFromArrowTime } static Status GetTimestampMetadata(const ::arrow::TimestampType& type, + const std::string& name, const ArrowWriterProperties& properties, + const std::string& path, + std::unordered_map& metadata, ParquetType::type* physical_type, std::shared_ptr* annotation) { const bool coerce = properties.coerce_timestamps_enabled(); @@ -558,8 +715,8 @@ static Status GetTimestampMetadata(const ::arrow::TimestampType& type, } *physical_type = ParquetType::INT64; - PARQUET_CATCH_NOT_OK(*annotation = - TimestampAnnotationFromArrowTimestamp(type, target_unit)); + PARQUET_CATCH_NOT_OK(*annotation = TimestampAnnotationFromArrowTimestamp( + path, type, target_unit, metadata)); // The user is explicitly asking for timestamp data to be converted to the // specified units (target_unit). @@ -587,18 +744,23 @@ static Status GetTimestampMetadata(const ::arrow::TimestampType& type, return Status::OK(); } -Status FieldToNode(const std::shared_ptr& field, - const WriterProperties& properties, - const ArrowWriterProperties& arrow_properties, NodePtr* out) { +Status FieldToNodeInternal(const std::shared_ptr& field, + const WriterProperties& properties, + const ArrowWriterProperties& arrow_properties, + PresumedColumnPath& path, + std::unordered_map& metadata, + NodePtr* out) { + const Repetition::type repetition = + field->nullable() ? Repetition::OPTIONAL : Repetition::REQUIRED; std::shared_ptr annotation = LogicalAnnotation::None(); ParquetType::type type; - Repetition::type repetition = - field->nullable() ? Repetition::OPTIONAL : Repetition::REQUIRED; int length = -1; int precision = -1; int scale = -1; + path.Extend(field); + switch (field->type()->id()) { case ArrowTypeId::NA: type = ParquetType::INT32; @@ -678,9 +840,9 @@ Status FieldToNode(const std::shared_ptr& field, annotation = LogicalAnnotation::Date(); break; case ArrowTypeId::TIMESTAMP: - RETURN_NOT_OK( - GetTimestampMetadata(static_cast<::arrow::TimestampType&>(*field->type()), - arrow_properties, &type, &annotation)); + RETURN_NOT_OK(GetTimestampMetadata( + static_cast<::arrow::TimestampType&>(*field->type()), field->name(), + arrow_properties, path.ToDotString(), metadata, &type, &annotation)); break; case ArrowTypeId::TIME32: type = ParquetType::INT32; @@ -701,12 +863,12 @@ Status FieldToNode(const std::shared_ptr& field, case ArrowTypeId::STRUCT: { auto struct_type = std::static_pointer_cast<::arrow::StructType>(field->type()); return StructToNode(struct_type, field->name(), field->nullable(), properties, - arrow_properties, out); + arrow_properties, path, metadata, out); } case ArrowTypeId::LIST: { auto list_type = std::static_pointer_cast<::arrow::ListType>(field->type()); return ListToNode(list_type, field->name(), field->nullable(), properties, - arrow_properties, out); + arrow_properties, path, metadata, out); } case ArrowTypeId::DICTIONARY: { // Parquet has no Dictionary type, dictionary-encoded is handled on @@ -715,7 +877,8 @@ Status FieldToNode(const std::shared_ptr& field, static_cast(*field->type()); std::shared_ptr<::arrow::Field> unpacked_field = ::arrow::field( field->name(), dict_type.value_type(), field->nullable(), field->metadata()); - return FieldToNode(unpacked_field, properties, arrow_properties, out); + return FieldToNodeInternal(unpacked_field, properties, arrow_properties, path, + metadata, out); } default: { // TODO: DENSE_UNION, SPARE_UNION, JSON_SCALAR, DECIMAL_TEXT, VARCHAR @@ -725,34 +888,92 @@ Status FieldToNode(const std::shared_ptr& field, } } + path.Retract(); + PARQUET_CATCH_NOT_OK(*out = PrimitiveNode::Make(field->name(), repetition, annotation, type, length)); return Status::OK(); } +Status FieldToNode(const std::shared_ptr& field, + const WriterProperties& properties, + const ArrowWriterProperties& arrow_properties, + std::unordered_map& parquet_metadata_map, + NodePtr* out) { + PresumedColumnPath parquet_column_path; + return FieldToNodeInternal(field, properties, arrow_properties, parquet_column_path, + parquet_metadata_map, out); +} + +Status FieldToNode(const std::shared_ptr& field, + const WriterProperties& properties, + const ArrowWriterProperties& arrow_properties, NodePtr* out) { + PresumedColumnPath parquet_column_path; + std::unordered_map dummy_metadata_map; + return FieldToNodeInternal(field, properties, arrow_properties, parquet_column_path, + dummy_metadata_map, out); +} + Status ToParquetSchema(const ::arrow::Schema* arrow_schema, const WriterProperties& properties, const ArrowWriterProperties& arrow_properties, - std::shared_ptr* out) { - std::vector nodes(arrow_schema->num_fields()); + std::shared_ptr* parquet_metadata_out, + std::shared_ptr* parquet_schema_out) { + std::unordered_map parquet_metadata_map; + std::vector parquet_nodes(arrow_schema->num_fields()); + + // TODO(tpboudreau): consider making construction of metadata map in + // FieldToNodeInternal() conditional on whether caller asked for metadata + // (parquet_metadata_out != NULL); metadata construction (even if + // unnecessary) can cause FieldToNodeInternal() to fail for (int i = 0; i < arrow_schema->num_fields(); i++) { - RETURN_NOT_OK( - FieldToNode(arrow_schema->field(i), properties, arrow_properties, &nodes[i])); + PresumedColumnPath parquet_column_path; + RETURN_NOT_OK(FieldToNodeInternal(arrow_schema->field(i), properties, + arrow_properties, parquet_column_path, + parquet_metadata_map, &parquet_nodes[i])); } - NodePtr schema = GroupNode::Make("schema", Repetition::REQUIRED, nodes); - *out = std::make_shared<::parquet::SchemaDescriptor>(); - PARQUET_CATCH_NOT_OK((*out)->Init(schema)); + if (parquet_metadata_out) { + if (arrow_schema->HasMetadata()) { + // merge Arrow application metadata with Arrow/Parquet storage specific metadata + auto arrow_metadata = arrow_schema->metadata(); + std::unordered_map arrow_metadata_map; + arrow_metadata->ToUnorderedMap(&arrow_metadata_map); + parquet_metadata_map.insert(arrow_metadata_map.cbegin(), arrow_metadata_map.cend()); + } + *parquet_metadata_out = + std::make_shared(parquet_metadata_map); + } + + NodePtr parquet_schema = GroupNode::Make("schema", Repetition::REQUIRED, parquet_nodes); + *parquet_schema_out = std::make_shared<::parquet::SchemaDescriptor>(); + PARQUET_CATCH_NOT_OK((*parquet_schema_out)->Init(parquet_schema)); return Status::OK(); } Status ToParquetSchema(const ::arrow::Schema* arrow_schema, const WriterProperties& properties, - std::shared_ptr* out) { + std::shared_ptr* parquet_metadata_out, + std::shared_ptr* parquet_schema_out) { + return ToParquetSchema(arrow_schema, properties, *default_arrow_writer_properties(), + parquet_metadata_out, parquet_schema_out); +} + +Status ToParquetSchema(const ::arrow::Schema* arrow_schema, + const WriterProperties& properties, + const ArrowWriterProperties& arrow_properties, + std::shared_ptr* parquet_schema_out) { + return ToParquetSchema(arrow_schema, properties, arrow_properties, nullptr, + parquet_schema_out); +} + +Status ToParquetSchema(const ::arrow::Schema* arrow_schema, + const WriterProperties& properties, + std::shared_ptr* parquet_schema_out) { return ToParquetSchema(arrow_schema, properties, *default_arrow_writer_properties(), - out); + nullptr, parquet_schema_out); } /// \brief Compute the number of bytes required to represent a decimal of a diff --git a/cpp/src/parquet/arrow/schema.h b/cpp/src/parquet/arrow/schema.h index 52fb843e6c6c2..e3047d634c4d9 100644 --- a/cpp/src/parquet/arrow/schema.h +++ b/cpp/src/parquet/arrow/schema.h @@ -20,6 +20,8 @@ #include #include +#include +#include #include #include "parquet/metadata.h" @@ -76,6 +78,11 @@ ::arrow::Status PARQUET_EXPORT FromParquetSchema(const SchemaDescriptor* parquet ::arrow::Status PARQUET_EXPORT FromParquetSchema(const SchemaDescriptor* parquet_schema, std::shared_ptr<::arrow::Schema>* out); +::arrow::Status PARQUET_EXPORT FieldToNode( + const std::shared_ptr<::arrow::Field>& field, const WriterProperties& properties, + const ArrowWriterProperties& arrow_properties, + std::unordered_map& metadata_map, schema::NodePtr* out); + ::arrow::Status PARQUET_EXPORT FieldToNode(const std::shared_ptr<::arrow::Field>& field, const WriterProperties& properties, const ArrowWriterProperties& arrow_properties, @@ -84,11 +91,22 @@ ::arrow::Status PARQUET_EXPORT FieldToNode(const std::shared_ptr<::arrow::Field> ::arrow::Status PARQUET_EXPORT ToParquetSchema(const ::arrow::Schema* arrow_schema, const WriterProperties& properties, const ArrowWriterProperties& arrow_properties, - std::shared_ptr* out); + std::shared_ptr* parquet_metadata_out, + std::shared_ptr* parquet_schema_out); + +::arrow::Status PARQUET_EXPORT +ToParquetSchema(const ::arrow::Schema* arrow_schema, const WriterProperties& properties, + std::shared_ptr* parquet_metadata_out, + std::shared_ptr* parquet_schema_out); + +::arrow::Status PARQUET_EXPORT +ToParquetSchema(const ::arrow::Schema* arrow_schema, const WriterProperties& properties, + const ArrowWriterProperties& arrow_properties, + std::shared_ptr* parquet_schema_out); -::arrow::Status PARQUET_EXPORT ToParquetSchema(const ::arrow::Schema* arrow_schema, - const WriterProperties& properties, - std::shared_ptr* out); +::arrow::Status PARQUET_EXPORT +ToParquetSchema(const ::arrow::Schema* arrow_schema, const WriterProperties& properties, + std::shared_ptr* parquet_schema_out); PARQUET_EXPORT int32_t DecimalSize(int32_t precision); diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index e31c947519079..7ffeb4375107a 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -1128,16 +1128,21 @@ Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool const std::shared_ptr& properties, const std::shared_ptr& arrow_properties, std::unique_ptr* writer) { + std::shared_ptr parquet_metadata; std::shared_ptr parquet_schema; - RETURN_NOT_OK( - ToParquetSchema(&schema, *properties, *arrow_properties, &parquet_schema)); + RETURN_NOT_OK(ToParquetSchema(&schema, *properties, *arrow_properties, + &parquet_metadata, &parquet_schema)); + + if (parquet_metadata->size() == 0) { + parquet_metadata.reset(); + } auto schema_node = std::static_pointer_cast(parquet_schema->schema_root()); std::unique_ptr base_writer = - ParquetFileWriter::Open(sink, schema_node, properties, schema.metadata()); + ParquetFileWriter::Open(sink, schema_node, properties, parquet_metadata); - auto schema_ptr = std::make_shared<::arrow::Schema>(schema); + auto schema_ptr = schema.AddMetadata(parquet_metadata); writer->reset( new FileWriter(pool, std::move(base_writer), schema_ptr, arrow_properties)); return Status::OK(); From c5de4209204b0a5193f0289f2375ddaec3829527 Mon Sep 17 00:00:00 2001 From: TP Boudreau Date: Mon, 10 Jun 2019 15:31:47 +0000 Subject: [PATCH 3/9] Revert "Preserve Arrow timestamp timezones using Parquet file metadata" This reverts commit f98a1086c802297b2b70612d6d3f2327618200d3. --- cpp/src/parquet/arrow/arrow-schema-test.cc | 280 +--------------- cpp/src/parquet/arrow/schema.cc | 359 ++++----------------- cpp/src/parquet/arrow/schema.h | 26 +- cpp/src/parquet/arrow/writer.cc | 13 +- 4 files changed, 82 insertions(+), 596 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow-schema-test.cc b/cpp/src/parquet/arrow/arrow-schema-test.cc index 878832686d26e..6424dd3f5def6 100644 --- a/cpp/src/parquet/arrow/arrow-schema-test.cc +++ b/cpp/src/parquet/arrow/arrow-schema-test.cc @@ -89,16 +89,7 @@ class TestConvertParquetSchema : public ::testing::Test { const std::shared_ptr& key_value_metadata) { NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes); descr_.Init(schema); - return FromParquetSchema(&descr_, key_value_metadata, &result_schema_); - } - - ::arrow::Status ConvertSchema( - const std::vector& nodes, const std::vector& column_indices, - const std::shared_ptr& key_value_metadata) { - NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes); - descr_.Init(schema); - return FromParquetSchema(&descr_, column_indices, key_value_metadata, - &result_schema_); + return FromParquetSchema(&descr_, {}, key_value_metadata, &result_schema_); } protected: @@ -320,7 +311,7 @@ TEST_F(TestConvertParquetSchema, ParquetKeyValueMetadata) { auto key_value_metadata = std::make_shared(); key_value_metadata->Append("foo", "bar"); key_value_metadata->Append("biz", "baz"); - ASSERT_OK(ConvertSchema(parquet_fields, {}, key_value_metadata)); + ASSERT_OK(ConvertSchema(parquet_fields, key_value_metadata)); auto arrow_metadata = result_schema_->metadata(); ASSERT_EQ("foo", arrow_metadata->key(0)); @@ -338,178 +329,12 @@ TEST_F(TestConvertParquetSchema, ParquetEmptyKeyValueMetadata) { arrow_fields.push_back(std::make_shared("int32", INT32, false)); std::shared_ptr key_value_metadata = nullptr; - ASSERT_OK(ConvertSchema(parquet_fields, {}, key_value_metadata)); + ASSERT_OK(ConvertSchema(parquet_fields, key_value_metadata)); auto arrow_metadata = result_schema_->metadata(); ASSERT_EQ(arrow_metadata, nullptr); } -TEST_F(TestConvertParquetSchema, ParquetSimpleTimezoneNodes) { - std::vector parquet_fields; - std::vector> arrow_fields; - auto parquet_file_metadata = std::make_shared(); - - parquet_fields.push_back(PrimitiveNode::Make( - "ts", Repetition::REQUIRED, - LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MILLIS), - ParquetType::INT64)); - arrow_fields.push_back( - std::make_shared("ts", ::arrow::timestamp(TimeUnit::MILLI), false)); - - parquet_fields.push_back(PrimitiveNode::Make( - "ts:UTC", Repetition::REQUIRED, - LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::MILLIS), - ParquetType::INT64)); - arrow_fields.push_back(std::make_shared( - "ts:UTC", ::arrow::timestamp(TimeUnit::MILLI, "UTC"), false)); - - parquet_fields.push_back(PrimitiveNode::Make( - "ts:CET", Repetition::REQUIRED, - LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MILLIS), - ParquetType::INT64)); - parquet_file_metadata->Append("org.apache.arrow.field[ts:CET].timestamp.timezone", - "CET"); - arrow_fields.push_back(std::make_shared( - "ts:CET", ::arrow::timestamp(TimeUnit::MILLI, "CET"), false)); - - parquet_file_metadata->Append("application.key", "application.value"); - - ASSERT_OK(ConvertSchema(parquet_fields, parquet_file_metadata)); - ASSERT_NO_FATAL_FAILURE( - CheckFlatSchema(std::make_shared<::arrow::Schema>(arrow_fields))); - // Confirm arrow timezone metadata removed, application metadata retained - auto arrow_metadata = result_schema_->metadata(); - ASSERT_EQ(1, arrow_metadata->size()); - ASSERT_EQ("application.key", arrow_metadata->key(0)); - ASSERT_EQ("application.value", arrow_metadata->value(0)); -} - -static void MakeNestedTimezoneSchemas(std::vector>& arrow_fields, - std::vector& parquet_fields, - std::shared_ptr& parquet_metadata, - bool unpacked_dictionaries = false) { - { - auto arrow_element = std::make_shared( - "timestamp:CET", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "CET"), false); - auto arrow_list = std::make_shared<::arrow::ListType>(arrow_element); - arrow_fields.push_back(std::make_shared("some_list", arrow_list, false)); - - auto parquet_element = PrimitiveNode::Make( - "timestamp:CET", Repetition::REQUIRED, - LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MILLIS), - ParquetType::INT64); - auto parquet_list = GroupNode::Make("list", Repetition::REPEATED, {parquet_element}); - parquet_fields.push_back(GroupNode::Make("some_list", Repetition::REQUIRED, - {parquet_list}, LogicalType::LIST)); - parquet_metadata->Append( - "org.apache.arrow.field[some_list.list.timestamp:CET].timestamp.timezone", "CET"); - } - - { - auto arrow_element_1 = std::make_shared( - "timestamp", ::arrow::timestamp(::arrow::TimeUnit::MILLI), false); - auto arrow_element_2 = std::make_shared( - "timestamp:UTC", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "UTC"), false); - auto arrow_element_3 = std::make_shared( - "timestamp:CET", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "CET"), false); - auto arrow_elements = {arrow_element_1, arrow_element_2, arrow_element_3}; - auto arrow_struct = std::make_shared<::arrow::StructType>(arrow_elements); - auto arrow_group = std::make_shared("some_struct", arrow_struct, false); - auto arrow_field = std::make_shared( - "timestamp:NZST", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "NZST"), false); - arrow_fields.push_back(arrow_group); - arrow_fields.push_back(arrow_field); - - auto parquet_element_1 = PrimitiveNode::Make( - "timestamp", Repetition::REQUIRED, - LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MILLIS), - ParquetType::INT64); - auto parquet_element_2 = PrimitiveNode::Make( - "timestamp:UTC", Repetition::REQUIRED, - LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::MILLIS), - ParquetType::INT64); - auto parquet_element_3 = PrimitiveNode::Make( - "timestamp:CET", Repetition::REQUIRED, - LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MILLIS), - ParquetType::INT64); - auto parquet_group = - GroupNode::Make("some_struct", Repetition::REQUIRED, - {parquet_element_1, parquet_element_2, parquet_element_3}); - auto parquet_field = PrimitiveNode::Make( - "timestamp:NZST", Repetition::REQUIRED, - LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MILLIS), - ParquetType::INT64); - parquet_fields.push_back(parquet_group); - parquet_fields.push_back(parquet_field); - parquet_metadata->Append( - "org.apache.arrow.field[some_struct.timestamp:CET].timestamp.timezone", "CET"); - parquet_metadata->Append("org.apache.arrow.field[timestamp:NZST].timestamp.timezone", - "NZST"); - } - - { - if (unpacked_dictionaries) { - // The Parquet nodes immediately below are converted to these non-dictionary Arrow - // fields - arrow_fields.push_back(std::make_shared( - "dictionary_timestamp", ::arrow::timestamp(::arrow::TimeUnit::MILLI), false)); - arrow_fields.push_back(std::make_shared( - "dictionary_timestamp:UTC", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "UTC"), - false)); - arrow_fields.push_back(std::make_shared( - "dictionary_timestamp:CET", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "CET"), - false)); - } else { - // When converted, these Arrow dictionaries are unpacked and stored as the Parquet - // nodes immediately below - arrow_fields.push_back(std::make_shared( - "dictionary_timestamp", - ::arrow::dictionary(::arrow::int16(), ::arrow::timestamp(TimeUnit::MILLI)), - false)); - arrow_fields.push_back(std::make_shared( - "dictionary_timestamp:UTC", - ::arrow::dictionary(::arrow::int16(), - ::arrow::timestamp(TimeUnit::MILLI, "UTC")), - false)); - arrow_fields.push_back(std::make_shared( - "dictionary_timestamp:CET", - ::arrow::dictionary(::arrow::int16(), - ::arrow::timestamp(TimeUnit::MILLI, "CET")), - false)); - } - - parquet_fields.push_back(PrimitiveNode::Make( - "dictionary_timestamp", Repetition::REQUIRED, - LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MILLIS), - ParquetType::INT64)); - parquet_fields.push_back(PrimitiveNode::Make( - "dictionary_timestamp:UTC", Repetition::REQUIRED, - LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::MILLIS), - ParquetType::INT64)); - parquet_fields.push_back(PrimitiveNode::Make( - "dictionary_timestamp:CET", Repetition::REQUIRED, - LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MILLIS), - ParquetType::INT64)); - parquet_metadata->Append( - "org.apache.arrow.field[dictionary_timestamp:CET].timestamp.timezone", "CET"); - } - - return; -} - -TEST_F(TestConvertParquetSchema, ParquetNestedTimezoneNodes) { - std::vector parquet_fields; - std::shared_ptr parquet_metadata = - std::make_shared(); - std::vector> arrow_fields; - - MakeNestedTimezoneSchemas(arrow_fields, parquet_fields, parquet_metadata, true); - - ASSERT_OK(ConvertSchema(parquet_fields, parquet_metadata)); - ASSERT_NO_FATAL_FAILURE( - CheckFlatSchema(std::make_shared<::arrow::Schema>(arrow_fields))); -} - TEST_F(TestConvertParquetSchema, ParquetFlatDecimals) { std::vector parquet_fields; std::vector> arrow_fields; @@ -908,10 +733,7 @@ class TestConvertArrowSchema : public ::testing::Test { public: virtual void SetUp() {} - void CheckFlatSchema( - const std::vector& nodes, - const std::shared_ptr& expected_metadata = nullptr, - bool expect_strict_metadata_equality = true) { + void CheckFlatSchema(const std::vector& nodes) { NodePtr schema_node = GroupNode::Make("schema", Repetition::REPEATED, nodes); const GroupNode* expected_schema_node = static_cast(schema_node.get()); @@ -924,35 +746,18 @@ class TestConvertArrowSchema : public ::testing::Test { auto rhs = expected_schema_node->field(i); EXPECT_TRUE(lhs->Equals(rhs.get())); } - - if (expected_metadata) { - if (expect_strict_metadata_equality) { - ASSERT_TRUE(result_metadata_->Equals(*expected_metadata)); - } else { - // check for expected keys and values, indifferent to order in container - ASSERT_EQ(result_metadata_->size(), expected_metadata->size()); - for (int i = 0; i < expected_metadata->size(); ++i) { - int index = result_metadata_->FindKey(expected_metadata->key(i)); - ASSERT_NE(index, -1) << "key '" << expected_metadata->key(i) - << "' unexpectedly missing from result metadata"; - ASSERT_EQ(result_metadata_->value(index), expected_metadata->value(i)); - } - } - } } ::arrow::Status ConvertSchema(const std::vector>& fields) { arrow_schema_ = std::make_shared<::arrow::Schema>(fields); std::shared_ptr<::parquet::WriterProperties> properties = ::parquet::default_writer_properties(); - return ToParquetSchema(arrow_schema_.get(), *properties.get(), &result_metadata_, - &result_schema_); + return ToParquetSchema(arrow_schema_.get(), *properties.get(), &result_schema_); } protected: std::shared_ptr<::arrow::Schema> arrow_schema_; std::shared_ptr result_schema_; - std::shared_ptr result_metadata_; }; TEST_F(TestConvertArrowSchema, ParquetFlatPrimitives) { @@ -1119,72 +924,6 @@ TEST_F(TestConvertArrowSchema, ArrowNonconvertibleFields) { } } -TEST_F(TestConvertArrowSchema, ArrowSimpleTimezoneFields) { - struct TimezoneFieldConstructionArguments { - std::string name; - std::shared_ptr<::arrow::DataType> datatype; - bool is_adjusted_to_utc; - bool inserts_metadata; - std::pair key_value_metadata; - }; - - std::vector cases = { - {"timestamp", ::arrow::timestamp(::arrow::TimeUnit::MILLI), false, false, {"", ""}}, - {"timestamp:UTC", - ::arrow::timestamp(::arrow::TimeUnit::MILLI, "UTC"), - true, - false, - {"", ""}}, - {"timestamp:utc", - ::arrow::timestamp(::arrow::TimeUnit::MILLI, "utc"), - true, - false, - {"", ""}}, - {"timestamp:CET", - ::arrow::timestamp(::arrow::TimeUnit::MILLI, "CET"), - false, - true, - {"org.apache.arrow.field[timestamp:CET].timestamp.timezone", "CET"}}, - {"timestamp:NZST", - ::arrow::timestamp(::arrow::TimeUnit::MILLI, "NZST"), - false, - true, - {"org.apache.arrow.field[timestamp:NZST].timestamp.timezone", "NZST"}}, - }; - - std::vector> arrow_fields; - std::vector parquet_fields; - auto parquet_file_metadata = std::make_shared(); - - for (const TimezoneFieldConstructionArguments& c : cases) { - arrow_fields.push_back(std::make_shared(c.name, c.datatype, false)); - parquet_fields.push_back(PrimitiveNode::Make( - c.name, Repetition::REQUIRED, - LogicalAnnotation::Timestamp(c.is_adjusted_to_utc, - LogicalAnnotation::TimeUnit::MILLIS), - ParquetType::INT64)); - if (c.inserts_metadata) { - parquet_file_metadata->Append(c.key_value_metadata.first, - c.key_value_metadata.second); - } - } - - ASSERT_OK(ConvertSchema(arrow_fields)); - ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields, parquet_file_metadata, false)); -} - -TEST_F(TestConvertArrowSchema, ArrowNestedTimezoneFields) { - std::vector> arrow_fields; - std::vector parquet_fields; - std::shared_ptr parquet_metadata = - std::make_shared(); - - MakeNestedTimezoneSchemas(arrow_fields, parquet_fields, parquet_metadata); - - ASSERT_OK(ConvertSchema(arrow_fields)); - ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields, parquet_metadata, false)); -} - TEST_F(TestConvertArrowSchema, ParquetFlatPrimitivesAsDictionaries) { std::vector parquet_fields; std::vector> arrow_fields; @@ -1210,15 +949,6 @@ TEST_F(TestConvertArrowSchema, ParquetFlatPrimitivesAsDictionaries) { arrow_fields.push_back(std::make_shared( "date64", ::arrow::dictionary(::arrow::int8(), ::arrow::date64()), false)); - parquet_fields.push_back(PrimitiveNode::Make( - "timestamp", Repetition::REQUIRED, - LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::MILLIS), - ParquetType::INT64)); - arrow_fields.push_back(std::make_shared( - "timestamp", - ::arrow::dictionary(::arrow::int8(), ::arrow::timestamp(TimeUnit::MILLI, "UTC")), - false)); - parquet_fields.push_back( PrimitiveNode::Make("float", Repetition::OPTIONAL, ParquetType::FLOAT)); arrow_fields.push_back(std::make_shared( diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 1e1c7580de879..c7f71665ffda0 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -18,8 +18,6 @@ #include "parquet/arrow/schema.h" #include -#include -#include #include #include #include @@ -35,7 +33,6 @@ #include "parquet/exception.h" #include "parquet/properties.h" #include "parquet/schema-internal.h" -#include "parquet/schema.h" #include "parquet/types.h" using arrow::Field; @@ -46,7 +43,6 @@ using ArrowType = arrow::DataType; using ArrowTypeId = arrow::Type; using parquet::Repetition; -using parquet::schema::ColumnPath; using parquet::schema::GroupNode; using parquet::schema::Node; using parquet::schema::NodePtr; @@ -60,9 +56,6 @@ namespace parquet { namespace arrow { -static const char* key_prefix = "org.apache.arrow.field["; -static const char* key_suffix = "].timestamp.timezone"; - const auto TIMESTAMP_MS = ::arrow::timestamp(::arrow::TimeUnit::MILLI); const auto TIMESTAMP_US = ::arrow::timestamp(::arrow::TimeUnit::MICRO); const auto TIMESTAMP_NS = ::arrow::timestamp(::arrow::TimeUnit::NANO); @@ -139,46 +132,31 @@ static Status MakeArrowTime64(const std::shared_ptr& an return Status::OK(); } -static int FetchMetadataTimezoneIndex( - const std::string& path, const std::shared_ptr& metadata) { - int index = -1; - if (metadata) { - const std::string key(key_prefix + path + key_suffix); - index = metadata->FindKey(key); - } - return index; -} - static Status MakeArrowTimestamp( - const std::shared_ptr& annotation, const std::string& path, - const std::shared_ptr& metadata, + const std::shared_ptr& annotation, std::shared_ptr* out) { + static constexpr auto utc = "UTC"; const auto& timestamp = checked_cast(*annotation); - ::arrow::TimeUnit::type time_unit; - switch (timestamp.time_unit()) { case LogicalAnnotation::TimeUnit::MILLIS: - time_unit = ::arrow::TimeUnit::MILLI; + *out = (timestamp.is_adjusted_to_utc() + ? ::arrow::timestamp(::arrow::TimeUnit::MILLI, utc) + : ::arrow::timestamp(::arrow::TimeUnit::MILLI)); break; case LogicalAnnotation::TimeUnit::MICROS: - time_unit = ::arrow::TimeUnit::MICRO; + *out = (timestamp.is_adjusted_to_utc() + ? ::arrow::timestamp(::arrow::TimeUnit::MICRO, utc) + : ::arrow::timestamp(::arrow::TimeUnit::MICRO)); break; case LogicalAnnotation::TimeUnit::NANOS: - time_unit = ::arrow::TimeUnit::NANO; + *out = (timestamp.is_adjusted_to_utc() + ? ::arrow::timestamp(::arrow::TimeUnit::NANO, utc) + : ::arrow::timestamp(::arrow::TimeUnit::NANO)); break; default: return Status::TypeError("Unrecognized time unit in timestamp annotation: ", annotation->ToString()); } - - // Attempt to recover optional timezone for this field from file metadata - int index = FetchMetadataTimezoneIndex(path, metadata); - - *out = (timestamp.is_adjusted_to_utc() - ? ::arrow::timestamp(time_unit, "UTC") - : (index == -1 ? ::arrow::timestamp(time_unit) - : ::arrow::timestamp(time_unit, metadata->value(index)))); - return Status::OK(); } @@ -250,8 +228,6 @@ static Status FromInt32(const std::shared_ptr& annotati } static Status FromInt64(const std::shared_ptr& annotation, - const std::string& path, - const std::shared_ptr& metadata, std::shared_ptr* out) { switch (annotation->type()) { case LogicalAnnotation::Type::INT: @@ -261,7 +237,7 @@ static Status FromInt64(const std::shared_ptr& annotati RETURN_NOT_OK(MakeArrowDecimal(annotation, out)); break; case LogicalAnnotation::Type::TIMESTAMP: - RETURN_NOT_OK(MakeArrowTimestamp(annotation, path, metadata, out)); + RETURN_NOT_OK(MakeArrowTimestamp(annotation, out)); break; case LogicalAnnotation::Type::TIME: RETURN_NOT_OK(MakeArrowTime64(annotation, out)); @@ -276,9 +252,7 @@ static Status FromInt64(const std::shared_ptr& annotati return Status::OK(); } -Status FromPrimitive(const PrimitiveNode& primitive, - const std::shared_ptr& metadata, - std::shared_ptr* out) { +Status FromPrimitive(const PrimitiveNode& primitive, std::shared_ptr* out) { const std::shared_ptr& annotation = primitive.logical_annotation(); if (annotation->is_invalid() || annotation->is_null()) { @@ -294,8 +268,7 @@ Status FromPrimitive(const PrimitiveNode& primitive, RETURN_NOT_OK(FromInt32(annotation, out)); break; case ParquetType::INT64: - RETURN_NOT_OK(FromInt64(annotation, ColumnPath::FromNode(primitive)->ToDotString(), - metadata, out)); + RETURN_NOT_OK(FromInt64(annotation, out)); break; case ParquetType::INT96: *out = TIMESTAMP_NS; @@ -324,7 +297,6 @@ Status FromPrimitive(const PrimitiveNode& primitive, // Forward declaration Status NodeToFieldInternal(const Node& node, const std::unordered_set* included_leaf_nodes, - const std::shared_ptr& metadata, std::shared_ptr* out); /* @@ -342,7 +314,6 @@ inline bool IsIncludedLeaf(const Node& node, Status StructFromGroup(const GroupNode& group, const std::unordered_set* included_leaf_nodes, - const std::shared_ptr& metadata, std::shared_ptr* out) { std::vector> fields; std::shared_ptr field; @@ -350,8 +321,7 @@ Status StructFromGroup(const GroupNode& group, *out = nullptr; for (int i = 0; i < group.field_count(); i++) { - RETURN_NOT_OK( - NodeToFieldInternal(*group.field(i), included_leaf_nodes, metadata, &field)); + RETURN_NOT_OK(NodeToFieldInternal(*group.field(i), included_leaf_nodes, &field)); if (field != nullptr) { fields.push_back(field); } @@ -364,7 +334,6 @@ Status StructFromGroup(const GroupNode& group, Status NodeToList(const GroupNode& group, const std::unordered_set* included_leaf_nodes, - const std::shared_ptr& metadata, std::shared_ptr* out) { *out = nullptr; if (group.field_count() == 1) { @@ -378,8 +347,8 @@ Status NodeToList(const GroupNode& group, if (list_group.field_count() == 1 && !schema::HasStructListName(list_group)) { // List of primitive type std::shared_ptr item_field; - RETURN_NOT_OK(NodeToFieldInternal(*list_group.field(0), included_leaf_nodes, - metadata, &item_field)); + RETURN_NOT_OK( + NodeToFieldInternal(*list_group.field(0), included_leaf_nodes, &item_field)); if (item_field != nullptr) { *out = ::arrow::list(item_field); @@ -387,8 +356,7 @@ Status NodeToList(const GroupNode& group, } else { // List of struct std::shared_ptr inner_type; - RETURN_NOT_OK( - StructFromGroup(list_group, included_leaf_nodes, metadata, &inner_type)); + RETURN_NOT_OK(StructFromGroup(list_group, included_leaf_nodes, &inner_type)); if (inner_type != nullptr) { auto item_field = std::make_shared(list_node.name(), inner_type, false); *out = ::arrow::list(item_field); @@ -398,8 +366,8 @@ Status NodeToList(const GroupNode& group, // repeated primitive node std::shared_ptr inner_type; if (IsIncludedLeaf(static_cast(list_node), included_leaf_nodes)) { - RETURN_NOT_OK(FromPrimitive(static_cast(list_node), - metadata, &inner_type)); + RETURN_NOT_OK( + FromPrimitive(static_cast(list_node), &inner_type)); auto item_field = std::make_shared(list_node.name(), inner_type, false); *out = ::arrow::list(item_field); } @@ -415,12 +383,11 @@ Status NodeToList(const GroupNode& group, } Status NodeToField(const Node& node, std::shared_ptr* out) { - return NodeToFieldInternal(node, nullptr, nullptr, out); + return NodeToFieldInternal(node, nullptr, out); } Status NodeToFieldInternal(const Node& node, const std::unordered_set* included_leaf_nodes, - const std::shared_ptr& metadata, std::shared_ptr* out) { std::shared_ptr type = nullptr; bool nullable = !node.is_required(); @@ -432,10 +399,9 @@ Status NodeToFieldInternal(const Node& node, std::shared_ptr inner_type; if (node.is_group()) { RETURN_NOT_OK(StructFromGroup(static_cast(node), - included_leaf_nodes, metadata, &inner_type)); + included_leaf_nodes, &inner_type)); } else if (IsIncludedLeaf(node, included_leaf_nodes)) { - RETURN_NOT_OK( - FromPrimitive(static_cast(node), metadata, &inner_type)); + RETURN_NOT_OK(FromPrimitive(static_cast(node), &inner_type)); } if (inner_type != nullptr) { auto item_field = std::make_shared(node.name(), inner_type, false); @@ -445,15 +411,14 @@ Status NodeToFieldInternal(const Node& node, } else if (node.is_group()) { const auto& group = static_cast(node); if (node.logical_annotation()->is_list()) { - RETURN_NOT_OK(NodeToList(group, included_leaf_nodes, metadata, &type)); + RETURN_NOT_OK(NodeToList(group, included_leaf_nodes, &type)); } else { - RETURN_NOT_OK(StructFromGroup(group, included_leaf_nodes, metadata, &type)); + RETURN_NOT_OK(StructFromGroup(group, included_leaf_nodes, &type)); } } else { // Primitive (leaf) node if (IsIncludedLeaf(node, included_leaf_nodes)) { - RETURN_NOT_OK( - FromPrimitive(static_cast(node), metadata, &type)); + RETURN_NOT_OK(FromPrimitive(static_cast(node), &type)); } } if (type != nullptr) { @@ -462,56 +427,26 @@ Status NodeToFieldInternal(const Node& node, return Status::OK(); } -static std::shared_ptr FilterParquetMetadata( - const std::shared_ptr& metadata) { - // Return a copy of the input metadata stripped of any key-values pairs - // that held Arrow/Parquet storage specific information - std::shared_ptr filtered_metadata = - std::make_shared(); - if (metadata) { - for (int i = 0; i < metadata->size(); ++i) { - const std::string& key = metadata->key(i); - const std::string& value = metadata->value(i); - // Discard keys like "org.apache.arrow.field[*].timestamp.timezone" - size_t key_size = key.size(); - size_t key_prefix_size = strlen(key_prefix); - size_t key_suffix_size = strlen(key_suffix); - bool timezone_storage_key = - (key_size >= (key_prefix_size + key_suffix_size)) && - (key.compare(0, key_prefix_size, key_prefix) == 0) && - (key.compare(key_size - key_suffix_size, key_suffix_size, key_suffix) == 0); - if (!timezone_storage_key) { - filtered_metadata->Append(key, value); - } - } - } - if (filtered_metadata->size() == 0) { - filtered_metadata.reset(); - } - return filtered_metadata; -} - -Status FromParquetSchema(const SchemaDescriptor* parquet_schema, - const std::shared_ptr& parquet_metadata, - std::shared_ptr<::arrow::Schema>* out) { +Status FromParquetSchema( + const SchemaDescriptor* parquet_schema, + const std::shared_ptr& key_value_metadata, + std::shared_ptr<::arrow::Schema>* out) { const GroupNode& schema_node = *parquet_schema->group_node(); int num_fields = static_cast(schema_node.field_count()); std::vector> fields(num_fields); for (int i = 0; i < num_fields; i++) { - RETURN_NOT_OK(NodeToFieldInternal(*schema_node.field(i), nullptr, parquet_metadata, - &fields[i])); + RETURN_NOT_OK(NodeToField(*schema_node.field(i), &fields[i])); } - *out = - std::make_shared<::arrow::Schema>(fields, FilterParquetMetadata(parquet_metadata)); + *out = std::make_shared<::arrow::Schema>(fields, key_value_metadata); return Status::OK(); } -Status FromParquetSchema(const SchemaDescriptor* parquet_schema, - const std::vector& column_indices, - const std::shared_ptr& parquet_metadata, - std::shared_ptr<::arrow::Schema>* out) { +Status FromParquetSchema( + const SchemaDescriptor* parquet_schema, const std::vector& column_indices, + const std::shared_ptr& key_value_metadata, + std::shared_ptr<::arrow::Schema>* out) { // TODO(wesm): Consider adding an arrow::Schema name attribute, which comes // from the root Parquet node @@ -535,15 +470,13 @@ Status FromParquetSchema(const SchemaDescriptor* parquet_schema, std::vector> fields; std::shared_ptr field; for (auto node : base_nodes) { - RETURN_NOT_OK( - NodeToFieldInternal(*node, &included_leaf_nodes, parquet_metadata, &field)); + RETURN_NOT_OK(NodeToFieldInternal(*node, &included_leaf_nodes, &field)); if (field != nullptr) { fields.push_back(field); } } - *out = - std::make_shared<::arrow::Schema>(fields, FilterParquetMetadata(parquet_metadata)); + *out = std::make_shared<::arrow::Schema>(fields, key_value_metadata); return Status::OK(); } @@ -558,73 +491,13 @@ Status FromParquetSchema(const SchemaDescriptor* parquet_schema, return FromParquetSchema(parquet_schema, nullptr, out); } -class PresumedColumnPath { - // Pre-constructs the eventual column path dot string for a Parquet node - // before it is fully constructed from Arrow fields - public: - PresumedColumnPath() = default; - - void Extend(const std::shared_ptr& field) { - switch (field->type()->id()) { - case ArrowTypeId::DICTIONARY: - path_.push_back({false, ""}); - break; - case ArrowTypeId::LIST: - path_.push_back({true, field->name() + ".list"}); - break; - default: - path_.push_back({true, field->name()}); - break; - } - return; - } - - void Retract() { - DCHECK(!path_.empty()); - path_.pop_back(); - return; - } - - std::string ToDotString() const { - std::stringstream ss; - int i = 0; - for (auto c = path_.cbegin(); c != path_.cend(); ++c) { - if (c->include) { - if (i > 0) { - ss << "."; - } - ss << c->component; - ++i; - } - } - return ss.str(); - } - - private: - struct Component { - bool include; - std::string component; - }; - std::deque path_; -}; - -Status FieldToNodeInternal(const std::shared_ptr& field, - const WriterProperties& properties, - const ArrowWriterProperties& arrow_properties, - PresumedColumnPath& path, - std::unordered_map& metadata, - NodePtr* out); - Status ListToNode(const std::shared_ptr<::arrow::ListType>& type, const std::string& name, bool nullable, const WriterProperties& properties, - const ArrowWriterProperties& arrow_properties, PresumedColumnPath& path, - std::unordered_map& metadata, NodePtr* out) { - const Repetition::type repetition = - nullable ? Repetition::OPTIONAL : Repetition::REQUIRED; + const ArrowWriterProperties& arrow_properties, NodePtr* out) { + Repetition::type repetition = nullable ? Repetition::OPTIONAL : Repetition::REQUIRED; NodePtr element; - RETURN_NOT_OK(FieldToNodeInternal(type->value_field(), properties, arrow_properties, - path, metadata, &element)); + RETURN_NOT_OK(FieldToNode(type->value_field(), properties, arrow_properties, &element)); NodePtr list = GroupNode::Make("list", Repetition::REPEATED, {element}); *out = GroupNode::Make(name, repetition, {list}, LogicalAnnotation::List()); @@ -634,17 +507,13 @@ Status ListToNode(const std::shared_ptr<::arrow::ListType>& type, const std::str Status StructToNode(const std::shared_ptr<::arrow::StructType>& type, const std::string& name, bool nullable, const WriterProperties& properties, - const ArrowWriterProperties& arrow_properties, - PresumedColumnPath& path, - std::unordered_map& metadata, - NodePtr* out) { - const Repetition::type repetition = - nullable ? Repetition::OPTIONAL : Repetition::REQUIRED; + const ArrowWriterProperties& arrow_properties, NodePtr* out) { + Repetition::type repetition = nullable ? Repetition::OPTIONAL : Repetition::REQUIRED; std::vector children(type->num_children()); for (int i = 0; i < type->num_children(); i++) { - RETURN_NOT_OK(FieldToNodeInternal(type->child(i), properties, arrow_properties, path, - metadata, &children[i])); + RETURN_NOT_OK( + FieldToNode(type->child(i), properties, arrow_properties, &children[i])); } *out = GroupNode::Make(name, repetition, children); @@ -657,32 +526,9 @@ static bool HasUTCTimezone(const std::string& timezone) { [timezone](const std::string& utc) { return timezone == utc; }); } -static void StoreMetadataTimezone(const std::string& path, - std::unordered_map& metadata, - const std::string& timezone) { - const std::string key(key_prefix + path + key_suffix); - if (metadata.find(key) != metadata.end()) { - std::stringstream ms; - ms << "Duplicate field name '" << path - << "' for timestamp with timezone field in Arrow schema."; - throw ParquetException(ms.str()); - } - metadata.insert(std::make_pair(key, timezone)); - return; -} - static std::shared_ptr TimestampAnnotationFromArrowTimestamp( - const std::string& path, const ::arrow::TimestampType& timestamp_type, - ::arrow::TimeUnit::type time_unit, - std::unordered_map& metadata) { - const std::string& timezone = timestamp_type.timezone(); - const bool utc = HasUTCTimezone(timezone); - - if (!utc && !timezone.empty()) { - // Attempt to preserve timezone for this field as file metadata - StoreMetadataTimezone(path, metadata, timezone); - } - + const ::arrow::TimestampType& timestamp_type, ::arrow::TimeUnit::type time_unit) { + const bool utc = HasUTCTimezone(timestamp_type.timezone()); switch (time_unit) { case ::arrow::TimeUnit::MILLI: return LogicalAnnotation::Timestamp(utc, LogicalAnnotation::TimeUnit::MILLIS); @@ -698,10 +544,7 @@ static std::shared_ptr TimestampAnnotationFromArrowTime } static Status GetTimestampMetadata(const ::arrow::TimestampType& type, - const std::string& name, const ArrowWriterProperties& properties, - const std::string& path, - std::unordered_map& metadata, ParquetType::type* physical_type, std::shared_ptr* annotation) { const bool coerce = properties.coerce_timestamps_enabled(); @@ -715,8 +558,8 @@ static Status GetTimestampMetadata(const ::arrow::TimestampType& type, } *physical_type = ParquetType::INT64; - PARQUET_CATCH_NOT_OK(*annotation = TimestampAnnotationFromArrowTimestamp( - path, type, target_unit, metadata)); + PARQUET_CATCH_NOT_OK(*annotation = + TimestampAnnotationFromArrowTimestamp(type, target_unit)); // The user is explicitly asking for timestamp data to be converted to the // specified units (target_unit). @@ -744,23 +587,18 @@ static Status GetTimestampMetadata(const ::arrow::TimestampType& type, return Status::OK(); } -Status FieldToNodeInternal(const std::shared_ptr& field, - const WriterProperties& properties, - const ArrowWriterProperties& arrow_properties, - PresumedColumnPath& path, - std::unordered_map& metadata, - NodePtr* out) { - const Repetition::type repetition = - field->nullable() ? Repetition::OPTIONAL : Repetition::REQUIRED; +Status FieldToNode(const std::shared_ptr& field, + const WriterProperties& properties, + const ArrowWriterProperties& arrow_properties, NodePtr* out) { std::shared_ptr annotation = LogicalAnnotation::None(); ParquetType::type type; + Repetition::type repetition = + field->nullable() ? Repetition::OPTIONAL : Repetition::REQUIRED; int length = -1; int precision = -1; int scale = -1; - path.Extend(field); - switch (field->type()->id()) { case ArrowTypeId::NA: type = ParquetType::INT32; @@ -840,9 +678,9 @@ Status FieldToNodeInternal(const std::shared_ptr& field, annotation = LogicalAnnotation::Date(); break; case ArrowTypeId::TIMESTAMP: - RETURN_NOT_OK(GetTimestampMetadata( - static_cast<::arrow::TimestampType&>(*field->type()), field->name(), - arrow_properties, path.ToDotString(), metadata, &type, &annotation)); + RETURN_NOT_OK( + GetTimestampMetadata(static_cast<::arrow::TimestampType&>(*field->type()), + arrow_properties, &type, &annotation)); break; case ArrowTypeId::TIME32: type = ParquetType::INT32; @@ -863,12 +701,12 @@ Status FieldToNodeInternal(const std::shared_ptr& field, case ArrowTypeId::STRUCT: { auto struct_type = std::static_pointer_cast<::arrow::StructType>(field->type()); return StructToNode(struct_type, field->name(), field->nullable(), properties, - arrow_properties, path, metadata, out); + arrow_properties, out); } case ArrowTypeId::LIST: { auto list_type = std::static_pointer_cast<::arrow::ListType>(field->type()); return ListToNode(list_type, field->name(), field->nullable(), properties, - arrow_properties, path, metadata, out); + arrow_properties, out); } case ArrowTypeId::DICTIONARY: { // Parquet has no Dictionary type, dictionary-encoded is handled on @@ -877,8 +715,7 @@ Status FieldToNodeInternal(const std::shared_ptr& field, static_cast(*field->type()); std::shared_ptr<::arrow::Field> unpacked_field = ::arrow::field( field->name(), dict_type.value_type(), field->nullable(), field->metadata()); - return FieldToNodeInternal(unpacked_field, properties, arrow_properties, path, - metadata, out); + return FieldToNode(unpacked_field, properties, arrow_properties, out); } default: { // TODO: DENSE_UNION, SPARE_UNION, JSON_SCALAR, DECIMAL_TEXT, VARCHAR @@ -888,92 +725,34 @@ Status FieldToNodeInternal(const std::shared_ptr& field, } } - path.Retract(); - PARQUET_CATCH_NOT_OK(*out = PrimitiveNode::Make(field->name(), repetition, annotation, type, length)); return Status::OK(); } -Status FieldToNode(const std::shared_ptr& field, - const WriterProperties& properties, - const ArrowWriterProperties& arrow_properties, - std::unordered_map& parquet_metadata_map, - NodePtr* out) { - PresumedColumnPath parquet_column_path; - return FieldToNodeInternal(field, properties, arrow_properties, parquet_column_path, - parquet_metadata_map, out); -} - -Status FieldToNode(const std::shared_ptr& field, - const WriterProperties& properties, - const ArrowWriterProperties& arrow_properties, NodePtr* out) { - PresumedColumnPath parquet_column_path; - std::unordered_map dummy_metadata_map; - return FieldToNodeInternal(field, properties, arrow_properties, parquet_column_path, - dummy_metadata_map, out); -} - Status ToParquetSchema(const ::arrow::Schema* arrow_schema, const WriterProperties& properties, const ArrowWriterProperties& arrow_properties, - std::shared_ptr* parquet_metadata_out, - std::shared_ptr* parquet_schema_out) { - std::unordered_map parquet_metadata_map; - std::vector parquet_nodes(arrow_schema->num_fields()); - - // TODO(tpboudreau): consider making construction of metadata map in - // FieldToNodeInternal() conditional on whether caller asked for metadata - // (parquet_metadata_out != NULL); metadata construction (even if - // unnecessary) can cause FieldToNodeInternal() to fail + std::shared_ptr* out) { + std::vector nodes(arrow_schema->num_fields()); for (int i = 0; i < arrow_schema->num_fields(); i++) { - PresumedColumnPath parquet_column_path; - RETURN_NOT_OK(FieldToNodeInternal(arrow_schema->field(i), properties, - arrow_properties, parquet_column_path, - parquet_metadata_map, &parquet_nodes[i])); - } - - if (parquet_metadata_out) { - if (arrow_schema->HasMetadata()) { - // merge Arrow application metadata with Arrow/Parquet storage specific metadata - auto arrow_metadata = arrow_schema->metadata(); - std::unordered_map arrow_metadata_map; - arrow_metadata->ToUnorderedMap(&arrow_metadata_map); - parquet_metadata_map.insert(arrow_metadata_map.cbegin(), arrow_metadata_map.cend()); - } - *parquet_metadata_out = - std::make_shared(parquet_metadata_map); + RETURN_NOT_OK( + FieldToNode(arrow_schema->field(i), properties, arrow_properties, &nodes[i])); } - NodePtr parquet_schema = GroupNode::Make("schema", Repetition::REQUIRED, parquet_nodes); - *parquet_schema_out = std::make_shared<::parquet::SchemaDescriptor>(); - PARQUET_CATCH_NOT_OK((*parquet_schema_out)->Init(parquet_schema)); + NodePtr schema = GroupNode::Make("schema", Repetition::REQUIRED, nodes); + *out = std::make_shared<::parquet::SchemaDescriptor>(); + PARQUET_CATCH_NOT_OK((*out)->Init(schema)); return Status::OK(); } Status ToParquetSchema(const ::arrow::Schema* arrow_schema, const WriterProperties& properties, - std::shared_ptr* parquet_metadata_out, - std::shared_ptr* parquet_schema_out) { - return ToParquetSchema(arrow_schema, properties, *default_arrow_writer_properties(), - parquet_metadata_out, parquet_schema_out); -} - -Status ToParquetSchema(const ::arrow::Schema* arrow_schema, - const WriterProperties& properties, - const ArrowWriterProperties& arrow_properties, - std::shared_ptr* parquet_schema_out) { - return ToParquetSchema(arrow_schema, properties, arrow_properties, nullptr, - parquet_schema_out); -} - -Status ToParquetSchema(const ::arrow::Schema* arrow_schema, - const WriterProperties& properties, - std::shared_ptr* parquet_schema_out) { + std::shared_ptr* out) { return ToParquetSchema(arrow_schema, properties, *default_arrow_writer_properties(), - nullptr, parquet_schema_out); + out); } /// \brief Compute the number of bytes required to represent a decimal of a diff --git a/cpp/src/parquet/arrow/schema.h b/cpp/src/parquet/arrow/schema.h index e3047d634c4d9..52fb843e6c6c2 100644 --- a/cpp/src/parquet/arrow/schema.h +++ b/cpp/src/parquet/arrow/schema.h @@ -20,8 +20,6 @@ #include #include -#include -#include #include #include "parquet/metadata.h" @@ -78,11 +76,6 @@ ::arrow::Status PARQUET_EXPORT FromParquetSchema(const SchemaDescriptor* parquet ::arrow::Status PARQUET_EXPORT FromParquetSchema(const SchemaDescriptor* parquet_schema, std::shared_ptr<::arrow::Schema>* out); -::arrow::Status PARQUET_EXPORT FieldToNode( - const std::shared_ptr<::arrow::Field>& field, const WriterProperties& properties, - const ArrowWriterProperties& arrow_properties, - std::unordered_map& metadata_map, schema::NodePtr* out); - ::arrow::Status PARQUET_EXPORT FieldToNode(const std::shared_ptr<::arrow::Field>& field, const WriterProperties& properties, const ArrowWriterProperties& arrow_properties, @@ -91,22 +84,11 @@ ::arrow::Status PARQUET_EXPORT FieldToNode(const std::shared_ptr<::arrow::Field> ::arrow::Status PARQUET_EXPORT ToParquetSchema(const ::arrow::Schema* arrow_schema, const WriterProperties& properties, const ArrowWriterProperties& arrow_properties, - std::shared_ptr* parquet_metadata_out, - std::shared_ptr* parquet_schema_out); - -::arrow::Status PARQUET_EXPORT -ToParquetSchema(const ::arrow::Schema* arrow_schema, const WriterProperties& properties, - std::shared_ptr* parquet_metadata_out, - std::shared_ptr* parquet_schema_out); - -::arrow::Status PARQUET_EXPORT -ToParquetSchema(const ::arrow::Schema* arrow_schema, const WriterProperties& properties, - const ArrowWriterProperties& arrow_properties, - std::shared_ptr* parquet_schema_out); + std::shared_ptr* out); -::arrow::Status PARQUET_EXPORT -ToParquetSchema(const ::arrow::Schema* arrow_schema, const WriterProperties& properties, - std::shared_ptr* parquet_schema_out); +::arrow::Status PARQUET_EXPORT ToParquetSchema(const ::arrow::Schema* arrow_schema, + const WriterProperties& properties, + std::shared_ptr* out); PARQUET_EXPORT int32_t DecimalSize(int32_t precision); diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 7ffeb4375107a..e31c947519079 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -1128,21 +1128,16 @@ Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool const std::shared_ptr& properties, const std::shared_ptr& arrow_properties, std::unique_ptr* writer) { - std::shared_ptr parquet_metadata; std::shared_ptr parquet_schema; - RETURN_NOT_OK(ToParquetSchema(&schema, *properties, *arrow_properties, - &parquet_metadata, &parquet_schema)); - - if (parquet_metadata->size() == 0) { - parquet_metadata.reset(); - } + RETURN_NOT_OK( + ToParquetSchema(&schema, *properties, *arrow_properties, &parquet_schema)); auto schema_node = std::static_pointer_cast(parquet_schema->schema_root()); std::unique_ptr base_writer = - ParquetFileWriter::Open(sink, schema_node, properties, parquet_metadata); + ParquetFileWriter::Open(sink, schema_node, properties, schema.metadata()); - auto schema_ptr = schema.AddMetadata(parquet_metadata); + auto schema_ptr = std::make_shared<::arrow::Schema>(schema); writer->reset( new FileWriter(pool, std::move(base_writer), schema_ptr, arrow_properties)); return Status::OK(); From b5ebdbdb7f26ab86817c0303ba9fe576a62df095 Mon Sep 17 00:00:00 2001 From: TP Boudreau Date: Mon, 10 Jun 2019 17:43:01 +0000 Subject: [PATCH 4/9] Set Parquet isAdjustedToUTC to true for timestamps with non-empty timezones --- cpp/src/parquet/arrow/arrow-reader-writer-test.cc | 2 +- cpp/src/parquet/arrow/arrow-schema-test.cc | 6 +++--- cpp/src/parquet/arrow/schema.cc | 10 ++-------- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index c352a62942913..2c7801504c56d 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -115,7 +115,7 @@ std::shared_ptr get_logical_annotation(const ::DataType return LogicalAnnotation::Date(); case ArrowId::TIMESTAMP: { const auto& ts_type = static_cast(type); - const bool adjusted_to_utc = (ts_type.timezone() == "UTC"); + const bool adjusted_to_utc = !(ts_type.timezone().empty()); switch (ts_type.unit()) { case TimeUnit::MILLI: return LogicalAnnotation::Timestamp(adjusted_to_utc, diff --git a/cpp/src/parquet/arrow/arrow-schema-test.cc b/cpp/src/parquet/arrow/arrow-schema-test.cc index 6424dd3f5def6..7a8d3d038c993 100644 --- a/cpp/src/parquet/arrow/arrow-schema-test.cc +++ b/cpp/src/parquet/arrow/arrow-schema-test.cc @@ -885,13 +885,13 @@ TEST_F(TestConvertArrowSchema, ArrowFields) { LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::NANOS), ParquetType::INT64, -1}, {"timestamp(millisecond, CET)", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "CET"), - LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MILLIS), + LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::MILLIS), ParquetType::INT64, -1}, {"timestamp(microsecond, CET)", ::arrow::timestamp(::arrow::TimeUnit::MICRO, "CET"), - LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MICROS), + LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::MICROS), ParquetType::INT64, -1}, {"timestamp(nanosecond, CET)", ::arrow::timestamp(::arrow::TimeUnit::NANO, "CET"), - LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::NANOS), + LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::NANOS), ParquetType::INT64, -1}, {"null", ::arrow::null(), LogicalAnnotation::Null(), ParquetType::INT32, -1}}; diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index c7f71665ffda0..e854953b3dcc7 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -135,7 +135,7 @@ static Status MakeArrowTime64(const std::shared_ptr& an static Status MakeArrowTimestamp( const std::shared_ptr& annotation, std::shared_ptr* out) { - static constexpr auto utc = "UTC"; + static const char* utc = "UTC"; const auto& timestamp = checked_cast(*annotation); switch (timestamp.time_unit()) { case LogicalAnnotation::TimeUnit::MILLIS: @@ -520,15 +520,9 @@ Status StructToNode(const std::shared_ptr<::arrow::StructType>& type, return Status::OK(); } -static bool HasUTCTimezone(const std::string& timezone) { - static const std::vector utczones{"UTC", "utc"}; - return std::any_of(utczones.begin(), utczones.end(), - [timezone](const std::string& utc) { return timezone == utc; }); -} - static std::shared_ptr TimestampAnnotationFromArrowTimestamp( const ::arrow::TimestampType& timestamp_type, ::arrow::TimeUnit::type time_unit) { - const bool utc = HasUTCTimezone(timestamp_type.timezone()); + const bool utc = !(timestamp_type.timezone().empty()); switch (time_unit) { case ::arrow::TimeUnit::MILLI: return LogicalAnnotation::Timestamp(utc, LogicalAnnotation::TimeUnit::MILLIS); From be613c47cd8d0a734524a9832e51cc333fa01aed Mon Sep 17 00:00:00 2001 From: TP Boudreau Date: Tue, 11 Jun 2019 21:40:13 +0000 Subject: [PATCH 5/9] Smallish code review fixes --- .../parquet/arrow/arrow-reader-writer-test.cc | 64 ++++-------- cpp/src/parquet/arrow/schema.cc | 95 +++++++++--------- cpp/src/parquet/arrow/writer.cc | 98 +++++++++---------- cpp/src/parquet/types.cc | 3 + 4 files changed, 118 insertions(+), 142 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 2c7801504c56d..474fa48420989 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -87,9 +87,7 @@ static constexpr int LARGE_SIZE = 10000; static constexpr uint32_t kDefaultSeed = 0; -std::shared_ptr get_logical_annotation(const ::DataType& type, - int32_t precision, - int32_t scale) { +std::shared_ptr get_logical_annotation(const ::DataType& type) { switch (type.id()) { case ArrowId::UINT8: return LogicalAnnotation::Int(8, false); @@ -149,18 +147,12 @@ std::shared_ptr get_logical_annotation(const ::DataType case ArrowId::DICTIONARY: { const ::arrow::DictionaryType& dict_type = static_cast(type); - const ::DataType& ty = *dict_type.value_type(); - int32_t pr = -1; - int32_t sc = -1; - if (ty.id() == ArrowId::DECIMAL) { - const auto& dt = static_cast(ty); - pr = dt.precision(); - sc = dt.scale(); - } - return get_logical_annotation(ty, pr, sc); + return get_logical_annotation(*dict_type.value_type()); + } + case ArrowId::DECIMAL: { + const auto& dec_type = static_cast(type); + return LogicalAnnotation::Decimal(dec_type.precision(), dec_type.scale()); } - case ArrowId::DECIMAL: - return LogicalAnnotation::Decimal(precision, scale); default: break; } @@ -417,8 +409,6 @@ void CheckSimpleRoundtrip(const std::shared_ptr
& table, int64_t row_group static std::shared_ptr MakeSimpleSchema(const ::DataType& type, Repetition::type repetition) { int32_t byte_width = -1; - int32_t precision = -1; - int32_t scale = -1; switch (type.id()) { case ::arrow::Type::DICTIONARY: { @@ -432,9 +422,7 @@ static std::shared_ptr MakeSimpleSchema(const ::DataType& type, case ::arrow::Type::DECIMAL: { const auto& decimal_type = static_cast(values_type); - precision = decimal_type.precision(); - scale = decimal_type.scale(); - byte_width = DecimalSize(precision); + byte_width = DecimalSize(decimal_type.precision()); } break; default: break; @@ -445,15 +433,12 @@ static std::shared_ptr MakeSimpleSchema(const ::DataType& type, break; case ::arrow::Type::DECIMAL: { const auto& decimal_type = static_cast(type); - precision = decimal_type.precision(); - scale = decimal_type.scale(); - byte_width = DecimalSize(precision); + byte_width = DecimalSize(decimal_type.precision()); } break; default: break; } - auto pnode = PrimitiveNode::Make("column1", repetition, - get_logical_annotation(type, precision, scale), + auto pnode = PrimitiveNode::Make("column1", repetition, get_logical_annotation(type), get_physical_type(type), byte_width); NodePtr node_ = GroupNode::Make("schema", Repetition::REQUIRED, std::vector({pnode})); @@ -1378,23 +1363,20 @@ TEST(TestArrowReadWrite, CoerceTimestamps) { ArrayFromVector<::arrow::TimestampType, int64_t>(t_ns, is_valid, ns_values, &a_ns); // Input table, all data as is - auto s1 = std::shared_ptr<::arrow::Schema>( - new ::arrow::Schema({field("f_s", t_s), field("f_ms", t_ms), field("f_us", t_us), - field("f_ns", t_ns)})); + auto s1 = ::arrow::schema( + {field("f_s", t_s), field("f_ms", t_ms), field("f_us", t_us), field("f_ns", t_ns)}); auto input = Table::Make( s1, {std::make_shared("f_s", a_s), std::make_shared("f_ms", a_ms), std::make_shared("f_us", a_us), std::make_shared("f_ns", a_ns)}); // Result when coercing to milliseconds - auto s2 = std::shared_ptr<::arrow::Schema>( - new ::arrow::Schema({field("f_s", t_ms), field("f_ms", t_ms), field("f_us", t_ms), - field("f_ns", t_ms)})); + auto s2 = ::arrow::schema({field("f_s", t_ms), field("f_ms", t_ms), field("f_us", t_ms), + field("f_ns", t_ms)}); auto ex_milli_result = Table::Make( s2, {std::make_shared("f_s", a_ms), std::make_shared("f_ms", a_ms), std::make_shared("f_us", a_ms), std::make_shared("f_ns", a_ms)}); - std::shared_ptr
milli_result; ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( input, false /* use_threads */, input->num_rows(), {}, &milli_result, @@ -1404,14 +1386,12 @@ TEST(TestArrowReadWrite, CoerceTimestamps) { ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_milli_result, *milli_result)); // Result when coercing to microseconds - auto s3 = std::shared_ptr<::arrow::Schema>( - new ::arrow::Schema({field("f_s", t_us), field("f_ms", t_us), field("f_us", t_us), - field("f_ns", t_us)})); + auto s3 = ::arrow::schema({field("f_s", t_us), field("f_ms", t_us), field("f_us", t_us), + field("f_ns", t_us)}); auto ex_micro_result = Table::Make( s3, {std::make_shared("f_s", a_us), std::make_shared("f_ms", a_us), std::make_shared("f_us", a_us), std::make_shared("f_ns", a_us)}); - std::shared_ptr
micro_result; ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( input, false /* use_threads */, input->num_rows(), {}, µ_result, @@ -1421,14 +1401,12 @@ TEST(TestArrowReadWrite, CoerceTimestamps) { ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_micro_result, *micro_result)); // Result when coercing to nanoseconds - auto s4 = std::shared_ptr<::arrow::Schema>( - new ::arrow::Schema({field("f_s", t_ns), field("f_ms", t_ns), field("f_us", t_ns), - field("f_ns", t_ns)})); + auto s4 = ::arrow::schema({field("f_s", t_ns), field("f_ms", t_ns), field("f_us", t_ns), + field("f_ns", t_ns)}); auto ex_nano_result = Table::Make( s4, {std::make_shared("f_s", a_ns), std::make_shared("f_ms", a_ns), std::make_shared("f_us", a_ns), std::make_shared("f_ns", a_ns)}); - std::shared_ptr
nano_result; ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( input, false /* use_threads */, input->num_rows(), {}, &nano_result, @@ -1465,10 +1443,10 @@ TEST(TestArrowReadWrite, CoerceTimestampsLosePrecision) { ArrayFromVector<::arrow::TimestampType, int64_t>(t_us, is_valid, us_values, &a_us); ArrayFromVector<::arrow::TimestampType, int64_t>(t_ns, is_valid, ns_values, &a_ns); - auto s1 = std::shared_ptr<::arrow::Schema>(new ::arrow::Schema({field("f_s", t_s)})); - auto s2 = std::shared_ptr<::arrow::Schema>(new ::arrow::Schema({field("f_ms", t_ms)})); - auto s3 = std::shared_ptr<::arrow::Schema>(new ::arrow::Schema({field("f_us", t_us)})); - auto s4 = std::shared_ptr<::arrow::Schema>(new ::arrow::Schema({field("f_ns", t_ns)})); + auto s1 = ::arrow::schema({field("f_s", t_s)}); + auto s2 = ::arrow::schema({field("f_ms", t_ms)}); + auto s3 = ::arrow::schema({field("f_us", t_us)}); + auto s4 = ::arrow::schema({field("f_ns", t_ns)}); auto c1 = std::make_shared("f_s", a_s); auto c2 = std::make_shared("f_ms", a_ms); diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index e854953b3dcc7..16e565c5bebe3 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -60,16 +60,16 @@ const auto TIMESTAMP_MS = ::arrow::timestamp(::arrow::TimeUnit::MILLI); const auto TIMESTAMP_US = ::arrow::timestamp(::arrow::TimeUnit::MICRO); const auto TIMESTAMP_NS = ::arrow::timestamp(::arrow::TimeUnit::NANO); -static Status MakeArrowDecimal(const std::shared_ptr& annotation, +static Status MakeArrowDecimal(const LogicalAnnotation& annotation, std::shared_ptr* out) { - const auto& decimal = checked_cast(*annotation); + const auto& decimal = checked_cast(annotation); *out = ::arrow::decimal(decimal.precision(), decimal.scale()); return Status::OK(); } -static Status MakeArrowInt(const std::shared_ptr& annotation, +static Status MakeArrowInt(const LogicalAnnotation& annotation, std::shared_ptr* out) { - const auto& integer = checked_cast(*annotation); + const auto& integer = checked_cast(annotation); switch (integer.bit_width()) { case 8: *out = integer.is_signed() ? ::arrow::int8() : ::arrow::uint8(); @@ -81,43 +81,43 @@ static Status MakeArrowInt(const std::shared_ptr& annot *out = integer.is_signed() ? ::arrow::int32() : ::arrow::uint32(); break; default: - return Status::TypeError(annotation->ToString(), + return Status::TypeError(annotation.ToString(), " can not annotate physical type Int32"); } return Status::OK(); } -static Status MakeArrowInt64(const std::shared_ptr& annotation, +static Status MakeArrowInt64(const LogicalAnnotation& annotation, std::shared_ptr* out) { - const auto& integer = checked_cast(*annotation); + const auto& integer = checked_cast(annotation); switch (integer.bit_width()) { case 64: *out = integer.is_signed() ? ::arrow::int64() : ::arrow::uint64(); break; default: - return Status::TypeError(annotation->ToString(), + return Status::TypeError(annotation.ToString(), " can not annotate physical type Int64"); } return Status::OK(); } -static Status MakeArrowTime32(const std::shared_ptr& annotation, +static Status MakeArrowTime32(const LogicalAnnotation& annotation, std::shared_ptr* out) { - const auto& time = checked_cast(*annotation); + const auto& time = checked_cast(annotation); switch (time.time_unit()) { case LogicalAnnotation::TimeUnit::MILLIS: *out = ::arrow::time32(::arrow::TimeUnit::MILLI); break; default: - return Status::TypeError(annotation->ToString(), + return Status::TypeError(annotation.ToString(), " can not annotate physical type Time32"); } return Status::OK(); } -static Status MakeArrowTime64(const std::shared_ptr& annotation, +static Status MakeArrowTime64(const LogicalAnnotation& annotation, std::shared_ptr* out) { - const auto& time = checked_cast(*annotation); + const auto& time = checked_cast(annotation); switch (time.time_unit()) { case LogicalAnnotation::TimeUnit::MICROS: *out = ::arrow::time64(::arrow::TimeUnit::MICRO); @@ -126,17 +126,16 @@ static Status MakeArrowTime64(const std::shared_ptr& an *out = ::arrow::time64(::arrow::TimeUnit::NANO); break; default: - return Status::TypeError(annotation->ToString(), + return Status::TypeError(annotation.ToString(), " can not annotate physical type Time64"); } return Status::OK(); } -static Status MakeArrowTimestamp( - const std::shared_ptr& annotation, - std::shared_ptr* out) { +static Status MakeArrowTimestamp(const LogicalAnnotation& annotation, + std::shared_ptr* out) { static const char* utc = "UTC"; - const auto& timestamp = checked_cast(*annotation); + const auto& timestamp = checked_cast(annotation); switch (timestamp.time_unit()) { case LogicalAnnotation::TimeUnit::MILLIS: *out = (timestamp.is_adjusted_to_utc() @@ -155,14 +154,14 @@ static Status MakeArrowTimestamp( break; default: return Status::TypeError("Unrecognized time unit in timestamp annotation: ", - annotation->ToString()); + annotation.ToString()); } return Status::OK(); } -static Status FromByteArray(const std::shared_ptr& annotation, +static Status FromByteArray(const LogicalAnnotation& annotation, std::shared_ptr* out) { - switch (annotation->type()) { + switch (annotation.type()) { case LogicalAnnotation::Type::STRING: *out = ::arrow::utf8(); break; @@ -177,14 +176,14 @@ static Status FromByteArray(const std::shared_ptr& anno break; default: return Status::NotImplemented("Unhandled logical annotation ", - annotation->ToString(), " for binary array"); + annotation.ToString(), " for binary array"); } return Status::OK(); } -static Status FromFLBA(const std::shared_ptr& annotation, - int32_t physical_length, std::shared_ptr* out) { - switch (annotation->type()) { +static Status FromFLBA(const LogicalAnnotation& annotation, int32_t physical_length, + std::shared_ptr* out) { + switch (annotation.type()) { case LogicalAnnotation::Type::DECIMAL: RETURN_NOT_OK(MakeArrowDecimal(annotation, out)); break; @@ -195,16 +194,16 @@ static Status FromFLBA(const std::shared_ptr& annotatio break; default: return Status::NotImplemented("Unhandled logical annotation ", - annotation->ToString(), + annotation.ToString(), " for fixed-length binary array"); } return Status::OK(); } -static Status FromInt32(const std::shared_ptr& annotation, +static Status FromInt32(const LogicalAnnotation& annotation, std::shared_ptr* out) { - switch (annotation->type()) { + switch (annotation.type()) { case LogicalAnnotation::Type::INT: RETURN_NOT_OK(MakeArrowInt(annotation, out)); break; @@ -221,15 +220,15 @@ static Status FromInt32(const std::shared_ptr& annotati *out = ::arrow::int32(); break; default: - return Status::NotImplemented("Unhandled logical type ", annotation->ToString(), + return Status::NotImplemented("Unhandled logical type ", annotation.ToString(), " for INT32"); } return Status::OK(); } -static Status FromInt64(const std::shared_ptr& annotation, +static Status FromInt64(const LogicalAnnotation& annotation, std::shared_ptr* out) { - switch (annotation->type()) { + switch (annotation.type()) { case LogicalAnnotation::Type::INT: RETURN_NOT_OK(MakeArrowInt64(annotation, out)); break; @@ -246,7 +245,7 @@ static Status FromInt64(const std::shared_ptr& annotati *out = ::arrow::int64(); break; default: - return Status::NotImplemented("Unhandled logical type ", annotation->ToString(), + return Status::NotImplemented("Unhandled logical type ", annotation.ToString(), " for INT64"); } return Status::OK(); @@ -265,10 +264,10 @@ Status FromPrimitive(const PrimitiveNode& primitive, std::shared_ptr* *out = ::arrow::boolean(); break; case ParquetType::INT32: - RETURN_NOT_OK(FromInt32(annotation, out)); + RETURN_NOT_OK(FromInt32(*annotation, out)); break; case ParquetType::INT64: - RETURN_NOT_OK(FromInt64(annotation, out)); + RETURN_NOT_OK(FromInt64(*annotation, out)); break; case ParquetType::INT96: *out = TIMESTAMP_NS; @@ -280,10 +279,10 @@ Status FromPrimitive(const PrimitiveNode& primitive, std::shared_ptr* *out = ::arrow::float64(); break; case ParquetType::BYTE_ARRAY: - RETURN_NOT_OK(FromByteArray(annotation, out)); + RETURN_NOT_OK(FromByteArray(*annotation, out)); break; case ParquetType::FIXED_LEN_BYTE_ARRAY: - RETURN_NOT_OK(FromFLBA(annotation, primitive.type_length(), out)); + RETURN_NOT_OK(FromFLBA(*annotation, primitive.type_length(), out)); break; default: { // PARQUET-1565: This can occur if the file is corrupt @@ -552,8 +551,7 @@ static Status GetTimestampMetadata(const ::arrow::TimestampType& type, } *physical_type = ParquetType::INT64; - PARQUET_CATCH_NOT_OK(*annotation = - TimestampAnnotationFromArrowTimestamp(type, target_unit)); + *annotation = TimestampAnnotationFromArrowTimestamp(type, target_unit); // The user is explicitly asking for timestamp data to be converted to the // specified units (target_unit). @@ -603,26 +601,26 @@ Status FieldToNode(const std::shared_ptr& field, break; case ArrowTypeId::UINT8: type = ParquetType::INT32; - PARQUET_CATCH_NOT_OK(annotation = LogicalAnnotation::Int(8, false)); + annotation = LogicalAnnotation::Int(8, false); break; case ArrowTypeId::INT8: type = ParquetType::INT32; - PARQUET_CATCH_NOT_OK(annotation = LogicalAnnotation::Int(8, true)); + annotation = LogicalAnnotation::Int(8, true); break; case ArrowTypeId::UINT16: type = ParquetType::INT32; - PARQUET_CATCH_NOT_OK(annotation = LogicalAnnotation::Int(16, false)); + annotation = LogicalAnnotation::Int(16, false); break; case ArrowTypeId::INT16: type = ParquetType::INT32; - PARQUET_CATCH_NOT_OK(annotation = LogicalAnnotation::Int(16, true)); + annotation = LogicalAnnotation::Int(16, true); break; case ArrowTypeId::UINT32: if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0) { type = ParquetType::INT64; } else { type = ParquetType::INT32; - PARQUET_CATCH_NOT_OK(annotation = LogicalAnnotation::Int(32, false)); + annotation = LogicalAnnotation::Int(32, false); } break; case ArrowTypeId::INT32: @@ -630,7 +628,7 @@ Status FieldToNode(const std::shared_ptr& field, break; case ArrowTypeId::UINT64: type = ParquetType::INT64; - PARQUET_CATCH_NOT_OK(annotation = LogicalAnnotation::Int(64, false)); + annotation = LogicalAnnotation::Int(64, false); break; case ArrowTypeId::INT64: type = ParquetType::INT64; @@ -678,18 +676,15 @@ Status FieldToNode(const std::shared_ptr& field, break; case ArrowTypeId::TIME32: type = ParquetType::INT32; - PARQUET_CATCH_NOT_OK(annotation = LogicalAnnotation::Time( - false, LogicalAnnotation::TimeUnit::MILLIS)); + annotation = LogicalAnnotation::Time(false, LogicalAnnotation::TimeUnit::MILLIS); break; case ArrowTypeId::TIME64: { type = ParquetType::INT64; auto time_type = static_cast<::arrow::Time64Type*>(field->type().get()); if (time_type->unit() == ::arrow::TimeUnit::NANO) { - PARQUET_CATCH_NOT_OK(annotation = LogicalAnnotation::Time( - false, LogicalAnnotation::TimeUnit::NANOS)); + annotation = LogicalAnnotation::Time(false, LogicalAnnotation::TimeUnit::NANOS); } else { - PARQUET_CATCH_NOT_OK(annotation = LogicalAnnotation::Time( - false, LogicalAnnotation::TimeUnit::MICROS)); + annotation = LogicalAnnotation::Time(false, LogicalAnnotation::TimeUnit::MICROS); } } break; case ArrowTypeId::STRUCT: { diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index e31c947519079..c1b3b0c049309 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -383,9 +383,8 @@ class ArrowColumnWriter { Status WriteTimestamps(const Array& data, int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels); - Status WriteTimestampsCoerce(const bool truncated_timestamps_allowed, const Array& data, - int64_t num_levels, const int16_t* def_levels, - const int16_t* rep_levels); + Status WriteTimestampsCoerce(const Array& data, int64_t num_levels, + const int16_t* def_levels, const int16_t* rep_levels); template Status WriteNonNullableBatch(const ArrowType& type, int64_t num_values, @@ -650,18 +649,16 @@ Status ArrowColumnWriter::WriteNonNullableBatch(*values.type()); + const auto& source_type = static_cast(*values.type()); if (ctx_->properties->support_deprecated_int96_timestamps()) { - // The user explicitly required to use Int96 storage. + // User explicitly requested Int96 timestamps return TypedWriteBatch(values, num_levels, def_levels, rep_levels); } else if (ctx_->properties->coerce_timestamps_enabled() && - (type.unit() != ctx_->properties->coerce_timestamps_unit())) { - // Casting is required: coerce_timestamps_enabled_, cast all timestamps to requested - // unit - return WriteTimestampsCoerce(ctx_->properties->truncated_timestamps_allowed(), values, - num_levels, def_levels, rep_levels); + (source_type.unit() != ctx_->properties->coerce_timestamps_unit())) { + // Convert timestamps to requested unit + return WriteTimestampsCoerce(values, num_levels, def_levels, rep_levels); } else { // No casting of timestamps is required, take the fast path return TypedWriteBatch(values, num_levels, @@ -669,27 +666,51 @@ Status ArrowColumnWriter::WriteTimestamps(const Array& values, int64_t num_level } } -Status ArrowColumnWriter::WriteTimestampsCoerce(const bool truncated_timestamps_allowed, - const Array& array, int64_t num_levels, +#define COERCE_DIVIDE -1 +#define COERCE_INVALID 0 +#define COERCE_MULTIPLY +1 + +static std::pair kTimestampCoercionFactors[4][4] = { + // from seconds ... + {{COERCE_INVALID, 0}, // ... to seconds + {COERCE_MULTIPLY, 1000}, // ... to millis + {COERCE_MULTIPLY, 1000000}, // ... to micros + {COERCE_MULTIPLY, INT64_C(1000000000)}}, // ... to nanos + // from millis ... + {{COERCE_INVALID, 0}, + {COERCE_MULTIPLY, 1}, + {COERCE_MULTIPLY, 1000}, + {COERCE_MULTIPLY, 1000000}}, + // from micros ... + {{COERCE_INVALID, 0}, + {COERCE_DIVIDE, 1000}, + {COERCE_MULTIPLY, 1}, + {COERCE_MULTIPLY, 1000}}, + // from nanos ... + {{COERCE_INVALID, 0}, + {COERCE_DIVIDE, 1000000}, + {COERCE_DIVIDE, 1000}, + {COERCE_MULTIPLY, 1}}}; + +Status ArrowColumnWriter::WriteTimestampsCoerce(const Array& array, int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels) { int64_t* buffer; RETURN_NOT_OK(ctx_->GetScratchData(num_levels, &buffer)); const auto& data = static_cast(array); - auto values = data.raw_values(); + const auto& source_type = static_cast(*array.type()); auto source_unit = source_type.unit(); - TimeUnit::type target_unit = ctx_->properties->coerce_timestamps_enabled() - ? ctx_->properties->coerce_timestamps_unit() - : TimeUnit::MICRO; + TimeUnit::type target_unit = ctx_->properties->coerce_timestamps_unit(); auto target_type = ::arrow::timestamp(target_unit); + bool truncation_allowed = ctx_->properties->truncated_timestamps_allowed(); auto DivideBy = [&](const int64_t factor) { for (int64_t i = 0; i < array.length(); i++) { - if (!truncated_timestamps_allowed && !data.IsNull(i) && (values[i] % factor != 0)) { + if (!truncation_allowed && !data.IsNull(i) && (values[i] % factor != 0)) { return Status::Invalid("Casting from ", source_type.ToString(), " to ", target_type->ToString(), " would lose data: ", values[i]); } @@ -705,38 +726,12 @@ Status ArrowColumnWriter::WriteTimestampsCoerce(const bool truncated_timestamps_ return Status::OK(); }; - if (source_unit == TimeUnit::NANO) { - if (target_unit == TimeUnit::MICRO) { - RETURN_NOT_OK(DivideBy(1000)); - } else { - DCHECK_EQ(TimeUnit::MILLI, target_unit); - RETURN_NOT_OK(DivideBy(1000000)); - } - } else if (source_unit == TimeUnit::MICRO) { - if (target_unit == TimeUnit::NANO) { - RETURN_NOT_OK(MultiplyBy(1000)); - } else { - DCHECK_EQ(TimeUnit::MILLI, target_unit); - RETURN_NOT_OK(DivideBy(1000)); - } - } else if (source_unit == TimeUnit::MILLI) { - if (target_unit == TimeUnit::MICRO) { - RETURN_NOT_OK(MultiplyBy(1000)); - } else { - DCHECK_EQ(TimeUnit::NANO, target_unit); - RETURN_NOT_OK(MultiplyBy(1000000)); - } - } else { - DCHECK_EQ(TimeUnit::SECOND, source_unit); - if (target_unit == TimeUnit::MILLI) { - RETURN_NOT_OK(MultiplyBy(1000)); - } else if (target_unit == TimeUnit::MICRO) { - RETURN_NOT_OK(MultiplyBy(1000000)); - } else { - DCHECK_EQ(TimeUnit::NANO, target_unit); - RETURN_NOT_OK(MultiplyBy(INT64_C(1000000000))); - } - } + const auto& coercion = kTimestampCoercionFactors[static_cast(source_unit)] + [static_cast(target_unit)]; + // .first -> coercion operation; .second -> scale factor + DCHECK_NE(coercion.first, COERCE_INVALID); + RETURN_NOT_OK(coercion.first == COERCE_DIVIDE ? DivideBy(coercion.second) + : MultiplyBy(coercion.second)); if (writer_->descr()->schema_node()->is_required() || (data.null_count() == 0)) { // no nulls, just dump the data @@ -749,9 +744,14 @@ Status ArrowColumnWriter::WriteTimestampsCoerce(const bool truncated_timestamps_ static_cast(*target_type), array.length(), num_levels, def_levels, rep_levels, valid_bits, data.offset(), buffer))); } + return Status::OK(); } +#undef COERCE_DIVIDE +#undef COERCE_INVALID +#undef COERCE_MULTIPLY + // This specialization seems quite similar but it significantly differs in two points: // * offset is added at the most latest time to the pointer as we have sub-byte access // * Arrow data is stored bitwise thus we cannot use std::copy to transform from diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index db48b24688069..ee81af323a318 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -498,11 +498,13 @@ std::shared_ptr LogicalAnnotation::Date() { std::shared_ptr LogicalAnnotation::Time( bool is_adjusted_to_utc, LogicalAnnotation::TimeUnit::unit time_unit) { + DCHECK(time_unit != LogicalAnnotation::TimeUnit::UNKNOWN); return TimeAnnotation::Make(is_adjusted_to_utc, time_unit); } std::shared_ptr LogicalAnnotation::Timestamp( bool is_adjusted_to_utc, LogicalAnnotation::TimeUnit::unit time_unit) { + DCHECK(time_unit != LogicalAnnotation::TimeUnit::UNKNOWN); return TimestampAnnotation::Make(is_adjusted_to_utc, time_unit); } @@ -512,6 +514,7 @@ std::shared_ptr LogicalAnnotation::Interval() { std::shared_ptr LogicalAnnotation::Int(int bit_width, bool is_signed) { + DCHECK(bit_width == 64 || bit_width == 32 || bit_width == 16 || bit_width == 8); return IntAnnotation::Make(bit_width, is_signed); } From fa0c4828f2599fe362e1821ae9d82cf0a16f012c Mon Sep 17 00:00:00 2001 From: TP Boudreau Date: Wed, 12 Jun 2019 01:59:59 +0000 Subject: [PATCH 6/9] Coerce Arrow second timestamps to Parquet millisecond timestamps by default --- .../parquet/arrow/arrow-reader-writer-test.cc | 35 +++++++++++++++++++ cpp/src/parquet/arrow/schema.cc | 6 ++-- cpp/src/parquet/arrow/writer.cc | 21 +++++++---- 3 files changed, 53 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 474fa48420989..73caa177ea603 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -1507,6 +1507,41 @@ TEST(TestArrowReadWrite, CoerceTimestampsLosePrecision) { default_writer_properties(), allow_truncation_to_micros)); } +TEST(TestArrowReadWrite, ImplicitSecondToMillisecondTimestampCoercion) { + using ::arrow::ArrayFromVector; + using ::arrow::field; + using ::arrow::schema; + + std::vector is_valid = {true, true, true, false, true, true}; + + auto t_s = ::arrow::timestamp(TimeUnit::SECOND); + auto t_ms = ::arrow::timestamp(TimeUnit::MILLI); + + std::vector s_values = {1489269, 1489270, 1489271, 1489272, 1489272, 1489273}; + std::vector ms_values = {1489269000, 1489270000, 1489271000, + 1489272000, 1489272000, 1489273000}; + + std::shared_ptr a_s, a_ms; + ArrayFromVector<::arrow::TimestampType, int64_t>(t_s, is_valid, s_values, &a_s); + ArrayFromVector<::arrow::TimestampType, int64_t>(t_ms, is_valid, ms_values, &a_ms); + + auto si = schema({field("timestamp", t_s)}); + auto sx = schema({field("timestamp", t_ms)}); + + auto ci = std::make_shared("timestamp", a_s); + auto cx = std::make_shared("timestamp", a_ms); + + auto ti = Table::Make(si, {ci}); // input + auto tx = Table::Make(sx, {cx}); // expected output + std::shared_ptr
to; // actual output + + // default properties (without explicit coercion instructions) used ... + ASSERT_NO_FATAL_FAILURE( + DoSimpleRoundtrip(ti, false /* use_threads */, ti->num_rows(), {}, &to)); + ASSERT_NO_FATAL_FAILURE(::arrow::AssertSchemaEqual(*tx->schema(), *to->schema())); + ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*tx, *to)); +} + TEST(TestArrowReadWrite, ConvertedDateTimeTypes) { using ::arrow::ArrayFromVector; diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 16e565c5bebe3..10189ea2df059 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -570,10 +570,10 @@ static Status GetTimestampMetadata(const ::arrow::TimestampType& type, } // The user implicitly wants timestamp data to retain its original time units, - // however the Arrow seconds time unit can not be represented (annotated) in Parquet. + // however the Arrow seconds time unit can not be represented (annotated) in + // Parquet and must be converted to milliseconds. if (type.unit() == ::arrow::TimeUnit::SECOND) { - return Status::NotImplemented( - "Only MILLI, MICRO, and NANO units supported for Arrow timestamps with Parquet."); + *annotation = TimestampAnnotationFromArrowTimestamp(type, ::arrow::TimeUnit::MILLI); } return Status::OK(); diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index c1b3b0c049309..abb503b4c77b4 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -384,7 +384,8 @@ class ArrowColumnWriter { const int16_t* rep_levels); Status WriteTimestampsCoerce(const Array& data, int64_t num_levels, - const int16_t* def_levels, const int16_t* rep_levels); + const int16_t* def_levels, const int16_t* rep_levels, + const ArrowWriterProperties& properties); template Status WriteNonNullableBatch(const ArrowType& type, int64_t num_values, @@ -657,8 +658,15 @@ Status ArrowColumnWriter::WriteTimestamps(const Array& values, int64_t num_level def_levels, rep_levels); } else if (ctx_->properties->coerce_timestamps_enabled() && (source_type.unit() != ctx_->properties->coerce_timestamps_unit())) { - // Convert timestamps to requested unit - return WriteTimestampsCoerce(values, num_levels, def_levels, rep_levels); + // User explicitly requested conversion to specific units + return WriteTimestampsCoerce(values, num_levels, def_levels, rep_levels, + *(ctx_->properties)); + } else if (source_type.unit() == TimeUnit::SECOND) { + // Absent superseding user instructions, timestamps in seconds are implicitly + // converted to milliseconds + std::shared_ptr properties = + (ArrowWriterProperties::Builder()).coerce_timestamps(TimeUnit::MILLI)->build(); + return WriteTimestampsCoerce(values, num_levels, def_levels, rep_levels, *properties); } else { // No casting of timestamps is required, take the fast path return TypedWriteBatch(values, num_levels, @@ -694,7 +702,8 @@ static std::pair kTimestampCoercionFactors[4][4] = { Status ArrowColumnWriter::WriteTimestampsCoerce(const Array& array, int64_t num_levels, const int16_t* def_levels, - const int16_t* rep_levels) { + const int16_t* rep_levels, + const ArrowWriterProperties& properties) { int64_t* buffer; RETURN_NOT_OK(ctx_->GetScratchData(num_levels, &buffer)); @@ -704,9 +713,9 @@ Status ArrowColumnWriter::WriteTimestampsCoerce(const Array& array, int64_t num_ const auto& source_type = static_cast(*array.type()); auto source_unit = source_type.unit(); - TimeUnit::type target_unit = ctx_->properties->coerce_timestamps_unit(); + TimeUnit::type target_unit = properties.coerce_timestamps_unit(); auto target_type = ::arrow::timestamp(target_unit); - bool truncation_allowed = ctx_->properties->truncated_timestamps_allowed(); + bool truncation_allowed = properties.truncated_timestamps_allowed(); auto DivideBy = [&](const int64_t factor) { for (int64_t i = 0; i < array.length(); i++) { From 81dc742731958e41ff65f63e4c66ab205f12e3f0 Mon Sep 17 00:00:00 2001 From: TP Boudreau Date: Thu, 13 Jun 2019 00:33:30 +0000 Subject: [PATCH 7/9] Reintroduce Parquet version 1.0 behavior for timestamps --- .../parquet/arrow/arrow-reader-writer-test.cc | 226 ++++++++++++++++-- cpp/src/parquet/arrow/arrow-schema-test.cc | 6 +- cpp/src/parquet/arrow/schema.cc | 56 +++-- cpp/src/parquet/arrow/writer.cc | 31 ++- python/pyarrow/tests/test_parquet.py | 76 +++++- 5 files changed, 342 insertions(+), 53 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 73caa177ea603..dd8c46a363b5d 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -371,6 +371,49 @@ void AssertChunkedEqual(const ChunkedArray& expected, const ChunkedArray& actual } } +void DoConfiguredRoundtrip( + const std::shared_ptr
& table, int64_t row_group_size, + std::shared_ptr
* out, + const std::shared_ptr<::parquet::WriterProperties>& parquet_properties = + default_writer_properties(), + const std::shared_ptr& arrow_properties = + default_arrow_writer_properties()) { + std::shared_ptr buffer; + + auto sink = CreateOutputStream(); + ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), sink, + row_group_size, parquet_properties, arrow_properties)); + ASSERT_OK_NO_THROW(sink->Finish(&buffer)); + + std::unique_ptr reader; + ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer), + ::arrow::default_memory_pool(), + ::parquet::default_reader_properties(), nullptr, &reader)); + ASSERT_OK_NO_THROW(reader->ReadTable(out)); +} + +void CheckConfiguredRoundtrip( + const std::shared_ptr
& input_table, + const std::shared_ptr
& expected_table = nullptr, + const std::shared_ptr<::parquet::WriterProperties>& parquet_properties = + default_writer_properties(), + const std::shared_ptr& arrow_properties = + default_arrow_writer_properties()) { + std::shared_ptr
actual_table; + ASSERT_NO_FATAL_FAILURE(DoConfiguredRoundtrip(input_table, input_table->num_rows(), + &actual_table, parquet_properties, + arrow_properties)); + if (expected_table) { + ASSERT_NO_FATAL_FAILURE( + ::arrow::AssertSchemaEqual(*actual_table->schema(), *expected_table->schema())); + ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*actual_table, *expected_table)); + } else { + ASSERT_NO_FATAL_FAILURE( + ::arrow::AssertSchemaEqual(*actual_table->schema(), *input_table->schema())); + ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*actual_table, *input_table)); + } +} + void DoSimpleRoundtrip(const std::shared_ptr
& table, bool use_threads, int64_t row_group_size, const std::vector& column_subset, std::shared_ptr
* out, @@ -1209,7 +1252,7 @@ TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredChunkedTableRead) { ASSERT_NO_FATAL_FAILURE(this->CheckSingleColumnRequiredTableRead(4)); } -void MakeDateTimeTypesTable(std::shared_ptr
* out) { +void MakeDateTimeTypesTable(std::shared_ptr
* out, bool expected = false) { using ::arrow::ArrayFromVector; std::vector is_valid = {true, true, true, false, true, true}; @@ -1219,12 +1262,13 @@ void MakeDateTimeTypesTable(std::shared_ptr
* out) { auto f1 = field("f1", ::arrow::timestamp(TimeUnit::MILLI)); auto f2 = field("f2", ::arrow::timestamp(TimeUnit::MICRO)); auto f3 = field("f3", ::arrow::timestamp(TimeUnit::NANO)); + auto f3_x = field("f3", ::arrow::timestamp(TimeUnit::MICRO)); auto f4 = field("f4", ::arrow::time32(TimeUnit::MILLI)); auto f5 = field("f5", ::arrow::time64(TimeUnit::MICRO)); auto f6 = field("f6", ::arrow::time64(TimeUnit::NANO)); std::shared_ptr<::arrow::Schema> schema( - new ::arrow::Schema({f0, f1, f2, f3, f4, f5, f6})); + new ::arrow::Schema({f0, f1, f2, (expected ? f3_x : f3), f4, f5, f6})); std::vector t32_values = {1489269000, 1489270000, 1489271000, 1489272000, 1489272000, 1489273000}; @@ -1235,7 +1279,7 @@ void MakeDateTimeTypesTable(std::shared_ptr
* out) { std::vector t64_ms_values = {1489269, 1489270, 1489271, 1489272, 1489272, 1489273}; - std::shared_ptr a0, a1, a2, a3, a4, a5, a6; + std::shared_ptr a0, a1, a2, a3, a3_x, a4, a5, a6; ArrayFromVector<::arrow::Date32Type, int32_t>(f0->type(), is_valid, t32_values, &a0); ArrayFromVector<::arrow::TimestampType, int64_t>(f1->type(), is_valid, t64_ms_values, &a1); @@ -1243,14 +1287,19 @@ void MakeDateTimeTypesTable(std::shared_ptr
* out) { &a2); ArrayFromVector<::arrow::TimestampType, int64_t>(f3->type(), is_valid, t64_ns_values, &a3); + ArrayFromVector<::arrow::TimestampType, int64_t>(f3_x->type(), is_valid, t64_us_values, + &a3_x); ArrayFromVector<::arrow::Time32Type, int32_t>(f4->type(), is_valid, t32_values, &a4); ArrayFromVector<::arrow::Time64Type, int64_t>(f5->type(), is_valid, t64_us_values, &a5); ArrayFromVector<::arrow::Time64Type, int64_t>(f6->type(), is_valid, t64_ns_values, &a6); std::vector> columns = { - std::make_shared("f0", a0), std::make_shared("f1", a1), - std::make_shared("f2", a2), std::make_shared("f3", a3), - std::make_shared("f4", a4), std::make_shared("f5", a5), + std::make_shared("f0", a0), + std::make_shared("f1", a1), + std::make_shared("f2", a2), + std::make_shared("f3", (expected ? a3_x : a3)), + std::make_shared("f4", a4), + std::make_shared("f5", a5), std::make_shared("f6", a6)}; *out = Table::Make(schema, columns); @@ -1263,7 +1312,7 @@ TEST(TestArrowReadWrite, DateTimeTypes) { ASSERT_NO_FATAL_FAILURE( DoSimpleRoundtrip(table, false /* use_threads */, table->num_rows(), {}, &result)); - MakeDateTimeTypesTable(&table); + MakeDateTimeTypesTable(&table, true); // build expected result ASSERT_NO_FATAL_FAILURE( ::arrow::AssertSchemaEqual(*table->schema(), *result->schema())); ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result)); @@ -1399,21 +1448,6 @@ TEST(TestArrowReadWrite, CoerceTimestamps) { ASSERT_NO_FATAL_FAILURE( ::arrow::AssertSchemaEqual(*ex_micro_result->schema(), *micro_result->schema())); ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_micro_result, *micro_result)); - - // Result when coercing to nanoseconds - auto s4 = ::arrow::schema({field("f_s", t_ns), field("f_ms", t_ns), field("f_us", t_ns), - field("f_ns", t_ns)}); - auto ex_nano_result = Table::Make( - s4, - {std::make_shared("f_s", a_ns), std::make_shared("f_ms", a_ns), - std::make_shared("f_us", a_ns), std::make_shared("f_ns", a_ns)}); - std::shared_ptr
nano_result; - ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( - input, false /* use_threads */, input->num_rows(), {}, &nano_result, - ArrowWriterProperties::Builder().coerce_timestamps(TimeUnit::NANO)->build())); - ASSERT_NO_FATAL_FAILURE( - ::arrow::AssertSchemaEqual(*ex_nano_result->schema(), *nano_result->schema())); - ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_nano_result, *nano_result)); } TEST(TestArrowReadWrite, CoerceTimestampsLosePrecision) { @@ -1542,6 +1576,154 @@ TEST(TestArrowReadWrite, ImplicitSecondToMillisecondTimestampCoercion) { ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*tx, *to)); } +TEST(TestArrowReadWrite, ParquetVersionTimestampDifferences) { + using ::arrow::ArrayFromVector; + using ::arrow::field; + using ::arrow::schema; + + auto t_s = ::arrow::timestamp(TimeUnit::SECOND); + auto t_ms = ::arrow::timestamp(TimeUnit::MILLI); + auto t_us = ::arrow::timestamp(TimeUnit::MICRO); + auto t_ns = ::arrow::timestamp(TimeUnit::NANO); + + const int N = 10; + int64_t instant = INT64_C(1262304000); // 2010-01-01T00:00:00 seconds offset + std::vector mask; + std::vector d_s, d_ms, d_us, d_ns; + for (int i = 0; i < N; ++i) { + mask.push_back(i % 3 == 1); + d_s.push_back(instant); + d_ms.push_back(instant * INT64_C(1000)); + d_us.push_back(instant * INT64_C(1000000)); + d_ns.push_back(instant * INT64_C(1000000000)); + instant += 1; + } + + std::shared_ptr a_s, a_ms, a_us, a_ns; + ArrayFromVector<::arrow::TimestampType, int64_t>(t_s, mask, d_s, &a_s); + ArrayFromVector<::arrow::TimestampType, int64_t>(t_ms, mask, d_ms, &a_ms); + ArrayFromVector<::arrow::TimestampType, int64_t>(t_us, mask, d_us, &a_us); + ArrayFromVector<::arrow::TimestampType, int64_t>(t_ns, mask, d_ns, &a_ns); + + auto c_s = std::make_shared("ts:s", a_s); + auto c_ms = std::make_shared("ts:ms", a_ms); + auto c_us = std::make_shared("ts:us", a_us); + auto c_ns = std::make_shared("ts:ns", a_ns); + + auto input_schema = schema({field("ts:s", t_s), field("ts:ms", t_ms), + field("ts:us", t_us), field("ts:ns", t_ns)}); + auto input_table = Table::Make(input_schema, {c_s, c_ms, c_us, c_ns}); + + auto parquet_version_1_properties = default_writer_properties(); + auto parquet_version_2_properties = ::parquet::WriterProperties::Builder() + .version(ParquetVersion::PARQUET_2_0) + ->build(); + + { + // Using Parquet version 1.0 defaults, seconds should be coerced to milliseconds + // and nanoseconds should be coerced to microseconds + auto expected_schema = schema({field("ts:s", t_ms), field("ts:ms", t_ms), + field("ts:us", t_us), field("ts:ns", t_us)}); + auto expected_table = Table::Make(expected_schema, {c_ms, c_ms, c_us, c_us}); + ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip(input_table, expected_table, + parquet_version_1_properties)); + } + { + // Using Parquet version 2.0 defaults, seconds should be coerced to milliseconds + // and nanoseconds should be retained + auto expected_schema = schema({field("ts:s", t_ms), field("ts:ms", t_ms), + field("ts:us", t_us), field("ts:ns", t_ns)}); + auto expected_table = Table::Make(expected_schema, {c_ms, c_ms, c_us, c_ns}); + ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip(input_table, expected_table, + parquet_version_2_properties)); + } + + auto arrow_coerce_to_seconds_properties = + ArrowWriterProperties::Builder().coerce_timestamps(TimeUnit::SECOND)->build(); + auto arrow_coerce_to_millis_properties = + ArrowWriterProperties::Builder().coerce_timestamps(TimeUnit::MILLI)->build(); + auto arrow_coerce_to_micros_properties = + ArrowWriterProperties::Builder().coerce_timestamps(TimeUnit::MICRO)->build(); + auto arrow_coerce_to_nanos_properties = + ArrowWriterProperties::Builder().coerce_timestamps(TimeUnit::NANO)->build(); + { + // Neither Parquet version 1.0 nor 2.0 allow coercing to seconds + auto sink = CreateOutputStream(); + std::shared_ptr
actual_table; + ASSERT_RAISES(NotImplemented, WriteTable(*input_table, ::arrow::default_memory_pool(), + sink, N, parquet_version_1_properties, + arrow_coerce_to_seconds_properties)); + ASSERT_RAISES(NotImplemented, WriteTable(*input_table, ::arrow::default_memory_pool(), + sink, N, parquet_version_2_properties, + arrow_coerce_to_seconds_properties)); + } + { + // Using Parquet version 1.0, coercing to milliseconds or microseconds is allowed + auto expected_schema = schema({field("ts:s", t_ms), field("ts:ms", t_ms), + field("ts:us", t_ms), field("ts:ns", t_ms)}); + auto expected_table = Table::Make(expected_schema, {c_ms, c_ms, c_ms, c_ms}); + ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip(input_table, expected_table, + parquet_version_1_properties, + arrow_coerce_to_millis_properties)); + + expected_schema = schema({field("ts:s", t_us), field("ts:ms", t_us), + field("ts:us", t_us), field("ts:ns", t_us)}); + expected_table = Table::Make(expected_schema, {c_us, c_us, c_us, c_us}); + ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip(input_table, expected_table, + parquet_version_1_properties, + arrow_coerce_to_micros_properties)); + } + { + // Using Parquet version 2.0, coercing to milliseconds or microseconds is allowed + auto expected_schema = schema({field("ts:s", t_ms), field("ts:ms", t_ms), + field("ts:us", t_ms), field("ts:ns", t_ms)}); + auto expected_table = Table::Make(expected_schema, {c_ms, c_ms, c_ms, c_ms}); + ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip(input_table, expected_table, + parquet_version_2_properties, + arrow_coerce_to_millis_properties)); + + expected_schema = schema({field("ts:s", t_us), field("ts:ms", t_us), + field("ts:us", t_us), field("ts:ns", t_us)}); + expected_table = Table::Make(expected_schema, {c_us, c_us, c_us, c_us}); + ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip(input_table, expected_table, + parquet_version_2_properties, + arrow_coerce_to_micros_properties)); + } + { + // Using Parquet version 1.0, coercing to (int64) nanoseconds is not allowed + auto sink = CreateOutputStream(); + std::shared_ptr
actual_table; + ASSERT_RAISES(NotImplemented, WriteTable(*input_table, ::arrow::default_memory_pool(), + sink, N, parquet_version_1_properties, + arrow_coerce_to_nanos_properties)); + } + { + // Using Parquet version 2.0, coercing to (int64) nanoseconds is allowed + auto expected_schema = schema({field("ts:s", t_ns), field("ts:ms", t_ns), + field("ts:us", t_ns), field("ts:ns", t_ns)}); + auto expected_table = Table::Make(expected_schema, {c_ns, c_ns, c_ns, c_ns}); + ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip(input_table, expected_table, + parquet_version_2_properties, + arrow_coerce_to_nanos_properties)); + } + + auto arrow_enable_int96_properties = + ArrowWriterProperties::Builder().enable_deprecated_int96_timestamps()->build(); + { + // For either Parquet version, coercing to nanoseconds is allowed if Int96 + // storage is used + auto expected_schema = schema({field("ts:s", t_ns), field("ts:ms", t_ns), + field("ts:us", t_ns), field("ts:ns", t_ns)}); + auto expected_table = Table::Make(expected_schema, {c_ns, c_ns, c_ns, c_ns}); + ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip(input_table, expected_table, + parquet_version_1_properties, + arrow_enable_int96_properties)); + ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip(input_table, expected_table, + parquet_version_2_properties, + arrow_enable_int96_properties)); + } +} + TEST(TestArrowReadWrite, ConvertedDateTimeTypes) { using ::arrow::ArrayFromVector; diff --git a/cpp/src/parquet/arrow/arrow-schema-test.cc b/cpp/src/parquet/arrow/arrow-schema-test.cc index 7a8d3d038c993..cedabdb73aaad 100644 --- a/cpp/src/parquet/arrow/arrow-schema-test.cc +++ b/cpp/src/parquet/arrow/arrow-schema-test.cc @@ -873,7 +873,7 @@ TEST_F(TestConvertArrowSchema, ArrowFields) { LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MICROS), ParquetType::INT64, -1}, {"timestamp(nanosecond)", ::arrow::timestamp(::arrow::TimeUnit::NANO), - LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::NANOS), + LogicalAnnotation::Timestamp(false, LogicalAnnotation::TimeUnit::MICROS), ParquetType::INT64, -1}, {"timestamp(millisecond, UTC)", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "UTC"), LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::MILLIS), @@ -882,7 +882,7 @@ TEST_F(TestConvertArrowSchema, ArrowFields) { LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::MICROS), ParquetType::INT64, -1}, {"timestamp(nanosecond, UTC)", ::arrow::timestamp(::arrow::TimeUnit::NANO, "UTC"), - LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::NANOS), + LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::MICROS), ParquetType::INT64, -1}, {"timestamp(millisecond, CET)", ::arrow::timestamp(::arrow::TimeUnit::MILLI, "CET"), LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::MILLIS), @@ -891,7 +891,7 @@ TEST_F(TestConvertArrowSchema, ArrowFields) { LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::MICROS), ParquetType::INT64, -1}, {"timestamp(nanosecond, CET)", ::arrow::timestamp(::arrow::TimeUnit::NANO, "CET"), - LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::NANOS), + LogicalAnnotation::Timestamp(true, LogicalAnnotation::TimeUnit::MICROS), ParquetType::INT64, -1}, {"null", ::arrow::null(), LogicalAnnotation::Null(), ParquetType::INT32, -1}}; diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 10189ea2df059..22b82973fd17d 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -537,15 +537,17 @@ static std::shared_ptr TimestampAnnotationFromArrowTime } static Status GetTimestampMetadata(const ::arrow::TimestampType& type, - const ArrowWriterProperties& properties, + const WriterProperties& properties, + const ArrowWriterProperties& arrow_properties, ParquetType::type* physical_type, std::shared_ptr* annotation) { - const bool coerce = properties.coerce_timestamps_enabled(); - const auto target_unit = coerce ? properties.coerce_timestamps_unit() : type.unit(); + const bool coerce = arrow_properties.coerce_timestamps_enabled(); + const auto target_unit = + coerce ? arrow_properties.coerce_timestamps_unit() : type.unit(); // The user is explicitly asking for Impala int96 encoding, there is no // logical type. - if (properties.support_deprecated_int96_timestamps()) { + if (arrow_properties.support_deprecated_int96_timestamps()) { *physical_type = ParquetType::INT96; return Status::OK(); } @@ -556,24 +558,48 @@ static Status GetTimestampMetadata(const ::arrow::TimestampType& type, // The user is explicitly asking for timestamp data to be converted to the // specified units (target_unit). if (coerce) { - switch (target_unit) { - case ::arrow::TimeUnit::MILLI: - case ::arrow::TimeUnit::MICRO: - case ::arrow::TimeUnit::NANO: - break; - case ::arrow::TimeUnit::SECOND: - return Status::NotImplemented( - "Can only coerce Arrow timestamps to milliseconds, microseconds, or " - "nanoseconds"); + if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0) { + switch (target_unit) { + case ::arrow::TimeUnit::MILLI: + case ::arrow::TimeUnit::MICRO: + break; + case ::arrow::TimeUnit::NANO: + case ::arrow::TimeUnit::SECOND: + return Status::NotImplemented( + "For Parquet version 1.0 files, can only coerce Arrow timestamps to " + "milliseconds or microseconds"); + } + } else { + switch (target_unit) { + case ::arrow::TimeUnit::MILLI: + case ::arrow::TimeUnit::MICRO: + case ::arrow::TimeUnit::NANO: + break; + case ::arrow::TimeUnit::SECOND: + return Status::NotImplemented( + "For Parquet files, can only coerce Arrow timestamps to milliseconds, " + "microseconds, or nanoseconds"); + } } return Status::OK(); } + // The user implicitly wants timestamp data to retain its original time units, + // however the ConvertedType field used to indicate logical types for Parquet + // version 1.0 fields does not allow for nanosecond time units and so nanoseconds + // must be coerced to microseconds. + if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0 && + type.unit() == ::arrow::TimeUnit::NANO) { + *annotation = TimestampAnnotationFromArrowTimestamp(type, ::arrow::TimeUnit::MICRO); + return Status::OK(); + } + // The user implicitly wants timestamp data to retain its original time units, // however the Arrow seconds time unit can not be represented (annotated) in - // Parquet and must be converted to milliseconds. + // any version of Parquet and so must be coerced to milliseconds. if (type.unit() == ::arrow::TimeUnit::SECOND) { *annotation = TimestampAnnotationFromArrowTimestamp(type, ::arrow::TimeUnit::MILLI); + return Status::OK(); } return Status::OK(); @@ -672,7 +698,7 @@ Status FieldToNode(const std::shared_ptr& field, case ArrowTypeId::TIMESTAMP: RETURN_NOT_OK( GetTimestampMetadata(static_cast<::arrow::TimestampType&>(*field->type()), - arrow_properties, &type, &annotation)); + properties, arrow_properties, &type, &annotation)); break; case ArrowTypeId::TIME32: type = ParquetType::INT32; diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index abb503b4c77b4..91811203f92cc 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -656,19 +656,34 @@ Status ArrowColumnWriter::WriteTimestamps(const Array& values, int64_t num_level // User explicitly requested Int96 timestamps return TypedWriteBatch(values, num_levels, def_levels, rep_levels); - } else if (ctx_->properties->coerce_timestamps_enabled() && - (source_type.unit() != ctx_->properties->coerce_timestamps_unit())) { - // User explicitly requested conversion to specific units - return WriteTimestampsCoerce(values, num_levels, def_levels, rep_levels, - *(ctx_->properties)); + } else if (ctx_->properties->coerce_timestamps_enabled()) { + // User explicitly requested coercion to specific unit + if (source_type.unit() == ctx_->properties->coerce_timestamps_unit()) { + // No data conversion necessary + return TypedWriteBatch(values, num_levels, + def_levels, rep_levels); + } else { + return WriteTimestampsCoerce(values, num_levels, def_levels, rep_levels, + *(ctx_->properties)); + } + } else if (writer_->properties()->version() == ParquetVersion::PARQUET_1_0 && + source_type.unit() == TimeUnit::NANO) { + // Absent superseding user instructions, when writing Parquet version 1.0 files, + // timestamps in nanoseconds are coerced to microseconds + std::shared_ptr properties = + (ArrowWriterProperties::Builder()) + .coerce_timestamps(TimeUnit::MICRO) + ->disallow_truncated_timestamps() + ->build(); + return WriteTimestampsCoerce(values, num_levels, def_levels, rep_levels, *properties); } else if (source_type.unit() == TimeUnit::SECOND) { - // Absent superseding user instructions, timestamps in seconds are implicitly - // converted to milliseconds + // Absent superseding user instructions, timestamps in seconds are coerced to + // milliseconds std::shared_ptr properties = (ArrowWriterProperties::Builder()).coerce_timestamps(TimeUnit::MILLI)->build(); return WriteTimestampsCoerce(values, num_levels, def_levels, rep_levels, *properties); } else { - // No casting of timestamps is required, take the fast path + // No data conversion necessary return TypedWriteBatch(values, num_levels, def_levels, rep_levels); } diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index b58dedd825894..fd181607f249c 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -1005,6 +1005,71 @@ def test_list_of_datetime_time_roundtrip(): _roundtrip_pandas_dataframe(df, write_kwargs={}) +@pytest.mark.pandas +def test_parquet_version_timestamp_differences(): + i_s = pd.Timestamp('2010-01-01').value / 1000000000 # := 1262304000 + + d_s = np.arange(i_s, i_s + 10, 1, dtype='int64') + d_ms = d_s * 1000 + d_us = d_ms * 1000 + d_ns = d_us * 1000 + + a_s = pa.array(d_s, type=pa.timestamp('s')) + a_ms = pa.array(d_ms, type=pa.timestamp('ms')) + a_us = pa.array(d_us, type=pa.timestamp('us')) + a_ns = pa.array(d_ns, type=pa.timestamp('ns')) + + names = ['ts:s', 'ts:ms', 'ts:us', 'ts:ns'] + table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names) + + # Using Parquet version 1.0, seconds should be coerced to milliseconds + # and nanoseconds should be coerced to microseconds by default + expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_us], names) + actual = _roundtrip_table(table) + assert actual.equals(expected) + + # Using Parquet version 2.0, seconds should be coerced to milliseconds + # and nanoseconds should be retained by default + expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_ns], names) + actual = _roundtrip_table(table, version='2.0') + assert actual.equals(expected) + + # Using Parquet version 1.0, coercing to milliseconds or microseconds + # is allowed + expected = pa.Table.from_arrays([a_ms, a_ms, a_ms, a_ms], names) + actual = _roundtrip_table(table, coerce_timestamps='ms') + assert actual.equals(expected) + + # Using Parquet version 2.0, coercing to milliseconds or microseconds + # is allowed + expected = pa.Table.from_arrays([a_us, a_us, a_us, a_us], names) + actual = _roundtrip_table(table, coerce_timestamps='us') + assert actual.equals(expected) + + # TODO: after pyarrow allows coerce_timestamps='ns', tests like the + # following should pass ... + + # Using Parquet version 1.0, coercing to nanoseconds is not allowed + # expected = None + # with pytest.raises(NotImplementedError): + # _roundtrip_table(table, coerce_timestamps='ns') + + # Using Parquet version 2.0, coercing to nanoseconds is allowed + # expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names) + # actual = _roundtrip_table(table, version='2.0', coerce_timestamps='ns') + # assert actual.equals(expected) + + # For either Parquet version, coercing to nanoseconds is allowed + # if Int96 storage is used + expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names) + actual = _roundtrip_table(table, + use_deprecated_int96_timestamps=True) + assert actual.equals(expected) + actual = _roundtrip_table(table, version='2.0', + use_deprecated_int96_timestamps=True) + assert actual.equals(expected) + + def test_large_list_records(): # This was fixed in PARQUET-1100 @@ -2041,7 +2106,7 @@ def test_write_error_deletes_incomplete_file(tempdir): filename = tempdir / 'tmp_file' try: - _write_table(pdf, filename, coerce_timestamps='ms') + _write_table(pdf, filename) except pa.ArrowException: pass @@ -2050,7 +2115,8 @@ def test_write_error_deletes_incomplete_file(tempdir): @pytest.mark.pandas def test_noncoerced_nanoseconds_written_without_exception(tempdir): - # ARROW-1957 + # ARROW-1957: the Parquet version 2.0 writer preserves Arrow + # nanosecond timestamps by default n = 9 df = pd.DataFrame({'x': range(n)}, index=pd.DatetimeIndex(start='2017-01-01', @@ -2060,18 +2126,18 @@ def test_noncoerced_nanoseconds_written_without_exception(tempdir): filename = tempdir / 'written.parquet' try: - pq.write_table(tb, filename) + pq.write_table(tb, filename, version='2.0') except Exception: pass assert filename.exists() - recovered_table = _simple_table_roundtrip(tb) + recovered_table = _roundtrip_table(tb, version='2.0') assert tb.equals(recovered_table) # Loss of data thru coercion (without explicit override) still an error filename = tempdir / 'not_written.parquet' with pytest.raises(ValueError): - pq.write_table(tb, filename, coerce_timestamps='ms') + pq.write_table(tb, filename, coerce_timestamps='ms', version='2.0') def test_read_non_existent_file(tempdir): From 4cef109d1bde1bf4247ccc3ea31617d259c11fa9 Mon Sep 17 00:00:00 2001 From: TP Boudreau Date: Tue, 18 Jun 2019 15:20:32 +0000 Subject: [PATCH 8/9] Temporarily remove validity masks from test --- .../parquet/arrow/arrow-reader-writer-test.cc | 41 ++++++++++--------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index dd8c46a363b5d..5f4e21365b4d2 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -375,7 +375,7 @@ void DoConfiguredRoundtrip( const std::shared_ptr
& table, int64_t row_group_size, std::shared_ptr
* out, const std::shared_ptr<::parquet::WriterProperties>& parquet_properties = - default_writer_properties(), + ::parquet::default_writer_properties(), const std::shared_ptr& arrow_properties = default_arrow_writer_properties()) { std::shared_ptr buffer; @@ -396,7 +396,7 @@ void CheckConfiguredRoundtrip( const std::shared_ptr
& input_table, const std::shared_ptr
& expected_table = nullptr, const std::shared_ptr<::parquet::WriterProperties>& parquet_properties = - default_writer_properties(), + ::parquet::default_writer_properties(), const std::shared_ptr& arrow_properties = default_arrow_writer_properties()) { std::shared_ptr
actual_table; @@ -1586,24 +1586,22 @@ TEST(TestArrowReadWrite, ParquetVersionTimestampDifferences) { auto t_us = ::arrow::timestamp(TimeUnit::MICRO); auto t_ns = ::arrow::timestamp(TimeUnit::NANO); - const int N = 10; + const int N = 24; int64_t instant = INT64_C(1262304000); // 2010-01-01T00:00:00 seconds offset - std::vector mask; std::vector d_s, d_ms, d_us, d_ns; for (int i = 0; i < N; ++i) { - mask.push_back(i % 3 == 1); d_s.push_back(instant); d_ms.push_back(instant * INT64_C(1000)); d_us.push_back(instant * INT64_C(1000000)); d_ns.push_back(instant * INT64_C(1000000000)); - instant += 1; + instant += 3600; } std::shared_ptr a_s, a_ms, a_us, a_ns; - ArrayFromVector<::arrow::TimestampType, int64_t>(t_s, mask, d_s, &a_s); - ArrayFromVector<::arrow::TimestampType, int64_t>(t_ms, mask, d_ms, &a_ms); - ArrayFromVector<::arrow::TimestampType, int64_t>(t_us, mask, d_us, &a_us); - ArrayFromVector<::arrow::TimestampType, int64_t>(t_ns, mask, d_ns, &a_ns); + ArrayFromVector<::arrow::TimestampType, int64_t>(t_s, d_s, &a_s); + ArrayFromVector<::arrow::TimestampType, int64_t>(t_ms, d_ms, &a_ms); + ArrayFromVector<::arrow::TimestampType, int64_t>(t_us, d_us, &a_us); + ArrayFromVector<::arrow::TimestampType, int64_t>(t_ns, d_ns, &a_ns); auto c_s = std::make_shared("ts:s", a_s); auto c_ms = std::make_shared("ts:ms", a_ms); @@ -1614,7 +1612,7 @@ TEST(TestArrowReadWrite, ParquetVersionTimestampDifferences) { field("ts:us", t_us), field("ts:ns", t_ns)}); auto input_table = Table::Make(input_schema, {c_s, c_ms, c_us, c_ns}); - auto parquet_version_1_properties = default_writer_properties(); + auto parquet_version_1_properties = ::parquet::default_writer_properties(); auto parquet_version_2_properties = ::parquet::WriterProperties::Builder() .version(ParquetVersion::PARQUET_2_0) ->build(); @@ -1650,12 +1648,14 @@ TEST(TestArrowReadWrite, ParquetVersionTimestampDifferences) { // Neither Parquet version 1.0 nor 2.0 allow coercing to seconds auto sink = CreateOutputStream(); std::shared_ptr
actual_table; - ASSERT_RAISES(NotImplemented, WriteTable(*input_table, ::arrow::default_memory_pool(), - sink, N, parquet_version_1_properties, - arrow_coerce_to_seconds_properties)); - ASSERT_RAISES(NotImplemented, WriteTable(*input_table, ::arrow::default_memory_pool(), - sink, N, parquet_version_2_properties, - arrow_coerce_to_seconds_properties)); + ASSERT_RAISES(NotImplemented, + WriteTable(*input_table, ::arrow::default_memory_pool(), sink, + input_table->num_rows(), parquet_version_1_properties, + arrow_coerce_to_seconds_properties)); + ASSERT_RAISES(NotImplemented, + WriteTable(*input_table, ::arrow::default_memory_pool(), sink, + input_table->num_rows(), parquet_version_2_properties, + arrow_coerce_to_seconds_properties)); } { // Using Parquet version 1.0, coercing to milliseconds or microseconds is allowed @@ -1693,9 +1693,10 @@ TEST(TestArrowReadWrite, ParquetVersionTimestampDifferences) { // Using Parquet version 1.0, coercing to (int64) nanoseconds is not allowed auto sink = CreateOutputStream(); std::shared_ptr
actual_table; - ASSERT_RAISES(NotImplemented, WriteTable(*input_table, ::arrow::default_memory_pool(), - sink, N, parquet_version_1_properties, - arrow_coerce_to_nanos_properties)); + ASSERT_RAISES(NotImplemented, + WriteTable(*input_table, ::arrow::default_memory_pool(), sink, + input_table->num_rows(), parquet_version_1_properties, + arrow_coerce_to_nanos_properties)); } { // Using Parquet version 2.0, coercing to (int64) nanoseconds is allowed From 464f34817cb652e0a20daaba72ecd839ae8fb745 Mon Sep 17 00:00:00 2001 From: TP Boudreau Date: Tue, 18 Jun 2019 20:16:52 +0000 Subject: [PATCH 9/9] Fix failing python parquet tests --- python/pyarrow/tests/test_parquet.py | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index fd181607f249c..c7423c935f42d 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -1025,26 +1025,22 @@ def test_parquet_version_timestamp_differences(): # Using Parquet version 1.0, seconds should be coerced to milliseconds # and nanoseconds should be coerced to microseconds by default expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_us], names) - actual = _roundtrip_table(table) - assert actual.equals(expected) + _check_roundtrip(table, expected) # Using Parquet version 2.0, seconds should be coerced to milliseconds # and nanoseconds should be retained by default expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_ns], names) - actual = _roundtrip_table(table, version='2.0') - assert actual.equals(expected) + _check_roundtrip(table, expected, version='2.0') # Using Parquet version 1.0, coercing to milliseconds or microseconds # is allowed expected = pa.Table.from_arrays([a_ms, a_ms, a_ms, a_ms], names) - actual = _roundtrip_table(table, coerce_timestamps='ms') - assert actual.equals(expected) + _check_roundtrip(table, expected, coerce_timestamps='ms') # Using Parquet version 2.0, coercing to milliseconds or microseconds # is allowed expected = pa.Table.from_arrays([a_us, a_us, a_us, a_us], names) - actual = _roundtrip_table(table, coerce_timestamps='us') - assert actual.equals(expected) + _check_roundtrip(table, expected, version='2.0', coerce_timestamps='us') # TODO: after pyarrow allows coerce_timestamps='ns', tests like the # following should pass ... @@ -1056,18 +1052,15 @@ def test_parquet_version_timestamp_differences(): # Using Parquet version 2.0, coercing to nanoseconds is allowed # expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names) - # actual = _roundtrip_table(table, version='2.0', coerce_timestamps='ns') - # assert actual.equals(expected) + # _check_roundtrip(table, expected, version='2.0', coerce_timestamps='ns') # For either Parquet version, coercing to nanoseconds is allowed # if Int96 storage is used expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names) - actual = _roundtrip_table(table, - use_deprecated_int96_timestamps=True) - assert actual.equals(expected) - actual = _roundtrip_table(table, version='2.0', - use_deprecated_int96_timestamps=True) - assert actual.equals(expected) + _check_roundtrip(table, expected, + use_deprecated_int96_timestamps=True) + _check_roundtrip(table, expected, version='2.0', + use_deprecated_int96_timestamps=True) def test_large_list_records(): @@ -2131,7 +2124,7 @@ def test_noncoerced_nanoseconds_written_without_exception(tempdir): pass assert filename.exists() - recovered_table = _roundtrip_table(tb, version='2.0') + recovered_table = pq.read_table(filename) assert tb.equals(recovered_table) # Loss of data thru coercion (without explicit override) still an error