diff --git a/doc/src/content/xdocs/spec.xml b/doc/src/content/xdocs/spec.xml index 947ae60854b..20ce76ca144 100644 --- a/doc/src/content/xdocs/spec.xml +++ b/doc/src/content/xdocs/spec.xml @@ -1310,6 +1310,92 @@ +
+ Standard Canonical Form for Schemas + +

One of defined way to normalize the avro schema using + Standard Canonical Form Transformation. This involves + stripping unwanted properties and maintain same canonical + ordering. The canonical ordering involves ordering avro + reserved properties followed by custom properties if mentioned while + transforming. Normalization schema which helps to reduce the + total memory size of schema (removed unwanted properties and whitespace) + while transfer avro schema between two system and also reduce the parsing + time for compatibility check and schema evolution. +

+ +

Standard Canonical Form is a transformation of a schema + into standard canonical ordered. It contains only avro reserved + properties "name", "type", "fields", "symbols", "items", "values", + "logicalType", "size", "order", "doc", "aliases", "default" + and other (custom properties) schema properties. +

+ +
+ Transforming into Standard Canonical Form + +

Assuming an input schema (in JSON form) that's already + UTF-8 text for a valid Avro schema (including all + quotes as required by JSON), the following transformations + will produce its Standard Canonical Form:

+ +
+ +
+ Transforming with Custom Properties + +

In addition to the standard canonical form transformation, including + custom Schema or Field properties by + passing the properties names while transforming. + For example, if an object has format, type, + name, and size fields, then the + name field should appear first, followed by the + type, size and then format + (custom properties) fields. +

+
+
+
Schema Fingerprints diff --git a/lang/java/avro/src/main/java/org/apache/avro/SchemaNormalization.java b/lang/java/avro/src/main/java/org/apache/avro/SchemaNormalization.java index 40da3bdee0b..fca5dd8e55f 100644 --- a/lang/java/avro/src/main/java/org/apache/avro/SchemaNormalization.java +++ b/lang/java/avro/src/main/java/org/apache/avro/SchemaNormalization.java @@ -17,17 +17,22 @@ */ package org.apache.avro; +import org.apache.avro.util.internal.JacksonUtils; +import java.util.Arrays; +import java.util.LinkedHashSet; import java.util.Map; import java.util.HashMap; +import java.util.TreeSet; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; /** - * Collection of static methods for generating the canonical form of schemas - * (see {@link #toParsingForm}) -- and fingerprints of canonical forms - * ({@link #fingerprint}). + * Collection of static methods for generating the parser canonical form of + * schemas (see {@link #toParsingForm}), standard canonical form of schemas (see + * {@link #toCanonicalForm}) with user defined properties and fingerprints of + * canonical forms ({@link #fingerprint}). */ public class SchemaNormalization { @@ -38,9 +43,31 @@ private SchemaNormalization() { * Returns "Parsing Canonical Form" of a schema as defined by Avro spec. */ public static String toParsingForm(Schema s) { + return toNormalizedForm(s, true, new LinkedHashSet<>()); + } + + /** + * Returns "Standard Canonical Form" of a schema as defined by Avro spec. + */ + public static String toCanonicalForm(Schema s) { + return toCanonicalForm(s, new LinkedHashSet<>()); + } + + /** + * Returns "Standard Canonical Form" of a schema as defined by Avro spec with + * additional user standard properties. + */ + public static String toCanonicalForm(Schema s, LinkedHashSet properties) { + LinkedHashSet reservedProperties = new LinkedHashSet<>(Arrays.asList("name", "type", "fields", "symbols", + "items", "values", "logicalType", "size", "order", "doc", "aliases", "default")); + properties.removeAll(reservedProperties); + return toNormalizedForm(s, false, properties); + } + + private static String toNormalizedForm(Schema s, Boolean ps, LinkedHashSet aps) { try { Map env = new HashMap<>(); - return build(env, s, new StringBuilder()).toString(); + return build(env, s, new StringBuilder(), ps, aps).toString(); } catch (IOException e) { // Shouldn't happen, b/c StringBuilder can't throw IOException throw new RuntimeException(e); @@ -103,12 +130,19 @@ public static long parsingFingerprint64(Schema s) { return fingerprint64(toParsingForm(s).getBytes(StandardCharsets.UTF_8)); } - private static Appendable build(Map env, Schema s, Appendable o) throws IOException { + private static Appendable build(Map env, Schema s, Appendable o, Boolean ps, + LinkedHashSet aps) throws IOException { boolean firstTime = true; Schema.Type st = s.getType(); + LogicalType lt = null; + if (!ps) + lt = s.getLogicalType(); switch (st) { default: // boolean, bytes, double, float, int, long, null, string - return o.append('"').append(st.getName()).append('"'); + if (!ps && lt != null) + return writeLogicalType(s, lt, o, aps); + else + return o.append('"').append(st.getName()).append('"'); case UNION: o.append('['); @@ -117,7 +151,7 @@ private static Appendable build(Map env, Schema s, Appendable o) o.append(','); else firstTime = false; - build(env, b, o); + build(env, b, o, ps, aps); } return o.append(']'); @@ -125,9 +159,11 @@ private static Appendable build(Map env, Schema s, Appendable o) case MAP: o.append("{\"type\":\"").append(st.getName()).append("\""); if (st == Schema.Type.ARRAY) - build(env, s.getElementType(), o.append(",\"items\":")); + build(env, s.getElementType(), o.append(",\"items\":"), ps, aps); else - build(env, s.getValueType(), o.append(",\"values\":")); + build(env, s.getValueType(), o.append(",\"values\":"), ps, aps); + if (!ps) + setSimpleProps(o, s.getObjectProps(), aps); // adding the reserved property if not parser canonical schema return o.append("}"); case ENUM: @@ -152,6 +188,10 @@ private static Appendable build(Map env, Schema s, Appendable o) o.append("]"); } else if (st == Schema.Type.FIXED) { o.append(",\"size\":").append(Integer.toString(s.getFixedSize())); + lt = s.getLogicalType(); + // adding the logical property + if (!ps && lt != null) + setLogicalProps(o, lt); } else { // st == Schema.Type.RECORD o.append(",\"fields\":["); for (Schema.Field f : s.getFields()) { @@ -160,14 +200,73 @@ private static Appendable build(Map env, Schema s, Appendable o) else firstTime = false; o.append("{\"name\":\"").append(f.name()).append("\""); - build(env, f.schema(), o.append(",\"type\":")).append("}"); + build(env, f.schema(), o.append(",\"type\":"), ps, aps); + if (!ps) + setFieldProps(o, f, aps); // if standard canonical form then add reserved properties + o.append("}"); } o.append("]"); } + if (!ps) { + setComplexProps(o, s); + setSimpleProps(o, s.getObjectProps(), aps); + } // adding the reserved property if not parser canonical schema return o.append("}"); } } + private static Appendable writeLogicalType(Schema s, LogicalType lt, Appendable o, LinkedHashSet aps) + throws IOException { + o.append("{\"type\":\"").append(s.getType().getName()).append("\""); + // adding the logical property + setLogicalProps(o, lt); + // adding the reserved property + setSimpleProps(o, s.getObjectProps(), aps); + return o.append("}"); + } + + private static void setLogicalProps(Appendable o, LogicalType lt) throws IOException { + o.append(",\"").append(LogicalType.LOGICAL_TYPE_PROP).append("\":\"").append(lt.getName()).append("\""); + if (lt.getName().equals("decimal")) { + LogicalTypes.Decimal dlt = (LogicalTypes.Decimal) lt; + o.append(",\"precision\":").append(Integer.toString(dlt.getPrecision())); + if (dlt.getScale() != 0) + o.append(",\"scale\":").append(Integer.toString(dlt.getScale())); + } + } + + private static void setSimpleProps(Appendable o, Map schemaProps, LinkedHashSet aps) + throws IOException { + for (String propKey : aps) { + if (schemaProps.containsKey(propKey)) { + String propValue = JacksonUtils.toJsonNode(schemaProps.get(propKey)).toString(); + o.append(",\"").append(propKey).append("\":").append(propValue); + } + } + } + + private static void setComplexProps(Appendable o, Schema s) throws IOException { + if (s.getDoc() != null && !s.getDoc().isEmpty()) + o.append(",\"doc\":\"").append(s.getDoc()).append("\""); + if (s.getAliases() != null && !s.getAliases().isEmpty()) + o.append(",\"aliases\":").append(JacksonUtils.toJsonNode(new TreeSet(s.getAliases())).toString()); + if (s.getType() == Schema.Type.ENUM && s.getEnumDefault() != null) { + o.append(",\"default\":").append(JacksonUtils.toJsonNode(s.getEnumDefault()).toString()); + } + } + + private static void setFieldProps(Appendable o, Schema.Field f, LinkedHashSet aps) throws IOException { + if (f.order() != null) + o.append(",\"order\":\"").append(f.order().toString()).append("\""); + if (f.doc() != null) + o.append(",\"doc\":\"").append(f.doc()).append("\""); + if (!f.aliases().isEmpty()) + o.append(",\"aliases\":").append(JacksonUtils.toJsonNode(new TreeSet(f.aliases())).toString()); + if (f.defaultVal() != null) + o.append(",\"default\":").append(JacksonUtils.toJsonNode(f.defaultVal()).toString()); + setSimpleProps(o, f.getObjectProps(), aps); + } + final static long EMPTY64 = 0xc15d213aa4d7a795L; /* An inner class ensures that FP_TABLE initialized only when needed. */ diff --git a/lang/java/avro/src/test/java/org/apache/avro/TestSchemaNormalization.java b/lang/java/avro/src/test/java/org/apache/avro/TestSchemaNormalization.java index 97b7a7803ce..a9cf0974f3a 100644 --- a/lang/java/avro/src/test/java/org/apache/avro/TestSchemaNormalization.java +++ b/lang/java/avro/src/test/java/org/apache/avro/TestSchemaNormalization.java @@ -29,6 +29,8 @@ import java.util.Formatter; import java.util.List; import java.util.Locale; +import java.util.LinkedHashSet; +import java.util.Arrays; import org.apache.avro.util.CaseFinder; import org.junit.Test; @@ -40,18 +42,27 @@ @RunWith(Enclosed.class) public class TestSchemaNormalization { + private static String PARSER_DATA_FILE = (System.getProperty("share.dir", "../../../share") + + "/test/data/schema-tests.txt"); + + private static String STANDARD_CANONICAL_DATA_FILE = (System.getProperty("share.dir", "../../../share") + + "/test/data/standard-schema-tests.txt"); + + private static String CUSTOM_CANONICAL_DATA_FILE = (System.getProperty("share.dir", "../../../share") + + "/test/data/custom-schema-tests.txt"); + @RunWith(Parameterized.class) - public static class TestCanonical { + public static class TestParserCanonicalSchema { String input, expectedOutput; - public TestCanonical(String i, String o) { + public TestParserCanonicalSchema(String i, String o) { input = i; expectedOutput = o; } @Parameters public static List cases() throws IOException { - return CaseFinder.find(data(), "canonical", new ArrayList<>()); + return CaseFinder.find(data(PARSER_DATA_FILE), "canonical", new ArrayList<>()); } @Test @@ -60,6 +71,47 @@ public void testCanonicalization() throws Exception { } } + @RunWith(Parameterized.class) + public static class TestStandardCanonicalSchema { + String input, expectedOutput; + + public TestStandardCanonicalSchema(String i, String o) { + input = i; + expectedOutput = o; + } + + @Parameters + public static List cases() throws IOException { + return CaseFinder.find(data(STANDARD_CANONICAL_DATA_FILE), "canonical", new ArrayList<>()); + } + + @Test + public void testCanonicalization() throws Exception { + assertEquals(SchemaNormalization.toCanonicalForm(new Schema.Parser().parse(input)), expectedOutput); + } + } + + @RunWith(Parameterized.class) + public static class TestCustomCanonicalSchema { + String input, expectedOutput; + LinkedHashSet properties = new LinkedHashSet<>(Arrays.asList("format")); + + public TestCustomCanonicalSchema(String i, String o) { + input = i; + expectedOutput = o; + } + + @Parameters + public static List cases() throws IOException { + return CaseFinder.find(data(CUSTOM_CANONICAL_DATA_FILE), "canonical", new ArrayList<>()); + } + + @Test + public void testCanonicalization() throws Exception { + assertEquals(SchemaNormalization.toCanonicalForm(new Schema.Parser().parse(input), properties), expectedOutput); + } + } + @RunWith(Parameterized.class) public static class TestFingerprint { String input, expectedOutput; @@ -71,7 +123,7 @@ public TestFingerprint(String i, String o) { @Parameters public static List cases() throws IOException { - return CaseFinder.find(data(), "fingerprint", new ArrayList<>()); + return CaseFinder.find(data(PARSER_DATA_FILE), "fingerprint", new ArrayList<>()); } @Test @@ -95,7 +147,7 @@ public TestFingerprintInternationalization(String i, String o) { @Parameters public static List cases() throws IOException { - return CaseFinder.find(data(), "fingerprint", new ArrayList<>()); + return CaseFinder.find(data(PARSER_DATA_FILE), "fingerprint", new ArrayList<>()); } @Test @@ -110,10 +162,8 @@ public void testCanonicalization() throws Exception { } } - private static String DATA_FILE = (System.getProperty("share.dir", "../../../share") + "/test/data/schema-tests.txt"); - - private static BufferedReader data() throws IOException { - return Files.newBufferedReader(Paths.get(DATA_FILE), UTF_8); + private static BufferedReader data(String data_file) throws IOException { + return Files.newBufferedReader(Paths.get(data_file), UTF_8); } /** diff --git a/share/test/data/custom-schema-tests.txt b/share/test/data/custom-schema-tests.txt new file mode 100644 index 00000000000..4d3871f521b --- /dev/null +++ b/share/test/data/custom-schema-tests.txt @@ -0,0 +1,99 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +// NOTE: the Java implementation provides a slow-but-direct implementation +// of the fingerpriting algorithm which is used to cross-check the +// "fingerprint" values below. Thus, the Java unit-tests provide validation +// for these values, so other languages can just assume they are correct. + + +// 01 +<