apache · rumeshkrish · Feb 7, 2020 · Jun 14, 2020 · Jun 14, 2020 · Jun 14, 2020
diff --git a/doc/src/content/xdocs/spec.xml b/doc/src/content/xdocs/spec.xml
@@ -1310,6 +1310,92 @@
  </ul>
  </section>
 
+ <section>
+ <title>Standard Canonical Form for Schemas</title>
+
+ <p>One of defined way to normalize the avro schema using
+ <em>Standard Canonical Form Transformation</em>. This involves
+ stripping unwanted properties and maintain same canonical
+ ordering. The canonical ordering involves ordering avro
+ reserved properties followed by custom properties if mentioned while
+ transforming. Normalization schema which helps to reduce the
+ total memory size of schema (removed unwanted properties and whitespace)
+ while transfer avro schema between two system and also reduce the parsing
+ time for compatibility check and schema evolution.
+ </p>
+
+ <p><em>Standard Canonical Form</em> is a transformation of a schema
+ into standard canonical ordered. It contains only avro reserved
+ properties <code>"name", "type", "fields", "symbols", "items", "values",
+ "logicalType", "size", "order", "doc", "aliases", "default"</code>
+ and <em>other (custom properties)</em> schema properties.
+ </p>
+
+ <section>
+ <title>Transforming into Standard Canonical Form</title>
+
+ <p>Assuming an input schema (in JSON form) that's already
+ UTF-8 text for a <em>valid</em> Avro schema (including all
+ quotes as required by JSON), the following transformations
+ will produce its Standard Canonical Form:</p>
+ <ul>
+ <li> [PRIMITIVES] Convert primitive schemas to their simple
+ form (e.g., <code>int</code> instead of
+ <code>{"type":"int"}</code>).</li>
+
+ <li> [FULLNAMES] Replace short names with fullnames, using
+ applicable namespaces to do so. Then eliminate
+ <code>namespace</code> attributes, which are now redundant.</li>
+
+ <li> [STRIP] Keep only attributes that are relevant to
+ reserved properties, which are:
+ <code>type</code>, <code>name</code>,
+ <code>fields</code>, <code>symbols</code>,
+ <code>items</code>, <code>values</code>,
+ <code>logicalType</code>, <code>size</code>,
+ <code>order</code>, <code>doc</code>
+ <code>aliases</code> and <code>default</code>.
+ Strip all others user defined properties (e.g., <code>format</code>).</li>
+
+ <li> [ORDER] Order the appearance of fields of JSON objects
+ as follows: <code>name</code>, <code>type</code>,
+ <code>fields</code>, <code>symbols</code>,
+ <code>items</code>, <code>values</code>,
+ <code>logicalType</code>, <code>size</code>,
+ <code>order</code>, <code>doc</code>,
+ <code>aliases</code>, <code>default</code>.
+ For example, if an object has <code>type</code>,
+ <code>name</code>, and <code>size</code> fields, then the
+ <code>name</code> field should appear first, followed by the
+ <code>type</code> and then the <code>size</code> fields.</li>
+
+ <li> [STRINGS] For all JSON string literals in the schema
+ text, replace any escaped characters (e.g., \uXXXX escapes)
+ with their UTF-8 equivalents.</li>
+
+ <li> [INTEGERS] Eliminate quotes around and any leading
+ zeros in front of JSON integer literals (which appear in the
+ <code>size</code> attributes of <code>fixed</code> schemas).</li>
+
+ <li> [WHITESPACE] Eliminate all whitespace in JSON outside of string literals.</li>
+ </ul>
+ </section>
+
+ <section>
+ <title>Transforming with Custom Properties</title>
+
+ <p>In addition to the standard canonical form transformation, including
+ <em>custom</em> <code>Schema</code> or <code>Field</code> properties by
+ passing the properties names while transforming.
+ For example, if an object has <code>format</code>, <code>type</code>,
+ <code>name</code>, and <code>size</code> fields, then the
+ <code>name</code> field should appear first, followed by the
+ <code>type</code>, <code>size</code> and then <code>format</code>
+ (custom properties) fields.
+ </p>
+ </section>
+ </section>
+
  <section id="schema_fingerprints">
  <title>Schema Fingerprints</title>
 

diff --git a/lang/java/avro/src/main/java/org/apache/avro/SchemaNormalization.java b/lang/java/avro/src/main/java/org/apache/avro/SchemaNormalization.java
@@ -17,17 +17,22 @@
  */
 package org.apache.avro;
 
+import org.apache.avro.util.internal.JacksonUtils;
+import java.util.Arrays;
+import java.util.LinkedHashSet;
 import java.util.Map;
 import java.util.HashMap;
+import java.util.TreeSet;
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
 
 /**
- * Collection of static methods for generating the canonical form of schemas
- * (see {@link #toParsingForm}) -- and fingerprints of canonical forms
- * ({@link #fingerprint}).
+ * Collection of static methods for generating the parser canonical form of
+ * schemas (see {@link #toParsingForm}), standard canonical form of schemas (see
+ * {@link #toCanonicalForm}) with user defined properties and fingerprints of
+ * canonical forms ({@link #fingerprint}).
  */
 public class SchemaNormalization {
 
@@ -38,9 +43,31 @@ private SchemaNormalization() {
  * Returns "Parsing Canonical Form" of a schema as defined by Avro spec.
  */
  public static String toParsingForm(Schema s) {
+ return toNormalizedForm(s, true, new LinkedHashSet<>());
+ }
+
+ /**
+ * Returns "Standard Canonical Form" of a schema as defined by Avro spec.
+ */
+ public static String toCanonicalForm(Schema s) {
+ return toCanonicalForm(s, new LinkedHashSet<>());
+ }
+
+ /**
+ * Returns "Standard Canonical Form" of a schema as defined by Avro spec with
+ * additional user standard properties.
+ */
+ public static String toCanonicalForm(Schema s, LinkedHashSet<String> properties) {
+ LinkedHashSet<String> reservedProperties = new LinkedHashSet<>(Arrays.asList("name", "type", "fields", "symbols",
+ "items", "values", "logicalType", "size", "order", "doc", "aliases", "default"));
+ properties.removeAll(reservedProperties);
+ return toNormalizedForm(s, false, properties);
+ }
+
+ private static String toNormalizedForm(Schema s, Boolean ps, LinkedHashSet<String> aps) {
  try {
  Map<String, String> env = new HashMap<>();
- return build(env, s, new StringBuilder()).toString();
+ return build(env, s, new StringBuilder(), ps, aps).toString();
  } catch (IOException e) {
  // Shouldn't happen, b/c StringBuilder can't throw IOException
  throw new RuntimeException(e);
@@ -103,12 +130,19 @@ public static long parsingFingerprint64(Schema s) {
  return fingerprint64(toParsingForm(s).getBytes(StandardCharsets.UTF_8));
  }
 
- private static Appendable build(Map<String, String> env, Schema s, Appendable o) throws IOException {
+ private static Appendable build(Map<String, String> env, Schema s, Appendable o, Boolean ps,
+ LinkedHashSet<String> aps) throws IOException {
  boolean firstTime = true;
  Schema.Type st = s.getType();
+ LogicalType lt = null;
+ if (!ps)
+ lt = s.getLogicalType();
  switch (st) {
  default: // boolean, bytes, double, float, int, long, null, string
- return o.append('"').append(st.getName()).append('"');
+ if (!ps && lt != null)
+ return writeLogicalType(s, lt, o, aps);
+ else
+ return o.append('"').append(st.getName()).append('"');
 
  case UNION:
  o.append('[');
@@ -117,17 +151,19 @@ private static Appendable build(Map<String, String> env, Schema s, Appendable o)
  o.append(',');
  else
  firstTime = false;
- build(env, b, o);
+ build(env, b, o, ps, aps);
  }
  return o.append(']');
 
  case ARRAY:
  case MAP:
  o.append("{\"type\":\"").append(st.getName()).append("\"");
  if (st == Schema.Type.ARRAY)
- build(env, s.getElementType(), o.append(",\"items\":"));
+ build(env, s.getElementType(), o.append(",\"items\":"), ps, aps);
  else
- build(env, s.getValueType(), o.append(",\"values\":"));
+ build(env, s.getValueType(), o.append(",\"values\":"), ps, aps);
+ if (!ps)
+ setSimpleProps(o, s.getObjectProps(), aps); // adding the reserved property if not parser canonical schema
  return o.append("}");
 
  case ENUM:
@@ -160,14 +196,65 @@ private static Appendable build(Map<String, String> env, Schema s, Appendable o)
  else
  firstTime = false;
  o.append("{\"name\":\"").append(f.name()).append("\"");
- build(env, f.schema(), o.append(",\"type\":")).append("}");
+ build(env, f.schema(), o.append(",\"type\":"), ps, aps);
+ if (!ps)
+ setFieldProps(o, f, aps); // if standard canonical form then add reserved properties
+ o.append("}");
  }
  o.append("]");
  }
+ if (!ps) {
+ setComplexProps(o, s);
+ setSimpleProps(o, s.getObjectProps(), aps);
+ } // adding the reserved property if not parser canonical schema
  return o.append("}");
  }
  }
 
+ private static Appendable writeLogicalType(Schema s, LogicalType lt, Appendable o, LinkedHashSet<String> aps)
+ throws IOException {
+ o.append("{\"type\":\"").append(s.getType().getName()).append("\"");
+ o.append("\"").append(LogicalType.LOGICAL_TYPE_PROP).append("\":\"").append(lt.getName()).append("\"");
+ if (lt.getName().equals("decimal")) {
+ LogicalTypes.Decimal dlt = (LogicalTypes.Decimal) lt;
+ o.append(",\"precision\":").append(Integer.toString(dlt.getPrecision()));
+ if (dlt.getScale() != 0)
+ o.append(",\"scale\":").append(Integer.toString(dlt.getScale()));
+ }
+ // adding the reserved property
+ setSimpleProps(o, s.getObjectProps(), aps);
+ return o.append("}");
+ }
+
+ private static void setSimpleProps(Appendable o, Map<String, Object> schemaProps, LinkedHashSet<String> aps)
+ throws IOException {
+ for (String propKey : aps) {
+ if (schemaProps.containsKey(propKey)) {
+ String propValue = JacksonUtils.toJsonNode(schemaProps.get(propKey)).toString();
+ o.append(",\"").append(propKey).append("\":").append(propValue);
+ }
+ }
+ }
+
+ private static void setComplexProps(Appendable o, Schema s) throws IOException {
+ if (s.getDoc() != null && !s.getDoc().isEmpty())
+ o.append(",\"doc\":\"").append(s.getDoc()).append("\"");
+ if (s.getAliases() != null && !s.getAliases().isEmpty())
+ o.append(",\"aliases\":").append(JacksonUtils.toJsonNode(new TreeSet<String>(s.getAliases())).toString());
+ }
+
+ private static void setFieldProps(Appendable o, Schema.Field f, LinkedHashSet<String> aps) throws IOException {
+ if (f.order() != null)
+ o.append(",\"order\":\"").append(f.order().toString()).append("\"");
+ if (f.doc() != null)
+ o.append(",\"doc\":\"").append(f.doc()).append("\"");
+ if (!f.aliases().isEmpty())
+ o.append(",\"aliases\":").append(JacksonUtils.toJsonNode(new TreeSet<String>(f.aliases())).toString());
+ if (f.defaultVal() != null)
+ o.append(",\"default\":").append(JacksonUtils.toJsonNode(f.defaultVal()).toString());
+ setSimpleProps(o, f.getObjectProps(), aps);
+ }
+
  final static long EMPTY64 = 0xc15d213aa4d7a795L;
 
  /* An inner class ensures that FP_TABLE initialized only when needed. */