canonical schema

* Huge thanks to Omid Aladini for the initial work for this feature! * Codec.CanonicalSchema() returns the parsing canonical form of the schema. https://avro.apache.org/docs/current/spec.html#Transforming+into+Parsing+Canonical+Form * This is considered work towards eventually having https://avro.apache.org/docs/1.8.2/spec.html#schema_fingerprints * The `fixed` type's `size` property allows either an integer or a quoted integer, due to implicit allowance in the _Transiforming into Parsing Canonical Form_ section of the Avro Specification. Both of the followed are permitted, although only the first is legal. {"type":"fixed","name":"example1","size":16} {"type":"fixed","name":"example1","size":"16"} * NewCodec returns an error when the provided Avro schema is an unquoted primitive type. The Avro Specification says that a schema may be a JSON string, naming a defined type, or a JSON object. A JSON string must have a starting and ending quote. This is an invalid schema: `int` This is a valid schema: `"int"` See: linkedin#35
wfscheper · Apr 27, 2018 · fa8f6a3 · fa8f6a3
1 parent e713d59
commit fa8f6a3
Show file tree

Hide file tree

Showing 20 changed files with 847 additions and 334 deletions.
diff --git a/binary_test.go b/binary_test.go
@@ -19,7 +19,7 @@ import (
 var morePositiveThanMaxBlockCount, morePositiveThanMaxBlockSize, moreNegativeThanMaxBlockCount, mostNegativeBlockCount []byte
 
 func init() {
-	c, err := NewCodec("long")
+	c, err := NewCodec(`"long"`)
 	if err != nil {
 		panic(err)
 	}

diff --git a/boolean_test.go b/boolean_test.go
@@ -12,21 +12,21 @@ package goavro
 import "testing"
 
 func TestSchemaPrimitiveCodecBoolean(t *testing.T) {
-	testSchemaPrimativeCodec(t, "boolean")
+	testSchemaPrimativeCodec(t, `"boolean"`)
 }
 
 func TestPrimitiveBooleanBinary(t *testing.T) {
-	testBinaryEncodeFailBadDatumType(t, "boolean", 0)
-	testBinaryEncodeFailBadDatumType(t, "boolean", 1)
-	testBinaryDecodeFailShortBuffer(t, "boolean", nil)
-	testBinaryCodecPass(t, "boolean", false, []byte{0})
-	testBinaryCodecPass(t, "boolean", true, []byte{1})
+	testBinaryEncodeFailBadDatumType(t, `"boolean"`, 0)
+	testBinaryEncodeFailBadDatumType(t, `"boolean"`, 1)
+	testBinaryDecodeFailShortBuffer(t, `"boolean"`, nil)
+	testBinaryCodecPass(t, `"boolean"`, false, []byte{0})
+	testBinaryCodecPass(t, `"boolean"`, true, []byte{1})
 }
 
 func TestPrimitiveBooleanText(t *testing.T) {
-	testTextEncodeFailBadDatumType(t, "boolean", 0)
-	testTextEncodeFailBadDatumType(t, "boolean", 1)
-	testTextDecodeFailShortBuffer(t, "boolean", nil)
-	testTextCodecPass(t, "boolean", false, []byte("false"))
-	testTextCodecPass(t, "boolean", true, []byte("true"))
+	testTextEncodeFailBadDatumType(t, `"boolean"`, 0)
+	testTextEncodeFailBadDatumType(t, `"boolean"`, 1)
+	testTextDecodeFailShortBuffer(t, `"boolean"`, nil)
+	testTextCodecPass(t, `"boolean"`, false, []byte("false"))
+	testTextCodecPass(t, `"boolean"`, true, []byte("true"))
 }
diff --git a/bytes.go b/bytes.go
@@ -14,6 +14,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"os"
 	"unicode"
 	"unicode/utf16"
 	"unicode/utf8"
@@ -440,3 +441,80 @@ var (
 	sliceTab            = []byte("\\t")
 	sliceUnicode        = []byte("\\u")
 )
+
+// DEBUG -- remove function prior to committing
+func decodedStringFromJSON(buf []byte) (string, []byte, error) {
+	fmt.Fprintf(os.Stderr, "decodedStringFromJSON(%v)\n", buf)
+	buflen := len(buf)
+	if buflen < 2 {
+		return "", buf, fmt.Errorf("cannot decode string: %s", io.ErrShortBuffer)
+	}
+	if buf[0] != '"' {
+		return "", buf, fmt.Errorf("cannot decode string: expected initial '\"'; found: %#U", buf[0])
+	}
+	var newBytes []byte
+	var escaped, ok bool
+	// Loop through bytes following initial double quote, but note we will
+	// return immediately when find unescaped double quote.
+	for i := 1; i < buflen; i++ {
+		b := buf[i]
+		if escaped {
+			escaped = false
+			if b, ok = unescapeSpecialJSON(b); ok {
+				newBytes = append(newBytes, b)
+				continue
+			}
+			if b == 'u' {
+				// NOTE: Need at least 4 more bytes to read uint16, but subtract
+				// 1 because do not want to count the trailing quote and
+				// subtract another 1 because already consumed u but have yet to
+				// increment i.
+				if i > buflen-6 {
+					return "", buf[i+1:], fmt.Errorf("cannot decode string: %s", io.ErrShortBuffer)
+				}
+				v, err := parseUint64FromHexSlice(buf[i+1 : i+5])
+				if err != nil {
+					return "", buf[i+1:], fmt.Errorf("cannot decode string: %s", err)
+				}
+				i += 4 // absorb 4 characters: one 'u' and three of the digits
+
+				nbl := len(newBytes)
+				newBytes = append(newBytes, 0, 0, 0, 0) // grow to make room for UTF-8 encoded rune
+
+				r := rune(v)
+				if utf16.IsSurrogate(r) {
+					i++ // absorb final hexidecimal digit from previous value
+
+					// Expect second half of surrogate pair
+					if i > buflen-6 || buf[i] != '\\' || buf[i+1] != 'u' {
+						return "", buf[i+1:], errors.New("cannot decode string: missing second half of surrogate pair")
+					}
+
+					v, err = parseUint64FromHexSlice(buf[i+2 : i+6])
+					if err != nil {
+						return "", buf[i+1:], fmt.Errorf("cannot decode string: cannot decode second half of surrogate pair: %s", err)
+					}
+					i += 5 // absorb 5 characters: two for '\u', and 3 of the 4 digits
+
+					// Get code point by combining high and low surrogate bits
+					r = utf16.DecodeRune(r, rune(v))
+				}
+
+				width := utf8.EncodeRune(newBytes[nbl:], r) // append UTF-8 encoded version of code point
+				newBytes = newBytes[:nbl+width]             // trim off excess bytes
+				continue
+			}
+			newBytes = append(newBytes, b)
+			continue
+		}
+		if b == '\\' {
+			escaped = true
+			continue
+		}
+		if b == '"' {
+			return string(newBytes), buf[i+1:], nil
+		}
+		newBytes = append(newBytes, b)
+	}
+	return "", buf, fmt.Errorf("cannot decode string: expected final '\"'; found: %#U", buf[buflen-1])
+}
diff --git a/bytes_test.go b/bytes_test.go
@@ -10,113 +10,114 @@
 package goavro
 
 import (
+	"encoding/json"
 	"strings"
 	"testing"
 )
 
 func TestSchemaPrimitiveCodecBytes(t *testing.T) {
-	testSchemaPrimativeCodec(t, "bytes")
+	testSchemaPrimativeCodec(t, `"bytes"`)
 }
 
 func TestPrimitiveBytesBinary(t *testing.T) {
-	testBinaryEncodeFailBadDatumType(t, "bytes", 13)
-	testBinaryDecodeFailShortBuffer(t, "bytes", nil)
-	testBinaryDecodeFailShortBuffer(t, "bytes", []byte{2})
-	testBinaryCodecPass(t, "bytes", []byte(""), []byte("\x00"))
-	testBinaryCodecPass(t, "bytes", []byte("some bytes"), []byte("\x14some bytes"))
+	testBinaryEncodeFailBadDatumType(t, `"bytes"`, 13)
+	testBinaryDecodeFailShortBuffer(t, `"bytes"`, nil)
+	testBinaryDecodeFailShortBuffer(t, `"bytes"`, []byte{2})
+	testBinaryCodecPass(t, `"bytes"`, []byte(""), []byte("\x00"))
+	testBinaryCodecPass(t, `"bytes"`, []byte("some bytes"), []byte("\x14some bytes"))
 }
 
 func TestPrimitiveBytesText(t *testing.T) {
-	testTextEncodeFailBadDatumType(t, "bytes", 42)
-	testTextDecodeFailShortBuffer(t, "bytes", []byte(``))
-	testTextDecodeFailShortBuffer(t, "bytes", []byte(`"`))
-	testTextDecodeFail(t, "bytes", []byte(`..`), "expected initial \"")
-	testTextDecodeFail(t, "bytes", []byte(`".`), "expected final \"")
-
-	testTextCodecPass(t, "bytes", []byte(""), []byte("\"\""))
-	testTextCodecPass(t, "bytes", []byte("a"), []byte("\"a\""))
-	testTextCodecPass(t, "bytes", []byte("ab"), []byte("\"ab\""))
-	testTextCodecPass(t, "bytes", []byte("a\"b"), []byte("\"a\\\"b\""))
-	testTextCodecPass(t, "bytes", []byte("a\\b"), []byte("\"a\\\\b\""))
-	testTextCodecPass(t, "bytes", []byte("a/b"), []byte("\"a\\/b\""))
-
-	testTextCodecPass(t, "bytes", []byte("a\bb"), []byte(`"a\bb"`))
-	testTextCodecPass(t, "bytes", []byte("a\fb"), []byte(`"a\fb"`))
-	testTextCodecPass(t, "bytes", []byte("a\nb"), []byte(`"a\nb"`))
-	testTextCodecPass(t, "bytes", []byte("a\rb"), []byte(`"a\rb"`))
-	testTextCodecPass(t, "bytes", []byte("a\tb"), []byte(`"a\tb"`))
-	testTextCodecPass(t, "bytes", []byte("a	b"), []byte(`"a\tb"`)) // tab byte between a and b
-
-	testTextDecodeFail(t, "bytes", []byte("\"\\u\""), "short buffer")
-	testTextDecodeFail(t, "bytes", []byte("\"\\u.\""), "short buffer")
-	testTextDecodeFail(t, "bytes", []byte("\"\\u..\""), "short buffer")
-	testTextDecodeFail(t, "bytes", []byte("\"\\u...\""), "short buffer")
-
-	testTextDecodeFail(t, "bytes", []byte("\"\\u////\""), "invalid byte") // < '0'
-	testTextDecodeFail(t, "bytes", []byte("\"\\u::::\""), "invalid byte") // > '9'
-	testTextDecodeFail(t, "bytes", []byte("\"\\u@@@@\""), "invalid byte") // < 'A'
-	testTextDecodeFail(t, "bytes", []byte("\"\\uGGGG\""), "invalid byte") // > 'F'
-	testTextDecodeFail(t, "bytes", []byte("\"\\u````\""), "invalid byte") // < 'a'
-	testTextDecodeFail(t, "bytes", []byte("\"\\ugggg\""), "invalid byte") // > 'f'
-
-	testTextCodecPass(t, "bytes", []byte("⌘ "), []byte("\"\\u0001\\u00E2\\u008C\\u0098 \""))
-	testTextCodecPass(t, "bytes", []byte("😂"), []byte(`"\u00F0\u009F\u0098\u0082"`))
+	testTextEncodeFailBadDatumType(t, `"bytes"`, 42)
+	testTextDecodeFailShortBuffer(t, `"bytes"`, []byte(``))
+	testTextDecodeFailShortBuffer(t, `"bytes"`, []byte(`"`))
+	testTextDecodeFail(t, `"bytes"`, []byte(`..`), "expected initial \"")
+	testTextDecodeFail(t, `"bytes"`, []byte(`".`), "expected final \"")
+
+	testTextCodecPass(t, `"bytes"`, []byte(""), []byte("\"\""))
+	testTextCodecPass(t, `"bytes"`, []byte("a"), []byte("\"a\""))
+	testTextCodecPass(t, `"bytes"`, []byte("ab"), []byte("\"ab\""))
+	testTextCodecPass(t, `"bytes"`, []byte("a\"b"), []byte("\"a\\\"b\""))
+	testTextCodecPass(t, `"bytes"`, []byte("a\\b"), []byte("\"a\\\\b\""))
+	testTextCodecPass(t, `"bytes"`, []byte("a/b"), []byte("\"a\\/b\""))
+
+	testTextCodecPass(t, `"bytes"`, []byte("a\bb"), []byte(`"a\bb"`))
+	testTextCodecPass(t, `"bytes"`, []byte("a\fb"), []byte(`"a\fb"`))
+	testTextCodecPass(t, `"bytes"`, []byte("a\nb"), []byte(`"a\nb"`))
+	testTextCodecPass(t, `"bytes"`, []byte("a\rb"), []byte(`"a\rb"`))
+	testTextCodecPass(t, `"bytes"`, []byte("a\tb"), []byte(`"a\tb"`))
+	testTextCodecPass(t, `"bytes"`, []byte("a	b"), []byte(`"a\tb"`)) // tab byte between a and b
+
+	testTextDecodeFail(t, `"bytes"`, []byte("\"\\u\""), "short buffer")
+	testTextDecodeFail(t, `"bytes"`, []byte("\"\\u.\""), "short buffer")
+	testTextDecodeFail(t, `"bytes"`, []byte("\"\\u..\""), "short buffer")
+	testTextDecodeFail(t, `"bytes"`, []byte("\"\\u...\""), "short buffer")
+
+	testTextDecodeFail(t, `"bytes"`, []byte("\"\\u////\""), "invalid byte") // < '0'
+	testTextDecodeFail(t, `"bytes"`, []byte("\"\\u::::\""), "invalid byte") // > '9'
+	testTextDecodeFail(t, `"bytes"`, []byte("\"\\u@@@@\""), "invalid byte") // < 'A'
+	testTextDecodeFail(t, `"bytes"`, []byte("\"\\uGGGG\""), "invalid byte") // > 'F'
+	testTextDecodeFail(t, `"bytes"`, []byte("\"\\u````\""), "invalid byte") // < 'a'
+	testTextDecodeFail(t, `"bytes"`, []byte("\"\\ugggg\""), "invalid byte") // > 'f'
+
+	testTextCodecPass(t, `"bytes"`, []byte("⌘ "), []byte("\"\\u0001\\u00E2\\u008C\\u0098 \""))
+	testTextCodecPass(t, `"bytes"`, []byte("😂"), []byte(`"\u00F0\u009F\u0098\u0082"`))
 }
 
 func TestSchemaPrimitiveStringCodec(t *testing.T) {
-	testSchemaPrimativeCodec(t, "string")
+	testSchemaPrimativeCodec(t, `"string"`)
 }
 
 func TestPrimitiveStringBinary(t *testing.T) {
-	testBinaryEncodeFailBadDatumType(t, "string", 42)
-	testBinaryDecodeFailShortBuffer(t, "string", nil)
-	testBinaryDecodeFailShortBuffer(t, "string", []byte{2})
-	testBinaryCodecPass(t, "string", "", []byte("\x00"))
-	testBinaryCodecPass(t, "string", "some string", []byte("\x16some string"))
+	testBinaryEncodeFailBadDatumType(t, `"string"`, 42)
+	testBinaryDecodeFailShortBuffer(t, `"string"`, nil)
+	testBinaryDecodeFailShortBuffer(t, `"string"`, []byte{2})
+	testBinaryCodecPass(t, `"string"`, "", []byte("\x00"))
+	testBinaryCodecPass(t, `"string"`, "some string", []byte("\x16some string"))
 }
 
 func TestPrimitiveStringText(t *testing.T) {
-	testTextEncodeFailBadDatumType(t, "string", 42)
-	testTextDecodeFailShortBuffer(t, "string", []byte(``))
-	testTextDecodeFailShortBuffer(t, "string", []byte(`"`))
-	testTextDecodeFail(t, "string", []byte(`..`), "expected initial \"")
-	testTextDecodeFail(t, "string", []byte(`".`), "expected final \"")
-
-	testTextCodecPass(t, "string", "", []byte("\"\""))
-	testTextCodecPass(t, "string", "a", []byte("\"a\""))
-	testTextCodecPass(t, "string", "ab", []byte("\"ab\""))
-	testTextCodecPass(t, "string", "a\"b", []byte("\"a\\\"b\""))
-	testTextCodecPass(t, "string", "a\\b", []byte("\"a\\\\b\""))
-	testTextCodecPass(t, "string", "a/b", []byte("\"a\\/b\""))
-
-	testTextCodecPass(t, "string", "a\bb", []byte(`"a\bb"`))
-	testTextCodecPass(t, "string", "a\fb", []byte(`"a\fb"`))
-	testTextCodecPass(t, "string", "a\nb", []byte(`"a\nb"`))
-	testTextCodecPass(t, "string", "a\rb", []byte(`"a\rb"`))
-	testTextCodecPass(t, "string", "a\tb", []byte(`"a\tb"`))
-	testTextCodecPass(t, "string", "a	b", []byte(`"a\tb"`)) // tab byte between a and b
-
-	testTextDecodeFail(t, "string", []byte("\"\\u\""), "short buffer")
-	testTextDecodeFail(t, "string", []byte("\"\\u.\""), "short buffer")
-	testTextDecodeFail(t, "string", []byte("\"\\u..\""), "short buffer")
-	testTextDecodeFail(t, "string", []byte("\"\\u...\""), "short buffer")
-
-	testTextDecodeFail(t, "string", []byte("\"\\u////\""), "invalid byte") // < '0'
-	testTextDecodeFail(t, "string", []byte("\"\\u::::\""), "invalid byte") // > '9'
-	testTextDecodeFail(t, "string", []byte("\"\\u@@@@\""), "invalid byte") // < 'A'
-	testTextDecodeFail(t, "string", []byte("\"\\uGGGG\""), "invalid byte") // > 'F'
-	testTextDecodeFail(t, "string", []byte("\"\\u````\""), "invalid byte") // < 'a'
-	testTextDecodeFail(t, "string", []byte("\"\\ugggg\""), "invalid byte") // > 'f'
-
-	testTextCodecPass(t, "string", "⌘ ", []byte("\"\\u0001\\u2318 \""))
-	testTextCodecPass(t, "string", "😂 ", []byte("\"\\u0001\\uD83D\\uDE02 \""))
-
-	testTextDecodeFail(t, "string", []byte("\"\\"), "short buffer")
-	testTextDecodeFail(t, "string", []byte("\"\\uD83D\""), "surrogate pair")
-	testTextDecodeFail(t, "string", []byte("\"\\uD83D\\u\""), "surrogate pair")
-	testTextDecodeFail(t, "string", []byte("\"\\uD83D\\uD\""), "surrogate pair")
-	testTextDecodeFail(t, "string", []byte("\"\\uD83D\\uDE\""), "surrogate pair")
-	testTextDecodeFail(t, "string", []byte("\"\\uD83D\\uDE0\""), "invalid byte")
+	testTextEncodeFailBadDatumType(t, `"string"`, 42)
+	testTextDecodeFailShortBuffer(t, `"string"`, []byte(``))
+	testTextDecodeFailShortBuffer(t, `"string"`, []byte(`"`))
+	testTextDecodeFail(t, `"string"`, []byte(`..`), "expected initial \"")
+	testTextDecodeFail(t, `"string"`, []byte(`".`), "expected final \"")
+
+	testTextCodecPass(t, `"string"`, "", []byte("\"\""))
+	testTextCodecPass(t, `"string"`, "a", []byte("\"a\""))
+	testTextCodecPass(t, `"string"`, "ab", []byte("\"ab\""))
+	testTextCodecPass(t, `"string"`, "a\"b", []byte("\"a\\\"b\""))
+	testTextCodecPass(t, `"string"`, "a\\b", []byte("\"a\\\\b\""))
+	testTextCodecPass(t, `"string"`, "a/b", []byte("\"a\\/b\""))
+
+	testTextCodecPass(t, `"string"`, "a\bb", []byte(`"a\bb"`))
+	testTextCodecPass(t, `"string"`, "a\fb", []byte(`"a\fb"`))
+	testTextCodecPass(t, `"string"`, "a\nb", []byte(`"a\nb"`))
+	testTextCodecPass(t, `"string"`, "a\rb", []byte(`"a\rb"`))
+	testTextCodecPass(t, `"string"`, "a\tb", []byte(`"a\tb"`))
+	testTextCodecPass(t, `"string"`, "a	b", []byte(`"a\tb"`)) // tab byte between a and b
+
+	testTextDecodeFail(t, `"string"`, []byte("\"\\u\""), "short buffer")
+	testTextDecodeFail(t, `"string"`, []byte("\"\\u.\""), "short buffer")
+	testTextDecodeFail(t, `"string"`, []byte("\"\\u..\""), "short buffer")
+	testTextDecodeFail(t, `"string"`, []byte("\"\\u...\""), "short buffer")
+
+	testTextDecodeFail(t, `"string"`, []byte("\"\\u////\""), "invalid byte") // < '0'
+	testTextDecodeFail(t, `"string"`, []byte("\"\\u::::\""), "invalid byte") // > '9'
+	testTextDecodeFail(t, `"string"`, []byte("\"\\u@@@@\""), "invalid byte") // < 'A'
+	testTextDecodeFail(t, `"string"`, []byte("\"\\uGGGG\""), "invalid byte") // > 'F'
+	testTextDecodeFail(t, `"string"`, []byte("\"\\u````\""), "invalid byte") // < 'a'
+	testTextDecodeFail(t, `"string"`, []byte("\"\\ugggg\""), "invalid byte") // > 'f'
+
+	testTextCodecPass(t, `"string"`, "⌘ ", []byte("\"\\u0001\\u2318 \""))
+	testTextCodecPass(t, `"string"`, "😂 ", []byte("\"\\u0001\\uD83D\\uDE02 \""))
+
+	testTextDecodeFail(t, `"string"`, []byte("\"\\"), "short buffer")
+	testTextDecodeFail(t, `"string"`, []byte("\"\\uD83D\""), "surrogate pair")
+	testTextDecodeFail(t, `"string"`, []byte("\"\\uD83D\\u\""), "surrogate pair")
+	testTextDecodeFail(t, `"string"`, []byte("\"\\uD83D\\uD\""), "surrogate pair")
+	testTextDecodeFail(t, `"string"`, []byte("\"\\uD83D\\uDE\""), "surrogate pair")
+	testTextDecodeFail(t, `"string"`, []byte("\"\\uD83D\\uDE0\""), "invalid byte")
 }
 
 func TestUnescapeUnicode(t *testing.T) {
@@ -150,3 +151,29 @@ func TestUnescapeUnicode(t *testing.T) {
 	checkGood(t, "\u263a\ufe0f", "☺️")
 	checkGood(t, "\u65e5\u672c\u8a9e", "日本語")
 }
+
+func TestJSONUnmarshalStrings(t *testing.T) {
+	cases := []struct {
+		arg  string
+		want string
+	}{
+		{arg: `"A1"`, want: "A1"},
+		{arg: `"\u0042\u0032"`, want: "B2"}, // backslashes have no meaning in back-tick string constant
+	}
+
+	for _, c := range cases {
+		var raw interface{}
+		if err := json.Unmarshal([]byte(c.arg), &raw); err != nil {
+			t.Errorf("CASE: %s; ERROR: %s", c.arg, err)
+			return
+		}
+		got, ok := raw.(string)
+		if !ok {
+			t.Errorf("CASE: %s; GOT: %T; WANT: string", c.arg, got)
+			return
+		}
+		if got != c.want {
+			t.Errorf("GOT: %s; WANT: %q", got, c.want)
+		}
+	}
+}