Skip to content

Commit

Permalink
canonical schema
Browse files Browse the repository at this point in the history
* Huge thanks to Omid Aladini for the initial work for this feature!

* Codec.CanonicalSchema() returns the parsing canonical form of the
  schema. https://avro.apache.org/docs/current/spec.html#Transforming+into+Parsing+Canonical+Form

* This is considered work towards eventually having
  https://avro.apache.org/docs/1.8.2/spec.html#schema_fingerprints

* The `fixed` type's `size` property allows either an integer or a
  quoted integer, due to implicit allowance in the _Transiforming into
  Parsing Canonical Form_ section of the Avro Specification. Both of
  the followed are permitted, although only the first is legal.

  {"type":"fixed","name":"example1","size":16}

  {"type":"fixed","name":"example1","size":"16"}

* NewCodec returns an error when the provided Avro schema is an
  unquoted primitive type. The Avro Specification says that a schema
  may be a JSON string, naming a defined type, or a JSON object. A
  JSON string must have a starting and ending quote.

  This is an invalid schema: `int`
  This is a valid schema:    `"int"`

See: linkedin#35
  • Loading branch information
Karrick S. McDermott committed Apr 27, 2018
1 parent e713d59 commit fa8f6a3
Show file tree
Hide file tree
Showing 20 changed files with 847 additions and 334 deletions.
2 changes: 1 addition & 1 deletion binary_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import (
var morePositiveThanMaxBlockCount, morePositiveThanMaxBlockSize, moreNegativeThanMaxBlockCount, mostNegativeBlockCount []byte

func init() {
c, err := NewCodec("long")
c, err := NewCodec(`"long"`)
if err != nil {
panic(err)
}
Expand Down
22 changes: 11 additions & 11 deletions boolean_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,21 @@ package goavro
import "testing"

func TestSchemaPrimitiveCodecBoolean(t *testing.T) {
testSchemaPrimativeCodec(t, "boolean")
testSchemaPrimativeCodec(t, `"boolean"`)
}

func TestPrimitiveBooleanBinary(t *testing.T) {
testBinaryEncodeFailBadDatumType(t, "boolean", 0)
testBinaryEncodeFailBadDatumType(t, "boolean", 1)
testBinaryDecodeFailShortBuffer(t, "boolean", nil)
testBinaryCodecPass(t, "boolean", false, []byte{0})
testBinaryCodecPass(t, "boolean", true, []byte{1})
testBinaryEncodeFailBadDatumType(t, `"boolean"`, 0)
testBinaryEncodeFailBadDatumType(t, `"boolean"`, 1)
testBinaryDecodeFailShortBuffer(t, `"boolean"`, nil)
testBinaryCodecPass(t, `"boolean"`, false, []byte{0})
testBinaryCodecPass(t, `"boolean"`, true, []byte{1})
}

func TestPrimitiveBooleanText(t *testing.T) {
testTextEncodeFailBadDatumType(t, "boolean", 0)
testTextEncodeFailBadDatumType(t, "boolean", 1)
testTextDecodeFailShortBuffer(t, "boolean", nil)
testTextCodecPass(t, "boolean", false, []byte("false"))
testTextCodecPass(t, "boolean", true, []byte("true"))
testTextEncodeFailBadDatumType(t, `"boolean"`, 0)
testTextEncodeFailBadDatumType(t, `"boolean"`, 1)
testTextDecodeFailShortBuffer(t, `"boolean"`, nil)
testTextCodecPass(t, `"boolean"`, false, []byte("false"))
testTextCodecPass(t, `"boolean"`, true, []byte("true"))
}
78 changes: 78 additions & 0 deletions bytes.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"errors"
"fmt"
"io"
"os"
"unicode"
"unicode/utf16"
"unicode/utf8"
Expand Down Expand Up @@ -440,3 +441,80 @@ var (
sliceTab = []byte("\\t")
sliceUnicode = []byte("\\u")
)

// DEBUG -- remove function prior to committing
func decodedStringFromJSON(buf []byte) (string, []byte, error) {
fmt.Fprintf(os.Stderr, "decodedStringFromJSON(%v)\n", buf)
buflen := len(buf)
if buflen < 2 {
return "", buf, fmt.Errorf("cannot decode string: %s", io.ErrShortBuffer)
}
if buf[0] != '"' {
return "", buf, fmt.Errorf("cannot decode string: expected initial '\"'; found: %#U", buf[0])
}
var newBytes []byte
var escaped, ok bool
// Loop through bytes following initial double quote, but note we will
// return immediately when find unescaped double quote.
for i := 1; i < buflen; i++ {
b := buf[i]
if escaped {
escaped = false
if b, ok = unescapeSpecialJSON(b); ok {
newBytes = append(newBytes, b)
continue
}
if b == 'u' {
// NOTE: Need at least 4 more bytes to read uint16, but subtract
// 1 because do not want to count the trailing quote and
// subtract another 1 because already consumed u but have yet to
// increment i.
if i > buflen-6 {
return "", buf[i+1:], fmt.Errorf("cannot decode string: %s", io.ErrShortBuffer)
}
v, err := parseUint64FromHexSlice(buf[i+1 : i+5])
if err != nil {
return "", buf[i+1:], fmt.Errorf("cannot decode string: %s", err)
}
i += 4 // absorb 4 characters: one 'u' and three of the digits

nbl := len(newBytes)
newBytes = append(newBytes, 0, 0, 0, 0) // grow to make room for UTF-8 encoded rune

r := rune(v)
if utf16.IsSurrogate(r) {
i++ // absorb final hexidecimal digit from previous value

// Expect second half of surrogate pair
if i > buflen-6 || buf[i] != '\\' || buf[i+1] != 'u' {
return "", buf[i+1:], errors.New("cannot decode string: missing second half of surrogate pair")
}

v, err = parseUint64FromHexSlice(buf[i+2 : i+6])
if err != nil {
return "", buf[i+1:], fmt.Errorf("cannot decode string: cannot decode second half of surrogate pair: %s", err)
}
i += 5 // absorb 5 characters: two for '\u', and 3 of the 4 digits

// Get code point by combining high and low surrogate bits
r = utf16.DecodeRune(r, rune(v))
}

width := utf8.EncodeRune(newBytes[nbl:], r) // append UTF-8 encoded version of code point
newBytes = newBytes[:nbl+width] // trim off excess bytes
continue
}
newBytes = append(newBytes, b)
continue
}
if b == '\\' {
escaped = true
continue
}
if b == '"' {
return string(newBytes), buf[i+1:], nil
}
newBytes = append(newBytes, b)
}
return "", buf, fmt.Errorf("cannot decode string: expected final '\"'; found: %#U", buf[buflen-1])
}
201 changes: 114 additions & 87 deletions bytes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,113 +10,114 @@
package goavro

import (
"encoding/json"
"strings"
"testing"
)

func TestSchemaPrimitiveCodecBytes(t *testing.T) {
testSchemaPrimativeCodec(t, "bytes")
testSchemaPrimativeCodec(t, `"bytes"`)
}

func TestPrimitiveBytesBinary(t *testing.T) {
testBinaryEncodeFailBadDatumType(t, "bytes", 13)
testBinaryDecodeFailShortBuffer(t, "bytes", nil)
testBinaryDecodeFailShortBuffer(t, "bytes", []byte{2})
testBinaryCodecPass(t, "bytes", []byte(""), []byte("\x00"))
testBinaryCodecPass(t, "bytes", []byte("some bytes"), []byte("\x14some bytes"))
testBinaryEncodeFailBadDatumType(t, `"bytes"`, 13)
testBinaryDecodeFailShortBuffer(t, `"bytes"`, nil)
testBinaryDecodeFailShortBuffer(t, `"bytes"`, []byte{2})
testBinaryCodecPass(t, `"bytes"`, []byte(""), []byte("\x00"))
testBinaryCodecPass(t, `"bytes"`, []byte("some bytes"), []byte("\x14some bytes"))
}

func TestPrimitiveBytesText(t *testing.T) {
testTextEncodeFailBadDatumType(t, "bytes", 42)
testTextDecodeFailShortBuffer(t, "bytes", []byte(``))
testTextDecodeFailShortBuffer(t, "bytes", []byte(`"`))
testTextDecodeFail(t, "bytes", []byte(`..`), "expected initial \"")
testTextDecodeFail(t, "bytes", []byte(`".`), "expected final \"")

testTextCodecPass(t, "bytes", []byte(""), []byte("\"\""))
testTextCodecPass(t, "bytes", []byte("a"), []byte("\"a\""))
testTextCodecPass(t, "bytes", []byte("ab"), []byte("\"ab\""))
testTextCodecPass(t, "bytes", []byte("a\"b"), []byte("\"a\\\"b\""))
testTextCodecPass(t, "bytes", []byte("a\\b"), []byte("\"a\\\\b\""))
testTextCodecPass(t, "bytes", []byte("a/b"), []byte("\"a\\/b\""))

testTextCodecPass(t, "bytes", []byte("a\bb"), []byte(`"a\bb"`))
testTextCodecPass(t, "bytes", []byte("a\fb"), []byte(`"a\fb"`))
testTextCodecPass(t, "bytes", []byte("a\nb"), []byte(`"a\nb"`))
testTextCodecPass(t, "bytes", []byte("a\rb"), []byte(`"a\rb"`))
testTextCodecPass(t, "bytes", []byte("a\tb"), []byte(`"a\tb"`))
testTextCodecPass(t, "bytes", []byte("a b"), []byte(`"a\tb"`)) // tab byte between a and b

testTextDecodeFail(t, "bytes", []byte("\"\\u\""), "short buffer")
testTextDecodeFail(t, "bytes", []byte("\"\\u.\""), "short buffer")
testTextDecodeFail(t, "bytes", []byte("\"\\u..\""), "short buffer")
testTextDecodeFail(t, "bytes", []byte("\"\\u...\""), "short buffer")

testTextDecodeFail(t, "bytes", []byte("\"\\u////\""), "invalid byte") // < '0'
testTextDecodeFail(t, "bytes", []byte("\"\\u::::\""), "invalid byte") // > '9'
testTextDecodeFail(t, "bytes", []byte("\"\\u@@@@\""), "invalid byte") // < 'A'
testTextDecodeFail(t, "bytes", []byte("\"\\uGGGG\""), "invalid byte") // > 'F'
testTextDecodeFail(t, "bytes", []byte("\"\\u````\""), "invalid byte") // < 'a'
testTextDecodeFail(t, "bytes", []byte("\"\\ugggg\""), "invalid byte") // > 'f'

testTextCodecPass(t, "bytes", []byte("⌘ "), []byte("\"\\u0001\\u00E2\\u008C\\u0098 \""))
testTextCodecPass(t, "bytes", []byte("😂"), []byte(`"\u00F0\u009F\u0098\u0082"`))
testTextEncodeFailBadDatumType(t, `"bytes"`, 42)
testTextDecodeFailShortBuffer(t, `"bytes"`, []byte(``))
testTextDecodeFailShortBuffer(t, `"bytes"`, []byte(`"`))
testTextDecodeFail(t, `"bytes"`, []byte(`..`), "expected initial \"")
testTextDecodeFail(t, `"bytes"`, []byte(`".`), "expected final \"")

testTextCodecPass(t, `"bytes"`, []byte(""), []byte("\"\""))
testTextCodecPass(t, `"bytes"`, []byte("a"), []byte("\"a\""))
testTextCodecPass(t, `"bytes"`, []byte("ab"), []byte("\"ab\""))
testTextCodecPass(t, `"bytes"`, []byte("a\"b"), []byte("\"a\\\"b\""))
testTextCodecPass(t, `"bytes"`, []byte("a\\b"), []byte("\"a\\\\b\""))
testTextCodecPass(t, `"bytes"`, []byte("a/b"), []byte("\"a\\/b\""))

testTextCodecPass(t, `"bytes"`, []byte("a\bb"), []byte(`"a\bb"`))
testTextCodecPass(t, `"bytes"`, []byte("a\fb"), []byte(`"a\fb"`))
testTextCodecPass(t, `"bytes"`, []byte("a\nb"), []byte(`"a\nb"`))
testTextCodecPass(t, `"bytes"`, []byte("a\rb"), []byte(`"a\rb"`))
testTextCodecPass(t, `"bytes"`, []byte("a\tb"), []byte(`"a\tb"`))
testTextCodecPass(t, `"bytes"`, []byte("a b"), []byte(`"a\tb"`)) // tab byte between a and b

testTextDecodeFail(t, `"bytes"`, []byte("\"\\u\""), "short buffer")
testTextDecodeFail(t, `"bytes"`, []byte("\"\\u.\""), "short buffer")
testTextDecodeFail(t, `"bytes"`, []byte("\"\\u..\""), "short buffer")
testTextDecodeFail(t, `"bytes"`, []byte("\"\\u...\""), "short buffer")

testTextDecodeFail(t, `"bytes"`, []byte("\"\\u////\""), "invalid byte") // < '0'
testTextDecodeFail(t, `"bytes"`, []byte("\"\\u::::\""), "invalid byte") // > '9'
testTextDecodeFail(t, `"bytes"`, []byte("\"\\u@@@@\""), "invalid byte") // < 'A'
testTextDecodeFail(t, `"bytes"`, []byte("\"\\uGGGG\""), "invalid byte") // > 'F'
testTextDecodeFail(t, `"bytes"`, []byte("\"\\u````\""), "invalid byte") // < 'a'
testTextDecodeFail(t, `"bytes"`, []byte("\"\\ugggg\""), "invalid byte") // > 'f'

testTextCodecPass(t, `"bytes"`, []byte("⌘ "), []byte("\"\\u0001\\u00E2\\u008C\\u0098 \""))
testTextCodecPass(t, `"bytes"`, []byte("😂"), []byte(`"\u00F0\u009F\u0098\u0082"`))
}

func TestSchemaPrimitiveStringCodec(t *testing.T) {
testSchemaPrimativeCodec(t, "string")
testSchemaPrimativeCodec(t, `"string"`)
}

func TestPrimitiveStringBinary(t *testing.T) {
testBinaryEncodeFailBadDatumType(t, "string", 42)
testBinaryDecodeFailShortBuffer(t, "string", nil)
testBinaryDecodeFailShortBuffer(t, "string", []byte{2})
testBinaryCodecPass(t, "string", "", []byte("\x00"))
testBinaryCodecPass(t, "string", "some string", []byte("\x16some string"))
testBinaryEncodeFailBadDatumType(t, `"string"`, 42)
testBinaryDecodeFailShortBuffer(t, `"string"`, nil)
testBinaryDecodeFailShortBuffer(t, `"string"`, []byte{2})
testBinaryCodecPass(t, `"string"`, "", []byte("\x00"))
testBinaryCodecPass(t, `"string"`, "some string", []byte("\x16some string"))
}

func TestPrimitiveStringText(t *testing.T) {
testTextEncodeFailBadDatumType(t, "string", 42)
testTextDecodeFailShortBuffer(t, "string", []byte(``))
testTextDecodeFailShortBuffer(t, "string", []byte(`"`))
testTextDecodeFail(t, "string", []byte(`..`), "expected initial \"")
testTextDecodeFail(t, "string", []byte(`".`), "expected final \"")

testTextCodecPass(t, "string", "", []byte("\"\""))
testTextCodecPass(t, "string", "a", []byte("\"a\""))
testTextCodecPass(t, "string", "ab", []byte("\"ab\""))
testTextCodecPass(t, "string", "a\"b", []byte("\"a\\\"b\""))
testTextCodecPass(t, "string", "a\\b", []byte("\"a\\\\b\""))
testTextCodecPass(t, "string", "a/b", []byte("\"a\\/b\""))

testTextCodecPass(t, "string", "a\bb", []byte(`"a\bb"`))
testTextCodecPass(t, "string", "a\fb", []byte(`"a\fb"`))
testTextCodecPass(t, "string", "a\nb", []byte(`"a\nb"`))
testTextCodecPass(t, "string", "a\rb", []byte(`"a\rb"`))
testTextCodecPass(t, "string", "a\tb", []byte(`"a\tb"`))
testTextCodecPass(t, "string", "a b", []byte(`"a\tb"`)) // tab byte between a and b

testTextDecodeFail(t, "string", []byte("\"\\u\""), "short buffer")
testTextDecodeFail(t, "string", []byte("\"\\u.\""), "short buffer")
testTextDecodeFail(t, "string", []byte("\"\\u..\""), "short buffer")
testTextDecodeFail(t, "string", []byte("\"\\u...\""), "short buffer")

testTextDecodeFail(t, "string", []byte("\"\\u////\""), "invalid byte") // < '0'
testTextDecodeFail(t, "string", []byte("\"\\u::::\""), "invalid byte") // > '9'
testTextDecodeFail(t, "string", []byte("\"\\u@@@@\""), "invalid byte") // < 'A'
testTextDecodeFail(t, "string", []byte("\"\\uGGGG\""), "invalid byte") // > 'F'
testTextDecodeFail(t, "string", []byte("\"\\u````\""), "invalid byte") // < 'a'
testTextDecodeFail(t, "string", []byte("\"\\ugggg\""), "invalid byte") // > 'f'

testTextCodecPass(t, "string", "⌘ ", []byte("\"\\u0001\\u2318 \""))
testTextCodecPass(t, "string", "😂 ", []byte("\"\\u0001\\uD83D\\uDE02 \""))

testTextDecodeFail(t, "string", []byte("\"\\"), "short buffer")
testTextDecodeFail(t, "string", []byte("\"\\uD83D\""), "surrogate pair")
testTextDecodeFail(t, "string", []byte("\"\\uD83D\\u\""), "surrogate pair")
testTextDecodeFail(t, "string", []byte("\"\\uD83D\\uD\""), "surrogate pair")
testTextDecodeFail(t, "string", []byte("\"\\uD83D\\uDE\""), "surrogate pair")
testTextDecodeFail(t, "string", []byte("\"\\uD83D\\uDE0\""), "invalid byte")
testTextEncodeFailBadDatumType(t, `"string"`, 42)
testTextDecodeFailShortBuffer(t, `"string"`, []byte(``))
testTextDecodeFailShortBuffer(t, `"string"`, []byte(`"`))
testTextDecodeFail(t, `"string"`, []byte(`..`), "expected initial \"")
testTextDecodeFail(t, `"string"`, []byte(`".`), "expected final \"")

testTextCodecPass(t, `"string"`, "", []byte("\"\""))
testTextCodecPass(t, `"string"`, "a", []byte("\"a\""))
testTextCodecPass(t, `"string"`, "ab", []byte("\"ab\""))
testTextCodecPass(t, `"string"`, "a\"b", []byte("\"a\\\"b\""))
testTextCodecPass(t, `"string"`, "a\\b", []byte("\"a\\\\b\""))
testTextCodecPass(t, `"string"`, "a/b", []byte("\"a\\/b\""))

testTextCodecPass(t, `"string"`, "a\bb", []byte(`"a\bb"`))
testTextCodecPass(t, `"string"`, "a\fb", []byte(`"a\fb"`))
testTextCodecPass(t, `"string"`, "a\nb", []byte(`"a\nb"`))
testTextCodecPass(t, `"string"`, "a\rb", []byte(`"a\rb"`))
testTextCodecPass(t, `"string"`, "a\tb", []byte(`"a\tb"`))
testTextCodecPass(t, `"string"`, "a b", []byte(`"a\tb"`)) // tab byte between a and b

testTextDecodeFail(t, `"string"`, []byte("\"\\u\""), "short buffer")
testTextDecodeFail(t, `"string"`, []byte("\"\\u.\""), "short buffer")
testTextDecodeFail(t, `"string"`, []byte("\"\\u..\""), "short buffer")
testTextDecodeFail(t, `"string"`, []byte("\"\\u...\""), "short buffer")

testTextDecodeFail(t, `"string"`, []byte("\"\\u////\""), "invalid byte") // < '0'
testTextDecodeFail(t, `"string"`, []byte("\"\\u::::\""), "invalid byte") // > '9'
testTextDecodeFail(t, `"string"`, []byte("\"\\u@@@@\""), "invalid byte") // < 'A'
testTextDecodeFail(t, `"string"`, []byte("\"\\uGGGG\""), "invalid byte") // > 'F'
testTextDecodeFail(t, `"string"`, []byte("\"\\u````\""), "invalid byte") // < 'a'
testTextDecodeFail(t, `"string"`, []byte("\"\\ugggg\""), "invalid byte") // > 'f'

testTextCodecPass(t, `"string"`, "⌘ ", []byte("\"\\u0001\\u2318 \""))
testTextCodecPass(t, `"string"`, "😂 ", []byte("\"\\u0001\\uD83D\\uDE02 \""))

testTextDecodeFail(t, `"string"`, []byte("\"\\"), "short buffer")
testTextDecodeFail(t, `"string"`, []byte("\"\\uD83D\""), "surrogate pair")
testTextDecodeFail(t, `"string"`, []byte("\"\\uD83D\\u\""), "surrogate pair")
testTextDecodeFail(t, `"string"`, []byte("\"\\uD83D\\uD\""), "surrogate pair")
testTextDecodeFail(t, `"string"`, []byte("\"\\uD83D\\uDE\""), "surrogate pair")
testTextDecodeFail(t, `"string"`, []byte("\"\\uD83D\\uDE0\""), "invalid byte")
}

func TestUnescapeUnicode(t *testing.T) {
Expand Down Expand Up @@ -150,3 +151,29 @@ func TestUnescapeUnicode(t *testing.T) {
checkGood(t, "\u263a\ufe0f", "☺️")
checkGood(t, "\u65e5\u672c\u8a9e", "日本語")
}

func TestJSONUnmarshalStrings(t *testing.T) {
cases := []struct {
arg string
want string
}{
{arg: `"A1"`, want: "A1"},
{arg: `"\u0042\u0032"`, want: "B2"}, // backslashes have no meaning in back-tick string constant
}

for _, c := range cases {
var raw interface{}
if err := json.Unmarshal([]byte(c.arg), &raw); err != nil {
t.Errorf("CASE: %s; ERROR: %s", c.arg, err)
return
}
got, ok := raw.(string)
if !ok {
t.Errorf("CASE: %s; GOT: %T; WANT: string", c.arg, got)
return
}
if got != c.want {
t.Errorf("GOT: %s; WANT: %q", got, c.want)
}
}
}
Loading

0 comments on commit fa8f6a3

Please sign in to comment.