Skip to content

Commit

Permalink
lenient utf8 parser (#34)
Browse files Browse the repository at this point in the history
  • Loading branch information
jhump authored Sep 16, 2022
1 parent 113307e commit 03fff2f
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 4 deletions.
11 changes: 7 additions & 4 deletions parser/lexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,13 @@ func (rr *runeReader) readRune() (r rune, size int, err error) {
return 0, 0, rr.err
}
r, sz := utf8.DecodeRune(rr.data[rr.pos:])
if r == utf8.RuneError {
rr.err = fmt.Errorf("invalid UTF8 at offset %d: %x", rr.pos, rr.data[rr.pos])
return 0, 0, rr.err
}
// TODO: Enable this check to make input strictly required to be UTF8. We may
// want this to be an optional flag that the parser accepts, to make it
// a conditional check. For now, since protoc allows bad UTF8, so must we :(
//if r == utf8.RuneError {
// rr.err = fmt.Errorf("invalid UTF8 at offset %d: %x", rr.pos, rr.data[rr.pos])
// return 0, 0, rr.err
//}
rr.pos = rr.pos + sz
return r, sz, nil
}
Expand Down
34 changes: 34 additions & 0 deletions parser/lexer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -429,3 +429,37 @@ func newTestLexer(t *testing.T, in io.Reader, h *reporter.Handler) *protoLex {
require.NoError(t, err)
return lexer
}

func TestUTF8(t *testing.T) {
t.Parallel()

testCases := []struct {
data string
expectVal string
succeeds bool
}{
{
data: "'😊'",
expectVal: "😊",
succeeds: true,
},
{
data: "'\xff\x80'",
expectVal: "��", // replaces bad encoding bytes w/ replacement char
succeeds: true, // TODO: should be false if enforcing valid UTF8
},
}
for _, tc := range testCases {
handler := reporter.NewHandler(nil)
l := newTestLexer(t, strings.NewReader(tc.data), handler)
var sym protoSymType
tok := l.Lex(&sym)
if !tc.succeeds {
assert.Equal(t, _ERROR, tok, "lexer should return error for %v", tc.data)
} else {
if assert.Equal(t, _STRING_LIT, tok, "lexer should return string literal token for %v", tc.data) {
assert.Equal(t, tc.expectVal, sym.s.Val)
}
}
}
}

0 comments on commit 03fff2f

Please sign in to comment.