Skip to content

Commit

Permalink
starlark: add 'bytes' data type, for binary strings (google#330)
Browse files Browse the repository at this point in the history
THIS IS AN INCOMPATIBLE LANGUAGE CHANGE; see below

This change defines a 'bytes' data type, an immutable string of
bytes. In this Go implementation of Starlark, ordinary strings
are also strings of bytes, so the behavior of the two is very similar.
However, that is not required by the spec. Other implementations of
Starlark, notably in Java, may use strings of UTF-16 codes for the
ordinary string type, and thus need a distinct type for byte strings.

See testdata/bytes.star for a tour of the API, and some remaining
questions. See the attached issue for an outline of the proposed
spec change. A Java implementation is underway, but is greatly
complicated by Bazel's unfortunate misdecoding of UTF-8 files as
Latin1.

The string.elems iterable view is now indexable.

The old syntax.quote function (which was in fact not used
except in tests) has been replaced by syntax.Quote,
which is similar to Go's strconv.Quote.

This change removes go.starlark.net.lib.proto.Bytes.

IMPORTANT: string literals that previously used hex escapes
\xXX or octal escapes \OOO to denote byte values greater than 127
will now result in a compile error advising you to use \u
escapes instead if you want the UTF-8 encoding of a code point
in the range U+80 to U+FF. A string literal can no longer
denote invalid text, such as the 1-element string formerly
written "\xff".

Updates bazelbuild/starlark#112
Fixes google#222
  • Loading branch information
adonovan authored Feb 12, 2021
1 parent 0a10e4f commit ebe61bd
Show file tree
Hide file tree
Showing 20 changed files with 739 additions and 284 deletions.
4 changes: 3 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ require (
github.com/chzyer/logex v1.1.10 // indirect
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e
github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1 // indirect
golang.org/x/sys v0.0.0-20200803210538-64077c9b5642
github.com/google/go-cmp v0.5.1 // indirect
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
google.golang.org/protobuf v1.25.0
)
10 changes: 6 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@ github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5a
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.0 h1:/QaMHBdZ26BB3SSst0Iwl10Epc+xhTquomWX0oZEB6w=
github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.1 h1:JFrFEBb2xKufg6XkJsJr+WbKb4FQlURi5RUcBveYu9k=
github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
Expand All @@ -42,15 +43,16 @@ golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20200803210538-64077c9b5642 h1:B6caxRw+hozq68X2MY7jEpZh/cr4/aHLv9xU8Kkadrw=
golang.org/x/sys v0.0.0-20200803210538-64077c9b5642/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f h1:+Nyd8tzPX9R7BWHguqsrbFdRx3WQ/1ib8I44HXV5yTA=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
Expand Down
32 changes: 24 additions & 8 deletions internal/compile/compile.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"os"
"path/filepath"
"strconv"
"strings"
"sync"

"go.starlark.net/resolve"
Expand All @@ -46,7 +47,7 @@ var Disassemble = false
const debug = false // make code generation verbose, for debugging the compiler

// Increment this to force recompilation of saved bytecode files.
const Version = 11
const Version = 12

type Opcode uint8

Expand Down Expand Up @@ -309,12 +310,15 @@ func (op Opcode) String() string {
type Program struct {
Loads []Binding // name (really, string) and position of each load stmt
Names []string // names of attributes and predeclared variables
Constants []interface{} // = string | int64 | float64 | *big.Int
Constants []interface{} // = string | int64 | float64 | *big.Int | Bytes
Functions []*Funcode
Globals []Binding // for error messages and tracing
Toplevel *Funcode // module initialization function
}

// The type of a bytes literal value, to distinguish from text string.
type Bytes string

// A Funcode is the code of a compiled Starlark function.
//
// Funcodes are serialized by the encoder.function method,
Expand Down Expand Up @@ -863,6 +867,8 @@ func PrintOp(fn *Funcode, pc uint32, op Opcode, arg uint32) {
switch x := fn.Prog.Constants[arg].(type) {
case string:
comment = strconv.Quote(x)
case Bytes:
comment = "b" + strconv.Quote(string(x))
default:
comment = fmt.Sprint(x)
}
Expand Down Expand Up @@ -1283,8 +1289,12 @@ func (fcomp *fcomp) expr(e syntax.Expr) {
fcomp.lookup(e)

case *syntax.Literal:
// e.Value is int64, float64, *bigInt, or string.
fcomp.emit1(CONSTANT, fcomp.pcomp.constantIndex(e.Value))
// e.Value is int64, float64, *bigInt, string
v := e.Value
if e.Token == syntax.BYTES {
v = Bytes(v.(string))
}
fcomp.emit1(CONSTANT, fcomp.pcomp.constantIndex(v))

case *syntax.ListExpr:
for _, x := range e.List {
Expand Down Expand Up @@ -1522,14 +1532,16 @@ func (fcomp *fcomp) plus(e *syntax.BinaryExpr) {
}

// addable reports whether e is a statically addable
// expression: a [s]tring, [l]ist, or [t]uple.
// expression: a [s]tring, [b]ytes, [l]ist, or [t]uple.
func addable(e syntax.Expr) rune {
switch e := e.(type) {
case *syntax.Literal:
// TODO(adonovan): opt: support INT/FLOAT/BIGINT constant folding.
switch e.Token {
case syntax.STRING:
return 's'
case syntax.BYTES:
return 'b'
}
case *syntax.ListExpr:
return 'l'
Expand All @@ -1544,12 +1556,16 @@ func addable(e syntax.Expr) rune {
// The resulting syntax is degenerate, lacking position, etc.
func add(code rune, args []summand) syntax.Expr {
switch code {
case 's':
var buf bytes.Buffer
case 's', 'b':
var buf strings.Builder
for _, arg := range args {
buf.WriteString(arg.x.(*syntax.Literal).Value.(string))
}
return &syntax.Literal{Token: syntax.STRING, Value: buf.String()}
tok := syntax.STRING
if code == 'b' {
tok = syntax.BYTES
}
return &syntax.Literal{Token: tok, Value: buf.String()}
case 'l':
var elems []syntax.Expr
for _, arg := range args {
Expand Down
22 changes: 14 additions & 8 deletions internal/compile/serial.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,10 @@ package compile
//
// Constant: # type data
// type varint # 0=string string
// data ... # 1=int varint
// # 2=float varint (bits as uint64)
// # 3=bigint string (decimal ASCII text)
// data ... # 1=bytes string
// # 2=int varint
// # 3=float varint (bits as uint64)
// # 4=bigint string (decimal ASCII text)
//
// The encoding starts with a four-byte magic number.
// The next four bytes are a little-endian uint32
Expand Down Expand Up @@ -109,14 +110,17 @@ func (prog *Program) Encode() []byte {
case string:
e.int(0)
e.string(c)
case int64:
case Bytes:
e.int(1)
e.string(string(c))
case int64:
e.int(2)
e.int64(c)
case float64:
e.int(2)
e.int(3)
e.uint64(math.Float64bits(c))
case *big.Int:
e.int(3)
e.int(4)
e.string(c.Text(10))
}
}
Expand Down Expand Up @@ -249,10 +253,12 @@ func DecodeProgram(data []byte) (_ *Program, err error) {
case 0:
c = d.string()
case 1:
c = d.int64()
c = Bytes(d.string())
case 2:
c = math.Float64frombits(d.uint64())
c = d.int64()
case 3:
c = math.Float64frombits(d.uint64())
case 4:
c, _ = new(big.Int).SetString(d.string(), 10)
}
constants[i] = c
Expand Down
87 changes: 5 additions & 82 deletions lib/proto/proto.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,6 @@
package proto

// TODO(adonovan): Go and Starlark API improvements:
// - Contribute the 'bytes' data type to the core language.
// See https://github.com/bazelbuild/starlark/issues/112.
// - Make Message and RepeatedField comparable.
// (NOTE: proto.Equal works only with generated message types.)
// - Support maps, oneof, any. But not messageset if we can avoid it.
Expand Down Expand Up @@ -234,7 +232,7 @@ func marshal(_ *starlark.Thread, fn *starlark.Builtin, args starlark.Tuple, kwar
if err != nil {
return nil, fmt.Errorf("%s: %v", fn.Name(), err)
}
return Bytes(data), nil
return starlark.Bytes(data), nil
} else {
text, err := prototext.MarshalOptions{Indent: " "}.Marshal(m.Message())
if err != nil {
Expand All @@ -247,7 +245,7 @@ func marshal(_ *starlark.Thread, fn *starlark.Builtin, args starlark.Tuple, kwar
// unmarshal(msg) decodes a binary protocol message to a Message.
func unmarshal(thread *starlark.Thread, fn *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) {
var desc MessageDescriptor
var data Bytes
var data starlark.Bytes
if err := starlark.UnpackPositionalArgs(fn.Name(), args, kwargs, 2, &desc, &data); err != nil {
return nil, err
}
Expand Down Expand Up @@ -486,7 +484,7 @@ func toProto(fdesc protoreflect.FieldDescriptor, v starlark.Value) (protoreflect
case protoreflect.StringKind:
if s, ok := starlark.AsString(v); ok {
return protoreflect.ValueOfString(s), nil
} else if b, ok := v.(Bytes); ok {
} else if b, ok := v.(starlark.Bytes); ok {
// TODO(adonovan): allow bytes for string? Not friendly to a Java port.
return protoreflect.ValueOfBytes([]byte(b)), nil
}
Expand All @@ -497,7 +495,7 @@ func toProto(fdesc protoreflect.FieldDescriptor, v starlark.Value) (protoreflect
// Instead provide b"..." literals in the core
// and a bytes(str) conversion.
return protoreflect.ValueOfBytes([]byte(s)), nil
} else if b, ok := v.(Bytes); ok {
} else if b, ok := v.(starlark.Bytes); ok {
return protoreflect.ValueOfBytes([]byte(b)), nil
}

Expand Down Expand Up @@ -588,7 +586,7 @@ func toStarlark1(typ protoreflect.FieldDescriptor, x protoreflect.Value, frozen
return starlark.String(x.String())

case protoreflect.BytesKind:
return Bytes(x.Bytes())
return starlark.Bytes(x.Bytes())

case protoreflect.DoubleKind, protoreflect.FloatKind:
return starlark.Float(x.Float())
Expand Down Expand Up @@ -1232,78 +1230,3 @@ func (x EnumValueDescriptor) CompareSameType(op syntax.Token, y_ starlark.Value,
return false, fmt.Errorf("%s %s %s not implemented", x.Type(), op, y_.Type())
}
}

// A Bytes is an immutable sequence of bytes.
// It is comparable, iterable, indexable, and sliceable.
//
// (In go.starlark.net, text Strings are also byte strings,
// but we shouldn't rely on that.
// See https://github.com/bazelbuild/starlark/issues/112.)
type Bytes string

var (
_ starlark.Comparable = Bytes("")
_ starlark.Iterable = Bytes("")
_ starlark.Sliceable = Bytes("")
_ starlark.Sequence = Bytes("")
)

func (b Bytes) String() string { return fmt.Sprintf("<%d bytes>", len(b)) }
func (b Bytes) Type() string { return "bytes" }
func (b Bytes) Freeze() {} // immutable
func (b Bytes) Truth() starlark.Bool { return len(b) > 0 }
func (b Bytes) Hash() (uint32, error) { return starlark.String(b).Hash() }
func (b Bytes) Len() int { return len(b) }
func (b Bytes) Index(i int) starlark.Value { return starlark.MakeInt(int(b[i])) }

func (b Bytes) Slice(start, end, step int) starlark.Value {
if step == 1 {
return b[start:end]
}

sign := signum(step)
var str []byte
for i := start; signum(end-i) == sign; i += step {
str = append(str, b[i])
}
return Bytes(str)
}

// From Hacker's Delight, section 2.8.
func signum64(x int64) int { return int(uint64(x>>63) | uint64(-x)>>63) }
func signum(x int) int { return signum64(int64(x)) }

func (b Bytes) Iterate() starlark.Iterator { return &bytesIterator{string(b)} }

type bytesIterator struct{ string }

func (it *bytesIterator) Next(p *starlark.Value) bool {
if it.string == "" {
return false
}
*p = starlark.MakeInt(int(it.string[0]))
it.string = it.string[1:]
return true
}

func (it *bytesIterator) Done() {}

func (x Bytes) CompareSameType(op syntax.Token, y_ starlark.Value, depth int) (bool, error) {
y := y_.(Bytes)
cmp := strings.Compare(string(x), string(y))
switch op {
case syntax.EQL:
return cmp == 0, nil
case syntax.NEQ:
return cmp != 0, nil
case syntax.LE:
return cmp <= 0, nil
case syntax.LT:
return cmp < 0, nil
case syntax.GE:
return cmp >= 0, nil
case syntax.GT:
return cmp > 0, nil
}
panic(op)
}
26 changes: 26 additions & 0 deletions starlark/eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,8 @@ func makeToplevelFunction(prog *compile.Program, predeclared StringDict) *Functi
v = MakeBigInt(c)
case string:
v = String(c)
case compile.Bytes:
v = Bytes(c)
case float64:
v = Float(c)
default:
Expand Down Expand Up @@ -796,6 +798,8 @@ func Binary(op syntax.Token, x, y Value) (Value, error) {
return xf * y, nil
case String:
return stringRepeat(y, x)
case Bytes:
return bytesRepeat(y, x)
case *List:
elems, err := tupleRepeat(Tuple(y.elems), x)
if err != nil {
Expand All @@ -820,6 +824,10 @@ func Binary(op syntax.Token, x, y Value) (Value, error) {
if y, ok := y.(Int); ok {
return stringRepeat(x, y)
}
case Bytes:
if y, ok := y.(Int); ok {
return bytesRepeat(x, y)
}
case *List:
if y, ok := y.(Int); ok {
elems, err := tupleRepeat(Tuple(x.elems), y)
Expand Down Expand Up @@ -996,6 +1004,19 @@ func Binary(op syntax.Token, x, y Value) (Value, error) {
return nil, fmt.Errorf("'in <string>' requires string as left operand, not %s", x.Type())
}
return Bool(strings.Contains(string(y), string(needle))), nil
case Bytes:
switch needle := x.(type) {
case Bytes:
return Bool(strings.Contains(string(y), string(needle))), nil
case Int:
var b byte
if err := AsInt(needle, &b); err != nil {
return nil, fmt.Errorf("int in bytes: %s", err)
}
return Bool(strings.IndexByte(string(y), b) >= 0), nil
default:
return nil, fmt.Errorf("'in bytes' requires bytes or int as left operand, not %s", x.Type())
}
case rangeValue:
i, err := NumberToInt(x)
if err != nil {
Expand Down Expand Up @@ -1138,6 +1159,11 @@ func tupleRepeat(elems Tuple, n Int) (Tuple, error) {
return res, nil
}

func bytesRepeat(b Bytes, n Int) (Bytes, error) {
res, err := stringRepeat(String(b), n)
return Bytes(res), err
}

func stringRepeat(s String, n Int) (String, error) {
if s == "" {
return "", nil
Expand Down
1 change: 1 addition & 0 deletions starlark/eval_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ func TestExecFile(t *testing.T) {
"testdata/assign.star",
"testdata/bool.star",
"testdata/builtins.star",
"testdata/bytes.star",
"testdata/control.star",
"testdata/dict.star",
"testdata/float.star",
Expand Down
Loading

0 comments on commit ebe61bd

Please sign in to comment.