From ac23acb182e187e6835e4c0cc42205fe37a05624 Mon Sep 17 00:00:00 2001 From: alandonovan Date: Thu, 11 Jun 2020 17:56:15 -0400 Subject: [PATCH] starlarkjson: a standard JSON module for Starlark (#179) This change defines a standard Starlark module for JSON encoding and decoding. See json.go for documentation. It is intended to subsume, generalize, and eventually replace Bazel's ill-conceived struct.to_json method. The json module is predeclared in the Starlark REPL environment. See related issues: https://github.com/bazelbuild/bazel/issues/7896 https://buganizer.corp.google.com/issues/23962735 https://buganizer.corp.google.com/issues/70210417 https://github.com/bazelbuild/bazel/issues/7879#issuecomment-477713954 https://github.com/bazelbuild/bazel/issues/5542 https://github.com/bazelbuild/bazel/issues/10176 https://github.com/bazelbuild/starlark/pull/83 https://github.com/bazelbuild/bazel/issues/3732 --- cmd/starlark/starlark.go | 6 +- starlark/eval_test.go | 7 + starlark/testdata/json.star | 147 +++++++++++ starlarkjson/json.go | 480 ++++++++++++++++++++++++++++++++++++ 4 files changed, 639 insertions(+), 1 deletion(-) create mode 100644 starlark/testdata/json.star create mode 100644 starlarkjson/json.go diff --git a/cmd/starlark/starlark.go b/cmd/starlark/starlark.go index 9768971a..af4f71f6 100644 --- a/cmd/starlark/starlark.go +++ b/cmd/starlark/starlark.go @@ -19,6 +19,7 @@ import ( "go.starlark.net/repl" "go.starlark.net/resolve" "go.starlark.net/starlark" + "go.starlark.net/starlarkjson" ) // flags @@ -88,6 +89,10 @@ func doMain() int { thread := &starlark.Thread{Load: repl.MakeLoad()} globals := make(starlark.StringDict) + // Ideally this statement would update the predeclared environment. + // TODO(adonovan): plumb predeclared env through to the REPL. + starlark.Universe["json"] = starlarkjson.Module + switch { case flag.NArg() == 1 || *execprog != "": var ( @@ -113,7 +118,6 @@ func doMain() int { fmt.Println("Welcome to Starlark (go.starlark.net)") thread.Name = "REPL" repl.REPL(thread, globals) - return 0 default: log.Print("want at most one Starlark file name") return 1 diff --git a/starlark/eval_test.go b/starlark/eval_test.go index 9b186381..e7aef9d6 100644 --- a/starlark/eval_test.go +++ b/starlark/eval_test.go @@ -16,6 +16,8 @@ import ( "go.starlark.net/internal/chunkedfile" "go.starlark.net/resolve" "go.starlark.net/starlark" + "go.starlark.net/starlarkjson" + "go.starlark.net/starlarkstruct" "go.starlark.net/starlarktest" "go.starlark.net/syntax" ) @@ -119,6 +121,7 @@ func TestExecFile(t *testing.T) { "testdata/float.star", "testdata/function.star", "testdata/int.star", + "testdata/json.star", "testdata/list.star", "testdata/misc.star", "testdata/set.star", @@ -132,6 +135,7 @@ func TestExecFile(t *testing.T) { predeclared := starlark.StringDict{ "hasfields": starlark.NewBuiltin("hasfields", newHasFields), "fibonacci": fib{}, + "struct": starlark.NewBuiltin("struct", starlarkstruct.Make), } setOptions(chunk.Source) @@ -186,6 +190,9 @@ func load(thread *starlark.Thread, module string) (starlark.StringDict, error) { if module == "assert.star" { return starlarktest.LoadAssertModule() } + if module == "json.star" { + return starlark.StringDict{"json": starlarkjson.Module}, nil + } // TODO(adonovan): test load() using this execution path. filename := filepath.Join(filepath.Dir(thread.CallFrame(0).Pos.Filename()), module) diff --git a/starlark/testdata/json.star b/starlark/testdata/json.star new file mode 100644 index 00000000..d8af66ba --- /dev/null +++ b/starlark/testdata/json.star @@ -0,0 +1,147 @@ +# Tests of json module. +# option:float + +load("assert.star", "assert") +load("json.star", "json") + +assert.eq(dir(json), ["decode", "encode", "indent"]) + +# Some of these cases were inspired by github.com/nst/JSONTestSuite. + +## json.encode + +assert.eq(json.encode(None), "null") +assert.eq(json.encode(True), "true") +assert.eq(json.encode(False), "false") +assert.eq(json.encode(-123), "-123") +assert.eq(json.encode(12345*12345*12345*12345*12345*12345), "3539537889086624823140625") +assert.eq(json.encode(float(12345*12345*12345*12345*12345*12345)), "3.539537889086625e+24") +assert.eq(json.encode(12.345e67), "1.2345e+68") +assert.eq(json.encode("hello"), '"hello"') +assert.eq(json.encode([1, 2, 3]), "[1,2,3]") +assert.eq(json.encode((1, 2, 3)), "[1,2,3]") +assert.eq(json.encode(range(3)), "[0,1,2]") # a built-in iterable +assert.eq(json.encode(dict(x = 1, y = "two")), '{"x":1,"y":"two"}') +assert.eq(json.encode(struct(x = 1, y = "two")), '{"x":1,"y":"two"}') # a user-defined HasAttrs +assert.eq(json.encode("\x80"), '"\\ufffd"') # invalid UTF-8 -> replacement char + +def encode_error(expr, error): + assert.fails(lambda: json.encode(expr), error) + +encode_error(float("NaN"), "json.encode: cannot encode non-finite float NaN") +encode_error({1: "two"}, "dict has int key, want string") +encode_error(len, "cannot encode builtin_function_or_method as JSON") +encode_error(struct(x=[1, {"x": len}]), # nested failure + 'in field .x: at list index 1: in dict key "x": cannot encode...') +encode_error(struct(x=[1, {"x": len}]), # nested failure + 'in field .x: at list index 1: in dict key "x": cannot encode...') +encode_error({1: 2}, 'dict has int key, want string') + +## json.decode + +assert.eq(json.decode("null"), None) +assert.eq(json.decode("true"), True) +assert.eq(json.decode("false"), False) +assert.eq(json.decode("-123"), -123) +assert.eq(json.decode("-0"), -0) +assert.eq(json.decode("3539537889086624823140625"), 3539537889086624823140625) +assert.eq(json.decode("3539537889086624823140625.0"), float(3539537889086624823140625)) +assert.eq(json.decode("3.539537889086625e+24"), 3.539537889086625e+24) +assert.eq(json.decode("0e+1"), 0) +assert.eq(json.decode("-0.0"), -0.0) +assert.eq(json.decode( + "-0.000000000000000000000000000000000000000000000000000000000000000000000000000001"), + -0.000000000000000000000000000000000000000000000000000000000000000000000000000001) +assert.eq(json.decode('[]'), []) +assert.eq(json.decode('[1]'), [1]) +assert.eq(json.decode('[1,2,3]'), [1, 2, 3]) +assert.eq(json.decode('{"one": 1, "two": 2}'), dict(one=1, two=2)) +assert.eq(json.decode('{"foo\u0000bar": 42}'), {"foo\x00bar": 42}) +assert.eq(json.decode('"\ud83d\ude39\ud83d\udc8d"'), "πŸ˜ΉπŸ’") +assert.eq(json.decode('"\u0123"'), 'Δ£') +assert.eq(json.decode('"\x7f"'), "\x7f") + +def decode_error(expr, error): + assert.fails(lambda: json.decode(expr), error) + +decode_error('truefalse', + "json.decode: at offset 4, unexpected character 'f' after value") + +decode_error('"abc', "unclosed string literal") +decode_error('"ab\gc"', "invalid character 'g' in string escape code") +decode_error("'abc'", "unexpected character '\\\\''") + +decode_error("1.2.3", "invalid number: 1.2.3") +decode_error("+1", "unexpected character '\\+'") +decode_error("-abc", "invalid number: -") +decode_error("-", "invalid number: -") +decode_error("-00", "invalid number: -00") +decode_error("00", "invalid number: 00") +decode_error("--1", "invalid number: --1") +decode_error("-+1", "invalid number: -\\+1") +decode_error("1e1e1", "invalid number: 1e1e1") +decode_error("0123", "invalid number: 0123") +decode_error("000.123", "invalid number: 000.123") +decode_error("-0123", "invalid number: -0123") +decode_error("-000.123", "invalid number: -000.123") +decode_error("0x123", "unexpected character 'x' after value") + +decode_error('[1, 2 ', "unexpected end of file") +decode_error('[1, 2, ', "unexpected end of file") +decode_error('[1, 2, ]', "unexpected character ']'") +decode_error('[1, 2, }', "unexpected character '}'") +decode_error('[1, 2}', "got '}', want ',' or ']'") + +decode_error('{"one": 1', "unexpected end of file") +decode_error('{"one" 1', "after object key, got '1', want ':'") +decode_error('{"one": 1 "two": 2', "in object, got '\"', want ',' or '}'") +decode_error('{"one": 1,', "unexpected end of file") +decode_error('{"one": 1, }', "unexpected character '}'") +decode_error('{"one": 1]', "in object, got ']', want ',' or '}'") + +def codec(x): + return json.decode(json.encode(x)) + +# string round-tripping +strings = [ + "😿", # U+1F63F CRYING_CAT_FACE + "πŸ±β€πŸ‘€", # CAT FACE + ZERO WIDTH JOINER + BUST IN SILHOUETTE +] +assert.eq(codec(strings), strings) + +# codepoints is a string with every 16-bit code point. +codepoints = ''.join(['%c' % c for c in range(65536)]) +assert.eq(codec(codepoints), codepoints) + +# number round-tripping +numbers = [ + 0, 1, -1, +1, 1.23e45, -1.23e-45, + 3539537889086624823140625, + float(3539537889086624823140625), +] +assert.eq(codec(numbers), numbers) + +## json.indent + +s = json.encode(dict(x = 1, y = ["one", "two"])) + +assert.eq(json.indent(s), '''{ + "x": 1, + "y": [ + "one", + "two" + ] +}''') + +assert.eq(json.decode(json.indent(s)), {"x": 1, "y": ["one", "two"]}) + +assert.eq(json.indent(s, prefix='ΒΆ', indent='–––'), '''{ +¢–––"x": 1, +¢–––"y": [ +¢––––––"one", +¢––––––"two" +¢–––] +ΒΆ}''') + +assert.fails(lambda: json.indent("!@#$%^& this is not json"), 'invalid character') +--- diff --git a/starlarkjson/json.go b/starlarkjson/json.go new file mode 100644 index 00000000..bfa397a6 --- /dev/null +++ b/starlarkjson/json.go @@ -0,0 +1,480 @@ +// Copyright 2020 The Bazel Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package starlarkjson defines utilities for converting Starlark values +// to/from JSON strings. The most recent IETF standard for JSON is +// https://www.ietf.org/rfc/rfc7159.txt. +package starlarkjson // import "go.starlark.net/starlarkjson" + +import ( + "bytes" + "encoding/json" + "fmt" + "log" + "math" + "math/big" + "sort" + "strconv" + "strings" + "unicode/utf8" + + "go.starlark.net/starlark" + "go.starlark.net/starlarkstruct" +) + +// Module json is a Starlark module of JSON-related functions. +// +// json = module( +// encode, +// decode, +// indent, +// ) +// +// def encode(x): +// +// The encode function accepts one required positional argument, +// which it converts to JSON by cases: +// - A Starlark value that implements Go's standard json.Marshal +// interface defines its own JSON encoding. +// - None, True, and False are converted to null, true, and false, respectively. +// - Starlark int values, no matter how large, are encoded as decimal integers. +// Some decoders may not be able to decode very large integers. +// - Starlark float values are encoded using decimal point notation, +// even if the value is an integer. +// It is an error to encode a non-finite floating-point value. +// - Starlark strings are encoded as JSON strings, using UTF-16 escapes. +// - a Starlark IterableMapping (e.g. dict) is encoded as a JSON object. +// It is an error if any key is not a string. +// - any other Starlark Iterable (e.g. list, tuple) is encoded as a JSON array. +// - a Starlark HasAttrs (e.g. struct) is encoded as a JSON object. +// It an application-defined type matches more than one the cases describe above, +// (e.g. it implements both Iterable and HasFields), the first case takes precedence. +// Encoding any other value yields an error. +// +// def decode(x): +// +// The decode function accepts one positional parameter, a JSON string. +// It returns the Starlark value that the string denotes. +// - Numbers are parsed as int or float, depending on whether they +// contain a decimal point. +// - JSON objects are parsed as new unfrozen Starlark dicts. +// - JSON arrays are parsed as new unfrozen Starlark lists. +// Decoding fails if x is not a valid JSON string. +// +// def indent(str, *, prefix="", indent="\t"): +// +// The indent function pretty-prints a valid JSON encoding, +// and returns a string containing the indented form. +// It accepts one required positional parameter, the JSON string, +// and two optional keyword-only string parameters, prefix and indent, +// that specify a prefix of each new line, and the unit of indentation. +// +var Module = &starlarkstruct.Module{ + Name: "json", + Members: starlark.StringDict{ + "encode": starlark.NewBuiltin("json.encode", encode), + "decode": starlark.NewBuiltin("json.decode", decode), + "indent": starlark.NewBuiltin("json.indent", indent), + }, +} + +func encode(thread *starlark.Thread, b *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) { + var x starlark.Value + if err := starlark.UnpackPositionalArgs(b.Name(), args, kwargs, 1, &x); err != nil { + return nil, err + } + + buf := new(bytes.Buffer) + + var quoteSpace [128]byte + quote := func(s string) { + // Non-trivial escaping is handled by Go's encoding/json. + if isPrintableASCII(s) { + buf.Write(strconv.AppendQuote(quoteSpace[:0], s)) + } else { + // TODO(adonovan): opt: RFC 8259 mandates UTF-8 for JSON. + // Can we avoid this call? + data, _ := json.Marshal(s) + buf.Write(data) + } + } + + var emit func(x starlark.Value) error + emit = func(x starlark.Value) error { + switch x := x.(type) { + case json.Marshaler: + // Application-defined starlark.Value types + // may define their own JSON encoding. + data, err := x.MarshalJSON() + if err != nil { + return err + } + buf.Write(data) + + case starlark.NoneType: + buf.WriteString("null") + + case starlark.Bool: + if x { + buf.WriteString("true") + } else { + buf.WriteString("false") + } + + case starlark.Int: + fmt.Fprint(buf, x) + + case starlark.Float: + if !isFinite(float64(x)) { + return fmt.Errorf("cannot encode non-finite float %v", x) + } + fmt.Fprintf(buf, "%g", x) // always contains a decimal point + + case starlark.String: + quote(string(x)) + + case starlark.IterableMapping: + // e.g. dict (must have string keys) + buf.WriteByte('{') + iter := x.Iterate() + defer iter.Done() + var k starlark.Value + for i := 0; iter.Next(&k); i++ { + if i > 0 { + buf.WriteByte(',') + } + s, ok := starlark.AsString(k) + if !ok { + return fmt.Errorf("%s has %s key, want string", x.Type(), k.Type()) + } + v, found, err := x.Get(k) + if err != nil || !found { + log.Fatalf("internal error: mapping %s has %s among keys but value lookup fails", x.Type(), k) + } + + quote(s) + buf.WriteByte(':') + if err := emit(v); err != nil { + return fmt.Errorf("in %s key %s: %v", x.Type(), k, err) + } + } + buf.WriteByte('}') + + case starlark.Iterable: + // e.g. tuple, list + buf.WriteByte('[') + iter := x.Iterate() + defer iter.Done() + var elem starlark.Value + for i := 0; iter.Next(&elem); i++ { + if i > 0 { + buf.WriteByte(',') + } + if err := emit(elem); err != nil { + return fmt.Errorf("at %s index %d: %v", x.Type(), i, err) + } + } + buf.WriteByte(']') + + case starlark.HasAttrs: + // e.g. struct + buf.WriteByte('{') + var names []string + names = append(names, x.AttrNames()...) + sort.Strings(names) + for i, name := range names { + v, err := x.Attr(name) + if err != nil || v == nil { + log.Fatalf("internal error: dir(%s) includes %q but value has no .%s field", x.Type(), name, name) + } + if i > 0 { + buf.WriteByte(',') + } + quote(name) + buf.WriteByte(':') + if err := emit(v); err != nil { + return fmt.Errorf("in field .%s: %v", name, err) + } + } + buf.WriteByte('}') + + default: + return fmt.Errorf("cannot encode %s as JSON", x.Type()) + } + return nil + } + + if err := emit(x); err != nil { + return nil, fmt.Errorf("%s: %v", b.Name(), err) + } + return starlark.String(buf.String()), nil +} + +// isPrintableASCII reports whether s contains only printable ASCII. +func isPrintableASCII(s string) bool { + for i := 0; i < len(s); i++ { + b := s[i] + if b < 0x20 || b >= 0x80 { + return false + } + } + return true +} + +// isFinite reports whether f represents a finite rational value. +// It is equivalent to !math.IsNan(f) && !math.IsInf(f, 0). +func isFinite(f float64) bool { + return math.Abs(f) <= math.MaxFloat64 +} + +func indent(thread *starlark.Thread, b *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) { + prefix, indent := "", "\t" // keyword-only + if err := starlark.UnpackArgs(b.Name(), nil, kwargs, + "prefix?", &prefix, + "indent?", &indent, + ); err != nil { + return nil, err + } + var str string // positional-only + if err := starlark.UnpackPositionalArgs(b.Name(), args, nil, 1, &str); err != nil { + return nil, err + } + + buf := new(bytes.Buffer) + if err := json.Indent(buf, []byte(str), prefix, indent); err != nil { + return nil, fmt.Errorf("%s: %v", b.Name(), err) + } + return starlark.String(buf.String()), nil +} + +func decode(thread *starlark.Thread, b *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (_ starlark.Value, err error) { + var s string + if err := starlark.UnpackPositionalArgs(b.Name(), args, kwargs, 1, &s); err != nil { + return nil, err + } + + // The decoder necessarily makes certain representation choices + // such as list vs tuple, struct vs dict, int vs float. + // In principle, we could parameterize it to allow the caller to + // control the returned types, but there's no compelling need yet. + + // Use panic/recover with a distinguished type (failure) for error handling. + type failure string + fail := func(format string, args ...interface{}) { + panic(failure(fmt.Sprintf(format, args...))) + } + + i := 0 + + // skipSpace consumes leading spaces, and reports whether there is more input. + skipSpace := func() bool { + for ; i < len(s); i++ { + b := s[i] + if b != ' ' && b != '\t' && b != '\n' && b != '\r' { + return true + } + } + return false + } + + // next consumes leading spaces and returns the first non-space. + // It panics if at EOF. + next := func() byte { + if skipSpace() { + return s[i] + } + fail("unexpected end of file") + panic("unreachable") + } + + // parse returns the next JSON value from the input. + // It consumes leading but not trailing whitespace. + // It panics on error. + var parse func() starlark.Value + parse = func() starlark.Value { + b := next() + switch b { + case '"': + // string + + // Find end of quotation. + // Also, record whether trivial unquoting is safe. + // Non-trivial unquoting is handled by Go's encoding/json. + safe := true + closed := false + j := i + 1 + for ; j < len(s); j++ { + b := s[j] + if b == '\\' { + safe = false + j++ // skip x in \x + } else if b == '"' { + closed = true + j++ // skip '"' + break + } else if b >= utf8.RuneSelf { + safe = false + } + } + if !closed { + fail("unclosed string literal") + } + + r := s[i:j] + i = j + + // unquote + if safe { + r = r[1 : len(r)-1] + } else if err := json.Unmarshal([]byte(r), &r); err != nil { + fail("%s", err) + } + return starlark.String(r) + + case 'n': + if strings.HasPrefix(s[i:], "null") { + i += len("null") + return starlark.None + } + + case 't': + if strings.HasPrefix(s[i:], "true") { + i += len("true") + return starlark.True + } + + case 'f': + if strings.HasPrefix(s[i:], "false") { + i += len("false") + return starlark.False + } + + case '[': + // array + var elems []starlark.Value + + i++ // '[' + b = next() + if b != ']' { + for { + elem := parse() + elems = append(elems, elem) + b = next() + if b != ',' { + if b != ']' { + fail("got %q, want ',' or ']'", b) + } + break + } + i++ // ',' + } + } + i++ // ']' + return starlark.NewList(elems) + + case '{': + // object + dict := new(starlark.Dict) + + i++ // '{' + b = next() + if b != '}' { + for { + key := parse() + if _, ok := key.(starlark.String); !ok { + fail("got %s for object key, want string", key.Type()) + } + b = next() + if b != ':' { + fail("after object key, got %q, want ':' ", b) + } + i++ // ':' + value := parse() + dict.SetKey(key, value) // can't fail + b = next() + if b != ',' { + if b != '}' { + fail("in object, got %q, want ',' or '}'", b) + } + break + } + i++ // ',' + } + } + i++ // '}' + return dict + + default: + // number? + if isdigit(b) || b == '-' { + // scan literal. Allow [0-9+-eE.] for now. + float := false + var j int + for j = i + 1; j < len(s); j++ { + b = s[j] + if isdigit(b) { + // ok + } else if b == '.' || + b == 'e' || + b == 'E' || + b == '+' || + b == '-' { + float = true + } else { + break + } + } + num := s[i:j] + i = j + + // Unlike most C-like languages, + // JSON disallows a leading zero before a digit. + digits := num + if num[0] == '-' { + digits = num[1:] + } + if digits == "" || digits[0] == '0' && len(digits) > 1 && isdigit(digits[1]) { + fail("invalid number: %s", num) + } + + // parse literal + if float { + x, err := strconv.ParseFloat(num, 64) + if err != nil { + fail("invalid number: %s", num) + } + return starlark.Float(x) + } else { + x, ok := new(big.Int).SetString(num, 10) + if !ok { + fail("invalid number: %s", num) + } + return starlark.MakeBigInt(x) + } + } + } + fail("unexpected character %q", b) + panic("unreachable") + } + defer func() { + x := recover() + switch x := x.(type) { + case failure: + err = fmt.Errorf("json.decode: at offset %d, %s", i, x) + case nil: + // nop + default: + panic(x) // unexpected panic + } + }() + x := parse() + if skipSpace() { + fail("unexpected character %q after value", s[i]) + } + return x, nil +} + +func isdigit(b byte) bool { + return b >= '0' && b <= '9' +}