Skip to content

Commit

Permalink
Fix goog.proto2.TextFormatSerializer parsing
Browse files Browse the repository at this point in the history
It's now more in line with the actual grammar from the text proto spec.  In particular, hex/octal/scientific numeric literals are all handled correctly by canonicalizing them to a single standard format.

RELNOTES: Fix `goog.proto2.TextFormatSerializer` to be more faithful to spec.
PiperOrigin-RevId: 570335111
Change-Id: Ic3a45500d737236ac4c814b9981e48d3244e8fbd
  • Loading branch information
Closure Team authored and copybara-github committed Oct 3, 2023
1 parent 32c006e commit 6b5e9e8
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 39 deletions.
8 changes: 1 addition & 7 deletions closure/goog/proto2/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ closure_js_library(
],
lenient = True,
deps = [
"//closure/goog/array",
"//closure/goog/asserts",
"//closure/goog/object",
"//closure/goog/string",
Expand All @@ -43,7 +42,6 @@ closure_js_library(
lenient = True,
deps = [
":fielddescriptor",
":serializer",
"//closure/goog/asserts",
"//closure/goog/string",
],
Expand All @@ -55,8 +53,6 @@ closure_js_library(
lenient = True,
deps = [
":fielddescriptor",
":lazydeserializer",
":serializer",
"//closure/goog/asserts",
],
)
Expand All @@ -71,12 +67,10 @@ closure_js_library(
srcs = ["textformatserializer.js"],
lenient = True,
deps = [
":fielddescriptor",
":message",
":serializer",
"//closure/goog/array",
"//closure/goog/asserts",
"//closure/goog/math",
"//closure/goog/math:long",
"//closure/goog/object",
"//closure/goog/string",
],
Expand Down
54 changes: 47 additions & 7 deletions closure/goog/proto2/textformatserializer.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ goog.provide('goog.proto2.TextFormatSerializer');

goog.require('goog.asserts');
goog.require('goog.math');
goog.require('goog.math.Long');
goog.require('goog.object');
goog.require('goog.proto2.FieldDescriptor');
goog.require('goog.proto2.Message');
Expand Down Expand Up @@ -466,10 +467,23 @@ goog.proto2.TextFormatSerializer.Tokenizer_.prototype.getCurrent = function() {
* @enum {!RegExp}
*/
goog.proto2.TextFormatSerializer.Tokenizer_.TokenTypes = {
END: /---end---/,
// Terminal tokens: END if the input data has been exhausted; BAD if not.
// Their regexes don't match any string.
END: /$ end $/,
BAD: /$ bad $/,
// Leading "-" to identify "-infinity"."
IDENTIFIER: /^-?[a-zA-Z][a-zA-Z0-9_]*/,
NUMBER: /^(0x[0-9a-f]+)|(([-])?[0-9][0-9]*(\.?[0-9]+)?(e[+-]?[0-9]+|[f])?)/,
// NOTE: the textproto grammar treats negation as a separate token, so this
// serializer accepts a subset language.
// From the grammar, a number is -?(FLOAT|DEC_INT|OCT_INT|HEX_INT)
// FLOAT | DEC_INT = ( float_lit | dec_lit ), [ "F" | "f" ]
// float_lit | dec_lit
// = ".", dec, { dec }, [ exp ]
// | dec_lit, ".", { dec }, [ exp ]
// | dec_lit, exp | dec_lit
// = ( ".", dec, {dec} | dec_lit, [ ".", {dec} ] ) [exp]
NUMBER:
/^-?(0[0-7]+|0x[0-9a-f]+|([.][0-9]+|(0|[1-9][0-9]*)([.][0-9]*)?)(e[+-]?[0-9]+)?f?)/i,
COMMENT: /^#.*/,
OPEN_BRACE: /^{/,
CLOSE_BRACE: /^}/,
Expand Down Expand Up @@ -506,7 +520,7 @@ goog.proto2.TextFormatSerializer.Tokenizer_.prototype.next = function() {

// If we reach this point, set the current token to END.
this.current_ = {
type: goog.proto2.TextFormatSerializer.Tokenizer_.TokenTypes.END,
type: this.currentData_.length == 0 ? types.END : types.BAD,
value: null
};

Expand Down Expand Up @@ -549,6 +563,12 @@ goog.proto2.TextFormatSerializer.Tokenizer_.prototype.nextInternal_ =

// Advance the index by the length of the token.
if (next) {
// From the textformat spec: There is one edge case that requires special
// attention: a number token (FLOAT, DEC_INT, OCT_INT, or HEX_INT) may not
// be immediately followed by an IDENT token.
if (this.current_.type == types.NUMBER && next.type == types.IDENTIFIER) {
return false;
}
this.current_ =
/** @type {goog.proto2.TextFormatSerializer.Tokenizer_.Token} */ (next);
this.index_ += next.value.length;
Expand Down Expand Up @@ -692,6 +712,23 @@ goog.proto2.TextFormatSerializer.Parser.prototype.consumeFieldValue_ = function(
return true;
};

/**
* Detects the radix of a number.
* @param {string} num a number.
* @return {number} The radix of `num`, or 0 if the number is a float.
* @private
*/
goog.proto2.TextFormatSerializer.Parser.getRadix_ = function(num) {
return /^-?0x/i.test(num) ?
16 :
/^-?0[0-7]/.test(num) ?
8 :
// NOTE: hexadecimal literals are already excluded here, so a decimal
// point, an "e" (as in 1e3), or an "f" suffix (as in 1f) all indicate
// floating point.
/[.ef]/i.test(num) ? 0 :
10;
};

/**
* Attempts to convert a string to a number.
Expand All @@ -701,9 +738,8 @@ goog.proto2.TextFormatSerializer.Parser.prototype.consumeFieldValue_ = function(
*/
goog.proto2.TextFormatSerializer.Parser.getNumberFromString_ = function(num) {
'use strict';
var returnValue = goog.string.contains(num, '.') ?
parseFloat(num) : // num is a float.
goog.string.parseInt(num); // num is an int.
const radix = goog.proto2.TextFormatSerializer.Parser.getRadix_(num);
const returnValue = radix == 0 ? parseFloat(num) : parseInt(num, radix);

goog.asserts.assert(!isNaN(returnValue));
goog.asserts.assert(isFinite(returnValue));
Expand Down Expand Up @@ -786,7 +822,11 @@ goog.proto2.TextFormatSerializer.Parser.prototype.getFieldValue_ = function(
return goog.proto2.TextFormatSerializer.Parser.getNumberFromString_(
num);
}

// Normalize numeric literals to decimal.
const radix = goog.proto2.TextFormatSerializer.Parser.getRadix_(num);
if (radix != 10) {
num = goog.math.Long.fromString(num, radix).toString(10);
}
return num; // 64-bit numbers are by default stored as strings.

case goog.proto2.FieldDescriptor.FieldType.BOOL:
Expand Down
129 changes: 104 additions & 25 deletions closure/goog/proto2/textformatserializer_test.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ function assertTokens(value, tokens, ignoreWhitespace = undefined) {
while (tokenizer.next()) {
tokensFound.push(tokenizer.getCurrent());
}
assertEquals(
TextFormatSerializer.Tokenizer_.TokenTypes.END,
tokenizer.getCurrent().type);

if (!Array.isArray(tokens)) {
tokens = [tokens];
Expand All @@ -45,6 +48,24 @@ function assertTokens(value, tokens, ignoreWhitespace = undefined) {
}
}

/**
* Asserts that the given string value parses to a list of tokens ending in the
* BAD token.
* @param {string} value The string value to parse.
* @param {boolean=} ignoreWhitespace Whether whitespace tokens should be
* skipped by the tokenizer.
*/
function assertEventuallyBadToken(value, ignoreWhitespace = undefined) {
/** @suppress {visibility} suppression added to enable type checking */
const tokenizer =
new TextFormatSerializer.Tokenizer_(value, ignoreWhitespace);
while (tokenizer.next()) {
}
assertEquals(
TextFormatSerializer.Tokenizer_.TokenTypes.BAD,
tokenizer.getCurrent().type);
}

function assertToken(expected, found) {
assertEquals(expected.type, found.type);
if (expected.value) {
Expand Down Expand Up @@ -77,6 +98,16 @@ const floatFormatCases = [
{given: '1.69e+06', expect: 1.69e+06},
{given: '1.69e6', expect: 1.69e+06},
{given: '2.468e-2', expect: 0.02468},
{given: '.1', expect: 0.1},
{given: '-.1', expect: -0.1},
{given: '1.', expect: 1},
{given: '-1.', expect: -1},
{given: '2e2', expect: 200},
{given: '-2e2', expect: -200},
{given: '3E002', expect: 300},
{given: '-3E002', expect: -300},
{given: '.1e2', expect: 10},
{given: '-.1e2', expect: -10},
];

testSuite({
Expand Down Expand Up @@ -287,26 +318,51 @@ testSuite({
assertIdentifier('infinity');
assertIdentifier('nan');

assertNumber(0);
assertNumber(10);
assertNumber(123);
assertNumber(1234);
assertNumber(123.56);
assertNumber(-124);
assertNumber(-1234);
assertNumber(-123.56);
assertNumber('123f');
assertNumber('123.6f');
assertNumber('-123f');
assertNumber('-123.8f');
assertNumber('0x1234');
assertNumber('0x12ac34');
assertNumber('0x49e281db686fb');
// Floating point numbers might be serialized in exponential
// notation:
assertNumber('1.2345e+3');
assertNumber('1.2345e3');
assertNumber('1.2345e-2');
// Numbers
// Octals
assertNumber('00');
assertNumber('-00');
assertNumber('07654321');
assertNumber('-07654321');
// Hexadecimals
assertNumber('0x0');
assertNumber('-0x0');
assertNumber('0X0');
assertNumber('-0X0');
assertNumber('0xFea');
assertNumber('-0xFea');
assertNumber('0XABCDEFabcdef0123456789');
assertNumber('-0XABCDEFabcdef0123456789');
assertNumber('0x09');
assertNumber('-0x09');
// Decimals
assertNumber('0');
assertNumber('-0');
assertNumber('1987654321');
assertNumber('-1987654321');
// Floats that are decimals + f/F.
assertNumber('0f');
assertNumber('-0F');
assertNumber('1987654321f');
assertNumber('-1987654321F');
// Float literals
assertNumber('.0');
assertNumber('-.0');
assertNumber('.1e+11');
assertNumber('-.1E-11');
assertNumber('1.');
assertNumber('-0.');
assertNumber('0.e12');
assertNumber('-0.e12F');
assertNumber('0.32e003');
assertNumber('-1e12f');

// Non-numbers.
assertTokens(
'09',
[{type: types.NUMBER, value: '0'}, {type: types.NUMBER, value: '9'}]);
assertEventuallyBadToken('010f');
assertEventuallyBadToken('.f');

assertString('""');
assertString('"hello world"');
Expand Down Expand Up @@ -378,13 +434,25 @@ testSuite({
assertEquals(255, message.getRepeatedInt32(1));
},

testDeserializationOfInt64AsHexadecimalString() {
testDeserializationOfInt64AsNonDecimalString() {
const message = new TestAllTypes();
const value = 'optional_int64: 0xf';
const value = 'optional_int64: -0xf\n' +
'optional_uint64: 0XF\n' +
'repeated_int64: -0X1cbe991a14\n' +
'repeated_int64: 0x1cBe991a14\n' +
'repeated_int64: -01627646215024\n' +
'repeated_int64: 01627646215024\n' +
'repeated_int64: 1205\n';

new TextFormatSerializer().deserializeTo(message, value);

assertEquals('0xf', message.getOptionalInt64());
assertEquals('-15', message.getOptionalInt64());
assertEquals('15', message.getOptionalUint64());
assertEquals('-123456789012', message.getRepeatedInt64(0));
assertEquals('123456789012', message.getRepeatedInt64(1));
assertEquals('-123456789012', message.getRepeatedInt64(2));
assertEquals('123456789012', message.getRepeatedInt64(3));
assertEquals('1205', message.getRepeatedInt64(4));
},

testDeserializationOfZeroFalseAndEmptyString() {
Expand Down Expand Up @@ -552,7 +620,7 @@ testSuite({
const value =
('repeated_int32: 23\n' +
'repeated_int32: -3\n' +
'repeated_int32: 0xdeadbeef\n' +
'repeated_int32: 0xdeAdBeef\n' +
'repeated_float: 123.0\n' +
'repeated_float: -3.27\n' +
'repeated_float: -35.5f\n');
Expand All @@ -571,7 +639,7 @@ testSuite({
const message = new TestAllTypes();
const value = 'repeated_float: 1.1e5\n' +
'repeated_float: 1.1e-5\n' +
'repeated_double: 1.1e5\n' +
'repeated_double: 1.1E5\n' +
'repeated_double: 1.1e-5\n';
new TextFormatSerializer().deserializeTo(message, value);
assertEquals(1.1e5, message.getRepeatedFloat(0));
Expand Down Expand Up @@ -665,12 +733,23 @@ testSuite({

assertEquals(3735928559, getNumberFromString('0xdeadbeef'));
assertEquals(4276215469, getNumberFromString('0xFEE1DEAD'));
assertEquals(12, getNumberFromString('014'));
assertEquals(-12, getNumberFromString('-014'));
assertEquals(123.1, getNumberFromString('123.1'));
assertEquals(123.0, getNumberFromString('123.0'));
assertEquals(-29.3, getNumberFromString('-29.3f'));
assertEquals(23, getNumberFromString('23'));
assertEquals(-3, getNumberFromString('-3'));
assertEquals(-3.27, getNumberFromString('-3.27'));
// These hexadecimal numbers resemble floats.
assertEquals(63, getNumberFromString('0x3f'));
assertEquals(-63, getNumberFromString('-0x3f'));
assertEquals(63, getNumberFromString('0X3F'));
assertEquals(-63, getNumberFromString('-0X3F'));
assertEquals(15903, getNumberFromString('0x3e1f'));
assertEquals(-15903, getNumberFromString('-0x3e1f'));
assertEquals(993, getNumberFromString('0X3E1'));
assertEquals(-993, getNumberFromString('-0X3E1'));

assertThrows(goog.partial(getNumberFromString, 'cat'));
assertThrows(goog.partial(getNumberFromString, 'NaN'));
Expand Down

0 comments on commit 6b5e9e8

Please sign in to comment.