From 930e93813c2adfdf111b5789b82fd5615ef24a2f Mon Sep 17 00:00:00 2001 From: Nice Zombies Date: Thu, 1 May 2025 08:37:06 +0200 Subject: [PATCH 1/8] Fast path for string encoding --- Modules/_json.c | 235 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 204 insertions(+), 31 deletions(-) diff --git a/Modules/_json.c b/Modules/_json.c index 89b0a41dd10acb..82e5dc7e7fd909 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -51,7 +51,7 @@ typedef struct _PyEncoderObject { char sort_keys; char skipkeys; int allow_nan; - PyCFunction fast_encode; + PyObject * (*fast_encode)(PyObject *); } PyEncoderObject; #define PyEncoderObject_CAST(op) ((PyEncoderObject *)(op)) @@ -102,8 +102,8 @@ static PyObject * _encoded_const(PyObject *obj); static void raise_errmsg(const char *msg, PyObject *s, Py_ssize_t end); -static PyObject * -encoder_encode_string(PyEncoderObject *s, PyObject *obj); +static int +encoder_write_string(PyEncoderObject *s, PyUnicodeWriter *writer, PyObject *obj); static PyObject * encoder_encode_float(PyEncoderObject *s, PyObject *obj); @@ -209,6 +209,72 @@ ascii_escape_unicode(PyObject *pystr) return rval; } +static PyObject * +ascii_escape_unicode_ex(PyObject *pystr) +{ + /* Take a PyUnicode pystr and return a new ASCII-only escaped PyUnicode */ + Py_ssize_t i; + Py_ssize_t input_chars; + Py_ssize_t output_size; + Py_ssize_t chars; + PyObject *rval; + const void *input; + Py_UCS1 *output; + int kind; + + input_chars = PyUnicode_GET_LENGTH(pystr); + input = PyUnicode_DATA(pystr); + kind = PyUnicode_KIND(pystr); + + /* Compute the output size */ + for (i = 0, output_size = 0; i < input_chars; i++) { + Py_UCS4 c = PyUnicode_READ(kind, input, i); + Py_ssize_t d; + if (S_CHAR(c)) { + d = 1; + } + else { + switch(c) { + case '\\': case '"': case '\b': case '\f': + case '\n': case '\r': case '\t': + d = 2; break; + default: + d = c >= 0x10000 ? 12 : 6; + } + } + if (output_size > PY_SSIZE_T_MAX - d) { + PyErr_SetString(PyExc_OverflowError, "string is too long to escape"); + return NULL; + } + output_size += d; + } + + if (output_size == input_chars) { + /* No need to escape anything */ + return Py_NewRef(pystr); + } + + rval = PyUnicode_New(output_size, 127); + if (rval == NULL) { + return NULL; + } + output = PyUnicode_1BYTE_DATA(rval); + chars = 0; + for (i = 0; i < input_chars; i++) { + Py_UCS4 c = PyUnicode_READ(kind, input, i); + if (S_CHAR(c)) { + output[chars++] = c; + } + else { + chars = ascii_escape_unichar(c, output, chars); + } + } +#ifdef Py_DEBUG + assert(_PyUnicode_CheckConsistency(rval, 1)); +#endif + return rval; +} + static PyObject * escape_unicode(PyObject *pystr) { @@ -303,6 +369,103 @@ escape_unicode(PyObject *pystr) return rval; } +static PyObject * +escape_unicode_ex(PyObject *pystr) +{ + /* Take a PyUnicode pystr and return a new escaped PyUnicode */ + Py_ssize_t i; + Py_ssize_t input_chars; + Py_ssize_t output_size; + Py_ssize_t chars; + PyObject *rval; + const void *input; + int kind; + Py_UCS4 maxchar; + + maxchar = PyUnicode_MAX_CHAR_VALUE(pystr); + input_chars = PyUnicode_GET_LENGTH(pystr); + input = PyUnicode_DATA(pystr); + kind = PyUnicode_KIND(pystr); + + /* Compute the output size */ + for (i = 0, output_size = 0; i < input_chars; i++) { + Py_UCS4 c = PyUnicode_READ(kind, input, i); + Py_ssize_t d; + switch (c) { + case '\\': case '"': case '\b': case '\f': + case '\n': case '\r': case '\t': + d = 2; + break; + default: + if (c <= 0x1f) + d = 6; + else + d = 1; + } + if (output_size > PY_SSIZE_T_MAX - d) { + PyErr_SetString(PyExc_OverflowError, "string is too long to escape"); + return NULL; + } + output_size += d; + } + + if (output_size == input_chars) { + /* No need to escape anything */ + return Py_NewRef(pystr); + } + + rval = PyUnicode_New(output_size, maxchar); + if (rval == NULL) + return NULL; + + kind = PyUnicode_KIND(rval); + +#define ENCODE_OUTPUT do { \ + chars = 0; \ + for (i = 0; i < input_chars; i++) { \ + Py_UCS4 c = PyUnicode_READ(kind, input, i); \ + switch (c) { \ + case '\\': output[chars++] = '\\'; output[chars++] = c; break; \ + case '"': output[chars++] = '\\'; output[chars++] = c; break; \ + case '\b': output[chars++] = '\\'; output[chars++] = 'b'; break; \ + case '\f': output[chars++] = '\\'; output[chars++] = 'f'; break; \ + case '\n': output[chars++] = '\\'; output[chars++] = 'n'; break; \ + case '\r': output[chars++] = '\\'; output[chars++] = 'r'; break; \ + case '\t': output[chars++] = '\\'; output[chars++] = 't'; break; \ + default: \ + if (c <= 0x1f) { \ + output[chars++] = '\\'; \ + output[chars++] = 'u'; \ + output[chars++] = '0'; \ + output[chars++] = '0'; \ + output[chars++] = Py_hexdigits[(c >> 4) & 0xf]; \ + output[chars++] = Py_hexdigits[(c ) & 0xf]; \ + } else { \ + output[chars++] = c; \ + } \ + } \ + } \ + } while (0) + + if (kind == PyUnicode_1BYTE_KIND) { + Py_UCS1 *output = PyUnicode_1BYTE_DATA(rval); + ENCODE_OUTPUT; + } else if (kind == PyUnicode_2BYTE_KIND) { + Py_UCS2 *output = PyUnicode_2BYTE_DATA(rval); + ENCODE_OUTPUT; + } else { + Py_UCS4 *output = PyUnicode_4BYTE_DATA(rval); + assert(kind == PyUnicode_4BYTE_KIND); + ENCODE_OUTPUT; + } +#undef ENCODE_OUTPUT + +#ifdef Py_DEBUG + assert(_PyUnicode_CheckConsistency(rval, 1)); +#endif + return rval; +} + static void raise_errmsg(const char *msg, PyObject *s, Py_ssize_t end) { @@ -1255,8 +1418,11 @@ encoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds) if (PyCFunction_Check(s->encoder)) { PyCFunction f = PyCFunction_GetFunction(s->encoder); - if (f == py_encode_basestring_ascii || f == py_encode_basestring) { - s->fast_encode = f; + if (f == py_encode_basestring_ascii) { + s->fast_encode = ascii_escape_unicode_ex; + } + else if (f == py_encode_basestring) { + s->fast_encode = escape_unicode_ex; } } @@ -1437,33 +1603,46 @@ encoder_encode_float(PyEncoderObject *s, PyObject *obj) return PyFloat_Type.tp_repr(obj); } -static PyObject * -encoder_encode_string(PyEncoderObject *s, PyObject *obj) +static int +_steal_accumulate(PyUnicodeWriter *writer, PyObject *stolen) +{ + /* Append stolen and then decrement its reference count */ + int rval = PyUnicodeWriter_WriteStr(writer, stolen); + Py_DECREF(stolen); + return rval; +} + +static int +encoder_write_string(PyEncoderObject *s, PyUnicodeWriter *writer, PyObject *obj) { /* Return the JSON representation of a string */ PyObject *encoded; if (s->fast_encode) { - return s->fast_encode(NULL, obj); + if (PyUnicodeWriter_WriteChar(writer, '"') < 0) { + return -1; + } + encoded = s->fast_encode(obj); + if (encoded == NULL) { + return -1; + } + if (_steal_accumulate(writer, encoded) < 0) { + return -1; + } + return PyUnicodeWriter_WriteChar(writer, '"'); } encoded = PyObject_CallOneArg(s->encoder, obj); - if (encoded != NULL && !PyUnicode_Check(encoded)) { + if (encoded == NULL) { + return -1; + } + if (!PyUnicode_Check(encoded)) { PyErr_Format(PyExc_TypeError, "encoder() must return a string, not %.80s", Py_TYPE(encoded)->tp_name); Py_DECREF(encoded); - return NULL; + return -1; } - return encoded; -} - -static int -_steal_accumulate(PyUnicodeWriter *writer, PyObject *stolen) -{ - /* Append stolen and then decrement its reference count */ - int rval = PyUnicodeWriter_WriteStr(writer, stolen); - Py_DECREF(stolen); - return rval; + return _steal_accumulate(writer, encoded); } static int @@ -1485,10 +1664,7 @@ encoder_listencode_obj(PyEncoderObject *s, PyUnicodeWriter *writer, return PyUnicodeWriter_WriteUTF8(writer, "false", 5); } else if (PyUnicode_Check(obj)) { - PyObject *encoded = encoder_encode_string(s, obj); - if (encoded == NULL) - return -1; - return _steal_accumulate(writer, encoded); + return encoder_write_string(s, writer, obj); } else if (PyLong_Check(obj)) { if (PyLong_CheckExact(obj)) { @@ -1577,7 +1753,7 @@ encoder_encode_key_value(PyEncoderObject *s, PyUnicodeWriter *writer, bool *firs PyObject *item_separator) { PyObject *keystr = NULL; - PyObject *encoded; + int rv; if (PyUnicode_Check(key)) { keystr = Py_NewRef(key); @@ -1617,14 +1793,11 @@ encoder_encode_key_value(PyEncoderObject *s, PyUnicodeWriter *writer, bool *firs } } - encoded = encoder_encode_string(s, keystr); + rv = encoder_write_string(s, writer, keystr); Py_DECREF(keystr); - if (encoded == NULL) { - return -1; - } - if (_steal_accumulate(writer, encoded) < 0) { - return -1; + if (rv < 0) { + return rv; } if (PyUnicodeWriter_WriteStr(writer, s->key_separator) < 0) { return -1; From 83499a49a0e268e53b582985afd46adb901b701c Mon Sep 17 00:00:00 2001 From: Nice Zombies Date: Fri, 30 May 2025 18:07:17 +0200 Subject: [PATCH 2/8] Improve slow path --- Modules/_json.c | 68 +++++++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/Modules/_json.c b/Modules/_json.c index 095d4acd485394..e9354888de3109 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -51,7 +51,7 @@ typedef struct _PyEncoderObject { char sort_keys; char skipkeys; int allow_nan; - PyObject * (*fast_encode)(PyObject *); + int (*fast_encode)(PyUnicodeWriter *, PyObject *); } PyEncoderObject; #define PyEncoderObject_CAST(op) ((PyEncoderObject *)(op)) @@ -103,6 +103,8 @@ _encoded_const(PyObject *obj); static void raise_errmsg(const char *msg, PyObject *s, Py_ssize_t end); static int +_steal_accumulate(PyUnicodeWriter *writer, PyObject *stolen); +static int encoder_write_string(PyEncoderObject *s, PyUnicodeWriter *writer, PyObject *obj); static PyObject * encoder_encode_float(PyEncoderObject *s, PyObject *obj); @@ -209,8 +211,8 @@ ascii_escape_unicode(PyObject *pystr) return rval; } -static PyObject * -ascii_escape_unicode_ex(PyObject *pystr) +static int +write_escaped_ascii(PyUnicodeWriter *writer, PyObject *pystr) { /* Take a PyUnicode pystr and return a new ASCII-only escaped PyUnicode */ Py_ssize_t i; @@ -227,7 +229,7 @@ ascii_escape_unicode_ex(PyObject *pystr) kind = PyUnicode_KIND(pystr); /* Compute the output size */ - for (i = 0, output_size = 0; i < input_chars; i++) { + for (i = 0, output_size = 2; i < input_chars; i++) { Py_UCS4 c = PyUnicode_READ(kind, input, i); Py_ssize_t d; if (S_CHAR(c)) { @@ -244,22 +246,29 @@ ascii_escape_unicode_ex(PyObject *pystr) } if (output_size > PY_SSIZE_T_MAX - d) { PyErr_SetString(PyExc_OverflowError, "string is too long to escape"); - return NULL; + return -1; } output_size += d; } - if (output_size == input_chars) { + if (output_size == input_chars + 2) { /* No need to escape anything */ - return Py_NewRef(pystr); + if (PyUnicodeWriter_WriteChar(writer, '"') < 0) { + return -1; + } + if (PyUnicodeWriter_WriteStr(writer, pystr) < 0) { + return -1; + } + return PyUnicodeWriter_WriteChar(writer, '"'); } rval = PyUnicode_New(output_size, 127); if (rval == NULL) { - return NULL; + return -1; } output = PyUnicode_1BYTE_DATA(rval); chars = 0; + output[chars++] = '"'; for (i = 0; i < input_chars; i++) { Py_UCS4 c = PyUnicode_READ(kind, input, i); if (S_CHAR(c)) { @@ -269,10 +278,11 @@ ascii_escape_unicode_ex(PyObject *pystr) chars = ascii_escape_unichar(c, output, chars); } } + output[chars++] = '"'; #ifdef Py_DEBUG assert(_PyUnicode_CheckConsistency(rval, 1)); #endif - return rval; + return _steal_accumulate(writer, rval); } static PyObject * @@ -369,8 +379,8 @@ escape_unicode(PyObject *pystr) return rval; } -static PyObject * -escape_unicode_ex(PyObject *pystr) +static int +write_escaped_unicode(PyUnicodeWriter *writer, PyObject *pystr) { /* Take a PyUnicode pystr and return a new escaped PyUnicode */ Py_ssize_t i; @@ -388,7 +398,7 @@ escape_unicode_ex(PyObject *pystr) kind = PyUnicode_KIND(pystr); /* Compute the output size */ - for (i = 0, output_size = 0; i < input_chars; i++) { + for (i = 0, output_size = 2; i < input_chars; i++) { Py_UCS4 c = PyUnicode_READ(kind, input, i); Py_ssize_t d; switch (c) { @@ -404,24 +414,31 @@ escape_unicode_ex(PyObject *pystr) } if (output_size > PY_SSIZE_T_MAX - d) { PyErr_SetString(PyExc_OverflowError, "string is too long to escape"); - return NULL; + return -1; } output_size += d; } - if (output_size == input_chars) { + if (output_size == input_chars + 2) { /* No need to escape anything */ - return Py_NewRef(pystr); + if (PyUnicodeWriter_WriteChar(writer, '"') < 0) { + return -1; + } + if (PyUnicodeWriter_WriteStr(writer, pystr) < 0) { + return -1; + } + return PyUnicodeWriter_WriteChar(writer, '"'); } rval = PyUnicode_New(output_size, maxchar); if (rval == NULL) - return NULL; + return -1; kind = PyUnicode_KIND(rval); #define ENCODE_OUTPUT do { \ chars = 0; \ + output[chars++] = '"'; \ for (i = 0; i < input_chars; i++) { \ Py_UCS4 c = PyUnicode_READ(kind, input, i); \ switch (c) { \ @@ -445,6 +462,7 @@ escape_unicode_ex(PyObject *pystr) } \ } \ } \ + output[chars++] = '"'; \ } while (0) if (kind == PyUnicode_1BYTE_KIND) { @@ -463,7 +481,7 @@ escape_unicode_ex(PyObject *pystr) #ifdef Py_DEBUG assert(_PyUnicode_CheckConsistency(rval, 1)); #endif - return rval; + return _steal_accumulate(writer, rval); } static void @@ -1419,10 +1437,10 @@ encoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds) if (PyCFunction_Check(s->encoder)) { PyCFunction f = PyCFunction_GetFunction(s->encoder); if (f == py_encode_basestring_ascii) { - s->fast_encode = ascii_escape_unicode_ex; + s->fast_encode = write_escaped_ascii; } else if (f == py_encode_basestring) { - s->fast_encode = escape_unicode_ex; + s->fast_encode = write_escaped_unicode; } } @@ -1619,17 +1637,7 @@ encoder_write_string(PyEncoderObject *s, PyUnicodeWriter *writer, PyObject *obj) PyObject *encoded; if (s->fast_encode) { - if (PyUnicodeWriter_WriteChar(writer, '"') < 0) { - return -1; - } - encoded = s->fast_encode(obj); - if (encoded == NULL) { - return -1; - } - if (_steal_accumulate(writer, encoded) < 0) { - return -1; - } - return PyUnicodeWriter_WriteChar(writer, '"'); + return s->fast_encode(writer, obj); } encoded = PyObject_CallOneArg(s->encoder, obj); if (encoded == NULL) { From 25da453860aa5341ed417ea6b7a5c52dd85a9bc7 Mon Sep 17 00:00:00 2001 From: Nice Zombies Date: Fri, 30 May 2025 18:08:21 +0200 Subject: [PATCH 3/8] Reduce diff --- Modules/_json.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Modules/_json.c b/Modules/_json.c index e9354888de3109..4f9ca1f96a0b49 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -1621,15 +1621,6 @@ encoder_encode_float(PyEncoderObject *s, PyObject *obj) return PyFloat_Type.tp_repr(obj); } -static int -_steal_accumulate(PyUnicodeWriter *writer, PyObject *stolen) -{ - /* Append stolen and then decrement its reference count */ - int rval = PyUnicodeWriter_WriteStr(writer, stolen); - Py_DECREF(stolen); - return rval; -} - static int encoder_write_string(PyEncoderObject *s, PyUnicodeWriter *writer, PyObject *obj) { @@ -1653,6 +1644,15 @@ encoder_write_string(PyEncoderObject *s, PyUnicodeWriter *writer, PyObject *obj) return _steal_accumulate(writer, encoded); } +static int +_steal_accumulate(PyUnicodeWriter *writer, PyObject *stolen) +{ + /* Append stolen and then decrement its reference count */ + int rval = PyUnicodeWriter_WriteStr(writer, stolen); + Py_DECREF(stolen); + return rval; +} + static int encoder_listencode_obj(PyEncoderObject *s, PyUnicodeWriter *writer, PyObject *obj, From b4a802674bb4ae892109c6ec3af728f6938f81a5 Mon Sep 17 00:00:00 2001 From: Nice Zombies Date: Tue, 10 Jun 2025 08:20:36 +0200 Subject: [PATCH 4/8] Reduce diff --- Modules/_json.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/_json.c b/Modules/_json.c index 4f9ca1f96a0b49..00028a80f53366 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -1805,7 +1805,7 @@ encoder_encode_key_value(PyEncoderObject *s, PyUnicodeWriter *writer, bool *firs Py_DECREF(keystr); if (rv < 0) { - return rv; + return -1; } if (PyUnicodeWriter_WriteStr(writer, s->key_separator) < 0) { return -1; From 8ee4c93abcced6b8fdf248b5492ba2c483d6b2b0 Mon Sep 17 00:00:00 2001 From: Nice Zombies Date: Tue, 10 Jun 2025 10:09:28 +0200 Subject: [PATCH 5/8] Reuse code --- Modules/_json.c | 227 ++++++++++++++++-------------------------------- 1 file changed, 77 insertions(+), 150 deletions(-) diff --git a/Modules/_json.c b/Modules/_json.c index 00028a80f53366..3b3f58239a9705 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -148,17 +148,13 @@ ascii_escape_unichar(Py_UCS4 c, unsigned char *output, Py_ssize_t chars) return chars; } -static PyObject * -ascii_escape_unicode(PyObject *pystr) +static int +ascii_escape_size(PyObject *pystr) { - /* Take a PyUnicode pystr and return a new ASCII-only escaped PyUnicode */ Py_ssize_t i; Py_ssize_t input_chars; Py_ssize_t output_size; - Py_ssize_t chars; - PyObject *rval; const void *input; - Py_UCS1 *output; int kind; input_chars = PyUnicode_GET_LENGTH(pystr); @@ -183,11 +179,29 @@ ascii_escape_unicode(PyObject *pystr) } if (output_size > PY_SSIZE_T_MAX - d) { PyErr_SetString(PyExc_OverflowError, "string is too long to escape"); - return NULL; + return -1; } output_size += d; } + return output_size; +} + +static PyObject * +ascii_escape_unicode_and_size(PyObject *pystr, Py_ssize_t output_size) +{ + Py_ssize_t i; + Py_ssize_t input_chars; + Py_ssize_t chars; + PyObject *rval; + const void *input; + Py_UCS1 *output; + int kind; + + input_chars = PyUnicode_GET_LENGTH(pystr); + input = PyUnicode_DATA(pystr); + kind = PyUnicode_KIND(pystr); + rval = PyUnicode_New(output_size, 127); if (rval == NULL) { return NULL; @@ -211,47 +225,27 @@ ascii_escape_unicode(PyObject *pystr) return rval; } -static int -write_escaped_ascii(PyUnicodeWriter *writer, PyObject *pystr) +static PyObject * +ascii_escape_unicode(PyObject *pystr) { /* Take a PyUnicode pystr and return a new ASCII-only escaped PyUnicode */ - Py_ssize_t i; - Py_ssize_t input_chars; - Py_ssize_t output_size; - Py_ssize_t chars; - PyObject *rval; - const void *input; - Py_UCS1 *output; - int kind; + Py_ssize_t output_size = ascii_escape_size(pystr); + if (output_size < 0) { + return NULL; + } - input_chars = PyUnicode_GET_LENGTH(pystr); - input = PyUnicode_DATA(pystr); - kind = PyUnicode_KIND(pystr); + return ascii_escape_unicode_and_size(pystr, output_size); +} - /* Compute the output size */ - for (i = 0, output_size = 2; i < input_chars; i++) { - Py_UCS4 c = PyUnicode_READ(kind, input, i); - Py_ssize_t d; - if (S_CHAR(c)) { - d = 1; - } - else { - switch(c) { - case '\\': case '"': case '\b': case '\f': - case '\n': case '\r': case '\t': - d = 2; break; - default: - d = c >= 0x10000 ? 12 : 6; - } - } - if (output_size > PY_SSIZE_T_MAX - d) { - PyErr_SetString(PyExc_OverflowError, "string is too long to escape"); - return -1; - } - output_size += d; +static int +write_escaped_ascii(PyUnicodeWriter *writer, PyObject *pystr) +{ + Py_ssize_t output_size = ascii_escape_size(pystr); + if (output_size < 0) { + return -1; } - if (output_size == input_chars + 2) { + if (output_size == PyUnicode_GET_LENGTH(pystr) + 2) { /* No need to escape anything */ if (PyUnicodeWriter_WriteChar(writer, '"') < 0) { return -1; @@ -262,43 +256,23 @@ write_escaped_ascii(PyUnicodeWriter *writer, PyObject *pystr) return PyUnicodeWriter_WriteChar(writer, '"'); } - rval = PyUnicode_New(output_size, 127); + PyObject *rval = ascii_escape_unicode_and_size(pystr, output_size); if (rval == NULL) { return -1; } - output = PyUnicode_1BYTE_DATA(rval); - chars = 0; - output[chars++] = '"'; - for (i = 0; i < input_chars; i++) { - Py_UCS4 c = PyUnicode_READ(kind, input, i); - if (S_CHAR(c)) { - output[chars++] = c; - } - else { - chars = ascii_escape_unichar(c, output, chars); - } - } - output[chars++] = '"'; -#ifdef Py_DEBUG - assert(_PyUnicode_CheckConsistency(rval, 1)); -#endif + return _steal_accumulate(writer, rval); } -static PyObject * -escape_unicode(PyObject *pystr) +static int +escape_size(PyObject *pystr) { - /* Take a PyUnicode pystr and return a new escaped PyUnicode */ Py_ssize_t i; Py_ssize_t input_chars; Py_ssize_t output_size; - Py_ssize_t chars; - PyObject *rval; const void *input; int kind; - Py_UCS4 maxchar; - maxchar = PyUnicode_MAX_CHAR_VALUE(pystr); input_chars = PyUnicode_GET_LENGTH(pystr); input = PyUnicode_DATA(pystr); kind = PyUnicode_KIND(pystr); @@ -320,11 +294,30 @@ escape_unicode(PyObject *pystr) } if (output_size > PY_SSIZE_T_MAX - d) { PyErr_SetString(PyExc_OverflowError, "string is too long to escape"); - return NULL; + return -1; } output_size += d; } + return output_size; +} + +static PyObject * +escape_unicode_and_size(PyObject *pystr, Py_ssize_t output_size) +{ + Py_ssize_t i; + Py_ssize_t input_chars; + Py_ssize_t chars; + PyObject *rval; + const void *input; + int kind; + Py_UCS4 maxchar; + + maxchar = PyUnicode_MAX_CHAR_VALUE(pystr); + input_chars = PyUnicode_GET_LENGTH(pystr); + input = PyUnicode_DATA(pystr); + kind = PyUnicode_KIND(pystr); + rval = PyUnicode_New(output_size, maxchar); if (rval == NULL) return NULL; @@ -379,47 +372,27 @@ escape_unicode(PyObject *pystr) return rval; } -static int -write_escaped_unicode(PyUnicodeWriter *writer, PyObject *pystr) +static PyObject * +escape_unicode(PyObject *pystr) { /* Take a PyUnicode pystr and return a new escaped PyUnicode */ - Py_ssize_t i; - Py_ssize_t input_chars; - Py_ssize_t output_size; - Py_ssize_t chars; - PyObject *rval; - const void *input; - int kind; - Py_UCS4 maxchar; + Py_ssize_t output_size = escape_size(pystr); + if (output_size < 0) { + return NULL; + } - maxchar = PyUnicode_MAX_CHAR_VALUE(pystr); - input_chars = PyUnicode_GET_LENGTH(pystr); - input = PyUnicode_DATA(pystr); - kind = PyUnicode_KIND(pystr); + return escape_unicode_and_size(pystr, output_size); +} - /* Compute the output size */ - for (i = 0, output_size = 2; i < input_chars; i++) { - Py_UCS4 c = PyUnicode_READ(kind, input, i); - Py_ssize_t d; - switch (c) { - case '\\': case '"': case '\b': case '\f': - case '\n': case '\r': case '\t': - d = 2; - break; - default: - if (c <= 0x1f) - d = 6; - else - d = 1; - } - if (output_size > PY_SSIZE_T_MAX - d) { - PyErr_SetString(PyExc_OverflowError, "string is too long to escape"); - return -1; - } - output_size += d; +static int +write_escaped_unicode(PyUnicodeWriter *writer, PyObject *pystr) +{ + Py_ssize_t output_size = escape_size(pystr); + if (output_size < 0) { + return -1; } - if (output_size == input_chars + 2) { + if (output_size == PyUnicode_GET_LENGTH(pystr) + 2) { /* No need to escape anything */ if (PyUnicodeWriter_WriteChar(writer, '"') < 0) { return -1; @@ -430,57 +403,11 @@ write_escaped_unicode(PyUnicodeWriter *writer, PyObject *pystr) return PyUnicodeWriter_WriteChar(writer, '"'); } - rval = PyUnicode_New(output_size, maxchar); - if (rval == NULL) + PyObject *rval = escape_unicode_and_size(pystr, output_size); + if (rval == NULL) { return -1; - - kind = PyUnicode_KIND(rval); - -#define ENCODE_OUTPUT do { \ - chars = 0; \ - output[chars++] = '"'; \ - for (i = 0; i < input_chars; i++) { \ - Py_UCS4 c = PyUnicode_READ(kind, input, i); \ - switch (c) { \ - case '\\': output[chars++] = '\\'; output[chars++] = c; break; \ - case '"': output[chars++] = '\\'; output[chars++] = c; break; \ - case '\b': output[chars++] = '\\'; output[chars++] = 'b'; break; \ - case '\f': output[chars++] = '\\'; output[chars++] = 'f'; break; \ - case '\n': output[chars++] = '\\'; output[chars++] = 'n'; break; \ - case '\r': output[chars++] = '\\'; output[chars++] = 'r'; break; \ - case '\t': output[chars++] = '\\'; output[chars++] = 't'; break; \ - default: \ - if (c <= 0x1f) { \ - output[chars++] = '\\'; \ - output[chars++] = 'u'; \ - output[chars++] = '0'; \ - output[chars++] = '0'; \ - output[chars++] = Py_hexdigits[(c >> 4) & 0xf]; \ - output[chars++] = Py_hexdigits[(c ) & 0xf]; \ - } else { \ - output[chars++] = c; \ - } \ - } \ - } \ - output[chars++] = '"'; \ - } while (0) - - if (kind == PyUnicode_1BYTE_KIND) { - Py_UCS1 *output = PyUnicode_1BYTE_DATA(rval); - ENCODE_OUTPUT; - } else if (kind == PyUnicode_2BYTE_KIND) { - Py_UCS2 *output = PyUnicode_2BYTE_DATA(rval); - ENCODE_OUTPUT; - } else { - Py_UCS4 *output = PyUnicode_4BYTE_DATA(rval); - assert(kind == PyUnicode_4BYTE_KIND); - ENCODE_OUTPUT; } -#undef ENCODE_OUTPUT -#ifdef Py_DEBUG - assert(_PyUnicode_CheckConsistency(rval, 1)); -#endif return _steal_accumulate(writer, rval); } From 7b055be5488f747416762dc2ce8cb7b76df5f1ec Mon Sep 17 00:00:00 2001 From: Nice Zombies Date: Tue, 10 Jun 2025 10:33:24 +0200 Subject: [PATCH 6/8] Fix compiler warnings --- Modules/_json.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Modules/_json.c b/Modules/_json.c index 3b3f58239a9705..1dd00e80b6b61f 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -148,7 +148,7 @@ ascii_escape_unichar(Py_UCS4 c, unsigned char *output, Py_ssize_t chars) return chars; } -static int +static Py_ssize_t ascii_escape_size(PyObject *pystr) { Py_ssize_t i; @@ -264,7 +264,7 @@ write_escaped_ascii(PyUnicodeWriter *writer, PyObject *pystr) return _steal_accumulate(writer, rval); } -static int +static Py_ssize_t escape_size(PyObject *pystr) { Py_ssize_t i; From 660d962602ecd4e5cc34cfe2aeb1f9af021187cc Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Wed, 11 Jun 2025 15:08:04 +0000 Subject: [PATCH 7/8] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/Library/2025-06-11-15-08-02.gh-issue-135336.6Gq6MI.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2025-06-11-15-08-02.gh-issue-135336.6Gq6MI.rst diff --git a/Misc/NEWS.d/next/Library/2025-06-11-15-08-02.gh-issue-135336.6Gq6MI.rst b/Misc/NEWS.d/next/Library/2025-06-11-15-08-02.gh-issue-135336.6Gq6MI.rst new file mode 100644 index 00000000000000..8d317c4ca288ea --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-06-11-15-08-02.gh-issue-135336.6Gq6MI.rst @@ -0,0 +1 @@ +:mod:`json` now encodes strings up to 2.2x faster if they consist solely of ASCII characters that don’t require escaping. From 19f252a80e20240bd6915c2b8183c4567dc171a1 Mon Sep 17 00:00:00 2001 From: Nice Zombies Date: Thu, 12 Jun 2025 09:32:17 +0200 Subject: [PATCH 8/8] Improve fast path --- Modules/_json.c | 94 ++++++++++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 44 deletions(-) diff --git a/Modules/_json.c b/Modules/_json.c index 1dd00e80b6b61f..64c43b62109f3d 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -149,17 +149,10 @@ ascii_escape_unichar(Py_UCS4 c, unsigned char *output, Py_ssize_t chars) } static Py_ssize_t -ascii_escape_size(PyObject *pystr) +ascii_escape_size(const void *input, int kind, Py_ssize_t input_chars) { Py_ssize_t i; - Py_ssize_t input_chars; Py_ssize_t output_size; - const void *input; - int kind; - - input_chars = PyUnicode_GET_LENGTH(pystr); - input = PyUnicode_DATA(pystr); - kind = PyUnicode_KIND(pystr); /* Compute the output size */ for (i = 0, output_size = 2; i < input_chars; i++) { @@ -188,19 +181,12 @@ ascii_escape_size(PyObject *pystr) } static PyObject * -ascii_escape_unicode_and_size(PyObject *pystr, Py_ssize_t output_size) +ascii_escape_unicode_and_size(const void *input, int kind, Py_ssize_t input_chars, Py_ssize_t output_size) { Py_ssize_t i; - Py_ssize_t input_chars; Py_ssize_t chars; PyObject *rval; - const void *input; Py_UCS1 *output; - int kind; - - input_chars = PyUnicode_GET_LENGTH(pystr); - input = PyUnicode_DATA(pystr); - kind = PyUnicode_KIND(pystr); rval = PyUnicode_New(output_size, 127); if (rval == NULL) { @@ -229,23 +215,39 @@ static PyObject * ascii_escape_unicode(PyObject *pystr) { /* Take a PyUnicode pystr and return a new ASCII-only escaped PyUnicode */ - Py_ssize_t output_size = ascii_escape_size(pystr); + Py_ssize_t input_chars; + const void *input; + int kind; + + input_chars = PyUnicode_GET_LENGTH(pystr); + input = PyUnicode_DATA(pystr); + kind = PyUnicode_KIND(pystr); + + Py_ssize_t output_size = ascii_escape_size(input, kind, input_chars); if (output_size < 0) { return NULL; } - return ascii_escape_unicode_and_size(pystr, output_size); + return ascii_escape_unicode_and_size(input, kind, input_chars, output_size); } static int write_escaped_ascii(PyUnicodeWriter *writer, PyObject *pystr) { - Py_ssize_t output_size = ascii_escape_size(pystr); + Py_ssize_t input_chars; + const void *input; + int kind; + + input_chars = PyUnicode_GET_LENGTH(pystr); + input = PyUnicode_DATA(pystr); + kind = PyUnicode_KIND(pystr); + + Py_ssize_t output_size = ascii_escape_size(input, kind, input_chars); if (output_size < 0) { return -1; } - if (output_size == PyUnicode_GET_LENGTH(pystr) + 2) { + if (output_size == input_chars + 2) { /* No need to escape anything */ if (PyUnicodeWriter_WriteChar(writer, '"') < 0) { return -1; @@ -256,7 +258,7 @@ write_escaped_ascii(PyUnicodeWriter *writer, PyObject *pystr) return PyUnicodeWriter_WriteChar(writer, '"'); } - PyObject *rval = ascii_escape_unicode_and_size(pystr, output_size); + PyObject *rval = ascii_escape_unicode_and_size(input, kind, input_chars, output_size); if (rval == NULL) { return -1; } @@ -265,17 +267,10 @@ write_escaped_ascii(PyUnicodeWriter *writer, PyObject *pystr) } static Py_ssize_t -escape_size(PyObject *pystr) +escape_size(const void *input, int kind, Py_ssize_t input_chars) { Py_ssize_t i; - Py_ssize_t input_chars; Py_ssize_t output_size; - const void *input; - int kind; - - input_chars = PyUnicode_GET_LENGTH(pystr); - input = PyUnicode_DATA(pystr); - kind = PyUnicode_KIND(pystr); /* Compute the output size */ for (i = 0, output_size = 2; i < input_chars; i++) { @@ -303,20 +298,11 @@ escape_size(PyObject *pystr) } static PyObject * -escape_unicode_and_size(PyObject *pystr, Py_ssize_t output_size) +escape_unicode_and_size(const void *input, int kind, Py_UCS4 maxchar, Py_ssize_t input_chars, Py_ssize_t output_size) { Py_ssize_t i; - Py_ssize_t input_chars; Py_ssize_t chars; PyObject *rval; - const void *input; - int kind; - Py_UCS4 maxchar; - - maxchar = PyUnicode_MAX_CHAR_VALUE(pystr); - input_chars = PyUnicode_GET_LENGTH(pystr); - input = PyUnicode_DATA(pystr); - kind = PyUnicode_KIND(pystr); rval = PyUnicode_New(output_size, maxchar); if (rval == NULL) @@ -376,23 +362,43 @@ static PyObject * escape_unicode(PyObject *pystr) { /* Take a PyUnicode pystr and return a new escaped PyUnicode */ - Py_ssize_t output_size = escape_size(pystr); + Py_ssize_t input_chars; + const void *input; + int kind; + Py_UCS4 maxchar; + + maxchar = PyUnicode_MAX_CHAR_VALUE(pystr); + input_chars = PyUnicode_GET_LENGTH(pystr); + input = PyUnicode_DATA(pystr); + kind = PyUnicode_KIND(pystr); + + Py_ssize_t output_size = escape_size(input, kind, input_chars); if (output_size < 0) { return NULL; } - return escape_unicode_and_size(pystr, output_size); + return escape_unicode_and_size(input, kind, maxchar, input_chars, output_size); } static int write_escaped_unicode(PyUnicodeWriter *writer, PyObject *pystr) { - Py_ssize_t output_size = escape_size(pystr); + Py_ssize_t input_chars; + const void *input; + int kind; + Py_UCS4 maxchar; + + maxchar = PyUnicode_MAX_CHAR_VALUE(pystr); + input_chars = PyUnicode_GET_LENGTH(pystr); + input = PyUnicode_DATA(pystr); + kind = PyUnicode_KIND(pystr); + + Py_ssize_t output_size = escape_size(input, kind, input_chars); if (output_size < 0) { return -1; } - if (output_size == PyUnicode_GET_LENGTH(pystr) + 2) { + if (output_size == input_chars + 2) { /* No need to escape anything */ if (PyUnicodeWriter_WriteChar(writer, '"') < 0) { return -1; @@ -403,7 +409,7 @@ write_escaped_unicode(PyUnicodeWriter *writer, PyObject *pystr) return PyUnicodeWriter_WriteChar(writer, '"'); } - PyObject *rval = escape_unicode_and_size(pystr, output_size); + PyObject *rval = escape_unicode_and_size(input, kind, maxchar, input_chars, output_size); if (rval == NULL) { return -1; }