From a181c01c8f177850f0bd82fd0d585bed507c9999 Mon Sep 17 00:00:00 2001 From: Justin Applegate <70449145+Legoclones@users.noreply.github.com> Date: Wed, 20 Nov 2024 18:11:30 -0500 Subject: [PATCH 01/10] Minor documentation updates + typo fixing --- Lib/pickletools.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Lib/pickletools.py b/Lib/pickletools.py index c462d26da97ce1..fd202afb211881 100644 --- a/Lib/pickletools.py +++ b/Lib/pickletools.py @@ -110,7 +110,7 @@ # # The second major set of additions is now called "protocol 1", and was called # "binary mode" before Python 2.3. This added many opcodes with arguments -# consisting of arbitrary bytes, including NUL bytes and unprintable "high bit" +# consisting of arbitrary bytes, including NULL bytes and unprintable "high bit" # bytes. Binary mode pickles can be substantially smaller than equivalent # text mode pickles, and sometimes faster too; e.g., BININT represents a 4-byte # int as 4 bytes following the opcode, which is cheaper to unpickle than the @@ -1228,7 +1228,8 @@ def __init__(self, name, code, arg, The same as INT, except that the literal ends with 'L', and always unpickles to a Python long. There doesn't seem a real purpose to the - trailing 'L'. + trailing 'L', and the trailing 'L' is not required for Python 3.0 or + higher. Note that LONG takes time quadratic in the number of digits when unpickling (this is simply due to the nature of decimal->binary @@ -1271,7 +1272,7 @@ def __init__(self, name, code, arg, The argument is a repr-style string, with bracketing quote characters, and perhaps embedded escapes. The argument extends until the next newline character. These are usually decoded into a str instance - using the encoding given to the Unpickler constructor. or the default, + using the encoding given to the Unpickler constructor, or the default, 'ASCII'. If the encoding given was 'bytes' however, they will be decoded as bytes object instead. """), @@ -1288,7 +1289,7 @@ def __init__(self, name, code, arg, signed int giving the number of bytes in the string, and the second is that many bytes, which are taken literally as the string content. These are usually decoded into a str instance using the - encoding given to the Unpickler constructor. or the default, + encoding given to the Unpickler constructor, or the default, 'ASCII'. If the encoding given was 'bytes' however, they will be decoded as bytes object instead. """), @@ -1305,7 +1306,7 @@ def __init__(self, name, code, arg, the number of bytes in the string, and the second is that many bytes, which are taken literally as the string content. These are usually decoded into a str instance using the encoding given to - the Unpickler constructor. or the default, 'ASCII'. If the + the Unpickler constructor, or the default, 'ASCII'. If the encoding given was 'bytes' however, they will be decoded as bytes object instead. """), From 6801bd24405dd73b7c068170b7dc4312bfa8f405 Mon Sep 17 00:00:00 2001 From: Justin Applegate <70449145+Legoclones@users.noreply.github.com> Date: Wed, 20 Nov 2024 18:41:40 -0500 Subject: [PATCH 02/10] Expand documentation on opcodes --- Lib/pickletools.py | 60 ++++++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/Lib/pickletools.py b/Lib/pickletools.py index fd202afb211881..5d50af074ca1b2 100644 --- a/Lib/pickletools.py +++ b/Lib/pickletools.py @@ -1375,7 +1375,12 @@ def __init__(self, name, code, arg, stack_before=[], stack_after=[pybuffer], proto=5, - doc="Push an out-of-band buffer object."), + doc="""Push an out-of-band buffer object. + + An iterable must be passed to the Unpickler's 'buffer' argument, and + this opcode takes the next element from that iterable and puts it on + the stack. + """), I(name='READONLY_BUFFER', code='\x98', @@ -1383,7 +1388,11 @@ def __init__(self, name, code, arg, stack_before=[pybuffer], stack_after=[pybuffer], proto=5, - doc="Make an out-of-band buffer object read-only."), + doc="""Make an out-of-band buffer object read-only. + + The top of the stack should be the out-of-band buffer object from + NEXT_BUFFER, and this object is set to read-only. + """), # Ways to spell None. @@ -1541,7 +1550,9 @@ def __init__(self, name, code, arg, Stack before: ... pylist markobject stackslice Stack after: ... pylist+stackslice - although pylist is really extended in-place. + although pylist is really extended in-place. The .extend() attribute + function is attempted first, and if that fails the .append() attribute + is ran instead. """), I(name='LIST', @@ -1669,7 +1680,9 @@ def __init__(self, name, code, arg, Stack before: ... pydict key value Stack after: ... pydict - where pydict has been modified via pydict[key] = value. + where pydict has been modified via pydict[key] = value. Note that any + type that supports item assignment can be modified here, such as a list + or bytearray. """), I(name='SETITEMS', @@ -1691,6 +1704,9 @@ def __init__(self, name, code, arg, where pydict has been modified via pydict[key_i] = value_i for i in 1, 2, ..., n, and in that order. + + Note that any type that supports item assignment can be modified here, + such as a list or bytearray. """), # Ways to build sets @@ -1749,7 +1765,12 @@ def __init__(self, name, code, arg, stack_before=[anyobject], stack_after=[], proto=0, - doc="Discard the top stack item, shrinking the stack by one item."), + doc="""Discard the top stack item, shrinking the stack by one item. + + If the stack has no items in it and the metastack is not empty, then + this opcode will act like POP_MARK and pop the top of the metastack + into the current stack. + """), I(name='DUP', code='2', @@ -1796,9 +1817,9 @@ def __init__(self, name, code, arg, proto=0, doc="""Read an object from the memo and push it on the stack. - The index of the memo object to push is given by the newline-terminated - decimal string following. BINGET and LONG_BINGET are space-optimized - versions. + The index of the memo object to push is given by the positive + newline-terminated decimal string following. BINGET and LONG_BINGET + are space-optimized versions. """), I(name='BINGET', @@ -1833,9 +1854,9 @@ def __init__(self, name, code, arg, proto=0, doc="""Store the stack top into the memo. The stack is not popped. - The index of the memo location to write into is given by the newline- - terminated decimal string following. BINPUT and LONG_BINPUT are - space-optimized versions. + The index of the memo location to write into is given by the positive + newline-terminated decimal string following. BINPUT and LONG_BINPUT + are space-optimized versions. """), I(name='BINPUT', @@ -1895,8 +1916,11 @@ def __init__(self, name, code, arg, code registry ought to be global, although a range of codes may be reserved for private use. - EXT1 has a 1-byte integer argument. This is used to index into the - extension registry, and the object at that index is pushed on the stack. + EXT1 has a 1-byte integer argument. This is used to index into + the inverted extension registry, which contains integer to tuple + mappings. The tuples have a length of two in the format of + '("module", "name")'. This tuple is then passed through find_class, + and the result is pushed onto the stack. """), I(name='EXT2', @@ -1946,6 +1970,9 @@ def __init__(self, name, code, arg, stack_after=[anyobject], proto=4, doc="""Push a global object (module.attr) on the stack. + + This opcode behaves the same way as GLOBAL except the module and name + arguments are two separate strings popped from the top of the stack. """), # Ways to build objects of classes pickle doesn't know about directly @@ -1973,13 +2000,6 @@ def __init__(self, name, code, arg, argument to be passed to the object's __setstate__, and then the REDUCE opcode is followed by code to create setstate's argument, and then a BUILD opcode to apply __setstate__ to that argument. - - If not isinstance(callable, type), REDUCE complains unless the - callable has been registered with the copyreg module's - safe_constructors dict, or the callable has a magic - '__safe_for_unpickling__' attribute with a true value. I'm not sure - why it does this, but I've sure seen this complaint often enough when - I didn't want to . """), I(name='BUILD', From 9ff845c52f2993bfa042ff1598bca9b40115d25d Mon Sep 17 00:00:00 2001 From: Justin Applegate <70449145+Legoclones@users.noreply.github.com> Date: Wed, 20 Nov 2024 18:47:15 -0500 Subject: [PATCH 03/10] Lint --- Lib/pickletools.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/pickletools.py b/Lib/pickletools.py index 5d50af074ca1b2..75d8221a9c2c6a 100644 --- a/Lib/pickletools.py +++ b/Lib/pickletools.py @@ -1817,8 +1817,8 @@ def __init__(self, name, code, arg, proto=0, doc="""Read an object from the memo and push it on the stack. - The index of the memo object to push is given by the positive - newline-terminated decimal string following. BINGET and LONG_BINGET + The index of the memo object to push is given by the positive + newline-terminated decimal string following. BINGET and LONG_BINGET are space-optimized versions. """), @@ -1855,7 +1855,7 @@ def __init__(self, name, code, arg, doc="""Store the stack top into the memo. The stack is not popped. The index of the memo location to write into is given by the positive - newline-terminated decimal string following. BINPUT and LONG_BINPUT + newline-terminated decimal string following. BINPUT and LONG_BINPUT are space-optimized versions. """), From 51fc59f875f37863496272c09a947817ae6f34fb Mon Sep 17 00:00:00 2001 From: Justin Applegate <70449145+Legoclones@users.noreply.github.com> Date: Wed, 20 Nov 2024 18:50:17 -0500 Subject: [PATCH 04/10] Lint2 --- Lib/pickletools.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/pickletools.py b/Lib/pickletools.py index 75d8221a9c2c6a..2e570039c7dac7 100644 --- a/Lib/pickletools.py +++ b/Lib/pickletools.py @@ -1376,7 +1376,7 @@ def __init__(self, name, code, arg, stack_after=[pybuffer], proto=5, doc="""Push an out-of-band buffer object. - + An iterable must be passed to the Unpickler's 'buffer' argument, and this opcode takes the next element from that iterable and puts it on the stack. @@ -1389,7 +1389,7 @@ def __init__(self, name, code, arg, stack_after=[pybuffer], proto=5, doc="""Make an out-of-band buffer object read-only. - + The top of the stack should be the out-of-band buffer object from NEXT_BUFFER, and this object is set to read-only. """), @@ -1766,7 +1766,7 @@ def __init__(self, name, code, arg, stack_after=[], proto=0, doc="""Discard the top stack item, shrinking the stack by one item. - + If the stack has no items in it and the metastack is not empty, then this opcode will act like POP_MARK and pop the top of the metastack into the current stack. @@ -1970,7 +1970,7 @@ def __init__(self, name, code, arg, stack_after=[anyobject], proto=4, doc="""Push a global object (module.attr) on the stack. - + This opcode behaves the same way as GLOBAL except the module and name arguments are two separate strings popped from the top of the stack. """), From 11dadbadc7f67eb021dbc151eeb27596bde69085 Mon Sep 17 00:00:00 2001 From: Justin Applegate <70449145+Legoclones@users.noreply.github.com> Date: Mon, 2 Jun 2025 15:33:29 -0400 Subject: [PATCH 05/10] Expand some FRAME opcode code documentation --- Lib/pickletools.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/pickletools.py b/Lib/pickletools.py index 2e570039c7dac7..9a3f79876e3e9e 100644 --- a/Lib/pickletools.py +++ b/Lib/pickletools.py @@ -2177,7 +2177,8 @@ def __init__(self, name, code, arg, doc="""Indicate the beginning of a new frame. The unpickler may use this opcode to safely prefetch data from its - underlying stream. + underlying stream and prevents several small I/O reads during unpickling. + Frames shouldn't overlap with each other or split opcodes. """), # Ways to deal with persistent IDs. From ee4df6cd163faf4b7ea05e91ccb49d6d835bf2fe Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Mon, 2 Jun 2025 19:49:35 +0000 Subject: [PATCH 06/10] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Documentation/2025-06-02-19-49-33.gh-issue-135041.Jt5D8K.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Documentation/2025-06-02-19-49-33.gh-issue-135041.Jt5D8K.rst diff --git a/Misc/NEWS.d/next/Documentation/2025-06-02-19-49-33.gh-issue-135041.Jt5D8K.rst b/Misc/NEWS.d/next/Documentation/2025-06-02-19-49-33.gh-issue-135041.Jt5D8K.rst new file mode 100644 index 00000000000000..086c9c06cb62bf --- /dev/null +++ b/Misc/NEWS.d/next/Documentation/2025-06-02-19-49-33.gh-issue-135041.Jt5D8K.rst @@ -0,0 +1 @@ +Expanded the documentation for some opcodes in the :mod:`pickletools` module. From 42eaa17c1ee567e00942404a90f2d0b68bc487cd Mon Sep 17 00:00:00 2001 From: Justin Applegate Date: Tue, 3 Jun 2025 09:26:21 -0400 Subject: [PATCH 07/10] Fixes --- Lib/pickletools.py | 5 ++--- .../2025-06-02-19-49-33.gh-issue-135041.Jt5D8K.rst | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) delete mode 100644 Misc/NEWS.d/next/Documentation/2025-06-02-19-49-33.gh-issue-135041.Jt5D8K.rst diff --git a/Lib/pickletools.py b/Lib/pickletools.py index 8a062a41bf686e..d96669e898669f 100644 --- a/Lib/pickletools.py +++ b/Lib/pickletools.py @@ -110,7 +110,7 @@ # # The second major set of additions is now called "protocol 1", and was called # "binary mode" before Python 2.3. This added many opcodes with arguments -# consisting of arbitrary bytes, including NULL bytes and unprintable "high bit" +# consisting of arbitrary bytes, including NUL bytes and unprintable "high bit" # bytes. Binary mode pickles can be substantially smaller than equivalent # text mode pickles, and sometimes faster too; e.g., BININT represents a 4-byte # int as 4 bytes following the opcode, which is cheaper to unpickle than the @@ -1228,8 +1228,7 @@ def __init__(self, name, code, arg, The same as INT, except that the literal ends with 'L', and always unpickles to a Python long. There doesn't seem a real purpose to the - trailing 'L', and the trailing 'L' is not required for Python 3.0 or - higher. + trailing 'L' and it's not required for Python 3.0 or higher. Note that LONG takes time quadratic in the number of digits when unpickling (this is simply due to the nature of decimal->binary diff --git a/Misc/NEWS.d/next/Documentation/2025-06-02-19-49-33.gh-issue-135041.Jt5D8K.rst b/Misc/NEWS.d/next/Documentation/2025-06-02-19-49-33.gh-issue-135041.Jt5D8K.rst deleted file mode 100644 index 086c9c06cb62bf..00000000000000 --- a/Misc/NEWS.d/next/Documentation/2025-06-02-19-49-33.gh-issue-135041.Jt5D8K.rst +++ /dev/null @@ -1 +0,0 @@ -Expanded the documentation for some opcodes in the :mod:`pickletools` module. From d8899903ce63400834dd8e6d4d06d9e1b8ccaf77 Mon Sep 17 00:00:00 2001 From: Justin Applegate <70449145+Legoclones@users.noreply.github.com> Date: Tue, 3 Jun 2025 09:58:04 -0400 Subject: [PATCH 08/10] Update Lib/pickletools.py Add Oxford comma Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com> --- Lib/pickletools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/pickletools.py b/Lib/pickletools.py index d96669e898669f..9f8a1af8a999b7 100644 --- a/Lib/pickletools.py +++ b/Lib/pickletools.py @@ -1228,7 +1228,7 @@ def __init__(self, name, code, arg, The same as INT, except that the literal ends with 'L', and always unpickles to a Python long. There doesn't seem a real purpose to the - trailing 'L' and it's not required for Python 3.0 or higher. + trailing 'L', and it's not required for Python 3.0 or higher. Note that LONG takes time quadratic in the number of digits when unpickling (this is simply due to the nature of decimal->binary From 05f1c48a72d651713221c234a182f7c51f8b7a43 Mon Sep 17 00:00:00 2001 From: Justin Applegate <70449145+Legoclones@users.noreply.github.com> Date: Tue, 17 Jun 2025 09:41:58 -0400 Subject: [PATCH 09/10] Update Lib/pickletools.py Clarify the `APPENDS` docstring Co-authored-by: Serhiy Storchaka --- Lib/pickletools.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Lib/pickletools.py b/Lib/pickletools.py index 9f8a1af8a999b7..4ad0903f40874c 100644 --- a/Lib/pickletools.py +++ b/Lib/pickletools.py @@ -1549,9 +1549,8 @@ def __init__(self, name, code, arg, Stack before: ... pylist markobject stackslice Stack after: ... pylist+stackslice - although pylist is really extended in-place. The .extend() attribute - function is attempted first, and if that fails the .append() attribute - is ran instead. + although pylist is really extended in-place. The extend() method is + used if it exists, otherwise the append() method is used. """), I(name='LIST', From c5455d2f5ad15882ee181b8222ff1e384250e114 Mon Sep 17 00:00:00 2001 From: Justin Applegate Date: Tue, 17 Jun 2025 09:58:15 -0400 Subject: [PATCH 10/10] Removed implementation details from EXT1 opcode --- Lib/pickletools.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/Lib/pickletools.py b/Lib/pickletools.py index 4ad0903f40874c..b430534f8a7234 100644 --- a/Lib/pickletools.py +++ b/Lib/pickletools.py @@ -1763,12 +1763,7 @@ def __init__(self, name, code, arg, stack_before=[anyobject], stack_after=[], proto=0, - doc="""Discard the top stack item, shrinking the stack by one item. - - If the stack has no items in it and the metastack is not empty, then - this opcode will act like POP_MARK and pop the top of the metastack - into the current stack. - """), + doc="Discard the top stack item, shrinking the stack by one item."), I(name='DUP', code='2', @@ -1915,10 +1910,9 @@ def __init__(self, name, code, arg, be reserved for private use. EXT1 has a 1-byte integer argument. This is used to index into - the inverted extension registry, which contains integer to tuple - mappings. The tuples have a length of two in the format of - '("module", "name")'. This tuple is then passed through find_class, - and the result is pushed onto the stack. + the inverted extension registry, which is populated through + copyreg.add_extension(). The result is then passed through + find_class() and the callable is pushed onto the stack. """), I(name='EXT2',