py-pdf · stefan6419846 · May 19, 2025 · Feb 27, 2025 · Feb 27, 2025 · Feb 27, 2025
diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml
@@ -70,7 +70,7 @@ jobs:
         sudo apt-get update
     - name: Install APT dependencies
       run:
-        sudo apt-get install ghostscript poppler-utils
+        sudo apt-get install ghostscript jbig2dec poppler-utils
     - name: Checkout Code
       uses: actions/checkout@v4
       with:

diff --git a/docs/user/installation.md b/docs/user/installation.md
@@ -48,6 +48,14 @@ If you plan to use image extraction, you need Pillow:
 pip install pypdf[image]
 ```
 
+For JBIG2 support, you need to install a global OS-level package as well:
+[`jbig2dec`](https://github.com/ArtifexSoftware/jbig2dec) The installation procedure
+depends on our operating system. For Ubuntu, just use the following for example:
+
+```
+sudo apt-get install jbig2dec
+```
+
 ## Python Version Support
 
 Since pypdf 4.0, every release, including point releases, should work with all

diff --git a/pypdf/constants.py b/pypdf/constants.py
@@ -245,6 +245,7 @@ class FilterTypes(StrEnum):
     CCITT_FAX_DECODE = "/CCITTFaxDecode"  # abbreviation: CCF
     DCT_DECODE = "/DCTDecode"  # abbreviation: DCT
     JPX_DECODE = "/JPXDecode"
+    JBIG2_DECODE = "/JBIG2Decode"
 
 
 class FilterTypeAbbreviations:

diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -35,11 +35,16 @@
 __author_email__ = "[email protected]"
 
 import math
+import os
+import shutil
 import struct
+import subprocess
 import zlib
 from base64 import a85decode
 from dataclasses import dataclass
 from io import BytesIO
+from pathlib import Path
+from tempfile import TemporaryDirectory
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 
 from ._codecs._codecs import LzwCodec as _LzwCodec
@@ -56,13 +61,15 @@
 from .constants import ImageAttributes as IA
 from .constants import LzwFilterParameters as LZW
 from .constants import StreamAttributes as SA
-from .errors import DeprecationError, PdfReadError, PdfStreamError
+from .errors import DependencyError, DeprecationError, PdfReadError, PdfStreamError
 from .generic import (
     ArrayObject,
     BooleanObject,
     DictionaryObject,
     IndirectObject,
     NullObject,
+    StreamObject,
+    is_null_or_none,
 )
 
 
@@ -641,6 +648,67 @@ def decode(
         return tiff_header + data
 
 
+JBIG2DEC_BINARY = shutil.which("jbig2dec")
+
+
+class JBIG2Decode:
+    @staticmethod
+    def decode(
+        data: bytes,
+        decode_parms: Optional[DictionaryObject] = None,
+        **kwargs: Any,
+    ) -> bytes:
+        if JBIG2DEC_BINARY is None:
+            raise DependencyError("jbig2dec binary is not available.")
+
+        with TemporaryDirectory() as tempdir:
+            directory = Path(tempdir)
+            paths: List[Path] = []
+
+            if decode_parms and "/JBIG2Globals" in decode_parms:
+                jbig2_globals = decode_parms["/JBIG2Globals"]
+                if not is_null_or_none(jbig2_globals) and not is_null_or_none(pointer := jbig2_globals.get_object()):
+                    assert pointer is not None, "mypy"
+                    if isinstance(pointer, StreamObject):
+                        path = directory.joinpath("globals.jbig2")
+                        path.write_bytes(pointer.get_data())
+                        paths.append(path)
+
+            path = directory.joinpath("image.jbig2")
+            path.write_bytes(data)
+            paths.append(path)
+
+            environment = os.environ.copy()
+            environment["LC_ALL"] = "C"
+            result = subprocess.run(  # noqa: S603
+                [JBIG2DEC_BINARY, "--embedded", "--format", "png", "--output", "-", *paths],
+                capture_output=True,
+                env=environment,
+            )
+            if b"unrecognized option '--embedded'" in result.stderr:
+                raise DependencyError("jbig2dec>=0.15 is required.")
+            if result.stderr:
+                for line in result.stderr.decode("utf-8").splitlines():
+                    logger_warning(line, __name__)
+            if result.returncode != 0:
+                raise PdfStreamError(f"Unable to decode JBIG2 data. Exit code: {result.returncode}")
+        return result.stdout
+
+    @staticmethod
+    def _is_binary_compatible() -> bool:
+        if not JBIG2DEC_BINARY:  # pragma: no cover
+            return False
+        result = subprocess.run(  # noqa: S603
+            [JBIG2DEC_BINARY, "--version"],
+            capture_output=True,
+            text=True,
+        )
+        version = result.stdout.split(" ", maxsplit=1)[1]
+
+        from ._utils import Version
+        return Version(version) >= Version("0.15")
+
+
 def decode_stream_data(stream: Any) -> bytes:
     """
     Decode the stream data based on the specified filters.
@@ -691,6 +759,8 @@ def decode_stream_data(stream: Any) -> bytes:
             data = DCTDecode.decode(data)
         elif filter_name == FT.JPX_DECODE:
             data = JPXDecode.decode(data)
+        elif filter_name == FT.JBIG2_DECODE:
+            data = JBIG2Decode.decode(data, params)
         elif filter_name == "/Crypt":
             if "/Name" in params or "/Type" in params:
                 raise NotImplementedError(
@@ -828,6 +898,13 @@ def _apply_alpha(
             ".tiff",
             False,
         )
+    elif lfilters == FT.JBIG2_DECODE:
+        img, image_format, extension, invert_color = (
+            Image.open(BytesIO(data), formats=("PNG",)),
+            "PNG",
+            ".png",
+            False,
+        )
     elif mode == "CMYK":
         img, image_format, extension, invert_color = (
             _extended_image_frombytes(mode, size, data),

diff --git a/tests/test_filters.py b/tests/test_filters.py
@@ -6,21 +6,31 @@
 from io import BytesIO
 from itertools import product as cartesian_product
 from pathlib import Path
+from unittest import mock
 
 import pytest
 from PIL import Image, ImageOps
 
 from pypdf import PdfReader
-from pypdf.errors import DeprecationError, PdfReadError
+from pypdf.errors import DependencyError, DeprecationError, PdfReadError, PdfStreamError
 from pypdf.filters import (
     ASCII85Decode,
     ASCIIHexDecode,
     CCITParameters,
     CCITTFaxDecode,
     CCITTParameters,
     FlateDecode,
+    JBIG2Decode,
+)
+from pypdf.generic import (
+    ArrayObject,
+    ContentStream,
+    DictionaryObject,
+    IndirectObject,
+    NameObject,
+    NullObject,
+    NumberObject,
 )
-from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject, NameObject, NumberObject
 
 from . import PILContext, get_data_from_url
 from .test_encryption import HAS_AES
@@ -697,3 +707,86 @@ def test_flate_decode__not_rectangular(caplog):
     expected = get_data_from_url(url, name=name)
     assert actual_image.getvalue() == expected
     assert caplog.messages == ["Image data is not rectangular. Adding padding."]
+
+
+def test_jbig2decode__binary_errors():
+    with mock.patch("pypdf.filters.JBIG2DEC_BINARY", None), \
+            pytest.raises(DependencyError, match="jbig2dec binary is not available."):
+        JBIG2Decode.decode(b"dummy")
+
+    result = subprocess.CompletedProcess(
+        args=["dummy"], returncode=0, stdout=b"",
+        stderr=(
+            b"jbig2dec: unrecognized option '--embedded'\n"
+            b"Usage: jbig2dec [options] <file.jbig2>\n"
+            b"   or  jbig2dec [options] <global_stream> <page_stream>\n"
+        )
+    )
+    with mock.patch("pypdf.filters.subprocess.run", return_value=result), \
+            mock.patch("pypdf.filters.JBIG2DEC_BINARY", "/usr/bin/jbig2dec"), \
+            pytest.raises(DependencyError, match="jbig2dec>=0.15 is required."):
+        JBIG2Decode.decode(b"dummy")
+
+
+@pytest.mark.skipif(condition=not JBIG2Decode._is_binary_compatible(), reason="Requires recent jbig2dec")
+def test_jbig2decode__edge_cases(caplog):
+    image_data = (
+        b'\x00\x00\x00\x010\x00\x01\x00\x00\x00\x13\x00\x00\x00\x05\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x06"'
+        b'\x00\x01\x00\x00\x00\x1c\x00\x00\x00\x05\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x9f\xa8_\xff\xac'
+
+    )
+    jbig2_globals = b"\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x18\x00\x00\x03\xff\xfd\xff\x02\xfe\xfe\xfe\x00\x00\x00\x01\x00\x00\x00\x01R\xd0u7\xff\xac"  # noqa: E501
+
+    # Validation: Is our image data valid?
+    content_stream = ContentStream(stream=None, pdf=None)
+    content_stream.set_data(jbig2_globals)
+    result = JBIG2Decode.decode(image_data, decode_parms=DictionaryObject({"/JBIG2Globals": content_stream}))
+    image = Image.open(BytesIO(result), formats=("PNG",))
+    for x in range(5):
+        for y in range(5):
+            assert image.getpixel((x, y)) == (255 if x < 3 else 0), (x, y)
+    assert caplog.messages == []
+
+    # No decode_params. Completely white image.
+    result = JBIG2Decode.decode(image_data)
+    image = Image.open(BytesIO(result), formats=("PNG",))
+    for x in range(5):
+        for y in range(5):
+            assert image.getpixel((x, y)) == 255, (x, y)
+    assert caplog.messages == [
+        "jbig2dec WARNING text region refers to no symbol dictionaries (segment 0x00000002)",
+        "jbig2dec WARNING ignoring out of range symbol ID (0/0) (segment 0x00000002)"
+    ]
+    caplog.clear()
+
+    # JBIG2Globals is NULL. Completely white image.
+    result = JBIG2Decode.decode(image_data, decode_parms=DictionaryObject({"/JBIG2Globals": NullObject()}))
+    image = Image.open(BytesIO(result), formats=("PNG",))
+    for x in range(5):
+        for y in range(5):
+            assert image.getpixel((x, y)) == 255, (x, y)
+    assert caplog.messages == [
+        "jbig2dec WARNING text region refers to no symbol dictionaries (segment 0x00000002)",
+        "jbig2dec WARNING ignoring out of range symbol ID (0/0) (segment 0x00000002)"
+    ]
+    caplog.clear()
+
+    # JBIG2Globals is DictionaryObject. Completely white image.
+    result = JBIG2Decode.decode(image_data, decode_parms=DictionaryObject({"/JBIG2Globals": DictionaryObject()}))
+    image = Image.open(BytesIO(result), formats=("PNG",))
+    for x in range(5):
+        for y in range(5):
+            assert image.getpixel((x, y)) == 255, (x, y)
+    assert caplog.messages == [
+        "jbig2dec WARNING text region refers to no symbol dictionaries (segment 0x00000002)",
+        "jbig2dec WARNING ignoring out of range symbol ID (0/0) (segment 0x00000002)"
+    ]
+    caplog.clear()
+
+    # Invalid input.
+    with pytest.raises(PdfStreamError, match="Unable to decode JBIG2 data. Exit code: 1"):
+        JBIG2Decode.decode(b"aaaaaa")
+    assert caplog.messages == [
+        "jbig2dec FATAL ERROR page has no image, cannot be completed",
+        "jbig2dec WARNING unable to complete page"
+    ]
diff --git a/tests/test_images.py b/tests/test_images.py
@@ -15,6 +15,7 @@
 from PIL import Image, ImageChops, ImageDraw
 
 from pypdf import PageObject, PdfReader, PdfWriter
+from pypdf.filters import JBIG2Decode
 from pypdf.generic import ContentStream, NameObject, NullObject
 
 from . import get_data_from_url
@@ -530,3 +531,44 @@ def test_inline_image_containing_ei_in_body():
     output = BytesIO()
     writer.write(output)
     assert expected in output.getvalue()
+
+
+@pytest.mark.enable_socket
+@pytest.mark.skipif(condition=not JBIG2Decode._is_binary_compatible(), reason="Requires recent jbig2dec")
+def test_jbig2decode():
+    url = "https://github.com/py-pdf/pypdf/files/12090692/New.Jersey.Coinbase.staking.securities.charges.2023-0606_Coinbase-Penalty-and-C-D.pdf"
+    name = "jbig2.pdf"
+
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    page = reader.pages[0]
+    image = next(iter(page.images))
+    assert image.image.size == (5138, 6630)
+    assert image.image.mode == "1"
+    assert image.image.format == "PNG"
+
+    url = "https://github.com/user-attachments/assets/d6f88c80-a2e0-4ea9-b1e0-34442041d004"
+    name = "jbig2.png"
+    img = Image.open(BytesIO(get_data_from_url(url, name=name)))
+
+    assert image_similarity(image.image, img) >= 0.999
+
+
+@pytest.mark.enable_socket
+@pytest.mark.skipif(condition=not JBIG2Decode._is_binary_compatible(), reason="Requires recent jbig2dec")
+def test_jbig2decode__jbig2globals():
+    url = "https://github.com/user-attachments/files/20119148/out.pdf"
+    name = "jbig2_globals.pdf"
+
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    page = reader.pages[0]
+    image = next(iter(page.images))
+    assert image.image.size == (1067, 1067)
+    assert image.image.mode == "1"
+    assert image.image.format == "PNG"
+
+    url = "https://github.com/user-attachments/assets/7ac41ee3-9c13-44cf-aa74-8f106287e354"
+    name = "jbig2_globals.png"
+    img = Image.open(BytesIO(get_data_from_url(url, name=name)))
+
+    # Wrong image: 0.9618265964800714
+    assert image_similarity(image.image, img) >= 0.999