Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add basic support for JBIG2 by using jbig2dec #3163

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/github-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ jobs:
sudo apt-get update
- name: Install APT dependencies
run:
sudo apt-get install ghostscript poppler-utils
sudo apt-get install ghostscript jbig2dec poppler-utils
- name: Checkout Code
uses: actions/checkout@v4
with:
Expand Down
8 changes: 8 additions & 0 deletions docs/user/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@ If you plan to use image extraction, you need Pillow:
pip install pypdf[image]
```

For JBIG2 support, you need to install a global OS-level package as well:
[`jbig2dec`](https://github.com/ArtifexSoftware/jbig2dec) The installation procedure
depends on our operating system. For Ubuntu, just use the following for example:

```
sudo apt-get install jbig2dec
```

## Python Version Support

Since pypdf 4.0, every release, including point releases, should work with all
Expand Down
1 change: 1 addition & 0 deletions pypdf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ class FilterTypes(StrEnum):
CCITT_FAX_DECODE = "/CCITTFaxDecode" # abbreviation: CCF
DCT_DECODE = "/DCTDecode" # abbreviation: DCT
JPX_DECODE = "/JPXDecode"
JBIG2_DECODE = "/JBIG2Decode"


class FilterTypeAbbreviations:
Expand Down
58 changes: 57 additions & 1 deletion pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,15 @@
__author_email__ = "[email protected]"

import math
import os
import shutil
import struct
import subprocess
import zlib
from base64 import a85decode
from dataclasses import dataclass
from io import BytesIO
from tempfile import NamedTemporaryFile
from typing import Any, Dict, List, Optional, Tuple, Union, cast

from ._codecs._codecs import LzwCodec as _LzwCodec
Expand All @@ -56,7 +60,7 @@
from .constants import ImageAttributes as IA
from .constants import LzwFilterParameters as LZW
from .constants import StreamAttributes as SA
from .errors import DeprecationError, PdfReadError, PdfStreamError
from .errors import DependencyError, DeprecationError, PdfReadError, PdfStreamError
from .generic import (
ArrayObject,
DictionaryObject,
Expand Down Expand Up @@ -614,6 +618,49 @@ def decode(
return tiff_header + data


_JBIG2DEC_BINARY = shutil.which("jbig2dec")


class JBIG2Decode:
@staticmethod
def decode(
data: bytes,
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> bytes:
# decode_parms is unused here
if _JBIG2DEC_BINARY is None:
raise DependencyError("jbig2dec binary is not available.")

with NamedTemporaryFile(suffix=".jbig2") as infile:
infile.write(data)
infile.seek(0)
environment = os.environ.copy()
environment["LC_ALL"] = "C"
result = subprocess.run( # noqa: S603
[_JBIG2DEC_BINARY, "--embedded", "--format", "png", "--output", "-", infile.name],
capture_output=True,
env=environment,
)
if b"unrecognized option '--embedded'" in result.stderr:
raise DependencyError("jbig2dec>=0.15 is required.")
return result.stdout

@staticmethod
def _is_binary_compatible() -> bool:
if not _JBIG2DEC_BINARY: # pragma: no cover
return False
result = subprocess.run( # noqa: S603
[_JBIG2DEC_BINARY, "--version"],
capture_output=True,
text=True,
)
version = result.stdout.split(" ", maxsplit=1)[1]

from ._utils import Version
return Version(version) >= Version("0.15")


def decode_stream_data(stream: Any) -> bytes:
"""
Decode the stream data based on the specified filters.
Expand Down Expand Up @@ -664,6 +711,8 @@ def decode_stream_data(stream: Any) -> bytes:
data = DCTDecode.decode(data)
elif filter_name == FT.JPX_DECODE:
data = JPXDecode.decode(data)
elif filter_name == FT.JBIG2_DECODE:
data = JBIG2Decode.decode(data)
elif filter_name == "/Crypt":
if "/Name" in params or "/Type" in params:
raise NotImplementedError(
Expand Down Expand Up @@ -794,6 +843,13 @@ def _apply_alpha(
".tiff",
False,
)
elif lfilters == FT.JBIG2_DECODE:
img, image_format, extension, invert_color = (
Image.open(BytesIO(data), formats=("PNG",)),
"PNG",
".png",
False,
)
elif mode == "CMYK":
img, image_format, extension, invert_color = (
_extended_image_frombytes(mode, size, data),
Expand Down
23 changes: 22 additions & 1 deletion tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,21 @@
from io import BytesIO
from itertools import product as cartesian_product
from pathlib import Path
from unittest import mock

import pytest
from PIL import Image

from pypdf import PdfReader
from pypdf.errors import DeprecationError, PdfReadError
from pypdf.errors import DependencyError, DeprecationError, PdfReadError
from pypdf.filters import (
ASCII85Decode,
ASCIIHexDecode,
CCITParameters,
CCITTFaxDecode,
CCITTParameters,
FlateDecode,
JBIG2Decode,
)
from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject, NameObject, NumberObject

Expand Down Expand Up @@ -642,3 +644,22 @@ def test_ascii85decode__non_recoverable(caplog):
with pytest.raises(ValueError, match="Non-Ascii85 digit found: Ã"):
ASCII85Decode.decode(data)
assert caplog.text == ""


def test_jbig2decode__binary_errors():
with mock.patch("pypdf.filters._JBIG2DEC_BINARY", None), \
pytest.raises(DependencyError, match="jbig2dec binary is not available."):
JBIG2Decode.decode(b"dummy")

result = subprocess.CompletedProcess(
args=["dummy"], returncode=0, stdout=b"",
stderr=(
b"jbig2dec: unrecognized option '--embedded'\n"
b"Usage: jbig2dec [options] <file.jbig2>\n"
b" or jbig2dec [options] <global_stream> <page_stream>\n"
)
)
with mock.patch("pypdf.filters.subprocess.run", return_value=result), \
mock.patch("pypdf.filters._JBIG2DEC_BINARY", "/usr/bin/jbig2dec"), \
pytest.raises(DependencyError, match="jbig2dec>=0.15 is required."):
JBIG2Decode.decode(b"dummy")
21 changes: 21 additions & 0 deletions tests/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from PIL import Image, ImageChops, ImageDraw

from pypdf import PageObject, PdfReader, PdfWriter
from pypdf.filters import JBIG2Decode
from pypdf.generic import ContentStream, NameObject, NullObject

from . import get_data_from_url
Expand Down Expand Up @@ -530,3 +531,23 @@ def test_inline_image_containing_ei_in_body():
output = BytesIO()
writer.write(output)
assert expected in output.getvalue()


@pytest.mark.enable_socket
@pytest.mark.skipif(condition=not JBIG2Decode._is_binary_compatible(), reason="Requires recent jbig2dec")
def test_jbig2decode():
url = "https://github.com/py-pdf/pypdf/files/12090692/New.Jersey.Coinbase.staking.securities.charges.2023-0606_Coinbase-Penalty-and-C-D.pdf"
name = "jbig2.pdf"

reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
page = reader.pages[0]
image = next(iter(page.images))
assert image.image.size == (5138, 6630)
assert image.image.mode == "1"
assert image.image.format == "PNG"

url = "https://github.com/user-attachments/assets/d6f88c80-a2e0-4ea9-b1e0-34442041d004"
name = "jbig2.png"
img = Image.open(BytesIO(get_data_from_url(url, name=name)))

assert image_similarity(image.image, img) >= 0.999
Loading