Skip to content

Commit

Permalink
feat: enhance PDF processing with improved logging and filename handling
Browse files Browse the repository at this point in the history
  • Loading branch information
titusz committed Oct 3, 2024
1 parent 4048919 commit cb2958a
Showing 1 changed file with 10 additions and 2 deletions.
12 changes: 10 additions & 2 deletions iscc_metagen/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@ def pdf_open(doc):
:raises TypeError: If input is not a string, Path, or Document object
"""
if isinstance(doc, (str, Path)):
doc = Path(doc)
filename = doc.name
log.info(f"{doc.name} -> open with PDF processor")
doc = pymupdf.open(doc)
doc.filename = filename
elif not isinstance(doc, Document):
raise TypeError("Input must be a string, Path, or Document object")
return doc
Expand All @@ -36,26 +40,29 @@ def pdf_extract_pages(doc, first=None, middle=None, last=None):
:return: Extracted pages as Markdown text
"""
doc = pdf_open(doc)
log.debug(f"{doc.name} -> {doc.page_count}")

first = first if first is not None else mg_opts.front_pages
middle = middle if middle is not None else mg_opts.mid_pages
last = last if last is not None else mg_opts.back_pages
log.info(f"{doc.filename} -> Extracting markdown for {first} first, {middle} middle, {last} last pages")

first_pages = list(range(first)) if first else []
center = doc.page_count // 2
middle_pages = list(range(center, center + middle)) if middle else []
last_pages = list(range(doc.page_count - last, doc.page_count)) if last else []
page_numbers = first_pages + middle_pages + last_pages

return pymupdf4llm.to_markdown(
text_md = pymupdf4llm.to_markdown(
doc,
pages=page_numbers,
embed_images=False,
page_chunks=False,
show_progress=False,
)

log.info(f"{doc.filename} -> Extraced {len(text_md)} characters")
return text_md


def pdf_extract_cover(doc):
# type: (str|Path|Document) -> io.BytesIO|None
Expand All @@ -66,6 +73,7 @@ def pdf_extract_cover(doc):
:return: An in-memory image object (BytesIO) or None if extraction fails
"""
doc = pdf_open(doc)
log.info(f"{doc.filename} -> Extracting cover image")
try:
first_page = doc[0]
pix = first_page.get_pixmap(matrix=pymupdf.Matrix(2, 2)) # Scale up for better quality
Expand Down

0 comments on commit cb2958a

Please sign in to comment.