Skip to content

Commit

Permalink
Merge pull request facebookresearch#101 from filipbasara0/main
Browse files Browse the repository at this point in the history
  • Loading branch information
lukas-blecher authored Sep 20, 2023
2 parents 5542cf0 + 4b9fdb1 commit fa061db
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 10 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -139,4 +139,7 @@ dmypy.json
# Pyre type checker
.pyre/

ckpt*/
ckpt*/

# Misc
pdfs
9 changes: 4 additions & 5 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,14 @@
"""
import os
import sys
import io
from functools import partial
from http import HTTPStatus
from fastapi import FastAPI, File, UploadFile
from PIL import Image
from pathlib import Path
import hashlib
from fastapi.middleware.cors import CORSMiddleware
import pypdf
import pypdfium2
import torch
from nougat import NougatModel
from nougat.postprocessing import markdown_compatible, close_envs
Expand Down Expand Up @@ -84,14 +83,14 @@ async def predict(
str: The extracted text in Markdown format.
"""
pdfbin = file.file.read()
pdf = pypdf.PdfReader(io.BytesIO(pdfbin))
pdf = pypdfium2.PdfDocument(pdfbin)
md5 = hashlib.md5(pdfbin).hexdigest()
save_path = SAVE_DIR / md5

if start is not None and stop is not None:
pages = list(range(start - 1, stop))
else:
pages = list(range(len(pdf.pages)))
pages = list(range(len(pdf)))
predictions = [""] * len(pages)
dellist = []
if save_path.exists():
Expand Down Expand Up @@ -148,7 +147,7 @@ async def predict(
)

(save_path / "pages").mkdir(parents=True, exist_ok=True)
pypdf.PdfWriter(clone_from=pdf).write(save_path / "doc.pdf")
pdf.save(save_path / "doc.pdf")
if len(images) > 0:
thumb = Image.open(images[0])
thumb.thumbnail((400, 400))
Expand Down
2 changes: 1 addition & 1 deletion nougat/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@
LICENSE file in the root directory of this source tree.
"""

__version__ = "0.1.10"
__version__ = "0.1.11"
6 changes: 3 additions & 3 deletions nougat/dataset/rasterize.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
logging.getLogger("pypdfium2").setLevel(logging.WARNING)

def rasterize_paper(
pdf: Path,
pdf: Path | bytes,
outpath: Optional[Path] = None,
dpi: int = 96,
return_pil=False,
Expand Down Expand Up @@ -54,8 +54,8 @@ def rasterize_paper(
pils.append(page_bytes)
else:
image.save((outpath / ("%02d.png" % (i + 1))), "png")
except Exception:
pass
except Exception as e:
logging.error(e)
if return_pil:
return pils

Expand Down

0 comments on commit fa061db

Please sign in to comment.