Skip to content

Commit

Permalink
Consolidate get_date onto the DocumentParser parent class
Browse files Browse the repository at this point in the history
  • Loading branch information
danielquinn committed Oct 7, 2018
1 parent 14bb52b commit 2a3f766
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 90 deletions.
2 changes: 2 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ Changelog
for reporting this. `#414`_.
* A bug in the Dockerfile meant that Tesseract language files weren't being
installed correctly. `euri10`_ was quick to provide a fix: `#406`_, `#413`_.
* The ``get_date()`` functionality of the parsers has been consolidated onto
the ``DocumentParser`` class since much of that code was redundant anyway.

2.4.0
=====
Expand Down
53 changes: 51 additions & 2 deletions src/documents/parsers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import logging
import os
import re
import shutil
import tempfile
import re

import dateparser
from django.conf import settings
from django.utils import timezone

# This regular expression will try to find dates in the document at
# hand and will match the following formats:
Expand Down Expand Up @@ -32,6 +35,7 @@ class DocumentParser:
"""

SCRATCH = settings.SCRATCH_DIR
DATE_ORDER = settings.DATE_ORDER

def __init__(self, path):
self.document_path = path
Expand All @@ -55,7 +59,52 @@ def get_date(self):
"""
Returns the date of the document.
"""
raise NotImplementedError()

date = None
date_string = None

try:
text = self.get_text()
except ParseError:
return None

next_year = timezone.now().year + 5 # Arbitrary 5 year future limit

# Iterate through all regex matches and try to parse the date
for m in re.finditer(DATE_REGEX, text):

date_string = m.group(0)

try:
date = dateparser.parse(
date_string,
settings={
"DATE_ORDER": self.DATE_ORDER,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE": True
}
)
except TypeError:
# Skip all matches that do not parse to a proper date
continue

if date is not None and next_year > date.year > 1900:
break
else:
date = None

if date is not None:
self.log(
"info",
"Detected document date {} based on string {}".format(
date.isoformat(),
date_string
)
)
else:
self.log("info", "Unable to detect date for document")

return date

def log(self, level, message):
getattr(self.logger, level)(message, extra={
Expand Down
49 changes: 1 addition & 48 deletions src/paperless_tesseract/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import subprocess
from multiprocessing.pool import Pool

import dateparser
import langdetect
import pyocr
from django.conf import settings
Expand All @@ -14,7 +13,7 @@
from pyocr.tesseract import TesseractError

import pdftotext
from documents.parsers import DocumentParser, ParseError, DATE_REGEX
from documents.parsers import DocumentParser, ParseError

from .languages import ISO639

Expand All @@ -33,7 +32,6 @@ class RasterisedDocumentParser(DocumentParser):
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
UNPAPER = settings.UNPAPER_BINARY
DATE_ORDER = settings.DATE_ORDER
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
OCR_ALWAYS = settings.OCR_ALWAYS

Expand Down Expand Up @@ -202,51 +200,6 @@ def _assemble_ocr_sections(self, imgs, middle, text):
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
return text

def get_date(self):

date = None
datestring = None

try:
text = self.get_text()
except ParseError as e:
return None

# Iterate through all regex matches and try to parse the date
for m in re.finditer(DATE_REGEX, text):
datestring = m.group(0)

try:
date = dateparser.parse(
datestring,
settings={
"DATE_ORDER": self.DATE_ORDER,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE": True
}
)
except TypeError:
# Skip all matches that do not parse to a proper date
continue

if date is not None and date.year > 1900:
break
else:
date = None

if date is not None:
self.log(
"info",
"Detected document date {} based on string {}".format(
date.isoformat(),
datestring
)
)
else:
self.log("info", "Unable to detect date for document")

return date


def run_convert(*args):

Expand Down
28 changes: 27 additions & 1 deletion src/paperless_tesseract/tests/test_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,33 @@ def test_get_text_9_pdf(self):
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_crazy_date(self, *args):
def test_crazy_date_past(self, *args):
document = RasterisedDocumentParser("/dev/null")
document.get_text()
self.assertIsNone(document.get_date())

@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="01-07-2350 00:00:00"
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_crazy_date_future(self, *args):
document = RasterisedDocumentParser("/dev/null")
document.get_text()
self.assertIsNone(document.get_date())

@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="01-07-0590 00:00:00"
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_crazy_date_past(self, *args):
document = RasterisedDocumentParser("/dev/null")
document.get_text()
self.assertIsNone(document.get_date())
41 changes: 2 additions & 39 deletions src/paperless_text/parsers.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import os
import re
import subprocess

import dateparser
from django.conf import settings

from documents.parsers import DocumentParser, ParseError, DATE_REGEX
from documents.parsers import DocumentParser, ParseError


class TextDocumentParser(DocumentParser):
Expand All @@ -16,7 +14,6 @@ class TextDocumentParser(DocumentParser):
CONVERT = settings.CONVERT_BINARY
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
UNPAPER = settings.UNPAPER_BINARY
DATE_ORDER = settings.DATE_ORDER
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
OCR_ALWAYS = settings.OCR_ALWAYS

Expand All @@ -26,7 +23,7 @@ def __init__(self, path):

def get_thumbnail(self):
"""
The thumbnail of a txt is just a 500px wide image of the text
The thumbnail of a text file is just a 500px wide image of the text
rendered onto a letter-sized page.
"""
# The below is heavily cribbed from https://askubuntu.com/a/590951
Expand Down Expand Up @@ -84,40 +81,6 @@ def get_text(self):

return self._text

def get_date(self):
date = None
datestring = None

try:
text = self.get_text()
except ParseError as e:
return None

# Iterate through all regex matches and try to parse the date
for m in re.finditer(DATE_REGEX, text):
datestring = m.group(0)

try:
date = dateparser.parse(
datestring,
settings={'DATE_ORDER': self.DATE_ORDER,
'PREFER_DAY_OF_MONTH': 'first',
'RETURN_AS_TIMEZONE_AWARE': True})
except TypeError:
# Skip all matches that do not parse to a proper date
continue

if date is not None:
break

if date is not None:
self.log("info", "Detected document date " + date.isoformat() +
" based on string " + datestring)
else:
self.log("info", "Unable to detect date for document")

return date


def run_command(*args):
environment = os.environ.copy()
Expand Down

0 comments on commit 2a3f766

Please sign in to comment.