Consolidate get_date onto the DocumentParser parent class

IbnNafis007 · Oct 7, 2018 · 2a3f766 · 2a3f766
1 parent 14bb52b
commit 2a3f766
Show file tree

Hide file tree

Showing 5 changed files with 83 additions and 90 deletions.
diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -9,6 +9,8 @@ Changelog
   for reporting this. `#414`_.
 * A bug in the Dockerfile meant that Tesseract language files weren't being
   installed correctly.  `euri10`_ was quick to provide a fix: `#406`_, `#413`_.
+* The ``get_date()`` functionality of the parsers has been consolidated onto
+  the ``DocumentParser`` class since much of that code was redundant anyway.
 
 2.4.0
 =====

diff --git a/src/documents/parsers.py b/src/documents/parsers.py
@@ -1,9 +1,12 @@
 import logging
+import os
+import re
 import shutil
 import tempfile
-import re
 
+import dateparser
 from django.conf import settings
+from django.utils import timezone
 
 # This regular expression will try to find dates in the document at
 # hand and will match the following formats:
@@ -32,6 +35,7 @@ class DocumentParser:
     """
 
     SCRATCH = settings.SCRATCH_DIR
+    DATE_ORDER = settings.DATE_ORDER
 
     def __init__(self, path):
         self.document_path = path
@@ -55,7 +59,52 @@ def get_date(self):
         """
         Returns the date of the document.
         """
-        raise NotImplementedError()
+
+        date = None
+        date_string = None
+
+        try:
+            text = self.get_text()
+        except ParseError:
+            return None
+
+        next_year = timezone.now().year + 5  # Arbitrary 5 year future limit
+
+        # Iterate through all regex matches and try to parse the date
+        for m in re.finditer(DATE_REGEX, text):
+
+            date_string = m.group(0)
+
+            try:
+                date = dateparser.parse(
+                    date_string,
+                    settings={
+                        "DATE_ORDER": self.DATE_ORDER,
+                        "PREFER_DAY_OF_MONTH": "first",
+                        "RETURN_AS_TIMEZONE_AWARE": True
+                    }
+                )
+            except TypeError:
+                # Skip all matches that do not parse to a proper date
+                continue
+
+            if date is not None and next_year > date.year > 1900:
+                break
+            else:
+                date = None
+
+        if date is not None:
+            self.log(
+                "info",
+                "Detected document date {} based on string {}".format(
+                    date.isoformat(),
+                    date_string
+                )
+            )
+        else:
+            self.log("info", "Unable to detect date for document")
+
+        return date
 
     def log(self, level, message):
         getattr(self.logger, level)(message, extra={

diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py
@@ -4,7 +4,6 @@
 import subprocess
 from multiprocessing.pool import Pool
 
-import dateparser
 import langdetect
 import pyocr
 from django.conf import settings
@@ -14,7 +13,7 @@
 from pyocr.tesseract import TesseractError
 
 import pdftotext
-from documents.parsers import DocumentParser, ParseError, DATE_REGEX
+from documents.parsers import DocumentParser, ParseError
 
 from .languages import ISO639
 
@@ -33,7 +32,6 @@ class RasterisedDocumentParser(DocumentParser):
     DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
     UNPAPER = settings.UNPAPER_BINARY
-    DATE_ORDER = settings.DATE_ORDER
     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
     OCR_ALWAYS = settings.OCR_ALWAYS
 
@@ -202,51 +200,6 @@ def _assemble_ocr_sections(self, imgs, middle, text):
         text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
         return text
 
-    def get_date(self):
-
-        date = None
-        datestring = None
-
-        try:
-            text = self.get_text()
-        except ParseError as e:
-            return None
-
-        # Iterate through all regex matches and try to parse the date
-        for m in re.finditer(DATE_REGEX, text):
-            datestring = m.group(0)
-
-            try:
-                date = dateparser.parse(
-                    datestring,
-                    settings={
-                        "DATE_ORDER": self.DATE_ORDER,
-                        "PREFER_DAY_OF_MONTH": "first",
-                        "RETURN_AS_TIMEZONE_AWARE": True
-                    }
-                )
-            except TypeError:
-                # Skip all matches that do not parse to a proper date
-                continue
-
-            if date is not None and date.year > 1900:
-                break
-            else:
-                date = None
-
-        if date is not None:
-            self.log(
-                "info",
-                "Detected document date {} based on string {}".format(
-                    date.isoformat(),
-                    datestring
-                )
-            )
-        else:
-            self.log("info", "Unable to detect date for document")
-
-        return date
-
 
 def run_convert(*args):
 

diff --git a/src/paperless_tesseract/tests/test_date.py b/src/paperless_tesseract/tests/test_date.py
@@ -393,7 +393,33 @@ def test_get_text_9_pdf(self):
         "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
         SCRATCH
     )
-    def test_crazy_date(self, *args):
+    def test_crazy_date_past(self, *args):
+        document = RasterisedDocumentParser("/dev/null")
+        document.get_text()
+        self.assertIsNone(document.get_date())
+
+    @mock.patch(
+        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
+        return_value="01-07-2350 00:00:00"
+    )
+    @mock.patch(
+        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
+        SCRATCH
+    )
+    def test_crazy_date_future(self, *args):
+        document = RasterisedDocumentParser("/dev/null")
+        document.get_text()
+        self.assertIsNone(document.get_date())
+
+    @mock.patch(
+        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
+        return_value="01-07-0590 00:00:00"
+    )
+    @mock.patch(
+        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
+        SCRATCH
+    )
+    def test_crazy_date_past(self, *args):
         document = RasterisedDocumentParser("/dev/null")
         document.get_text()
         self.assertIsNone(document.get_date())
diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py
@@ -1,11 +1,9 @@
 import os
-import re
 import subprocess
 
-import dateparser
 from django.conf import settings
 
-from documents.parsers import DocumentParser, ParseError, DATE_REGEX
+from documents.parsers import DocumentParser, ParseError
 
 
 class TextDocumentParser(DocumentParser):
@@ -16,7 +14,6 @@ class TextDocumentParser(DocumentParser):
     CONVERT = settings.CONVERT_BINARY
     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
     UNPAPER = settings.UNPAPER_BINARY
-    DATE_ORDER = settings.DATE_ORDER
     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
     OCR_ALWAYS = settings.OCR_ALWAYS
 
@@ -26,7 +23,7 @@ def __init__(self, path):
 
     def get_thumbnail(self):
         """
-        The thumbnail of a txt is just a 500px wide image of the text
+        The thumbnail of a text file is just a 500px wide image of the text
         rendered onto a letter-sized page.
         """
         # The below is heavily cribbed from https://askubuntu.com/a/590951
@@ -84,40 +81,6 @@ def get_text(self):
 
         return self._text
 
-    def get_date(self):
-        date = None
-        datestring = None
-
-        try:
-            text = self.get_text()
-        except ParseError as e:
-            return None
-
-        # Iterate through all regex matches and try to parse the date
-        for m in re.finditer(DATE_REGEX, text):
-            datestring = m.group(0)
-
-            try:
-                date = dateparser.parse(
-                           datestring,
-                           settings={'DATE_ORDER': self.DATE_ORDER,
-                                     'PREFER_DAY_OF_MONTH': 'first',
-                                     'RETURN_AS_TIMEZONE_AWARE': True})
-            except TypeError:
-                # Skip all matches that do not parse to a proper date
-                continue
-
-            if date is not None:
-                break
-
-        if date is not None:
-            self.log("info", "Detected document date " + date.isoformat() +
-                             " based on string " + datestring)
-        else:
-            self.log("info", "Unable to detect date for document")
-
-        return date
-
 
 def run_command(*args):
     environment = os.environ.copy()