Merge pull request the-paperless-project#542 from grembo/master

Allow configuring transformations to be applied to the filename before
IbnNafis007 · Sep 9, 2019 · 1c95665 · 1c95665
2 parents ebd9f91 + 4f85d9e
commit 1c95665
Show file tree

Hide file tree

Showing 5 changed files with 141 additions and 1 deletion.
diff --git a/docs/guesswork.rst b/docs/guesswork.rst
@@ -54,6 +54,34 @@ filename as described above.
 
 .. _dateparser: https://github.com/scrapinghub/dateparser/blob/v0.7.0/docs/usage.rst#settings
 
+Transforming filenames for parsing
+----------------------------------
+Some devices can't produce filenames that can be parsed by the default
+parser. By configuring the option ``PAPERLESS_FILENAME_PARSE_TRANSFORMS`` in
+``paperless.conf`` one can add transformations that are applied to the filename
+before it's parsed.
+
+The option contains a list of dictionaries of regular expressions (key:
+``pattern``) and replacements (key: ``repl``) in JSON format, which are
+applied in order by passing them to ``re.subn``. Transformation stops
+after the first match, so at most one transformation is applied. The general
+syntax is
+
+.. code:: python
+
+   [{"pattern":"pattern1", "repl":"repl1"}, {"pattern":"pattern2", "repl":"repl2"}, ..., {"pattern":"patternN", "repl":"replN"}]
+
+The example below is for a Brother ADS-2400N, a scanner that allows
+different names to different hardware buttons (useful for handling
+multiple entities in one instance), but insists on adding ``_<count>``
+to the filename.
+
+.. code:: python
+
+   # Brother profile configuration, support "Name_Date_Count" (the default
+   # setting) and "Name_Count" (use "Name" as tag and "Count" as title).
+   PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}, {"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}]
+
 .. _guesswork-content:
 
 Reading the Document Contents

diff --git a/paperless.conf.example b/paperless.conf.example
@@ -135,6 +135,23 @@ PAPERLESS_EMAIL_SECRET=""
 # as normal.
 #PAPERLESS_FILENAME_DATE_ORDER="YMD"
 
+# Sometimes devices won't create filenames which can be parsed properly
+# by the filename parser (see
+# https://paperless.readthedocs.io/en/latest/guesswork.html).
+#
+# This setting allows to specify a list of transformations
+# in regular expression syntax, which are passed in order to re.sub.
+# Transformation stops after the first match, so at most one transformation
+# is applied.
+#
+# Syntax is a JSON array of dictionaries containing "pattern" and "repl"
+# as keys.
+#
+# The example below transforms filenames created by a Brother ADS-2400N
+# document scanner in its standard configuration `Name_Date_Count', so that
+# count is used as title, name as tag and date can be parsed by paperless.
+#PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}]
+
 #
 # The following values use sensible defaults for modern systems, but if you're
 # running Paperless on a low-resource device (like a Raspberry Pi), modifying

diff --git a/src/documents/models.py b/src/documents/models.py
@@ -483,8 +483,18 @@ def from_path(cls, path):
           "<title>.<suffix>"
         """
 
+        filename = os.path.basename(path)
+
+        # Mutate filename in-place before parsing its components
+        # by applying at most one of the configured transformations.
+        for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
+            (filename, count) = pattern.subn(repl, filename)
+            if count:
+                break
+
+        # Parse filename components.
         for regex in cls.REGEXES.values():
-            m = regex.match(os.path.basename(path))
+            m = regex.match(filename)
             if m:
                 properties = m.groupdict()
                 cls._mangle_property(properties, "created")

diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py
@@ -1,3 +1,5 @@
+import re
+
 from django.test import TestCase
 from unittest import mock
 from tempfile import TemporaryDirectory
@@ -372,3 +374,79 @@ def test_invalid_date_format(self):
         info = FileInfo.from_path("/path/to/06112017Z - title.pdf")
         self.assertEqual(info.title, "title")
         self.assertIsNone(info.created)
+
+    def test_filename_parse_transforms(self):
+
+        path = "/some/path/to/tag1,tag2_20190908_180610_0001.pdf"
+        all_patt = re.compile("^.*$")
+        none_patt = re.compile("$a")
+        exact_patt = re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.")
+        repl1 = " - \\4 - \\1."    # (empty) corrspondent, title and tags
+        repl2 = "\\2Z - " + repl1  # creation date + repl1
+
+        # No transformations configured (= default)
+        info = FileInfo.from_path(path)
+        self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
+        self.assertEqual(info.extension, "pdf")
+        self.assertEqual(info.tags, ())
+        self.assertIsNone(info.created)
+
+        # Pattern doesn't match (filename unaltered)
+        with self.settings(
+                FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]):
+            info = FileInfo.from_path(path)
+            self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
+            self.assertEqual(info.extension, "pdf")
+
+        # Simple transformation (match all)
+        with self.settings(
+                FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]):
+            info = FileInfo.from_path(path)
+            self.assertEqual(info.title, "all")
+            self.assertEqual(info.extension, "gif")
+
+        # Multiple transformations configured (first pattern matches)
+        with self.settings(
+                FILENAME_PARSE_TRANSFORMS=[
+                    (all_patt, "all.gif"),
+                    (all_patt, "anotherall.gif")]):
+            info = FileInfo.from_path(path)
+            self.assertEqual(info.title, "all")
+            self.assertEqual(info.extension, "gif")
+
+        # Multiple transformations configured (second pattern matches)
+        with self.settings(
+                FILENAME_PARSE_TRANSFORMS=[
+                    (none_patt, "none.gif"),
+                    (all_patt, "anotherall.gif")]):
+            info = FileInfo.from_path(path)
+            self.assertEqual(info.title, "anotherall")
+            self.assertEqual(info.extension, "gif")
+
+        # Complex transformation without date in replacement string
+        with self.settings(
+                FILENAME_PARSE_TRANSFORMS=[(exact_patt, repl1)]):
+            info = FileInfo.from_path(path)
+            self.assertEqual(info.title, "0001")
+            self.assertEqual(info.extension, "pdf")
+            self.assertEqual(len(info.tags), 2)
+            self.assertEqual(info.tags[0].slug, "tag1")
+            self.assertEqual(info.tags[1].slug, "tag2")
+            self.assertIsNone(info.created)
+
+        # Complex transformation with date in replacement string
+        with self.settings(
+            FILENAME_PARSE_TRANSFORMS=[
+                (none_patt, "none.gif"),
+                (exact_patt, repl2),    # <-- matches
+                (exact_patt, repl1),
+                (all_patt, "all.gif")]):
+            info = FileInfo.from_path(path)
+            self.assertEqual(info.title, "0001")
+            self.assertEqual(info.extension, "pdf")
+            self.assertEqual(len(info.tags), 2)
+            self.assertEqual(info.tags[0].slug, "tag1")
+            self.assertEqual(info.tags[1].slug, "tag2")
+            self.assertEqual(info.created.year, 2019)
+            self.assertEqual(info.created.month, 9)
+            self.assertEqual(info.created.day, 8)
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
@@ -10,7 +10,9 @@
 https://docs.djangoproject.com/en/1.10/ref/settings/
 """
 
+import json
 import os
+import re
 
 from dotenv import load_dotenv
 
@@ -322,6 +324,11 @@ def __get_boolean(key, default="NO"):
 DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
 FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
 
+# Transformations applied before filename parsing
+FILENAME_PARSE_TRANSFORMS = []
+for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
+    FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
+
 # Specify for how many years a correspondent is considered recent. Recent
 # correspondents will be shown in a separate "Recent correspondents" filter as
 # well. Set to 0 to disable this filter.