diff --git a/docs/guesswork.rst b/docs/guesswork.rst index 0e728d7af..c12ecd0c4 100644 --- a/docs/guesswork.rst +++ b/docs/guesswork.rst @@ -54,6 +54,34 @@ filename as described above. .. _dateparser: https://github.com/scrapinghub/dateparser/blob/v0.7.0/docs/usage.rst#settings +Transforming filenames for parsing +---------------------------------- +Some devices can't produce filenames that can be parsed by the default +parser. By configuring the option ``PAPERLESS_FILENAME_PARSE_TRANSFORMS`` in +``paperless.conf`` one can add transformations that are applied to the filename +before it's parsed. + +The option contains a list of dictionaries of regular expressions (key: +``pattern``) and replacements (key: ``repl``) in JSON format, which are +applied in order by passing them to ``re.subn``. Transformation stops +after the first match, so at most one transformation is applied. The general +syntax is + +.. code:: python + + [{"pattern":"pattern1", "repl":"repl1"}, {"pattern":"pattern2", "repl":"repl2"}, ..., {"pattern":"patternN", "repl":"replN"}] + +The example below is for a Brother ADS-2400N, a scanner that allows +different names to different hardware buttons (useful for handling +multiple entities in one instance), but insists on adding ``_`` +to the filename. + +.. code:: python + + # Brother profile configuration, support "Name_Date_Count" (the default + # setting) and "Name_Count" (use "Name" as tag and "Count" as title). + PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}, {"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}] + .. _guesswork-content: Reading the Document Contents diff --git a/paperless.conf.example b/paperless.conf.example index 05a6c9cca..b04e93f94 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -135,6 +135,23 @@ PAPERLESS_EMAIL_SECRET="" # as normal. #PAPERLESS_FILENAME_DATE_ORDER="YMD" +# Sometimes devices won't create filenames which can be parsed properly +# by the filename parser (see +# https://paperless.readthedocs.io/en/latest/guesswork.html). +# +# This setting allows to specify a list of transformations +# in regular expression syntax, which are passed in order to re.sub. +# Transformation stops after the first match, so at most one transformation +# is applied. +# +# Syntax is a JSON array of dictionaries containing "pattern" and "repl" +# as keys. +# +# The example below transforms filenames created by a Brother ADS-2400N +# document scanner in its standard configuration `Name_Date_Count', so that +# count is used as title, name as tag and date can be parsed by paperless. +#PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}] + # # The following values use sensible defaults for modern systems, but if you're # running Paperless on a low-resource device (like a Raspberry Pi), modifying diff --git a/src/documents/models.py b/src/documents/models.py index 37c1cfdbf..c6fc8191e 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -483,8 +483,18 @@ def from_path(cls, path): ".<suffix>" """ + filename = os.path.basename(path) + + # Mutate filename in-place before parsing its components + # by applying at most one of the configured transformations. + for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS: + (filename, count) = pattern.subn(repl, filename) + if count: + break + + # Parse filename components. for regex in cls.REGEXES.values(): - m = regex.match(os.path.basename(path)) + m = regex.match(filename) if m: properties = m.groupdict() cls._mangle_property(properties, "created") diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 3f5c69774..512447741 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -1,3 +1,5 @@ +import re + from django.test import TestCase from unittest import mock from tempfile import TemporaryDirectory @@ -372,3 +374,79 @@ def test_invalid_date_format(self): info = FileInfo.from_path("/path/to/06112017Z - title.pdf") self.assertEqual(info.title, "title") self.assertIsNone(info.created) + + def test_filename_parse_transforms(self): + + path = "/some/path/to/tag1,tag2_20190908_180610_0001.pdf" + all_patt = re.compile("^.*$") + none_patt = re.compile("$a") + exact_patt = re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.") + repl1 = " - \\4 - \\1." # (empty) corrspondent, title and tags + repl2 = "\\2Z - " + repl1 # creation date + repl1 + + # No transformations configured (= default) + info = FileInfo.from_path(path) + self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001") + self.assertEqual(info.extension, "pdf") + self.assertEqual(info.tags, ()) + self.assertIsNone(info.created) + + # Pattern doesn't match (filename unaltered) + with self.settings( + FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]): + info = FileInfo.from_path(path) + self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001") + self.assertEqual(info.extension, "pdf") + + # Simple transformation (match all) + with self.settings( + FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]): + info = FileInfo.from_path(path) + self.assertEqual(info.title, "all") + self.assertEqual(info.extension, "gif") + + # Multiple transformations configured (first pattern matches) + with self.settings( + FILENAME_PARSE_TRANSFORMS=[ + (all_patt, "all.gif"), + (all_patt, "anotherall.gif")]): + info = FileInfo.from_path(path) + self.assertEqual(info.title, "all") + self.assertEqual(info.extension, "gif") + + # Multiple transformations configured (second pattern matches) + with self.settings( + FILENAME_PARSE_TRANSFORMS=[ + (none_patt, "none.gif"), + (all_patt, "anotherall.gif")]): + info = FileInfo.from_path(path) + self.assertEqual(info.title, "anotherall") + self.assertEqual(info.extension, "gif") + + # Complex transformation without date in replacement string + with self.settings( + FILENAME_PARSE_TRANSFORMS=[(exact_patt, repl1)]): + info = FileInfo.from_path(path) + self.assertEqual(info.title, "0001") + self.assertEqual(info.extension, "pdf") + self.assertEqual(len(info.tags), 2) + self.assertEqual(info.tags[0].slug, "tag1") + self.assertEqual(info.tags[1].slug, "tag2") + self.assertIsNone(info.created) + + # Complex transformation with date in replacement string + with self.settings( + FILENAME_PARSE_TRANSFORMS=[ + (none_patt, "none.gif"), + (exact_patt, repl2), # <-- matches + (exact_patt, repl1), + (all_patt, "all.gif")]): + info = FileInfo.from_path(path) + self.assertEqual(info.title, "0001") + self.assertEqual(info.extension, "pdf") + self.assertEqual(len(info.tags), 2) + self.assertEqual(info.tags[0].slug, "tag1") + self.assertEqual(info.tags[1].slug, "tag2") + self.assertEqual(info.created.year, 2019) + self.assertEqual(info.created.month, 9) + self.assertEqual(info.created.day, 8) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index ce387be07..ddc903857 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -10,7 +10,9 @@ https://docs.djangoproject.com/en/1.10/ref/settings/ """ +import json import os +import re from dotenv import load_dotenv @@ -322,6 +324,11 @@ def __get_boolean(key, default="NO"): DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY") FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER") +# Transformations applied before filename parsing +FILENAME_PARSE_TRANSFORMS = [] +for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")): + FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"])) + # Specify for how many years a correspondent is considered recent. Recent # correspondents will be shown in a separate "Recent correspondents" filter as # well. Set to 0 to disable this filter.