Skip to content

Commit

Permalink
Merge pull request the-paperless-project#542 from grembo/master
Browse files Browse the repository at this point in the history
Allow configuring transformations to be applied to the filename before
  • Loading branch information
danielquinn authored Sep 9, 2019
2 parents ebd9f91 + 4f85d9e commit 1c95665
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 1 deletion.
28 changes: 28 additions & 0 deletions docs/guesswork.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,34 @@ filename as described above.

.. _dateparser: https://github.com/scrapinghub/dateparser/blob/v0.7.0/docs/usage.rst#settings

Transforming filenames for parsing
----------------------------------
Some devices can't produce filenames that can be parsed by the default
parser. By configuring the option ``PAPERLESS_FILENAME_PARSE_TRANSFORMS`` in
``paperless.conf`` one can add transformations that are applied to the filename
before it's parsed.

The option contains a list of dictionaries of regular expressions (key:
``pattern``) and replacements (key: ``repl``) in JSON format, which are
applied in order by passing them to ``re.subn``. Transformation stops
after the first match, so at most one transformation is applied. The general
syntax is

.. code:: python
[{"pattern":"pattern1", "repl":"repl1"}, {"pattern":"pattern2", "repl":"repl2"}, ..., {"pattern":"patternN", "repl":"replN"}]
The example below is for a Brother ADS-2400N, a scanner that allows
different names to different hardware buttons (useful for handling
multiple entities in one instance), but insists on adding ``_<count>``
to the filename.

.. code:: python
# Brother profile configuration, support "Name_Date_Count" (the default
# setting) and "Name_Count" (use "Name" as tag and "Count" as title).
PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}, {"pattern":"^([a-z]+)_([0-9]+)\\.", "repl":" - \\2 - \\1."}]
.. _guesswork-content:

Reading the Document Contents
Expand Down
17 changes: 17 additions & 0 deletions paperless.conf.example
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,23 @@ PAPERLESS_EMAIL_SECRET=""
# as normal.
#PAPERLESS_FILENAME_DATE_ORDER="YMD"

# Sometimes devices won't create filenames which can be parsed properly
# by the filename parser (see
# https://paperless.readthedocs.io/en/latest/guesswork.html).
#
# This setting allows to specify a list of transformations
# in regular expression syntax, which are passed in order to re.sub.
# Transformation stops after the first match, so at most one transformation
# is applied.
#
# Syntax is a JSON array of dictionaries containing "pattern" and "repl"
# as keys.
#
# The example below transforms filenames created by a Brother ADS-2400N
# document scanner in its standard configuration `Name_Date_Count', so that
# count is used as title, name as tag and date can be parsed by paperless.
#PAPERLESS_FILENAME_PARSE_TRANSFORMS=[{"pattern":"^([a-z]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.", "repl":"\\2\\3Z - \\4 - \\1."}]

#
# The following values use sensible defaults for modern systems, but if you're
# running Paperless on a low-resource device (like a Raspberry Pi), modifying
Expand Down
12 changes: 11 additions & 1 deletion src/documents/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,8 +483,18 @@ def from_path(cls, path):
"<title>.<suffix>"
"""

filename = os.path.basename(path)

# Mutate filename in-place before parsing its components
# by applying at most one of the configured transformations.
for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
(filename, count) = pattern.subn(repl, filename)
if count:
break

# Parse filename components.
for regex in cls.REGEXES.values():
m = regex.match(os.path.basename(path))
m = regex.match(filename)
if m:
properties = m.groupdict()
cls._mangle_property(properties, "created")
Expand Down
78 changes: 78 additions & 0 deletions src/documents/tests/test_consumer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

from django.test import TestCase
from unittest import mock
from tempfile import TemporaryDirectory
Expand Down Expand Up @@ -372,3 +374,79 @@ def test_invalid_date_format(self):
info = FileInfo.from_path("/path/to/06112017Z - title.pdf")
self.assertEqual(info.title, "title")
self.assertIsNone(info.created)

def test_filename_parse_transforms(self):

path = "/some/path/to/tag1,tag2_20190908_180610_0001.pdf"
all_patt = re.compile("^.*$")
none_patt = re.compile("$a")
exact_patt = re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.")
repl1 = " - \\4 - \\1." # (empty) corrspondent, title and tags
repl2 = "\\2Z - " + repl1 # creation date + repl1

# No transformations configured (= default)
info = FileInfo.from_path(path)
self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
self.assertEqual(info.extension, "pdf")
self.assertEqual(info.tags, ())
self.assertIsNone(info.created)

# Pattern doesn't match (filename unaltered)
with self.settings(
FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]):
info = FileInfo.from_path(path)
self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
self.assertEqual(info.extension, "pdf")

# Simple transformation (match all)
with self.settings(
FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]):
info = FileInfo.from_path(path)
self.assertEqual(info.title, "all")
self.assertEqual(info.extension, "gif")

# Multiple transformations configured (first pattern matches)
with self.settings(
FILENAME_PARSE_TRANSFORMS=[
(all_patt, "all.gif"),
(all_patt, "anotherall.gif")]):
info = FileInfo.from_path(path)
self.assertEqual(info.title, "all")
self.assertEqual(info.extension, "gif")

# Multiple transformations configured (second pattern matches)
with self.settings(
FILENAME_PARSE_TRANSFORMS=[
(none_patt, "none.gif"),
(all_patt, "anotherall.gif")]):
info = FileInfo.from_path(path)
self.assertEqual(info.title, "anotherall")
self.assertEqual(info.extension, "gif")

# Complex transformation without date in replacement string
with self.settings(
FILENAME_PARSE_TRANSFORMS=[(exact_patt, repl1)]):
info = FileInfo.from_path(path)
self.assertEqual(info.title, "0001")
self.assertEqual(info.extension, "pdf")
self.assertEqual(len(info.tags), 2)
self.assertEqual(info.tags[0].slug, "tag1")
self.assertEqual(info.tags[1].slug, "tag2")
self.assertIsNone(info.created)

# Complex transformation with date in replacement string
with self.settings(
FILENAME_PARSE_TRANSFORMS=[
(none_patt, "none.gif"),
(exact_patt, repl2), # <-- matches
(exact_patt, repl1),
(all_patt, "all.gif")]):
info = FileInfo.from_path(path)
self.assertEqual(info.title, "0001")
self.assertEqual(info.extension, "pdf")
self.assertEqual(len(info.tags), 2)
self.assertEqual(info.tags[0].slug, "tag1")
self.assertEqual(info.tags[1].slug, "tag2")
self.assertEqual(info.created.year, 2019)
self.assertEqual(info.created.month, 9)
self.assertEqual(info.created.day, 8)
7 changes: 7 additions & 0 deletions src/paperless/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
https://docs.djangoproject.com/en/1.10/ref/settings/
"""

import json
import os
import re

from dotenv import load_dotenv

Expand Down Expand Up @@ -322,6 +324,11 @@ def __get_boolean(key, default="NO"):
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")

# Transformations applied before filename parsing
FILENAME_PARSE_TRANSFORMS = []
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))

# Specify for how many years a correspondent is considered recent. Recent
# correspondents will be shown in a separate "Recent correspondents" filter as
# well. Set to 0 to disable this filter.
Expand Down

0 comments on commit 1c95665

Please sign in to comment.