-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtxt.py
90 lines (77 loc) · 4.18 KB
/
txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from typing import List, Optional, Dict
import logging
from pathlib import Path
from haystack.nodes.file_converter.base import BaseConverter
from haystack.schema import Document
import jieba
logger = logging.getLogger(__name__)
class TextConverter(BaseConverter):
def convert(
self,
file_path: Path,
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "utf-8",
id_hash_keys: Optional[List[str]] = None,
) -> List[Document]:
"""
Reads text from a txt file and executes optional preprocessing steps.
:param file_path: path of the file to convert
:param meta: dictionary of meta data key-value pairs to append in the returned document.
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
:param encoding: Select the file encoding (default is `utf-8`)
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.
"""
if remove_numeric_tables is None:
remove_numeric_tables = self.remove_numeric_tables
if valid_languages is None:
valid_languages = self.valid_languages
if id_hash_keys is None:
id_hash_keys = self.id_hash_keys
with open(file_path, encoding='utf-8', errors="ignore") as f:
text = f.read()
pages = text.split("\f")
cleaned_pages = []
for page in pages:
lines = page.splitlines()
cleaned_lines = []
for line in lines:
words = jieba.lcut(line)
digits = [word for word in words if any(i.isdigit() for i in word)]
# remove lines having > 40% of words as digits AND not ending with a period(.)
if (
remove_numeric_tables
and words
and len(digits) / len(words) > 0.4
and not line.strip().endswith(".")
):
logger.debug("Removing line '%s' from %s", line, file_path)
continue
cleaned_lines.append(line)
page = "\n".join(cleaned_lines)
cleaned_pages.append(page)
if valid_languages:
document_text = "".join(cleaned_pages)
if not self.validate_language(document_text, valid_languages):
logger.warning(
"The language for %s is not one of %s. The file may not have "
"been decoded in the correct text format.",
file_path,
valid_languages,
)
text = "".join(cleaned_pages)
document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys)
return [document]