-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathpdf_parser.py
148 lines (131 loc) · 6.19 KB
/
pdf_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from base_class import AbstractPDFParser
import pickle
from scipdf_utils import parse_pdf_to_dict
class GrobidSciPDFPaser(AbstractPDFParser):
import pysbd
seg_en = pysbd.Segmenter(language="en", clean=False)
seg_chinese = pysbd.Segmenter(language="zh", clean=False)
def __init__(self, pdf_link, db_name="grobid_scipdf", short_thereshold=30) -> None:
"""Initialize the PDF parser
Args:
pdf_link: link to the PDF file, the pdf link can be a web link or local file path
metadata: metadata of the PDF file, like authors, title, abstract, etc.
paragraphs: list of paragraphs of the PDF file, all paragraphs are concatenated together
split_paragraphs: dict of section name and corresponding list of split paragraphs
"""
super().__init__(db_name=db_name)
self.db_name = db_name
self.pdf_link = pdf_link
self.pdf = None
self.metadata = {}
self.flattn_paragraphs = None
self.split_paragraphs = None
self.short_thereshold = short_thereshold
self.parse_pdf()
def _contact_too_short_paragraphs(self, ):
"""Contact too short paragraphs or discard them"""
for i, section in enumerate(self.split_paragraphs):
# section_name = section['heading']
paragraphs = section['texts']
new_paragraphs = []
for paragraph in paragraphs:
if len(paragraph) <= self.short_thereshold and len(paragraph.strip()) != 0:
if len(new_paragraphs) != 0:
new_paragraphs[-1] += paragraph
else:
new_paragraphs.append(paragraph)
else:
new_paragraphs.append(paragraph)
self.split_paragraphs[i]['texts'] = new_paragraphs
@staticmethod
def _find_largest_font_string(file_name, search_string):
search_string = search_string.strip()
max_font_size = -1
page_number = -1
import PyPDF2
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar
try:
with open(file_name, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for index, page_layout in enumerate(extract_pages(file_name)):
for element in page_layout:
if isinstance(element, LTTextContainer):
for text_line in element:
if search_string in text_line.get_text():
for character in text_line:
if isinstance(character, LTChar):
if character.size > max_font_size:
max_font_size = character.size
page_number = index
return page_number + 1 if page_number != -1 else -1
except Exception as e:
return -1
def _find_section_page(self, section_name) -> None:
return GrobidSciPDFPaser._find_largest_font_string(self.pdf_link, section_name)
def _retrive_or_parse(self, ):
"""Return pdf dict from cache if present, otherwise parse the pdf"""
db_name = self.db_name
if (self.pdf_link, db_name) not in self.db_cache.keys():
self.db_cache[(self.pdf_link, db_name)
] = parse_pdf_to_dict(self.pdf_link)
with open(self.db_cache_path, "wb") as db_cache_file:
pickle.dump(self.db_cache, db_cache_file)
return self.db_cache[(self.pdf_link, db_name)]
@staticmethod
def _check_chinese(text) -> None:
return any(u'\u4e00' <= char <= u'\u9fff' for char in text)
def parse_pdf(self) -> None:
"""Parse the PDF file
"""
article_dict = self._retrive_or_parse()
self.article_dict = article_dict
self._get_metadata()
self.split_paragraphs = self.get_split_paragraphs()
self._contact_too_short_paragraphs()
self.flattn_paragraphs = self.get_paragraphs()
def get_paragraphs(self) -> None:
"""Get the paragraphs of the PDF file
"""
paragraphs = []
self.content2section = {}
for section in self.split_paragraphs:
# paragraphs+=[section["heading"]]
paragraphs += section["texts"]
for para in section["texts"]:
self.content2section[para] = section["heading"]
return paragraphs
def _get_metadata(self) -> None:
for meta in ['authors', "pub_date", "abstract", "references", "doi", 'title',]:
self.metadata[meta] = self.article_dict[meta]
self.section_names = [section["heading"]
for section in self.article_dict['sections']]
self.section_names2page = {}
for section_name in self.section_names:
section_page_index = self._find_section_page(section_name)
self.section_names2page.update({section_name: section_page_index})
self.section_names_with_page_index = [section_name + " (Page {})".format(
self.section_names2page[section_name]) for section_name in self.section_names]
def get_split_paragraphs(self, ) -> None:
section_pair_list = []
for section in self.article_dict['sections']:
section_pair_list.append({
"heading": section["heading"],
"texts": section["all_paragraphs"],
})
return section_pair_list
@staticmethod
def _determine_optimal_split_of_pargraphs(section_pair_list) -> None:
"""
split based on the some magic rules
"""
import pysbd
for section_pair in section_pair_list:
if GrobidSciPDFPaser._check_chinese(section_pair["text"]):
seg = GrobidSciPDFPaser.seg_chinese
else:
seg = GrobidSciPDFPaser.seg_en
section_pair["texts"] = seg.segment(section_pair["texts"])
section_pair["texts"] = [
para for para in section_pair["text"] if len(para) > 2]
return section_pair_list