Skip to content

Commit

Permalink
Close PDF files after using
Browse files Browse the repository at this point in the history
As mentioned in the [pikepdf documentation](https://pikepdf.readthedocs.io/en/latest/topics/pages.html#pdf-split-merge-and-document-assembly),
we should close the PDF files after using them. Letting them hang around can
cause issues with python and C++ references.

One issue that this patch seems to fix is a potential issue with calling
`pdf_concatenate` under load. Before this patch, with 6 individual sessions
of the [Assembly Line Weaver](https://github.com/SuffolkLITLab/docassemble-ALWeaver)
open in different browsers, I can consistently get the following error
to appear in the `uwsgi.log`:

```
terminate called after throwing an instance of 'std::runtime_error'
  what():  pybind11_object_dealloc(): Tried to deallocate unregistered instance!
DAMN ! worker 1 (pid: 666329) died, killed by signal 6 :( trying respawn ...
Respawned uWSGI worker 1 (new pid: 683455)
```

This unexpected closing of the uWSGI worker causes other issues with
sqlalchemy, and can cause the server to return 502s for 5-10 seconds after
the worker crashes.

After the patch, I can't reproduce the issue anymore. Can't say it's completely
gone, but it's definitely less prelevant.
  • Loading branch information
BryceStevenWilley committed Mar 14, 2023
1 parent 36991c4 commit 4a5abab
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 20 deletions.
4 changes: 2 additions & 2 deletions docassemble_base/docassemble/base/file_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,8 +694,8 @@ def pdf_pages(file_info, width):
server.fg_make_pdf_for_word_path(file_info['path'], file_info['extension'])
if 'pages' not in file_info:
try:
reader = Pdf.open(file_info['path'] + '.pdf')
file_info['pages'] = len(reader.pages)
with Pdf.open(file_info['path'] + '.pdf') as reader:
file_info['pages'] = len(reader.pages)
except:
file_info['pages'] = 1
max_pages = 1 + int(file_info['pages'])
Expand Down
16 changes: 8 additions & 8 deletions docassemble_base/docassemble/base/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -846,8 +846,8 @@ def image_as_rtf(match, question=None):
server.fg_make_pdf_for_word_path(file_info['path'], file_info['extension'])
if 'pages' not in file_info:
try:
reader = Pdf.open(file_info['path'] + '.pdf')
file_info['pages'] = len(reader.pages)
with Pdf.open(file_info['path'] + '.pdf') as reader:
file_info['pages'] = len(reader.pages)
except:
file_info['pages'] = 1
max_pages = 1 + int(file_info['pages'])
Expand Down Expand Up @@ -1047,8 +1047,8 @@ def image_url(file_reference, alt_text, width, emoji=False, question=None, exter
sf.finalize()
if 'pages' not in file_info:
try:
reader = Pdf.open(file_info['path'] + '.pdf')
file_info['pages'] = len(reader.pages)
with Pdf.open(file_info['path'] + '.pdf') as reader:
file_info['pages'] = len(reader.pages)
except:
file_info['pages'] = 1
the_image_url = server.url_finder(file_reference, size="screen", page=1, _question=question, _external=external)
Expand All @@ -1063,10 +1063,10 @@ def image_url(file_reference, alt_text, width, emoji=False, question=None, exter
else:
the_alt_text = alt_text
try:
reader = Pdf.open(file_info['path'] + '.pdf')
layout_width = str(reader.pages[0].mediabox[2] - reader.pages[0].mediabox[0])
layout_height = str(reader.pages[0].mediabox[3] - reader.pages[0].mediabox[1])
output = '<a target="_blank"' + title + ' class="daimageref" href="' + the_url + '"><img ' + the_alt_text + 'class="daicon dapdfscreen' + extra_class + '" width=' + layout_width + ' height=' + layout_height + ' style="' + width_string + '; height: auto;" src="' + the_image_url + '"/></a>'
with Pdf.open(file_info['path'] + '.pdf') as reader:
layout_width = str(reader.pages[0].mediabox[2] - reader.pages[0].mediabox[0])
layout_height = str(reader.pages[0].mediabox[3] - reader.pages[0].mediabox[1])
output = '<a target="_blank"' + title + ' class="daimageref" href="' + the_url + '"><img ' + the_alt_text + 'class="daicon dapdfscreen' + extra_class + '" width=' + layout_width + ' height=' + layout_height + ' style="' + width_string + '; height: auto;" src="' + the_image_url + '"/></a>'
except:
output = '<a target="_blank"' + title + ' class="daimageref" href="' + the_url + '"><img ' + the_alt_text + 'class="daicon dapdfscreen' + extra_class + '" style="' + width_string + '; height: auto;" src="' + the_image_url + '"/></a>'
if 'pages' in file_info and file_info['pages'] > 1:
Expand Down
10 changes: 5 additions & 5 deletions docassemble_base/docassemble/base/pandoc.py
Original file line number Diff line number Diff line change
Expand Up @@ -746,11 +746,11 @@ def concatenate_files(path_list, pdfa=False, password=None):
if len(new_path_list) == 1:
shutil.copyfile(new_path_list[0], pdf_file.name)
else:
original = Pdf.open(new_path_list[0])
for additional_file in new_path_list[1:]:
additional_pdf = Pdf.open(additional_file)
original.pages.extend(additional_pdf.pages)
original.save(pdf_file.name)
with Pdf.open(new_path_list[0]) as original:
for additional_file in new_path_list[1:]:
with Pdf.open(additional_file) as additional_pdf:
original.pages.extend(additional_pdf.pages)
original.save(pdf_file.name)
if pdfa:
pdf_to_pdfa(pdf_file.name)
if password:
Expand Down
8 changes: 5 additions & 3 deletions docassemble_base/docassemble/base/pdftk.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,9 +344,9 @@ def fill_template(template, data_strings=None, data_names=None, hidden=None, rea
for item in image_todo:
xone, yone, xtwo, ytwo = fields[item['field']]['rect']
logmessage("Trying to save to page " + repr(item['pageno'] - 1))
overlay_file = Pdf.open(item['overlay_file'])
overlay_page = overlay_file.pages[0]
pdf.pages[item['pageno'] - 1].add_overlay(overlay_page, rect=pikepdf.Rectangle(xone, yone, xtwo, ytwo))
with Pdf.open(item['overlay_file']) as overlay_file:
overlay_page = overlay_file.pages[0]
pdf.pages[item['pageno'] - 1].add_overlay(overlay_page, rect=pikepdf.Rectangle(xone, yone, xtwo, ytwo))
pdf.save(pdf_file.name)
if (pdfa or not editable) and len(images) > 0:
flatten_pdf(pdf_file.name)
Expand Down Expand Up @@ -480,6 +480,8 @@ def overlay_pdf(main_file, logo_file, out_file, first_page=None, last_page=None,
continue
main_pdf.pages[page_no].add_overlay(logo_pdf.pages[logo_page - 1])
main_pdf.save(out_file)
logo_pdf.close()
main_pdf.close()


def apply_qpdf(filename):
Expand Down
8 changes: 6 additions & 2 deletions docassemble_base/docassemble/base/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9977,15 +9977,19 @@ def ocr_page_tasks(image_file, language=None, psm=6, f=None, l=None, x=None, y=N
raise Exception("document with extension " + doc.extension + " is not a readable image file")
if doc.extension == 'pdf':
# doc.page_path(1, 'page')
for i in range(len(Pdf.open(doc.path()).pages)):
with Pdf.open(doc.path()) as tmp_pdf:
page_count = len(tmp_pdf.pages)
for i in range(page_count):
if f is not None and i + 1 < f:
continue
if l is not None and i + 1 > l:
continue
todo.append(dict(doc=doc, page=i+1, lang=lang, ocr_resolution=ocr_resolution, psm=psm, x=x, y=y, W=W, H=H, pdf_to_ppm=pdf_to_ppm, user_code=user_code, user=user, pdf=pdf, preserve_color=preserve_color))
elif doc.extension in ("docx", "doc", "odt", "rtf"):
doc_conv = pdf_concatenate(doc)
for i in range(len(Pdf.open(doc_conv.path()).pages)):
with Pdf.open(doc_conv.path()) as tmp_pdf:
page_count = len(tmp_pdf.pages)
for i in range(page_count):
if f is not None and i + 1 < f:
continue
if l is not None and i + 1 > l:
Expand Down

0 comments on commit 4a5abab

Please sign in to comment.