Skip to content

Commit

Permalink
上载readPdf.py
Browse files Browse the repository at this point in the history
  • Loading branch information
gz51837844 committed Jul 19, 2016
1 parent d3c7084 commit 7b9f53b
Showing 1 changed file with 28 additions and 0 deletions.
28 changes: 28 additions & 0 deletions test/readPdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# _*_coding:utf8_*_
# readPdf.py
# python读取pdf格式的文档

from urllib.request import urlopen
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open

def readPDF(pdfFile):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)

process_pdf(rsrcmgr, device, pdfFile)
device.close()

content = retstr.getvalue()
retstr.close()
return content

pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf")
outputString = readPDF(pdfFile)
print(outputString)
pdfFile.close()

0 comments on commit 7b9f53b

Please sign in to comment.