-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathOCR.py
executable file
·58 lines (45 loc) · 2 KB
/
OCR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
############################################################################################
### ###
### Algorítimo de conversão de multiplos arquivos com imagens PDFs em arquivos txt ###
### Diego Telles - 18/04/2020 ###
### github.com/diegozd ###
### ###
############################################################################################
import io
import os
from PIL import Image
import pytesseract
from wand.image import Image as wi
end_filespfds = './marrero/pdf'
end_filesptxt = './marrero/txt'
pdfsList = os.listdir(end_filespfds)
txtList = os.listdir(end_filesptxt)
n = 0
for arqpdf in pdfsList:
filenamepdf = str(arqpdf)
fim = filenamepdf.rfind('.pdf')
filenametxt = filenamepdf[0:fim] + '.txt'
if filenametxt in txtList:
print(n+1, '/', len(pdfsList), ' ', filenametxt, ' arquivo ja existente')
else:
print(n+1, '/', len(pdfsList), 'convertendo ', arqpdf)
pdf = wi(filename = end_filespfds + '/' + filenamepdf, resolution= 300)
pdfImage = pdf.convert('jpeg')
imageBlobs = []
for img in pdfImage.sequence:
imgPage = wi(image= img)
imageBlobs.append(imgPage.make_blob('jpeg'))
recognized_text = []
for imgBlobs in imageBlobs:
im = Image.open(io.BytesIO(imgBlobs))
#text = pytesseract.image_to_string(im, lang='por')
text = pytesseract.image_to_string(im)
recognized_text.append(text)
f = open(end_filesptxt + '/' + filenametxt,'w')
for ele in recognized_text:
f.write(ele+'\n')
f.close()
os.system('find /tmp -maxdepth 1 -type f -name "magick-*" -delete')
n=n+1
print('FIM!')
#print(recognized_text)