Skip to content

Commit

Permalink
TIKA-3002 -- fix bug in OCR AUTO mode
Browse files Browse the repository at this point in the history
  • Loading branch information
tballison committed Dec 2, 2019
1 parent adb6545 commit f5edbbd
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -370,8 +370,6 @@ protected void endPage(PDPage page) throws IOException {
metadata.add(PDF.CHARACTERS_PER_PAGE, totalCharsPerPage);
metadata.add(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE,
unmappedUnicodeCharsPerPage);
totalCharsPerPage = 0;
unmappedUnicodeCharsPerPage = 0;

try {
for (PDAnnotation annotation : page.getAnnotations()) {
Expand Down Expand Up @@ -458,6 +456,9 @@ protected void endPage(PDPage page) throws IOException {
throw new IOExceptionWithCause("Unable to end a page", e);
} catch (IOException e) {
handleCatchableIOE(e);
} finally {
totalCharsPerPage = 0;
unmappedUnicodeCharsPerPage = 0;
}

if (config.getExtractFontNames()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1284,6 +1284,21 @@ public void testJBIG2OCROnly() throws Exception {
assertContains("Norconex", xmlResult.xml);
}

@Test
public void testOCRAutoMode() throws Exception {
assumeTrue("can run OCR", canRunOCR());
PDFParserConfig config = new PDFParserConfig();
config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.AUTO);
ParseContext context = new ParseContext();
context.set(PDFParserConfig.class, config);
XMLResult xmlResult = getXML("testOCR.pdf", context);
assertContains("Happy New Year", xmlResult.xml);

config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
String txt = getText("testOCR.pdf", new Metadata(), context);
assertEquals("", txt.trim());
}

@Test
public void testTesseractInitializationWorks() throws Exception {
//TIKA-2970 -- make sure that configurations set on the TesseractOCRParser
Expand Down

0 comments on commit f5edbbd

Please sign in to comment.