diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index 8acc3ffe60..d3f56f6fc9 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -370,8 +370,6 @@ protected void endPage(PDPage page) throws IOException { metadata.add(PDF.CHARACTERS_PER_PAGE, totalCharsPerPage); metadata.add(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE, unmappedUnicodeCharsPerPage); - totalCharsPerPage = 0; - unmappedUnicodeCharsPerPage = 0; try { for (PDAnnotation annotation : page.getAnnotations()) { @@ -458,6 +456,9 @@ protected void endPage(PDPage page) throws IOException { throw new IOExceptionWithCause("Unable to end a page", e); } catch (IOException e) { handleCatchableIOE(e); + } finally { + totalCharsPerPage = 0; + unmappedUnicodeCharsPerPage = 0; } if (config.getExtractFontNames()) { diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 3ad4dbfd62..6a816c9b40 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -1284,6 +1284,21 @@ public void testJBIG2OCROnly() throws Exception { assertContains("Norconex", xmlResult.xml); } + @Test + public void testOCRAutoMode() throws Exception { + assumeTrue("can run OCR", canRunOCR()); + PDFParserConfig config = new PDFParserConfig(); + config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.AUTO); + ParseContext context = new ParseContext(); + context.set(PDFParserConfig.class, config); + XMLResult xmlResult = getXML("testOCR.pdf", context); + assertContains("Happy New Year", xmlResult.xml); + + config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR); + String txt = getText("testOCR.pdf", new Metadata(), context); + assertEquals("", txt.trim()); + } + @Test public void testTesseractInitializationWorks() throws Exception { //TIKA-2970 -- make sure that configurations set on the TesseractOCRParser