Skip to content

Commit

Permalink
TIKA-1201 enable parameter for NonSequentialPDFParser
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.apache.org/repos/asf/tika/trunk@1547250 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
tballison committed Dec 3, 2013
1 parent b7986ea commit 1babc9e
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ public class PDFParser extends AbstractParser {
// (necessary for some PDFs, but messes up other PDFs):
private boolean sortByPosition = false;

//True if we should use PDFBox's NonSequentialParser
private boolean useNonSequentialParser = false;
/**
* Metadata key for giving the document password to the parser.
*
Expand Down Expand Up @@ -106,14 +108,18 @@ public void parse(
// for unpacked / processed resources
// Decide which to do based on if we're reading from a file or not already
TikaInputStream tstream = TikaInputStream.cast(stream);
if (tstream != null && tstream.hasFile()) {
// File based, take that as a cue to use a temporary file
RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw");
pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true);
if (useNonSequentialParser == true) {
RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw");
pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile);
} else if (tstream != null && tstream.hasFile()) {
// File based, take that as a cue to use a temporary file
RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw");
pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true);
} else {
// Go for the normal, stream based in-memory parsing
pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true);
// Go for the normal, stream based in-memory parsing
pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true);
}


if (pdfDocument.isEncrypted()) {
String password = null;
Expand Down Expand Up @@ -233,11 +239,25 @@ private void addMetadata(Metadata metadata, String name, COSBase value) {
}
} else if(value instanceof COSString) {
addMetadata(metadata, name, ((COSString)value).getString());
} else {
} else if (value != null){
addMetadata(metadata, name, value.toString());
}
}

/**
* If true, the parser will use the NonSequentialParser. This may
* be faster than the full doc parser.
* If false (default), this will use the full doc parser.
*/
public void setUseNonSequentialParser(boolean v){
useNonSequentialParser = v;
}

/** @see #setUseNonSequentialParser(boolean) */
public boolean getUseNonSequentialParser(){
return useNonSequentialParser;
}

/**
* If true (the default), the parser should estimate
* where spaces should be inserted between words. For
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
*/
package org.apache.tika.parser.pdf;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;

import org.apache.tika.TikaTest;
Expand Down Expand Up @@ -516,4 +518,52 @@ public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception {
assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2));
}

/**
* tests for equality between traditional sequential parser
* and newer nonsequential parser.
*
* TODO: more testing
*/
public void testSequentialParser() throws Exception{
PDFParser defaultParser = new PDFParser();
PDFParser sequentialParser = new PDFParser();
sequentialParser.setUseNonSequentialParser(true);
File testDocs = new File(this.getClass().getResource("/test-documents").toURI());
int pdfs = 0;
for (File f : testDocs.listFiles()){
if (! f.getName().toLowerCase().endsWith(".pdf")){
continue;
}
pdfs++;
Metadata defaultMetadata = new Metadata();
String defaultContent = getText(f, defaultParser, defaultMetadata);

Metadata sequentialMetadata = new Metadata();
String sequentialContent = getText(f, sequentialParser, sequentialMetadata);

assertEquals(f.getName(), defaultContent, sequentialContent);
//TODO: until PDFBox fixes metadata extraction for this file,
//skip this one file.
if (f.getName().equals("testAnnotations.pdf")){
continue;
}

assertEquals(f.getName(), defaultMetadata, sequentialMetadata);
}
assertEquals("Number of pdf files tested", 14, pdfs);
}

private String getText(File f, PDFParser parser, Metadata metadata) throws Exception{
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
FileInputStream is = null;
try {
is = new FileInputStream(f);
parser.parse(is, handler, metadata, context);
} finally {
is.close();
}
return handler.toString();
}

}

0 comments on commit 1babc9e

Please sign in to comment.