Skip to content

Commit

Permalink
TIKA-1202 added PDFParserConfig and refactored PDFParserTest and Tika…
Browse files Browse the repository at this point in the history
…Test to reduce boilerplate

git-svn-id: https://svn.apache.org/repos/asf/tika/trunk@1548700 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
tballison committed Dec 6, 2013
1 parent 0fe86c2 commit 173b90f
Show file tree
Hide file tree
Showing 6 changed files with 458 additions and 187 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -71,16 +71,14 @@ class PDF2XHTML extends PDFTextStripper {
*/
public static void process(
PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
boolean extractAnnotationText, boolean enableAutoSpace,
boolean suppressDuplicateOverlappingText, boolean sortByPosition)
PDFParserConfig config)
throws SAXException, TikaException {
try {
// Extract text using a dummy Writer as we override the
// key methods to output to the given content
// handler.
PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, context, metadata,
extractAnnotationText, enableAutoSpace,
suppressDuplicateOverlappingText, sortByPosition);
PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, context, metadata, config);

pdf2XHTML.writeText(document, new Writer() {
@Override
public void write(char[] cbuf, int off, int len) {
Expand All @@ -105,27 +103,27 @@ public void close() {
private final ContentHandler originalHandler;
private final ParseContext context;
private final XHTMLContentHandler handler;
private final boolean extractAnnotationText;

private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata,
boolean extractAnnotationText, boolean enableAutoSpace,
boolean suppressDuplicateOverlappingText, boolean sortByPosition)
private final PDFParserConfig config;

private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata,
PDFParserConfig defaultConfig)
throws IOException {

this.config = context.get(PDFParserConfig.class, defaultConfig);
this.originalHandler = handler;
this.context = context;
this.handler = new XHTMLContentHandler(handler, metadata);
this.extractAnnotationText = extractAnnotationText;
setForceParsing(true);
setSortByPosition(sortByPosition);
if (enableAutoSpace) {
setSortByPosition(config.getSortByPosition());
if (config.getEnableAutoSpace()) {
setWordSeparator(" ");
} else {
setWordSeparator("");
}
// TODO: maybe expose setting these too:
//setAverageCharTolerance(1.0f);
//setSpacingTolerance(1.0f);
setSuppressDuplicateOverlappingText(suppressDuplicateOverlappingText);
setSuppressDuplicateOverlappingText(config.getSuppressDuplicateOverlappingText());
}

void extractBookmarkText() throws SAXException {
Expand Down Expand Up @@ -190,7 +188,7 @@ protected void endPage(PDPage page) throws IOException {
try {
writeParagraphEnd();
// TODO: remove once PDFBOX-1143 is fixed:
if (extractAnnotationText) {
if (config.getExtractAnnotationText()) {
for(Object o : page.getAnnotations()) {
if( o instanceof PDAnnotationLink ) {
PDAnnotationLink annotationlink = (PDAnnotationLink) o;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,22 +64,7 @@ public class PDFParser extends AbstractParser {
/** Serial version UID */
private static final long serialVersionUID = -752276948656079347L;

// True if we let PDFBox "guess" where spaces should go:
private boolean enableAutoSpace = true;

// True if we let PDFBox remove duplicate overlapping text:
private boolean suppressDuplicateOverlappingText;

// True if we extract annotation text ourselves
// (workaround for PDFBOX-1143):
private boolean extractAnnotationText = true;

// True if we should sort text tokens by position
// (necessary for some PDFs, but messes up other PDFs):
private boolean sortByPosition = false;

//True if we should use PDFBox's NonSequentialParser
private boolean useNonSequentialParser = false;
private PDFParserConfig config = new PDFParserConfig();
/**
* Metadata key for giving the document password to the parser.
*
Expand Down Expand Up @@ -108,7 +93,7 @@ public void parse(
// for unpacked / processed resources
// Decide which to do based on if we're reading from a file or not already
TikaInputStream tstream = TikaInputStream.cast(stream);
if (useNonSequentialParser == true) {
if (config.getUseNonSequentialParser() == true) {
RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw");
pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile);
} else if (tstream != null && tstream.hasFile()) {
Expand Down Expand Up @@ -148,9 +133,7 @@ public void parse(
}
metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
extractMetadata(pdfDocument, metadata);
PDF2XHTML.process(pdfDocument, handler, context, metadata,
extractAnnotationText, enableAutoSpace,
suppressDuplicateOverlappingText, sortByPosition);
PDF2XHTML.process(pdfDocument, handler, context, metadata, config);

} finally {
if (pdfDocument != null) {
Expand Down Expand Up @@ -244,48 +227,69 @@ private void addMetadata(Metadata metadata, String name, COSBase value) {
}
}

public void setPDFParserConfig(PDFParserConfig config){
this.config = config;
}

public PDFParserConfig getPDFParserConfig(){
return config;
}

/**
* If true, the parser will use the NonSequentialParser. This may
* be faster than the full doc parser.
* If false (default), this will use the full doc parser.
*
* @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
*/
public void setUseNonSequentialParser(boolean v){
useNonSequentialParser = v;
config.setUseNonSequentialParser(v);
}

/** @see #setUseNonSequentialParser(boolean) */
/**
* @see #setUseNonSequentialParser(boolean)
* @deprecated use {@link #getPDFParserConfig()}
*/
public boolean getUseNonSequentialParser(){
return useNonSequentialParser;
return config.getUseNonSequentialParser();
}

/**
* If true (the default), the parser should estimate
* where spaces should be inserted between words. For
* many PDFs this is necessary as they do not include
* explicit whitespace characters.
*
* @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
*/
public void setEnableAutoSpace(boolean v) {
enableAutoSpace = v;
config.setEnableAutoSpace(v);
}

/** @see #setEnableAutoSpace. */
/**
* @see #setEnableAutoSpace.
* @deprecated use {@link #getPDFParserConfig()}
*/
public boolean getEnableAutoSpace() {
return enableAutoSpace;
return config.getEnableAutoSpace();
}

/**
* If true (the default), text in annotations will be
* extracted.
* @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
*/
public void setExtractAnnotationText(boolean v) {
extractAnnotationText = v;
config.setExtractAnnotationText(v);
}

/**
* If true, text in annotations will be extracted.
*
* @deprecated use {@link #getPDFParserConfig()}
*/
public boolean getExtractAnnotationText() {
return extractAnnotationText;
return config.getExtractAnnotationText();
}

/**
Expand All @@ -296,14 +300,20 @@ public boolean getExtractAnnotationText() {
* slow down extraction substantially (PDFBOX-956) and
* sometimes remove characters that were not in fact
* duplicated (PDFBOX-1155). By default this is disabled.
*
* @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
*/
public void setSuppressDuplicateOverlappingText(boolean v) {
suppressDuplicateOverlappingText = v;
config.setSuppressDuplicateOverlappingText(v);
}

/** @see #setSuppressDuplicateOverlappingText. */
/**
* @see #setSuppressDuplicateOverlappingText.
*
* @deprecated use {@link #getPDFParserConfig()}
*/
public boolean getSuppressDuplicateOverlappingText() {
return suppressDuplicateOverlappingText;
return config.getSuppressDuplicateOverlappingText();
}

/**
Expand All @@ -313,14 +323,20 @@ public boolean getSuppressDuplicateOverlappingText() {
* order"), while for other PDFs it can produce the
* wrong result (for example if there are 2 columns,
* the text will be interleaved). Default is false.
*
* @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
*/
public void setSortByPosition(boolean v) {
sortByPosition = v;
config.setSortByPosition(v);
}

/** @see #setSortByPosition. */
/**
* @see #setSortByPosition.
*
* @deprecated use {@link #getPDFParserConfig()}
*/
public boolean getSortByPosition() {
return sortByPosition;
return config.getSortByPosition();
}

}
Loading

0 comments on commit 173b90f

Please sign in to comment.