TIKA-1202 added PDFParserConfig and refactored PDFParserTest and Tika…

…Test to reduce boilerplate git-svn-id: https://svn.apache.org/repos/asf/tika/trunk@1548700 13f79535-47bb-0310-9956-ffa450edef68
willp-bl · Dec 6, 2013 · 173b90f · 173b90f
1 parent 0fe86c2
commit 173b90f
Show file tree

Hide file tree

Showing 6 changed files with 458 additions and 187 deletions.
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -71,16 +71,14 @@ class PDF2XHTML extends PDFTextStripper {
      */
     public static void process(
             PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
-            boolean extractAnnotationText, boolean enableAutoSpace,
-            boolean suppressDuplicateOverlappingText, boolean sortByPosition)
+            PDFParserConfig config)
             throws SAXException, TikaException {
         try {
             // Extract text using a dummy Writer as we override the
             // key methods to output to the given content
             // handler.
-            PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, context, metadata,
-                                                extractAnnotationText, enableAutoSpace,
-                                                suppressDuplicateOverlappingText, sortByPosition);
+            PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, context, metadata, config);
+
             pdf2XHTML.writeText(document, new Writer() {
                 @Override
                 public void write(char[] cbuf, int off, int len) {
@@ -105,27 +103,27 @@ public void close() {
     private final ContentHandler originalHandler;
     private final ParseContext context;
     private final XHTMLContentHandler handler;
-    private final boolean extractAnnotationText;
-
-    private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata,
-                      boolean extractAnnotationText, boolean enableAutoSpace,
-                      boolean suppressDuplicateOverlappingText, boolean sortByPosition)
+    private final PDFParserConfig config;
+
+    private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata, 
+            PDFParserConfig defaultConfig)
             throws IOException {
+
+        this.config = context.get(PDFParserConfig.class, defaultConfig);
         this.originalHandler = handler;
         this.context = context;
         this.handler = new XHTMLContentHandler(handler, metadata);
-        this.extractAnnotationText = extractAnnotationText;
         setForceParsing(true);
-        setSortByPosition(sortByPosition);
-        if (enableAutoSpace) {
+        setSortByPosition(config.getSortByPosition());
+        if (config.getEnableAutoSpace()) {
             setWordSeparator(" ");
         } else {
             setWordSeparator("");
         }
         // TODO: maybe expose setting these too:
         //setAverageCharTolerance(1.0f);
         //setSpacingTolerance(1.0f);
-        setSuppressDuplicateOverlappingText(suppressDuplicateOverlappingText);
+        setSuppressDuplicateOverlappingText(config.getSuppressDuplicateOverlappingText());
     }
 
     void extractBookmarkText() throws SAXException {
@@ -190,7 +188,7 @@ protected void endPage(PDPage page) throws IOException {
         try {
             writeParagraphEnd();
             // TODO: remove once PDFBOX-1143 is fixed:
-            if (extractAnnotationText) {
+            if (config.getExtractAnnotationText()) {
                 for(Object o : page.getAnnotations()) {
                     if( o instanceof PDAnnotationLink ) {
                         PDAnnotationLink annotationlink = (PDAnnotationLink) o;

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -64,22 +64,7 @@ public class PDFParser extends AbstractParser {
     /** Serial version UID */
     private static final long serialVersionUID = -752276948656079347L;
 
-    // True if we let PDFBox "guess" where spaces should go:
-    private boolean enableAutoSpace = true;
-
-    // True if we let PDFBox remove duplicate overlapping text:
-    private boolean suppressDuplicateOverlappingText;
-
-    // True if we extract annotation text ourselves
-    // (workaround for PDFBOX-1143):
-    private boolean extractAnnotationText = true;
-
-    // True if we should sort text tokens by position
-    // (necessary for some PDFs, but messes up other PDFs):
-    private boolean sortByPosition = false;
-
-    //True if we should use PDFBox's NonSequentialParser
-    private boolean useNonSequentialParser = false;
+    private PDFParserConfig config = new PDFParserConfig();
     /**
      * Metadata key for giving the document password to the parser.
      *
@@ -108,7 +93,7 @@ public void parse(
             //  for unpacked / processed resources
             // Decide which to do based on if we're reading from a file or not already
             TikaInputStream tstream = TikaInputStream.cast(stream);
-            if (useNonSequentialParser == true) {
+            if (config.getUseNonSequentialParser() == true) {
                   RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw");
                   pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile);
             } else if (tstream != null && tstream.hasFile()) {
@@ -148,9 +133,7 @@ public void parse(
             }
             metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
             extractMetadata(pdfDocument, metadata);
-            PDF2XHTML.process(pdfDocument, handler, context, metadata,
-                              extractAnnotationText, enableAutoSpace,
-                              suppressDuplicateOverlappingText, sortByPosition);
+            PDF2XHTML.process(pdfDocument, handler, context, metadata, config);
 
         } finally {
             if (pdfDocument != null) {
@@ -244,48 +227,69 @@ private void addMetadata(Metadata metadata, String name, COSBase value) {
         }
     }
 
+    public void setPDFParserConfig(PDFParserConfig config){
+        this.config = config;
+    }
+
+    public PDFParserConfig getPDFParserConfig(){
+        return config;
+    }
+
     /**
      * If true, the parser will use the NonSequentialParser.  This may
      * be faster than the full doc parser.
      * If false (default), this will use the full doc parser.
+     * 
+     * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
      */
     public void setUseNonSequentialParser(boolean v){
-        useNonSequentialParser = v;
+        config.setUseNonSequentialParser(v);
     }
 
-    /** @see #setUseNonSequentialParser(boolean) */
+    /** 
+     * @see #setUseNonSequentialParser(boolean) 
+     * @deprecated use {@link #getPDFParserConfig()}
+     */
     public boolean getUseNonSequentialParser(){
-        return useNonSequentialParser;
+        return config.getUseNonSequentialParser();
     }
 
     /**
      *  If true (the default), the parser should estimate
      *  where spaces should be inserted between words.  For
      *  many PDFs this is necessary as they do not include
      *  explicit whitespace characters.
+     *
+     *  @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
      */
     public void setEnableAutoSpace(boolean v) {
-        enableAutoSpace = v;
+        config.setEnableAutoSpace(v);
     }
 
-    /** @see #setEnableAutoSpace. */
+    /** 
+     * @see #setEnableAutoSpace. 
+     * @deprecated use {@link #getPDFParserConfig()}
+     */
     public boolean getEnableAutoSpace() {
-        return enableAutoSpace;
+        return config.getEnableAutoSpace();
     }
 
     /**
      * If true (the default), text in annotations will be
      * extracted.
+     * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
      */
     public void setExtractAnnotationText(boolean v) {
-        extractAnnotationText = v;
+        config.setExtractAnnotationText(v);
     }
 
     /**
      * If true, text in annotations will be extracted.
+     * 
+     * @deprecated use {@link #getPDFParserConfig()}
      */
     public boolean getExtractAnnotationText() {
-        return extractAnnotationText;
+        return config.getExtractAnnotationText();
     }
 
     /**
@@ -296,14 +300,20 @@ public boolean getExtractAnnotationText() {
      *  slow down extraction substantially (PDFBOX-956) and
      *  sometimes remove characters that were not in fact
      *  duplicated (PDFBOX-1155).  By default this is disabled.
+     *  
+     *  @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
      */
     public void setSuppressDuplicateOverlappingText(boolean v) {
-        suppressDuplicateOverlappingText = v;
+        config.setSuppressDuplicateOverlappingText(v);
     }
 
-    /** @see #setSuppressDuplicateOverlappingText. */
+    /** 
+     * @see #setSuppressDuplicateOverlappingText. 
+     * 
+     * @deprecated use {@link #getPDFParserConfig()}
+     */
     public boolean getSuppressDuplicateOverlappingText() {
-        return suppressDuplicateOverlappingText;
+        return config.getSuppressDuplicateOverlappingText();
     }
 
     /**
@@ -313,14 +323,20 @@ public boolean getSuppressDuplicateOverlappingText() {
      *  order"), while for other PDFs it can produce the
      *  wrong result (for example if there are 2 columns,
      *  the text will be interleaved).  Default is false.
+     *  
+     *  @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
      */
     public void setSortByPosition(boolean v) {
-        sortByPosition = v;
+        config.setSortByPosition(v);
     }
 
-    /** @see #setSortByPosition. */
+    /** 
+     * @see #setSortByPosition. 
+     * 
+     * @deprecated use {@link #getPDFParserConfig()}
+     */
     public boolean getSortByPosition() {
-        return sortByPosition;
+        return config.getSortByPosition();
     }
 
 }