Skip to content

Commit

Permalink
TIKA-1663 add a DigestingParser
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.apache.org/repos/asf/tika/trunk@1687981 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
tballison committed Jun 28, 2015
1 parent 1a3749f commit 90a2202
Show file tree
Hide file tree
Showing 54 changed files with 1,199 additions and 205 deletions.
4 changes: 4 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
Release 1.10 - Current Development

* Added DigestingParser to calculate digest hashes
and record them in metadata. Integrated with
tika-app and tika-server (TIKA-1663).

* Fixed ZipContainerDetector to detect all IPA files
(TIKA-1659).

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.tika.batch;

import org.apache.tika.config.TikaConfig;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.Parser;

public class DigestingAutoDetectParserFactory extends ParserFactory {

private DigestingParser.Digester digester = null;


@Override
public Parser getParser(TikaConfig config) {
Parser p = new AutoDetectParser(config);
if (digester == null) {
return p;
}
DigestingParser d = new DigestingParser(p, digester);
return d;
}

public void setDigester(DigestingParser.Digester digester) {
this.digester = digester;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.tika.batch.builders;

import java.util.Locale;
import java.util.Map;

import org.apache.tika.batch.DigestingAutoDetectParserFactory;
import org.apache.tika.batch.ParserFactory;
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.utils.CommonsDigester;
import org.apache.tika.util.ClassLoaderUtil;
import org.apache.tika.util.XMLDOMUtil;
import org.w3c.dom.Node;

public class AppParserFactoryBuilder implements IParserFactoryBuilder {

@Override
public ParserFactory build(Node node, Map<String, String> runtimeAttrs) {
Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttrs);
String className = localAttrs.get("class");
ParserFactory pf = ClassLoaderUtil.buildClass(ParserFactory.class, className);

if (localAttrs.containsKey("parseRecursively")) {
String bString = localAttrs.get("parseRecursively").toLowerCase(Locale.ENGLISH);
if (bString.equals("true")) {
pf.setParseRecursively(true);
} else if (bString.equals("false")) {
pf.setParseRecursively(false);
} else {
throw new RuntimeException("parseRecursively must have value of \"true\" or \"false\": "+
bString);
}
}
if (pf instanceof DigestingAutoDetectParserFactory) {
DigestingParser.Digester d = buildDigester(localAttrs);
((DigestingAutoDetectParserFactory)pf).setDigester(d);
}
return pf;
}

private DigestingParser.Digester buildDigester(Map<String, String> localAttrs) {
String digestString = localAttrs.get("digest");
CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse(digestString);

String readLimitString = localAttrs.get("digestMarkLimit");
if (readLimitString == null) {
throw new IllegalArgumentException("Must specify \"digestMarkLimit\" for "+
"the DigestingAutoDetectParserFactory");
}
int readLimit = -1;

try {
readLimit = Integer.parseInt(readLimitString);
} catch (NumberFormatException e) {
throw new IllegalArgumentException("Parameter \"digestMarkLimit\" must be a parseable int: "+
readLimitString);
}
return new CommonsDigester(readLimit, algos);
}
}
17 changes: 17 additions & 0 deletions tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,15 @@
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.NetworkParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.parser.utils.CommonsDigester;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerFactory;
Expand All @@ -108,6 +110,9 @@
* Simple command line interface for Apache Tika.
*/
public class TikaCLI {

private final int MAX_MARK = 20*1024*1024;//20MB

private File extractDir = new File(".");

private static final Log logger = LogFactory.getLog(TikaCLI.class);
Expand Down Expand Up @@ -334,6 +339,8 @@ public void process(
*/
private String password = System.getenv("TIKA_PASSWORD");

private DigestingParser.Digester digester = null;

private boolean pipeMode = true;

private boolean serverMode = false;
Expand Down Expand Up @@ -400,6 +407,11 @@ public void process(String arg) throws Exception {
fork = true;
} else if (arg.startsWith("--config=")) {
configure(arg.substring("--config=".length()));
} else if (arg.startsWith("--digest=")) {
CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse(
arg.substring("--digest=".length()));
digester = new CommonsDigester(MAX_MARK,algos);
parser = new DigestingParser(parser, digester);
} else if (arg.startsWith("-e")) {
encoding = arg.substring("-e".length());
} else if (arg.startsWith("--encoding=")) {
Expand Down Expand Up @@ -545,6 +557,8 @@ private void usage() {
out.println(" with -x, -h, -t or -m; default is -x)");
out.println(" -l or --language Output only language");
out.println(" -d or --detect Detect document type");
out.println(" --digest=X Include digest X (md2, md5, sha1,");
out.println(" sha256, sha384, sha512");
out.println(" -eX or --encoding=X Use output encoding X");
out.println(" -pX or --password=X Use document password X");
out.println(" -z or --extract Extract all attachements into current directory");
Expand Down Expand Up @@ -662,6 +676,9 @@ private void configure(String configFilePath) throws Exception {
this.configFilePath = configFilePath;
TikaConfig config = new TikaConfig(new File(configFilePath));
parser = new AutoDetectParser(config);
if (digester != null) {
parser = new DigestingParser(parser, digester);
}
detector = config.getDetector();
context.set(Parser.class, parser);
}
Expand Down
35 changes: 29 additions & 6 deletions tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@
import java.awt.event.ActionListener;
import java.awt.event.KeyEvent;
import java.awt.event.WindowEvent;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.awt.event.KeyEvent;
import java.awt.event.WindowEvent;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
Expand All @@ -71,10 +75,12 @@
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.parser.utils.CommonsDigester;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
Expand All @@ -92,6 +98,9 @@
public class TikaGUI extends JFrame
implements ActionListener, HyperlinkListener {

//maximum length to allow for mark for reparse to get JSON
private static final int MAX_MARK = 20*1024*1024;//20MB

/**
* Serial version UID.
*/
Expand All @@ -115,13 +124,16 @@ public static void main(String[] args) throws Exception {
final TikaConfig finalConfig = config;
SwingUtilities.invokeLater(new Runnable() {
public void run() {
new TikaGUI(new AutoDetectParser(finalConfig)).setVisible(true);
new TikaGUI(new DigestingParser(
new AutoDetectParser(finalConfig),
new CommonsDigester(MAX_MARK,
CommonsDigester.DigestAlgorithm.MD5,
CommonsDigester.DigestAlgorithm.SHA256)
)).setVisible(true);
}
});
}

//maximum length to allow for mark for reparse to get JSON
private final int MAX_MARK = 20*1024*1024;//20MB
/**
* Parsing context.
*/
Expand Down Expand Up @@ -334,11 +346,22 @@ private void handleStream(InputStream input, Metadata md)
getXmlContentHandler(xmlBuffer));

context.set(DocumentSelector.class, new ImageDocumentSelector());

input = TikaInputStream.get(new ProgressMonitorInputStream(
this, "Parsing stream", input));

if (input.markSupported()) {
input.mark(MAX_MARK);
int mark = -1;
if (input instanceof TikaInputStream) {
if (((TikaInputStream)input).hasFile()) {
mark = (int)((TikaInputStream)input).getLength();
}
}
if (mark == -1) {
mark = MAX_MARK;
}
input.mark(mark);
}
input = new ProgressMonitorInputStream(
this, "Parsing stream", input);
parser.parse(input, handler, md, context);

String[] names = md.names();
Expand Down
Loading

0 comments on commit 90a2202

Please sign in to comment.