Skip to content

Commit

Permalink
TIKA-3976 (apache#972)
Browse files Browse the repository at this point in the history
* TIKA-3976 -- allow users to turn off exception on zero-byte files
  • Loading branch information
tballison authored Feb 17, 2023
1 parent 5e61a58 commit e48b10f
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 13 deletions.
3 changes: 3 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
Release 2.7.1 - ???

* Users may now avoid the ZeroByteFileException via a
setting on the AutoDetectParserConfig (TIKA-3976).

* Fix bug in closing <a> elements in the presence of <b> elements
in RTF files (TIKA-3972).

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,13 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
}
//check for zero-byte inputstream
if (tis.getOpenContainer() == null) {
tis.mark(1);
if (tis.read() == -1) {
throw new ZeroByteFileException("InputStream must have > 0 bytes");
if (autoDetectParserConfig.getThrowOnZeroBytes()) {
tis.mark(1);
if (tis.read() == -1) {
throw new ZeroByteFileException("InputStream must have > 0 bytes");
}
tis.reset();
}
tis.reset();
}
handler = decorateHandler(handler, metadata, context, autoDetectParserConfig);
// TIKA-216: Zip bomb prevention
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ public static AutoDetectParserConfig load(Element element)

private DigestingParser.DigesterFactory digesterFactory = null;

private boolean throwOnZeroBytes = true;

/**
* Creates a SecureContentHandlerConfig using the passed in parameters.
*
Expand Down Expand Up @@ -198,6 +200,14 @@ public DigestingParser.DigesterFactory getDigesterFactory() {
return this.digesterFactory;
}

public void setThrowOnZeroBytes(boolean throwOnZeroBytes) {
this.throwOnZeroBytes = throwOnZeroBytes;
}

public boolean getThrowOnZeroBytes() {
return throwOnZeroBytes;
}

@Override
public String toString() {
return "AutoDetectParserConfig{" + "spoolToDisk=" + spoolToDisk + ", outputThreshold=" +
Expand All @@ -206,7 +216,8 @@ public String toString() {
maximumPackageEntryDepth + ", metadataWriteFilterFactory=" +
metadataWriteFilterFactory + ", embeddedDocumentExtractorFactory=" +
embeddedDocumentExtractorFactory + ", contentHandlerDecoratorFactory=" +
contentHandlerDecoratorFactory + ", digesterFactory=" + digesterFactory + '}';
contentHandlerDecoratorFactory + ", digesterFactory=" + digesterFactory +
", throwOnZeroBytes=" + throwOnZeroBytes + '}';
}
}

18 changes: 11 additions & 7 deletions tika-core/src/test/java/org/apache/tika/TikaTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -342,29 +342,33 @@ protected List<Metadata> getRecursiveMetadata(String filePath, Parser wrapped,

protected List<Metadata> getRecursiveMetadata(Path path, ParseContext context,
boolean suppressException) throws Exception {
try (TikaInputStream tis = TikaInputStream.get(path)) {
return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, new Metadata(), context,
Metadata metadata = new Metadata();
try (TikaInputStream tis = TikaInputStream.get(path, metadata)) {
return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, metadata, context,
suppressException);
}
}

protected List<Metadata> getRecursiveMetadata(Path path, Parser parser,
boolean suppressException) throws Exception {
try (TikaInputStream tis = TikaInputStream.get(path)) {
return getRecursiveMetadata(tis, parser, new Metadata(), new ParseContext(),
Metadata metadata = new Metadata();
try (TikaInputStream tis = TikaInputStream.get(path, metadata)) {
return getRecursiveMetadata(tis, parser, metadata, new ParseContext(),
suppressException);
}
}

protected List<Metadata> getRecursiveMetadata(Path p, boolean suppressException)
throws Exception {
try (TikaInputStream tis = TikaInputStream.get(p)) {
return getRecursiveMetadata(tis, new Metadata(), new ParseContext(), suppressException);
Metadata metadata = new Metadata();
try (TikaInputStream tis = TikaInputStream.get(p, metadata)) {
return getRecursiveMetadata(tis, metadata, new ParseContext(), suppressException);
}
}

protected List<Metadata> getRecursiveMetadata(Path filePath) throws Exception {
try (TikaInputStream tis = TikaInputStream.get(filePath)) {
Metadata metadata = new Metadata();
try (TikaInputStream tis = TikaInputStream.get(filePath, metadata)) {
return getRecursiveMetadata(tis, true);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
import static org.junit.jupiter.api.Assertions.assertEquals;

import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;

import org.junit.jupiter.api.Test;
Expand Down Expand Up @@ -104,7 +106,7 @@ public void testDigests() throws Exception {
//test to make sure that the decorator is only applied once for
//legacy (e.g. not RecursiveParserWrapperHandler) parsing
TikaConfig tikaConfig = null;
try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
try (InputStream is = AutoDetectParserConfigTest.class.getResourceAsStream(
"/configs/tika-config-digests.xml")) {
tikaConfig = new TikaConfig(is);
}
Expand Down Expand Up @@ -138,4 +140,23 @@ public void testDigestsEmptyParser() throws Exception {
assertEquals("org.apache.tika.parser.EmptyParser",
metadataList.get(0).get("X-TIKA:Parsed-By"));
}

@Test
public void testContainerZeroBytes() throws Exception {
Path tmp = Files.createTempFile("tika-test", "");
try {
TikaConfig tikaConfig = null;
try (InputStream is = AutoDetectParserConfigTest.class.getResourceAsStream(
"/configs/tika-config-digests.xml")) {
tikaConfig = new TikaConfig(is);
}
Parser p = new AutoDetectParser(tikaConfig);
List<Metadata> metadataList = getRecursiveMetadata(tmp, p, true);
assertEquals("d41d8cd98f00b204e9800998ecf8427e",
metadataList.get(0).get("X-TIKA:digest:MD5"));
assertEquals("0", metadataList.get(0).get(Metadata.CONTENT_LENGTH));
} finally {
Files.delete(tmp);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,6 @@
<markLimit>100000</markLimit>
<algorithmString>sha256:32,md5</algorithmString>
</digesterFactory>
<throwOnZeroBytes>false</throwOnZeroBytes>
</autoDetectParserConfig>
</properties>

0 comments on commit e48b10f

Please sign in to comment.