From 7d89a5e455686a945f2e3302f0d201e5e6c4a985 Mon Sep 17 00:00:00 2001 From: Jukka Zitting Date: Sun, 8 Jul 2012 22:44:00 +0000 Subject: [PATCH] TIKA-431: Tika currently misuses the HTTP Content-Encoding header, and does not seem to use the charset part of the Content-Type header properly. Make text and html parsers return character encoding as a charset parameter in the content type metadata field git-svn-id: https://svn.apache.org/repos/asf/tika/trunk@1358858 13f79535-47bb-0310-9956-ffa450edef68 --- .gitattributes | 2 + CHANGES.txt | 9 ++ .../org/apache/tika/config/ServiceLoader.java | 2 - .../org/apache/tika/detect/TextDetector.java | 34 ++--- .../apache/tika/detect/TextStatistics.java | 133 ++++++++++++++++++ .../java/org/apache/tika/mime/MediaType.java | 25 ++++ .../org/apache/tika/mime/package-info.java | 2 +- .../apache/tika/detect/TextDetectorTest.java | 6 +- .../apache/tika/mime/MimeDetectionTest.java | 6 +- .../apache/tika/parser/html/HtmlParser.java | 14 +- .../microsoft/POIFSContainerDetector.java | 16 +-- .../org/apache/tika/parser/txt/TXTParser.java | 22 ++- .../parser/txt/UniversalEncodingListener.java | 33 +++-- .../org/apache/tika/mime/MimeTypesTest.java | 32 ----- .../org/apache/tika/mime/TestMimeTypes.java | 26 ++++ .../tika/parser/AutoDetectParserTest.java | 8 +- .../tika/parser/html/HtmlParserTest.java | 2 +- .../apache/tika/parser/txt/TXTParserTest.java | 85 +++++++---- 18 files changed, 325 insertions(+), 132 deletions(-) create mode 100644 tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java diff --git a/.gitattributes b/.gitattributes index c75b2a4533..64c8501ddf 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,4 @@ tika-parsers/src/test/resources/test-documents/testARofText.ar eol=lf tika-parsers/src/test/resources/test-documents/testEMLX.emlx eol=lf +tika-parsers/src/test/resources/test-documents/testTXT.txt eol=lf +tika-parsers/src/test/resources/test-documents/testHTML.html eol=lf diff --git a/CHANGES.txt b/CHANGES.txt index 01b542ff70..3a1a6a0e9a 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -43,6 +43,15 @@ Release 1.2 - Current Development ICU4J algorithms are still used as a fallback thanks to their wider coverage of custom character encodings. (TIKA-322, TIKA-471) + * Charset parameter: Related to the character encoding improvements + mentioned above, Tika now returns the detected character encoding as + a "charset" parameter of the content type metadata field for text/plain + and text/html documents. For example, instead of just "text/plain", the + returned content type will be something like "text/plain; charset=UTF-8" + for a UTF-8 encoded text document. Character encoding information is still + present also in the content encoding metadata field for backwards + compatibility, but that field should be considered deprecated. (TIKA-431) + * Extraction of embedded resources from OLE2 Office Documents, where the resource isn't another office document, has been fixed (TIKA-948) diff --git a/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java b/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java index 0e025da135..e2f80ef59d 100644 --- a/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java +++ b/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java @@ -26,10 +26,8 @@ import java.util.Collections; import java.util.Enumeration; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.regex.Pattern; /** diff --git a/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java b/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java index 31a1fa509f..09d3af08d2 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java @@ -116,30 +116,18 @@ public MediaType detect(InputStream input, Metadata metadata) input.mark(bytesToTest); try { - int chars = 0; - int controls = 0; - int asciis = 0; - int ch = input.read(); - while (ch != -1 && chars < bytesToTest) { - if (ch < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[ch]) { - controls++; - } else if (ch < 127) { - asciis++; - } - ch = input.read(); - chars++; + TextStatistics stats = new TextStatistics(); + + byte[] buffer = new byte[1024]; + int n = 0; + int m = input.read(buffer, 0, Math.min(bytesToTest, buffer.length)); + while (m != -1 && n < bytesToTest) { + stats.addData(buffer, 0, m); + n += m; + m = input.read(buffer, 0, Math.min(bytesToTest - n, buffer.length)); } - if (chars == 0) { - // Empty document, so treat it as binary - // See https://issues.apache.org/jira/browse/TIKA-483 - return MediaType.OCTET_STREAM; - } else if (controls == 0) { - // No control characters, so treat it as text - return MediaType.TEXT_PLAIN; - } else if (controls < chars * 2 / 100 - && asciis > chars * 90 / 100) { - // Almost plain text (< 2% control, > 90% ASCII range) - // See https://issues.apache.org/jira/browse/TIKA-688 + + if (stats.isMostlyAscii()) { return MediaType.TEXT_PLAIN; } else { return MediaType.OCTET_STREAM; diff --git a/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java b/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java new file mode 100644 index 0000000000..581a1334de --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.detect; + +/** + * Utility class for computing a histogram of the bytes seen in a stream. + * + * @since Apache Tika 1.2 + */ +public class TextStatistics { + + private final int[] counts = new int[256]; + + private int total = 0; + + public void addData(byte[] buffer, int offset, int length) { + for (int i = 0; i < length; i++) { + counts[buffer[offset + i] & 0xff]++; + total++; + } + } + + /** + * Checks whether at least one byte was seen and that the bytes that + * were seen were mostly plain text (i.e. < 2% control, > 90% ASCII range). + * + * @see TIKA-483 + * @see TIKA-688 + * @return true if the seen bytes were mostly safe ASCII, + * false otherwise + */ + public boolean isMostlyAscii() { + int control = count(0, 0x20); + int ascii = count(0x20, 128); + int safe = countSafeControl(); + return total > 0 + && (control - safe) * 100 < total * 2 + && (ascii + safe) * 100 > total * 90; + } + + /** + * Returns the total number of bytes seen so far. + * + * @return count of all bytes + */ + public int count() { + return total; + } + + /** + * Returns the number of occurrences of the given byte. + * + * @param b byte + * @return count of the given byte + */ + public int count(int b) { + return counts[b & 0xff]; + } + + /** + * Counts control characters (i.e. < 0x20, excluding tab, CR, LF, + * page feed and escape). + *

+ * This definition of control characters is based on section 4 of the + * "Content-Type Processing Model" Internet-draft + * (draft-abarth-mime-sniff-01). + *

+     * +-------------------------+
+     * | Binary data byte ranges |
+     * +-------------------------+
+     * | 0x00 -- 0x08            |
+     * | 0x0B                    |
+     * | 0x0E -- 0x1A            |
+     * | 0x1C -- 0x1F            |
+     * +-------------------------+
+     * 
+ * + * @see TIKA-154 + * @return count of control characters + */ + public int countControl() { + return count(0, 0x20) - countSafeControl(); + } + + /** + * Counts "safe" (i.e. seven-bit non-control) ASCII characters. + * + * @see #countControl() + * @return count of safe ASCII characters + */ + public int countSafeAscii() { + return count(0x20, 128) + countSafeControl(); + } + + /** + * Counts eight bit characters, i.e. bytes with their highest bit set. + * + * @return count of eight bit characters + */ + public int countEightBit() { + return count(128, 256); + } + + private int count(int from, int to) { + assert 0 <= from && to < counts.length; + int count = 0; + for (int i = from; i < to; i++) { + count += counts[i]; + } + return count; + } + + private int countSafeControl() { + return count('\t') + count('\n') + count('\r') // tab, LF, CR + + count(0x0c) + count(0x1b); // new page, escape + } + +} diff --git a/tika-core/src/main/java/org/apache/tika/mime/MediaType.java b/tika-core/src/main/java/org/apache/tika/mime/MediaType.java index 5eb0cb277f..0080c4aab0 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MediaType.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MediaType.java @@ -17,6 +17,7 @@ package org.apache.tika.mime; import java.io.Serializable; +import java.nio.charset.Charset; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -72,6 +73,8 @@ public final class MediaType implements Comparable, Serializable { public static final MediaType TEXT_PLAIN = parse("text/plain"); + public static final MediaType TEXT_HTML = parse("text/html"); + public static final MediaType APPLICATION_XML = parse("application/xml"); public static final MediaType APPLICATION_ZIP = parse("application/zip"); @@ -345,6 +348,28 @@ public MediaType(MediaType type, Map parameters) { union(type.parameters, parameters)); } + /** + * Creates a media type by adding a parameter to a base type. + * + * @param type base type + * @param name parameter name + * @param value parameter value + * @since Apache Tika 1.2 + */ + public MediaType(MediaType type, String name, String value) { + this(type, Collections.singletonMap(name, value)); + } + + /** + * Creates a media type by adding the "charset" parameter to a base type. + * + * @param type base type + * @param charset charset value + * @since Apache Tika 1.2 + */ + public MediaType(MediaType type, Charset charset) { + this(type, "charset", charset.name()); + } /** * Returns the base form of the MediaType, excluding * any parameters, such as "text/plain" for diff --git a/tika-core/src/main/java/org/apache/tika/mime/package-info.java b/tika-core/src/main/java/org/apache/tika/mime/package-info.java index 4e3246f5c9..104dc3acf9 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/mime/package-info.java @@ -18,5 +18,5 @@ /** * Media type information. */ -@aQute.bnd.annotation.Version("1.0.0") +@aQute.bnd.annotation.Version("1.2.0") package org.apache.tika.mime; diff --git a/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java index 441da0585b..cdf625a14b 100644 --- a/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java +++ b/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java @@ -51,16 +51,16 @@ public void testDetectEmpty() throws Exception { public void testDetectText() throws Exception { assertText("Hello, World!".getBytes("UTF-8")); assertText(" \t\r\n".getBytes("UTF-8")); - assertText(new byte[] { -1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B }); + assertNotText(new byte[] { -1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B }); assertNotText(new byte[] { 0 }); assertNotText(new byte[] { 'H', 'e', 'l', 'l', 'o', 0 }); byte[] data = new byte[512]; Arrays.fill(data, (byte) '.'); assertText(data); - Arrays.fill(data, 100, 109, (byte) 0x1f); - assertText(data); // almost text Arrays.fill(data, 100, 110, (byte) 0x1f); + assertText(data); // almost text + Arrays.fill(data, 100, 111, (byte) 0x1f); assertNotText(data); // no longer almost text, too many control chars Arrays.fill(data, (byte) 0x1f); assertNotText(data); diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java index 996b8a9956..58dfbbe215 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java @@ -67,13 +67,13 @@ public void testDetection() throws Exception { public void testByteOrderMark() throws Exception { assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect( - new ByteArrayInputStream("\ufffetest".getBytes("UTF-16LE")), + new ByteArrayInputStream("\ufefftest".getBytes("UTF-16LE")), new Metadata())); assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect( - new ByteArrayInputStream("\ufffetest".getBytes("UTF-16BE")), + new ByteArrayInputStream("\ufefftest".getBytes("UTF-16BE")), new Metadata())); assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect( - new ByteArrayInputStream("\ufffetest".getBytes("UTF-8")), + new ByteArrayInputStream("\ufefftest".getBytes("UTF-8")), new Metadata())); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java index 4be67e7d28..cc8adc0642 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java @@ -18,6 +18,7 @@ import java.io.IOException; import java.io.InputStream; +import java.nio.charset.Charset; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; @@ -57,7 +58,7 @@ public class HtmlParser extends AbstractParser { new ServiceLoader(HtmlParser.class.getClassLoader()); /** - * HTML schema singleton used to amortize the heavy instantiation time. + * HTML schema singleton used to amortise the heavy instantiation time. */ private static final Schema HTML_SCHEMA = new HTMLSchema(); @@ -73,11 +74,14 @@ public void parse( AutoDetectReader reader = new AutoDetectReader( new CloseShieldInputStream(stream), metadata, LOADER); try { - if (metadata.get(Metadata.CONTENT_TYPE) == null) { - // TODO: Include charset - metadata.set(Metadata.CONTENT_TYPE, "text/html"); + Charset charset = reader.getCharset(); + String previous = metadata.get(Metadata.CONTENT_TYPE); + if (previous == null || previous.startsWith("text/html")) { + MediaType type = new MediaType(MediaType.TEXT_HTML, charset); + metadata.set(Metadata.CONTENT_TYPE, type.toString()); } - metadata.set(Metadata.CONTENT_ENCODING, reader.getCharset().name()); + // deprecated, see TIKA-431 + metadata.set(Metadata.CONTENT_ENCODING, charset.name()); // Get the HTML mapper from the parse context HtmlMapper mapper = diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java index f30fbc29b1..82aabaf850 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java @@ -22,9 +22,7 @@ import java.io.InputStream; import java.nio.channels.FileChannel; import java.util.Collections; -import java.util.HashMap; import java.util.HashSet; -import java.util.Map; import java.util.Set; import java.util.regex.Pattern; @@ -76,10 +74,12 @@ public class POIFSContainerDetector implements Detector { public static final MediaType GENERAL_EMBEDDED = application("x-tika-msoffice-embedded"); /** An OLE10 Native embedded document within another OLE2 document */ - public static final MediaType OLE10_NATIVE = new MediaType(GENERAL_EMBEDDED, format("ole10_native")); + public static final MediaType OLE10_NATIVE = + new MediaType(GENERAL_EMBEDDED, "format", "ole10_native"); /** Some other kind of embedded document, in a CompObj container within another OLE2 document */ - public static final MediaType COMP_OBJ = new MediaType(GENERAL_EMBEDDED, format("comp_obj")); + public static final MediaType COMP_OBJ = + new MediaType(GENERAL_EMBEDDED, "format", "comp_obj"); /** Microsoft Excel */ public static final MediaType XLS = application("vnd.ms-excel"); @@ -122,13 +122,7 @@ public class POIFSContainerDetector implements Detector { /** Regexp for matching the MPP Project Data stream */ private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+"); - - private static Map format(String format) { - Map params = new HashMap(); - params.put("format", format); - return params; - } - + public MediaType detect(InputStream input, Metadata metadata) throws IOException { // Check if we have access to the document diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java index c540e7db67..6531774520 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java @@ -18,6 +18,7 @@ import java.io.IOException; import java.io.InputStream; +import java.nio.charset.Charset; import java.util.Collections; import java.util.Set; @@ -36,20 +37,14 @@ /** * Plain text parser. The text encoding of the document stream is * automatically detected based on the byte patterns found at the - * beginning of the stream. The input metadata key - * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_ENCODING} is used - * as an encoding hint if the automatic encoding detection fails. + * beginning of the stream and the given document metadata, most + * notably the charset parameter of a + * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE} value. *

* This parser sets the following output metadata entries: *

*
{@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE}
- *
text/plain
- *
{@link org.apache.tika.metadata.HttpHeaders#CONTENT_ENCODING}
- *
The detected text encoding of the document.
- *
- * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_LANGUAGE} and - * {@link org.apache.tika.metadata.DublinCore#LANGUAGE} - *
+ *
text/plain; charset=...
*
*/ public class TXTParser extends AbstractParser { @@ -75,8 +70,11 @@ public void parse( AutoDetectReader reader = new AutoDetectReader( new CloseShieldInputStream(stream), metadata, LOADER); try { - metadata.set(Metadata.CONTENT_TYPE, "text/plain"); // TODO: charset - metadata.set(Metadata.CONTENT_ENCODING, reader.getCharset().name()); + Charset charset = reader.getCharset(); + MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset); + metadata.set(Metadata.CONTENT_TYPE, type.toString()); + // deprecated, see TIKA-431 + metadata.set(Metadata.CONTENT_ENCODING, charset.name()); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java b/tika-parsers/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java index 826febc40d..5e215a99f7 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java @@ -18,6 +18,7 @@ import java.nio.charset.Charset; +import org.apache.tika.detect.TextStatistics; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.utils.CharsetUtils; @@ -33,14 +34,16 @@ class UniversalEncodingListener implements CharsetListener { private static final String CHARSET_ISO_8859_1 = "ISO-8859-1"; + private static final String CHARSET_ISO_8859_15 = "ISO-8859-15"; + + private final TextStatistics statistics = new TextStatistics(); + private final UniversalDetector detector = new UniversalDetector(this); private String hint = null; private Charset charset = null; - private boolean hasCR = false; - public UniversalEncodingListener(Metadata metadata) { MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE)); if (type != null) { @@ -54,11 +57,20 @@ public UniversalEncodingListener(Metadata metadata) { public void report(String name) { if (Constants.CHARSET_WINDOWS_1252.equals(name)) { if (hint != null) { - // Use the encoding hint to distinguish between latin charsets + // Use the encoding hint when available name = hint; - } else if (!hasCR) { - // If there are no CRLFs, it's more likely to be ISO-8859-1 - name = CHARSET_ISO_8859_1; + } else if (statistics.count('\r') == 0) { + // If there are no CR(LF)s, then the encoding is more + // likely to be ISO-8859-1(5) than windows-1252 + if (statistics.count(0xa4) > 0) { // currency/euro sign + // The general currency sign is hardly ever used in + // ISO-8859-1, so it's more likely that we're dealing + // with ISO-8859-15, where the character is used for + // the euro symbol, which is more commonly used. + name = CHARSET_ISO_8859_15; + } else { + name = CHARSET_ISO_8859_1; + } } } try { @@ -73,16 +85,15 @@ public boolean isDone() { } public void handleData(byte[] buf, int offset, int length) { - for (int i = 0; !hasCR && i < length; i++) { - if (buf[offset + i] == '\r') { - hasCR = true; - } - } + statistics.addData(buf, offset, length); detector.handleData(buf, offset, length); } public Charset dataEnd() { detector.dataEnd(); + if (charset == null && statistics.isMostlyAscii()) { + report(Constants.CHARSET_WINDOWS_1252); + } return charset; } diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java b/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java index 9f45cb7b33..78b42d142f 100644 --- a/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java @@ -19,11 +19,6 @@ import static org.apache.tika.mime.MediaType.OCTET_STREAM; import static org.apache.tika.mime.MediaType.TEXT_PLAIN; -import java.io.ByteArrayInputStream; -import java.io.IOException; - -import org.apache.tika.metadata.Metadata; - import junit.framework.TestCase; public class MimeTypesTest extends TestCase { @@ -95,31 +90,4 @@ public void testCompareTo() { assertTrue(html.compareTo(html) == 0); } - /** Test getMimeType(byte[]) - * @throws IOException */ - public void testGetMimeType_byteArray() throws IOException { - // Plain text detection - assertText(new byte[] { (byte) 0xFF, (byte) 0xFE }); - assertText(new byte[] { (byte) 0xFF, (byte) 0xFE }); - assertText(new byte[] { (byte) 0xEF, (byte) 0xFB, (byte) 0xBF }); - assertText(new byte[] { 'a', 'b', 'c' }); - assertText(new byte[] { '\t', '\r', '\n', 0x0C, 0x1B }); - assertNotText(new byte[] { '\t', '\r', '\n', 0x0E, 0x1C }); - } - - private void assertText(byte[] prefix) throws IOException { - assertMagic("text/plain", prefix); - } - - private void assertNotText(byte[] prefix) throws IOException { - assertMagic("application/octet-stream", prefix); - } - - private void assertMagic(String expected, byte[] prefix) throws IOException { - MediaType type = - types.detect(new ByteArrayInputStream(prefix), new Metadata()); - assertNotNull(type); - assertEquals(expected, type.toString()); - } - } diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java index 57bb4ca96a..67b8195c15 100644 --- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java +++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java @@ -609,6 +609,32 @@ public void testEmlx() throws IOException { assertTypeDetection("testEMLX.emlx", "message/x-emlx"); } + /** Test getMimeType(byte[]) */ + public void testGetMimeType_byteArray() throws IOException { + // Plain text detection + assertText(new byte[] { (byte) 0xFF, (byte) 0xFE }); + assertText(new byte[] { (byte) 0xFF, (byte) 0xFE }); + assertText(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF }); + assertText(new byte[] { 'a', 'b', 'c' }); + assertText(new byte[] { '\t', '\r', '\n', 0x0C, 0x1B }); + assertNotText(new byte[] { '\t', '\r', '\n', 0x0E, 0x1C }); + } + + private void assertText(byte[] prefix) throws IOException { + assertMagic("text/plain", prefix); + } + + private void assertNotText(byte[] prefix) throws IOException { + assertMagic("application/octet-stream", prefix); + } + + private void assertMagic(String expected, byte[] prefix) throws IOException { + MediaType type = + repo.detect(new ByteArrayInputStream(prefix), new Metadata()); + assertNotNull(type); + assertEquals(expected, type.toString()); + } + private void assertType(String expected, String filename) throws Exception { InputStream stream = TestMimeTypes.class.getResourceAsStream( "/test-documents/" + filename); diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java index ad62dc417b..ee980caa92 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java @@ -28,7 +28,6 @@ import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.XMPDM; import org.apache.tika.mime.MediaType; import org.apache.tika.sax.BodyContentHandler; @@ -40,14 +39,14 @@ public class AutoDetectParserTest extends TestCase { // Easy to read constants for the MIME types: private static final String RAW = "application/octet-stream"; private static final String EXCEL = "application/vnd.ms-excel"; - private static final String HTML = "text/html"; + private static final String HTML = "text/html; charset=ISO-8859-1"; private static final String PDF = "application/pdf"; private static final String POWERPOINT = "application/vnd.ms-powerpoint"; private static final String KEYNOTE = "application/vnd.apple.keynote"; private static final String PAGES = "application/vnd.apple.pages"; private static final String NUMBERS = "application/vnd.apple.numbers"; private static final String RTF = "application/rtf"; - private static final String PLAINTEXT = "text/plain"; + private static final String PLAINTEXT = "text/plain; charset=ISO-8859-1"; private static final String WORD = "application/msword"; private static final String XML = "application/xml"; private static final String RSS = "application/rss+xml"; @@ -236,11 +235,12 @@ public void testZipBombPrevention() throws Exception { } } - + /** * Test to ensure that the Vorbis and FLAC parsers have been correctly * included, and are available */ + @SuppressWarnings("deprecation") public void testVorbisFlac() throws Exception { // The three test files should all have similar test data String[] testFiles = new String[] { diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java index f5032eecd9..4976640be9 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java @@ -571,7 +571,7 @@ public void testMetaTagHandling() throws Exception { String result = sw.toString(); // tag for Content-Type should exist, but nothing for Language - assertTrue(Pattern.matches("(?s).*.*$", result)); + assertTrue(Pattern.matches("(?s).*.*$", result)); assertFalse(Pattern.matches("(?s).*hello world".getBytes("UTF-8")), + new ByteArrayInputStream("hello world".getBytes("ISO-8859-1")), new WriteOutContentHandler(writer), metadata, new ParseContext()); - assertNotSame("IBM500", metadata.get(Metadata.CONTENT_ENCODING)); + assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); } }