TIKA-431: Tika currently misuses the HTTP Content-Encoding header, an…

…d does not seem to use the charset part of the Content-Type header properly. Make text and html parsers return character encoding as a charset parameter in the content type metadata field git-svn-id: https://svn.apache.org/repos/asf/tika/trunk@1358858 13f79535-47bb-0310-9956-ffa450edef68
lpc76 · Jul 8, 2012 · 7d89a5e · 7d89a5e
1 parent 95a1cf9
commit 7d89a5e
Show file tree

Hide file tree

Showing 18 changed files with 325 additions and 132 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -1,2 +1,4 @@
 tika-parsers/src/test/resources/test-documents/testARofText.ar eol=lf
 tika-parsers/src/test/resources/test-documents/testEMLX.emlx eol=lf
+tika-parsers/src/test/resources/test-documents/testTXT.txt eol=lf
+tika-parsers/src/test/resources/test-documents/testHTML.html eol=lf
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -43,6 +43,15 @@ Release 1.2 - Current Development
     ICU4J algorithms are still used as a fallback thanks to their wider
     coverage of custom character encodings. (TIKA-322, TIKA-471)
 
+  * Charset parameter: Related to the character encoding improvements
+    mentioned above, Tika now returns the detected character encoding as
+    a "charset" parameter of the content type metadata field for text/plain
+    and text/html documents. For example, instead of just "text/plain", the
+    returned content type will be something like "text/plain; charset=UTF-8"
+    for a UTF-8 encoded text document. Character encoding information is still
+    present also in the content encoding metadata field for backwards
+    compatibility, but that field should be considered deprecated. (TIKA-431)
+
   * Extraction of embedded resources from OLE2 Office Documents, where
     the resource isn't another office document, has been fixed (TIKA-948)
 

diff --git a/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java b/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
@@ -26,10 +26,8 @@
 import java.util.Collections;
 import java.util.Enumeration;
 import java.util.HashMap;
-import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
-import java.util.Set;
 import java.util.regex.Pattern;
 
 /**

diff --git a/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java b/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
@@ -116,30 +116,18 @@ public MediaType detect(InputStream input, Metadata metadata)
 
         input.mark(bytesToTest);
         try {
-            int chars = 0;
-            int controls = 0;
-            int asciis = 0;
-            int ch = input.read();
-            while (ch != -1 && chars < bytesToTest) {
-                if (ch < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[ch]) {
-                    controls++;
-                } else if (ch < 127) {
-                    asciis++;
-                }
-                ch = input.read();
-                chars++;
+            TextStatistics stats = new TextStatistics();
+
+            byte[] buffer = new byte[1024];
+            int n = 0;
+            int m = input.read(buffer, 0, Math.min(bytesToTest, buffer.length));
+            while (m != -1 && n < bytesToTest) {
+                stats.addData(buffer, 0, m);
+                n += m;
+                m = input.read(buffer, 0, Math.min(bytesToTest - n, buffer.length));
             }
-            if (chars == 0) {
-                // Empty document, so treat it as binary
-                // See https://issues.apache.org/jira/browse/TIKA-483
-                return MediaType.OCTET_STREAM;
-            } else if (controls == 0) {
-                // No control characters, so treat it as text
-                return MediaType.TEXT_PLAIN;
-            } else if (controls < chars * 2 / 100
-                    && asciis > chars * 90 / 100) {
-                // Almost plain text (< 2% control, > 90% ASCII range)
-                // See https://issues.apache.org/jira/browse/TIKA-688
+
+            if (stats.isMostlyAscii()) {
                 return MediaType.TEXT_PLAIN;
             } else {
                 return MediaType.OCTET_STREAM;

diff --git a/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java b/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+/**
+ * Utility class for computing a histogram of the bytes seen in a stream.
+ *
+ * @since Apache Tika 1.2
+ */
+public class TextStatistics {
+
+    private final int[] counts = new int[256];
+
+    private int total = 0;
+
+    public void addData(byte[] buffer, int offset, int length) {
+        for (int i = 0; i < length; i++) {
+            counts[buffer[offset + i] & 0xff]++;
+            total++;
+        }
+    }
+
+    /**
+     * Checks whether at least one byte was seen and that the bytes that
+     * were seen were mostly plain text (i.e. < 2% control, > 90% ASCII range).
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-688">TIKA-688</a>
+     * @return <code>true</code> if the seen bytes were mostly safe ASCII,
+     *         <code>false</code> otherwise
+     */
+    public boolean isMostlyAscii() {
+        int control = count(0, 0x20);
+        int ascii = count(0x20, 128);
+        int safe = countSafeControl();
+        return total > 0
+                && (control - safe) * 100 < total * 2
+                && (ascii + safe) * 100 > total * 90;
+    }
+
+    /**
+     * Returns the total number of bytes seen so far.
+     *
+     * @return count of all bytes
+     */
+    public int count() {
+        return total;
+    }
+
+    /**
+     * Returns the number of occurrences of the given byte.
+     *
+     * @param b byte
+     * @return count of the given byte
+     */
+    public int count(int b) {
+        return counts[b & 0xff];
+    }
+
+    /**
+     * Counts control characters (i.e. < 0x20, excluding tab, CR, LF,
+     * page feed and escape).
+     * <p>
+     * This definition of control characters is based on section 4 of the
+     * "Content-Type Processing Model" Internet-draft
+     * (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
+     * >draft-abarth-mime-sniff-01</a>).
+     * <pre>
+     * +-------------------------+
+     * | Binary data byte ranges |
+     * +-------------------------+
+     * | 0x00 -- 0x08            |
+     * | 0x0B                    |
+     * | 0x0E -- 0x1A            |
+     * | 0x1C -- 0x1F            |
+     * +-------------------------+
+     * </pre>
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a>
+     * @return count of control characters
+     */
+    public int countControl() {
+        return count(0, 0x20) - countSafeControl();
+    }
+
+    /**
+     * Counts "safe" (i.e. seven-bit non-control) ASCII characters.
+     *
+     * @see #countControl()
+     * @return count of safe ASCII characters
+     */
+    public int countSafeAscii() {
+        return count(0x20, 128) + countSafeControl();
+    }
+
+    /**
+     * Counts eight bit characters, i.e. bytes with their highest bit set.
+     *
+     * @return count of eight bit characters
+     */
+    public int countEightBit() {
+        return count(128, 256);
+    }
+
+    private int count(int from, int to) {
+        assert 0 <= from && to < counts.length;
+        int count = 0;
+        for (int i = from; i < to; i++) {
+            count += counts[i];
+        }
+        return count;
+    }
+
+    private int countSafeControl() {
+        return count('\t') + count('\n') + count('\r') // tab, LF, CR
+                + count(0x0c) + count(0x1b);           // new page, escape
+    }
+
+}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MediaType.java b/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
@@ -17,6 +17,7 @@
 package org.apache.tika.mime;
 
 import java.io.Serializable;
+import java.nio.charset.Charset;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -72,6 +73,8 @@ public final class MediaType implements Comparable<MediaType>, Serializable {
 
     public static final MediaType TEXT_PLAIN = parse("text/plain");
 
+    public static final MediaType TEXT_HTML = parse("text/html");
+
     public static final MediaType APPLICATION_XML = parse("application/xml");
 
     public static final MediaType APPLICATION_ZIP = parse("application/zip");
@@ -345,6 +348,28 @@ public MediaType(MediaType type, Map<String, String> parameters) {
                 union(type.parameters, parameters));
     }
 
+    /**
+     * Creates a media type by adding a parameter to a base type.
+     *
+     * @param type base type
+     * @param name parameter name
+     * @param value parameter value
+     * @since Apache Tika 1.2
+     */
+    public MediaType(MediaType type, String name, String value) {
+        this(type, Collections.singletonMap(name, value));
+    }
+
+    /**
+     * Creates a media type by adding the "charset" parameter to a base type.
+     *
+     * @param type base type
+     * @param charset charset value
+     * @since Apache Tika 1.2
+     */
+    public MediaType(MediaType type, Charset charset) {
+        this(type, "charset", charset.name());
+    }
     /**
      * Returns the base form of the MediaType, excluding
      *  any parameters, such as "text/plain" for

diff --git a/tika-core/src/main/java/org/apache/tika/mime/package-info.java b/tika-core/src/main/java/org/apache/tika/mime/package-info.java
@@ -18,5 +18,5 @@
 /**
  * Media type information.
  */
-@aQute.bnd.annotation.Version("1.0.0")
+@aQute.bnd.annotation.Version("1.2.0")
 package org.apache.tika.mime;
diff --git a/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java
@@ -51,16 +51,16 @@ public void testDetectEmpty() throws Exception {
     public void testDetectText() throws Exception {
         assertText("Hello, World!".getBytes("UTF-8"));
         assertText(" \t\r\n".getBytes("UTF-8"));
-        assertText(new byte[] { -1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B });
+        assertNotText(new byte[] { -1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B });
         assertNotText(new byte[] { 0 });
         assertNotText(new byte[] { 'H', 'e', 'l', 'l', 'o', 0 });
 
         byte[] data = new byte[512];
         Arrays.fill(data, (byte) '.');
         assertText(data);
-        Arrays.fill(data, 100, 109, (byte) 0x1f);
-        assertText(data); // almost text
         Arrays.fill(data, 100, 110, (byte) 0x1f);
+        assertText(data); // almost text
+        Arrays.fill(data, 100, 111, (byte) 0x1f);
         assertNotText(data); // no longer almost text, too many control chars
         Arrays.fill(data, (byte) 0x1f);
         assertNotText(data);

diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
@@ -67,13 +67,13 @@ public void testDetection() throws Exception {
 
     public void testByteOrderMark() throws Exception {
         assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
-                new ByteArrayInputStream("\ufffetest".getBytes("UTF-16LE")),
+                new ByteArrayInputStream("\ufefftest".getBytes("UTF-16LE")),
                 new Metadata()));
         assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
-                new ByteArrayInputStream("\ufffetest".getBytes("UTF-16BE")),
+                new ByteArrayInputStream("\ufefftest".getBytes("UTF-16BE")),
                 new Metadata()));
         assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
-                new ByteArrayInputStream("\ufffetest".getBytes("UTF-8")),
+                new ByteArrayInputStream("\ufefftest".getBytes("UTF-8")),
                 new Metadata()));
     }
 

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
@@ -18,6 +18,7 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.charset.Charset;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
@@ -57,7 +58,7 @@ public class HtmlParser extends AbstractParser {
             new ServiceLoader(HtmlParser.class.getClassLoader());
 
     /**
-     * HTML schema singleton used to amortize the heavy instantiation time.
+     * HTML schema singleton used to amortise the heavy instantiation time.
      */
     private static final Schema HTML_SCHEMA = new HTMLSchema();
 
@@ -73,11 +74,14 @@ public void parse(
         AutoDetectReader reader = new AutoDetectReader(
                 new CloseShieldInputStream(stream), metadata, LOADER);
         try {
-            if (metadata.get(Metadata.CONTENT_TYPE) == null) {
-                // TODO: Include charset
-                metadata.set(Metadata.CONTENT_TYPE, "text/html");
+            Charset charset = reader.getCharset();
+            String previous = metadata.get(Metadata.CONTENT_TYPE);
+            if (previous == null || previous.startsWith("text/html")) {
+                MediaType type = new MediaType(MediaType.TEXT_HTML, charset);
+                metadata.set(Metadata.CONTENT_TYPE, type.toString());
             }
-            metadata.set(Metadata.CONTENT_ENCODING, reader.getCharset().name());
+            // deprecated, see TIKA-431
+            metadata.set(Metadata.CONTENT_ENCODING, charset.name());
 
             // Get the HTML mapper from the parse context
             HtmlMapper mapper =

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
@@ -22,9 +22,7 @@
 import java.io.InputStream;
 import java.nio.channels.FileChannel;
 import java.util.Collections;
-import java.util.HashMap;
 import java.util.HashSet;
-import java.util.Map;
 import java.util.Set;
 import java.util.regex.Pattern;
 
@@ -76,10 +74,12 @@ public class POIFSContainerDetector implements Detector {
     public static final MediaType GENERAL_EMBEDDED = application("x-tika-msoffice-embedded");
 
     /** An OLE10 Native embedded document within another OLE2 document */
-    public static final MediaType OLE10_NATIVE = new MediaType(GENERAL_EMBEDDED, format("ole10_native")); 
+    public static final MediaType OLE10_NATIVE =
+            new MediaType(GENERAL_EMBEDDED, "format", "ole10_native");
 
     /** Some other kind of embedded document, in a CompObj container within another OLE2 document */
-    public static final MediaType COMP_OBJ = new MediaType(GENERAL_EMBEDDED, format("comp_obj"));
+    public static final MediaType COMP_OBJ =
+            new MediaType(GENERAL_EMBEDDED, "format", "comp_obj");
 
     /** Microsoft Excel */
     public static final MediaType XLS = application("vnd.ms-excel");
@@ -122,13 +122,7 @@ public class POIFSContainerDetector implements Detector {
 
     /** Regexp for matching the MPP Project Data stream */
     private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+");
-
-    private static Map<String,String> format(String format) {
-       Map<String, String> params = new HashMap<String, String>();
-       params.put("format", format);
-       return params;
-    }
-
+
     public MediaType detect(InputStream input, Metadata metadata)
              throws IOException {
         // Check if we have access to the document