From 7d89a5e455686a945f2e3302f0d201e5e6c4a985 Mon Sep 17 00:00:00 2001
From: Jukka Zitting <jukka@apache.org>
Date: Sun, 8 Jul 2012 22:44:00 +0000
Subject: [PATCH] TIKA-431: Tika currently misuses the HTTP Content-Encoding
 header, and does not seem to use the charset part of the Content-Type header
 properly.

Make text and html parsers return character encoding as a charset parameter in the content type metadata field

git-svn-id: https://svn.apache.org/repos/asf/tika/trunk@1358858 13f79535-47bb-0310-9956-ffa450edef68
---
 .gitattributes                                |   2 +
 CHANGES.txt                                   |   9 ++
 .../org/apache/tika/config/ServiceLoader.java |   2 -
 .../org/apache/tika/detect/TextDetector.java  |  34 ++---
 .../apache/tika/detect/TextStatistics.java    | 133 ++++++++++++++++++
 .../java/org/apache/tika/mime/MediaType.java  |  25 ++++
 .../org/apache/tika/mime/package-info.java    |   2 +-
 .../apache/tika/detect/TextDetectorTest.java  |   6 +-
 .../apache/tika/mime/MimeDetectionTest.java   |   6 +-
 .../apache/tika/parser/html/HtmlParser.java   |  14 +-
 .../microsoft/POIFSContainerDetector.java     |  16 +--
 .../org/apache/tika/parser/txt/TXTParser.java |  22 ++-
 .../parser/txt/UniversalEncodingListener.java |  33 +++--
 .../org/apache/tika/mime/MimeTypesTest.java   |  32 -----
 .../org/apache/tika/mime/TestMimeTypes.java   |  26 ++++
 .../tika/parser/AutoDetectParserTest.java     |   8 +-
 .../tika/parser/html/HtmlParserTest.java      |   2 +-
 .../apache/tika/parser/txt/TXTParserTest.java |  85 +++++++----
 18 files changed, 325 insertions(+), 132 deletions(-)
 create mode 100644 tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java

diff --git a/.gitattributes b/.gitattributes
index c75b2a4533..64c8501ddf 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1,4 @@
 tika-parsers/src/test/resources/test-documents/testARofText.ar eol=lf
 tika-parsers/src/test/resources/test-documents/testEMLX.emlx eol=lf
+tika-parsers/src/test/resources/test-documents/testTXT.txt eol=lf
+tika-parsers/src/test/resources/test-documents/testHTML.html eol=lf
diff --git a/CHANGES.txt b/CHANGES.txt
index 01b542ff70..3a1a6a0e9a 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -43,6 +43,15 @@ Release 1.2 - Current Development
     ICU4J algorithms are still used as a fallback thanks to their wider
     coverage of custom character encodings. (TIKA-322, TIKA-471)
 
+  * Charset parameter: Related to the character encoding improvements
+    mentioned above, Tika now returns the detected character encoding as
+    a "charset" parameter of the content type metadata field for text/plain
+    and text/html documents. For example, instead of just "text/plain", the
+    returned content type will be something like "text/plain; charset=UTF-8"
+    for a UTF-8 encoded text document. Character encoding information is still
+    present also in the content encoding metadata field for backwards
+    compatibility, but that field should be considered deprecated. (TIKA-431)
+
   * Extraction of embedded resources from OLE2 Office Documents, where
     the resource isn't another office document, has been fixed (TIKA-948)
 
diff --git a/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java b/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
index 0e025da135..e2f80ef59d 100644
--- a/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
+++ b/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
@@ -26,10 +26,8 @@
 import java.util.Collections;
 import java.util.Enumeration;
 import java.util.HashMap;
-import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
-import java.util.Set;
 import java.util.regex.Pattern;
 
 /**
diff --git a/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java b/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
index 31a1fa509f..09d3af08d2 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
@@ -116,30 +116,18 @@ public MediaType detect(InputStream input, Metadata metadata)
 
         input.mark(bytesToTest);
         try {
-            int chars = 0;
-            int controls = 0;
-            int asciis = 0;
-            int ch = input.read();
-            while (ch != -1 && chars < bytesToTest) {
-                if (ch < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[ch]) {
-                    controls++;
-                } else if (ch < 127) {
-                    asciis++;
-                }
-                ch = input.read();
-                chars++;
+            TextStatistics stats = new TextStatistics();
+
+            byte[] buffer = new byte[1024];
+            int n = 0;
+            int m = input.read(buffer, 0, Math.min(bytesToTest, buffer.length));
+            while (m != -1 && n < bytesToTest) {
+                stats.addData(buffer, 0, m);
+                n += m;
+                m = input.read(buffer, 0, Math.min(bytesToTest - n, buffer.length));
             }
-            if (chars == 0) {
-                // Empty document, so treat it as binary
-                // See https://issues.apache.org/jira/browse/TIKA-483
-                return MediaType.OCTET_STREAM;
-            } else if (controls == 0) {
-                // No control characters, so treat it as text
-                return MediaType.TEXT_PLAIN;
-            } else if (controls < chars * 2 / 100
-                    && asciis > chars * 90 / 100) {
-                // Almost plain text (< 2% control, > 90% ASCII range)
-                // See https://issues.apache.org/jira/browse/TIKA-688
+
+            if (stats.isMostlyAscii()) {
                 return MediaType.TEXT_PLAIN;
             } else {
                 return MediaType.OCTET_STREAM;
diff --git a/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java b/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
new file mode 100644
index 0000000000..581a1334de
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+/**
+ * Utility class for computing a histogram of the bytes seen in a stream.
+ *
+ * @since Apache Tika 1.2
+ */
+public class TextStatistics {
+
+    private final int[] counts = new int[256];
+
+    private int total = 0;
+
+    public void addData(byte[] buffer, int offset, int length) {
+        for (int i = 0; i < length; i++) {
+            counts[buffer[offset + i] & 0xff]++;
+            total++;
+        }
+    }
+
+    /**
+     * Checks whether at least one byte was seen and that the bytes that
+     * were seen were mostly plain text (i.e. < 2% control, > 90% ASCII range).
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-688">TIKA-688</a>
+     * @return <code>true</code> if the seen bytes were mostly safe ASCII,
+     *         <code>false</code> otherwise
+     */
+    public boolean isMostlyAscii() {
+        int control = count(0, 0x20);
+        int ascii = count(0x20, 128);
+        int safe = countSafeControl();
+        return total > 0
+                && (control - safe) * 100 < total * 2
+                && (ascii + safe) * 100 > total * 90;
+    }
+
+    /**
+     * Returns the total number of bytes seen so far.
+     *
+     * @return count of all bytes
+     */
+    public int count() {
+        return total;
+    }
+
+    /**
+     * Returns the number of occurrences of the given byte.
+     *
+     * @param b byte
+     * @return count of the given byte
+     */
+    public int count(int b) {
+        return counts[b & 0xff];
+    }
+
+    /**
+     * Counts control characters (i.e. < 0x20, excluding tab, CR, LF,
+     * page feed and escape).
+     * <p>
+     * This definition of control characters is based on section 4 of the
+     * "Content-Type Processing Model" Internet-draft
+     * (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
+     * >draft-abarth-mime-sniff-01</a>).
+     * <pre>
+     * +-------------------------+
+     * | Binary data byte ranges |
+     * +-------------------------+
+     * | 0x00 -- 0x08            |
+     * | 0x0B                    |
+     * | 0x0E -- 0x1A            |
+     * | 0x1C -- 0x1F            |
+     * +-------------------------+
+     * </pre>
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a>
+     * @return count of control characters
+     */
+    public int countControl() {
+        return count(0, 0x20) - countSafeControl();
+    }
+
+    /**
+     * Counts "safe" (i.e. seven-bit non-control) ASCII characters.
+     *
+     * @see #countControl()
+     * @return count of safe ASCII characters
+     */
+    public int countSafeAscii() {
+        return count(0x20, 128) + countSafeControl();
+    }
+
+    /**
+     * Counts eight bit characters, i.e. bytes with their highest bit set.
+     *
+     * @return count of eight bit characters
+     */
+    public int countEightBit() {
+        return count(128, 256);
+    }
+
+    private int count(int from, int to) {
+        assert 0 <= from && to < counts.length;
+        int count = 0;
+        for (int i = from; i < to; i++) {
+            count += counts[i];
+        }
+        return count;
+    }
+
+    private int countSafeControl() {
+        return count('\t') + count('\n') + count('\r') // tab, LF, CR
+                + count(0x0c) + count(0x1b);           // new page, escape
+    }
+
+}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MediaType.java b/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
index 5eb0cb277f..0080c4aab0 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
@@ -17,6 +17,7 @@
 package org.apache.tika.mime;
 
 import java.io.Serializable;
+import java.nio.charset.Charset;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -72,6 +73,8 @@ public final class MediaType implements Comparable<MediaType>, Serializable {
 
     public static final MediaType TEXT_PLAIN = parse("text/plain");
 
+    public static final MediaType TEXT_HTML = parse("text/html");
+
     public static final MediaType APPLICATION_XML = parse("application/xml");
 
     public static final MediaType APPLICATION_ZIP = parse("application/zip");
@@ -345,6 +348,28 @@ public MediaType(MediaType type, Map<String, String> parameters) {
                 union(type.parameters, parameters));
     }
 
+    /**
+     * Creates a media type by adding a parameter to a base type.
+     *
+     * @param type base type
+     * @param name parameter name
+     * @param value parameter value
+     * @since Apache Tika 1.2
+     */
+    public MediaType(MediaType type, String name, String value) {
+        this(type, Collections.singletonMap(name, value));
+    }
+
+    /**
+     * Creates a media type by adding the "charset" parameter to a base type.
+     *
+     * @param type base type
+     * @param charset charset value
+     * @since Apache Tika 1.2
+     */
+    public MediaType(MediaType type, Charset charset) {
+        this(type, "charset", charset.name());
+    }
     /**
      * Returns the base form of the MediaType, excluding
      *  any parameters, such as "text/plain" for
diff --git a/tika-core/src/main/java/org/apache/tika/mime/package-info.java b/tika-core/src/main/java/org/apache/tika/mime/package-info.java
index 4e3246f5c9..104dc3acf9 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/package-info.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/package-info.java
@@ -18,5 +18,5 @@
 /**
  * Media type information.
  */
-@aQute.bnd.annotation.Version("1.0.0")
+@aQute.bnd.annotation.Version("1.2.0")
 package org.apache.tika.mime;
diff --git a/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java
index 441da0585b..cdf625a14b 100644
--- a/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java
+++ b/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java
@@ -51,16 +51,16 @@ public void testDetectEmpty() throws Exception {
     public void testDetectText() throws Exception {
         assertText("Hello, World!".getBytes("UTF-8"));
         assertText(" \t\r\n".getBytes("UTF-8"));
-        assertText(new byte[] { -1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B });
+        assertNotText(new byte[] { -1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B });
         assertNotText(new byte[] { 0 });
         assertNotText(new byte[] { 'H', 'e', 'l', 'l', 'o', 0 });
 
         byte[] data = new byte[512];
         Arrays.fill(data, (byte) '.');
         assertText(data);
-        Arrays.fill(data, 100, 109, (byte) 0x1f);
-        assertText(data); // almost text
         Arrays.fill(data, 100, 110, (byte) 0x1f);
+        assertText(data); // almost text
+        Arrays.fill(data, 100, 111, (byte) 0x1f);
         assertNotText(data); // no longer almost text, too many control chars
         Arrays.fill(data, (byte) 0x1f);
         assertNotText(data);
diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
index 996b8a9956..58dfbbe215 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
@@ -67,13 +67,13 @@ public void testDetection() throws Exception {
 
     public void testByteOrderMark() throws Exception {
         assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
-                new ByteArrayInputStream("\ufffetest".getBytes("UTF-16LE")),
+                new ByteArrayInputStream("\ufefftest".getBytes("UTF-16LE")),
                 new Metadata()));
         assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
-                new ByteArrayInputStream("\ufffetest".getBytes("UTF-16BE")),
+                new ByteArrayInputStream("\ufefftest".getBytes("UTF-16BE")),
                 new Metadata()));
         assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
-                new ByteArrayInputStream("\ufffetest".getBytes("UTF-8")),
+                new ByteArrayInputStream("\ufefftest".getBytes("UTF-8")),
                 new Metadata()));
     }
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
index 4be67e7d28..cc8adc0642 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
@@ -18,6 +18,7 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.charset.Charset;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
@@ -57,7 +58,7 @@ public class HtmlParser extends AbstractParser {
             new ServiceLoader(HtmlParser.class.getClassLoader());
 
     /**
-     * HTML schema singleton used to amortize the heavy instantiation time.
+     * HTML schema singleton used to amortise the heavy instantiation time.
      */
     private static final Schema HTML_SCHEMA = new HTMLSchema();
 
@@ -73,11 +74,14 @@ public void parse(
         AutoDetectReader reader = new AutoDetectReader(
                 new CloseShieldInputStream(stream), metadata, LOADER);
         try {
-            if (metadata.get(Metadata.CONTENT_TYPE) == null) {
-                // TODO: Include charset
-                metadata.set(Metadata.CONTENT_TYPE, "text/html");
+            Charset charset = reader.getCharset();
+            String previous = metadata.get(Metadata.CONTENT_TYPE);
+            if (previous == null || previous.startsWith("text/html")) {
+                MediaType type = new MediaType(MediaType.TEXT_HTML, charset);
+                metadata.set(Metadata.CONTENT_TYPE, type.toString());
             }
-            metadata.set(Metadata.CONTENT_ENCODING, reader.getCharset().name());
+            // deprecated, see TIKA-431
+            metadata.set(Metadata.CONTENT_ENCODING, charset.name());
 
             // Get the HTML mapper from the parse context
             HtmlMapper mapper =
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
index f30fbc29b1..82aabaf850 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
@@ -22,9 +22,7 @@
 import java.io.InputStream;
 import java.nio.channels.FileChannel;
 import java.util.Collections;
-import java.util.HashMap;
 import java.util.HashSet;
-import java.util.Map;
 import java.util.Set;
 import java.util.regex.Pattern;
 
@@ -76,10 +74,12 @@ public class POIFSContainerDetector implements Detector {
     public static final MediaType GENERAL_EMBEDDED = application("x-tika-msoffice-embedded");
     
     /** An OLE10 Native embedded document within another OLE2 document */
-    public static final MediaType OLE10_NATIVE = new MediaType(GENERAL_EMBEDDED, format("ole10_native")); 
+    public static final MediaType OLE10_NATIVE =
+            new MediaType(GENERAL_EMBEDDED, "format", "ole10_native");
     
     /** Some other kind of embedded document, in a CompObj container within another OLE2 document */
-    public static final MediaType COMP_OBJ = new MediaType(GENERAL_EMBEDDED, format("comp_obj"));
+    public static final MediaType COMP_OBJ =
+            new MediaType(GENERAL_EMBEDDED, "format", "comp_obj");
 
     /** Microsoft Excel */
     public static final MediaType XLS = application("vnd.ms-excel");
@@ -122,13 +122,7 @@ public class POIFSContainerDetector implements Detector {
 
     /** Regexp for matching the MPP Project Data stream */
     private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+");
-    
-    private static Map<String,String> format(String format) {
-       Map<String, String> params = new HashMap<String, String>();
-       params.put("format", format);
-       return params;
-    }
-    
+
     public MediaType detect(InputStream input, Metadata metadata)
              throws IOException {
         // Check if we have access to the document
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
index c540e7db67..6531774520 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
@@ -18,6 +18,7 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.charset.Charset;
 import java.util.Collections;
 import java.util.Set;
 
@@ -36,20 +37,14 @@
 /**
  * Plain text parser. The text encoding of the document stream is
  * automatically detected based on the byte patterns found at the
- * beginning of the stream. The input metadata key
- * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_ENCODING} is used
- * as an encoding hint if the automatic encoding detection fails.
+ * beginning of the stream and the given document metadata, most
+ * notably the <code>charset</code> parameter of a
+ * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE} value.
  * <p>
  * This parser sets the following output metadata entries:
  * <dl>
  *   <dt>{@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE}</dt>
- *   <dd><code>text/plain</code></dd>
- *   <dt>{@link org.apache.tika.metadata.HttpHeaders#CONTENT_ENCODING}</dt>
- *   <dd>The detected text encoding of the document.</dd>
- *   <dt>
- *     {@link org.apache.tika.metadata.HttpHeaders#CONTENT_LANGUAGE} and
- *     {@link org.apache.tika.metadata.DublinCore#LANGUAGE}
- *   </dt>
+ *   <dd><code>text/plain; charset=...</code></dd>
  * </dl>
  */
 public class TXTParser extends AbstractParser {
@@ -75,8 +70,11 @@ public void parse(
         AutoDetectReader reader = new AutoDetectReader(
                 new CloseShieldInputStream(stream), metadata, LOADER);
         try {
-            metadata.set(Metadata.CONTENT_TYPE, "text/plain"); // TODO: charset
-            metadata.set(Metadata.CONTENT_ENCODING, reader.getCharset().name());
+            Charset charset = reader.getCharset();
+            MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
+            metadata.set(Metadata.CONTENT_TYPE, type.toString());
+            // deprecated, see TIKA-431
+            metadata.set(Metadata.CONTENT_ENCODING, charset.name());
 
             XHTMLContentHandler xhtml =
                     new XHTMLContentHandler(handler, metadata);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java b/tika-parsers/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java
index 826febc40d..5e215a99f7 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java
@@ -18,6 +18,7 @@
 
 import java.nio.charset.Charset;
 
+import org.apache.tika.detect.TextStatistics;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.utils.CharsetUtils;
@@ -33,14 +34,16 @@ class UniversalEncodingListener implements CharsetListener {
 
     private static final String CHARSET_ISO_8859_1 = "ISO-8859-1";
 
+    private static final String CHARSET_ISO_8859_15 = "ISO-8859-15";
+
+    private final TextStatistics statistics = new TextStatistics();
+
     private final UniversalDetector detector = new UniversalDetector(this);
 
     private String hint = null;
 
     private Charset charset = null;
 
-    private boolean hasCR = false;
-
     public UniversalEncodingListener(Metadata metadata) {
         MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
         if (type != null) {
@@ -54,11 +57,20 @@ public UniversalEncodingListener(Metadata metadata) {
     public void report(String name) {
         if (Constants.CHARSET_WINDOWS_1252.equals(name)) {
             if (hint != null) {
-                // Use the encoding hint to distinguish between latin charsets
+                // Use the encoding hint when available
                 name = hint;
-            } else if (!hasCR) {
-                // If there are no CRLFs, it's more likely to be ISO-8859-1
-                name = CHARSET_ISO_8859_1;
+            } else if (statistics.count('\r') == 0) {
+                // If there are no CR(LF)s, then the encoding is more
+                // likely to be ISO-8859-1(5) than windows-1252
+                if (statistics.count(0xa4) > 0) { // currency/euro sign
+                    // The general currency sign is hardly ever used in
+                    // ISO-8859-1, so it's more likely that we're dealing
+                    // with ISO-8859-15, where the character is used for
+                    // the euro symbol, which is more commonly used.
+                    name = CHARSET_ISO_8859_15;
+                } else {
+                    name = CHARSET_ISO_8859_1;
+                }
             }
         }
         try {
@@ -73,16 +85,15 @@ public boolean isDone() {
     }
 
     public void handleData(byte[] buf, int offset, int length) {
-        for (int i = 0; !hasCR && i < length; i++) {
-            if (buf[offset + i] == '\r') {
-                hasCR = true;
-            }
-        }
+        statistics.addData(buf, offset, length);
         detector.handleData(buf, offset, length);
     }
 
     public Charset dataEnd() {
         detector.dataEnd();
+        if (charset == null && statistics.isMostlyAscii()) {
+            report(Constants.CHARSET_WINDOWS_1252);
+        }
         return charset;
     }
 
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java b/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java
index 9f45cb7b33..78b42d142f 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java
@@ -19,11 +19,6 @@
 import static org.apache.tika.mime.MediaType.OCTET_STREAM;
 import static org.apache.tika.mime.MediaType.TEXT_PLAIN;
 
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-
-import org.apache.tika.metadata.Metadata;
-
 import junit.framework.TestCase;
 
 public class MimeTypesTest extends TestCase {
@@ -95,31 +90,4 @@ public void testCompareTo() {
         assertTrue(html.compareTo(html) == 0);
     }
 
-    /** Test getMimeType(byte[]) 
-     * @throws IOException */
-    public void testGetMimeType_byteArray() throws IOException {
-        // Plain text detection
-        assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
-        assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
-        assertText(new byte[] { (byte) 0xEF, (byte) 0xFB, (byte) 0xBF });
-        assertText(new byte[] { 'a', 'b', 'c' });
-        assertText(new byte[] { '\t', '\r', '\n', 0x0C, 0x1B });
-        assertNotText(new byte[] { '\t', '\r', '\n', 0x0E, 0x1C });
-    }
-
-    private void assertText(byte[] prefix) throws IOException {
-        assertMagic("text/plain", prefix);
-    }
-
-    private void assertNotText(byte[] prefix) throws IOException {
-        assertMagic("application/octet-stream", prefix);
-    }
-
-    private void assertMagic(String expected, byte[] prefix) throws IOException {
-        MediaType type =
-                types.detect(new ByteArrayInputStream(prefix), new Metadata());
-        assertNotNull(type);
-        assertEquals(expected, type.toString());
-    }
-
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 57bb4ca96a..67b8195c15 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -609,6 +609,32 @@ public void testEmlx() throws IOException {
         assertTypeDetection("testEMLX.emlx", "message/x-emlx");
     }
 
+    /** Test getMimeType(byte[]) */
+    public void testGetMimeType_byteArray() throws IOException {
+        // Plain text detection
+        assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
+        assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
+        assertText(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF });
+        assertText(new byte[] { 'a', 'b', 'c' });
+        assertText(new byte[] { '\t', '\r', '\n', 0x0C, 0x1B });
+        assertNotText(new byte[] { '\t', '\r', '\n', 0x0E, 0x1C });
+    }
+
+    private void assertText(byte[] prefix) throws IOException {
+        assertMagic("text/plain", prefix);
+    }
+
+    private void assertNotText(byte[] prefix) throws IOException {
+        assertMagic("application/octet-stream", prefix);
+    }
+
+    private void assertMagic(String expected, byte[] prefix) throws IOException {
+        MediaType type =
+                repo.detect(new ByteArrayInputStream(prefix), new Metadata());
+        assertNotNull(type);
+        assertEquals(expected, type.toString());
+    }
+
     private void assertType(String expected, String filename) throws Exception {
         InputStream stream = TestMimeTypes.class.getResourceAsStream(
                 "/test-documents/" + filename);
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
index ad62dc417b..ee980caa92 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -28,7 +28,6 @@
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.metadata.XMPDM;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.sax.BodyContentHandler;
@@ -40,14 +39,14 @@ public class AutoDetectParserTest extends TestCase {
     // Easy to read constants for the MIME types:
     private static final String RAW        = "application/octet-stream";
     private static final String EXCEL      = "application/vnd.ms-excel";
-    private static final String HTML       = "text/html";
+    private static final String HTML       = "text/html; charset=ISO-8859-1";
     private static final String PDF        = "application/pdf";
     private static final String POWERPOINT = "application/vnd.ms-powerpoint";
     private static final String KEYNOTE    = "application/vnd.apple.keynote";
     private static final String PAGES      = "application/vnd.apple.pages";
     private static final String NUMBERS    = "application/vnd.apple.numbers";
     private static final String RTF        = "application/rtf";
-    private static final String PLAINTEXT  = "text/plain";
+    private static final String PLAINTEXT  = "text/plain; charset=ISO-8859-1";
     private static final String WORD       = "application/msword";
     private static final String XML        = "application/xml";
     private static final String RSS        = "application/rss+xml";
@@ -236,11 +235,12 @@ public void testZipBombPrevention() throws Exception {
         }
     
     }
-    
+
     /**
      * Test to ensure that the Vorbis and FLAC parsers have been correctly
      *  included, and are available
      */
+    @SuppressWarnings("deprecation")
     public void testVorbisFlac() throws Exception {
        // The three test files should all have similar test data
        String[] testFiles = new String[] {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index f5032eecd9..4976640be9 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -571,7 +571,7 @@ public void testMetaTagHandling() throws Exception {
         String result = sw.toString();
 
         // <meta> tag for Content-Type should exist, but nothing for Language
-        assertTrue(Pattern.matches("(?s).*<meta name=\"Content-Type\" content=\"text/html; charset=utf-8\"/>.*$", result));
+        assertTrue(Pattern.matches("(?s).*<meta name=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>.*$", result));
         assertFalse(Pattern.matches("(?s).*<meta name=\"Language\".*$", result));
     }
 
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
index 2267242b0f..b670d2d5fe 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -26,6 +26,7 @@
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.WriteOutContentHandler;
 import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
 
 import junit.framework.TestCase;
 
@@ -42,14 +43,14 @@ public void testEnglishText() throws Exception {
         Metadata metadata = new Metadata();
         StringWriter writer = new StringWriter();
         parser.parse(
-                new ByteArrayInputStream(text.getBytes("UTF-8")),
+                new ByteArrayInputStream(text.getBytes("ISO-8859-1")),
                 new WriteOutContentHandler(writer),
                 metadata,
                 new ParseContext());
         String content = writer.toString();
 
-        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
-        
+        assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+
         // TIKA-501: Remove language detection from TXTParser
         assertNull(metadata.get(Metadata.CONTENT_LANGUAGE));
         assertNull(metadata.get(TikaCoreProperties.LANGUAGE));
@@ -68,8 +69,8 @@ public void testUTF8Text() throws Exception {
         parser.parse(
                 new ByteArrayInputStream(text.getBytes("UTF-8")),
                 handler, metadata, new ParseContext());
-        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+        assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
 
         assertTrue(handler.toString().contains(text));
     }
@@ -79,10 +80,49 @@ public void testEmptyText() throws Exception {
         Metadata metadata = new Metadata();
         parser.parse(
                 new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
-        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("\n", handler.toString());
     }
 
+    /**
+     * Test for the heuristics that we use to assign an eight-bit character
+     * encoding to mostly ASCII sequences. If a more specific match can not
+     * be made, a string with a CR(LF) in it is most probably windows-1252,
+     * otherwise ISO-8859-1, except if it contains the currency/euro symbol
+     * (byte 0xa4) in which case it's more likely to be ISO-8859-15.
+     */
+    public void testLatinDetectionHeuristics() throws Exception {
+        String windows = "test\r\n";
+        String unix = "test\n";
+        String euro = "test \u20ac\n";
+
+        Metadata metadata;
+
+        metadata = new Metadata();
+        parser.parse(
+                new ByteArrayInputStream(windows.getBytes("ISO-8859-15")),
+                new DefaultHandler(), metadata, new ParseContext());
+        assertEquals(
+                "text/plain; charset=windows-1252",
+                metadata.get(Metadata.CONTENT_TYPE));
+
+        metadata = new Metadata();
+        parser.parse(
+                new ByteArrayInputStream(unix.getBytes("ISO-8859-15")),
+                new DefaultHandler(), metadata, new ParseContext());
+        assertEquals(
+                "text/plain; charset=ISO-8859-1",
+                metadata.get(Metadata.CONTENT_TYPE));
+
+        metadata = new Metadata();
+        parser.parse(
+                new ByteArrayInputStream(euro.getBytes("ISO-8859-15")),
+                new DefaultHandler(), metadata, new ParseContext());
+        assertEquals(
+                "text/plain; charset=ISO-8859-15",
+                metadata.get(Metadata.CONTENT_TYPE));
+    }
+
     /**
      * Test case for TIKA-240: Drop the BOM when extracting plain text
      *
@@ -111,15 +151,15 @@ public void testUseIncomingCharsetAsHint() throws Exception {
         parser.parse(
                 new ByteArrayInputStream(test2.getBytes("ISO-8859-1")),
                 new BodyContentHandler(),  metadata, new ParseContext());
-        
-        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+        assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
 
-        metadata.set(Metadata.CONTENT_ENCODING, "ISO-8859-15");
+        metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=ISO-8859-15");
         parser.parse(
                 new ByteArrayInputStream(test2.getBytes("ISO-8859-1")),
                 new BodyContentHandler(),  metadata, new ParseContext());
-
-        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
+        assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
     }
 
     /**
@@ -136,16 +176,16 @@ public void testUsingCharsetInContentTypeHeader() throws Exception {
         parser.parse(
                 new ByteArrayInputStream(test2.getBytes("ISO-8859-1")),
                 new BodyContentHandler(),  metadata, new ParseContext());
-
-        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+        assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
 
         metadata = new Metadata();
         metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-15");
         parser.parse(
                 new ByteArrayInputStream(test2.getBytes("ISO-8859-1")),
                 new BodyContentHandler(),  metadata, new ParseContext());
-
-        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
+        assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
     }
 
     private void assertExtractText(String msg, String expected, byte[] input)
@@ -157,7 +197,6 @@ public void ignorableWhitespace(char[] ch, int off, int len) {
         };
         Metadata metadata = new Metadata();
         parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
-        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals(msg, expected, handler.toString());
     }
 
@@ -188,8 +227,7 @@ public void testCP866() throws Exception {
                 metadata,
                 new ParseContext());
 
-        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("IBM866", metadata.get(Metadata.CONTENT_ENCODING));
+        assertEquals("text/plain; charset=IBM866", metadata.get(Metadata.CONTENT_TYPE));
     }
 
     public void testEBCDIC_CP500() throws Exception {
@@ -201,19 +239,18 @@ public void testEBCDIC_CP500() throws Exception {
                 metadata,
                 new ParseContext());
 
-        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("IBM500", metadata.get(Metadata.CONTENT_ENCODING));
-        
+        assertEquals("text/plain; charset=IBM500", metadata.get(Metadata.CONTENT_TYPE));
+
         // Additional check that it isn't too eager on short blocks of text
         metadata = new Metadata();
         writer = new StringWriter();
         parser.parse(
-                new ByteArrayInputStream("<html><body>hello world</body></html>".getBytes("UTF-8")),
+                new ByteArrayInputStream("<html><body>hello world</body></html>".getBytes("ISO-8859-1")),
                 new WriteOutContentHandler(writer),
                 metadata,
                 new ParseContext());
 
-        assertNotSame("IBM500", metadata.get(Metadata.CONTENT_ENCODING));
+        assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
     }
 
 }