Skip to content

Commit

Permalink
TIKA-431: Tika currently misuses the HTTP Content-Encoding header, an…
Browse files Browse the repository at this point in the history
…d does not seem to use the charset part of the Content-Type header properly.

Make text and html parsers return character encoding as a charset parameter in the content type metadata field

git-svn-id: https://svn.apache.org/repos/asf/tika/trunk@1358858 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
jukka committed Jul 8, 2012
1 parent 95a1cf9 commit 7d89a5e
Show file tree
Hide file tree
Showing 18 changed files with 325 additions and 132 deletions.
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
tika-parsers/src/test/resources/test-documents/testARofText.ar eol=lf
tika-parsers/src/test/resources/test-documents/testEMLX.emlx eol=lf
tika-parsers/src/test/resources/test-documents/testTXT.txt eol=lf
tika-parsers/src/test/resources/test-documents/testHTML.html eol=lf
9 changes: 9 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,15 @@ Release 1.2 - Current Development
ICU4J algorithms are still used as a fallback thanks to their wider
coverage of custom character encodings. (TIKA-322, TIKA-471)

* Charset parameter: Related to the character encoding improvements
mentioned above, Tika now returns the detected character encoding as
a "charset" parameter of the content type metadata field for text/plain
and text/html documents. For example, instead of just "text/plain", the
returned content type will be something like "text/plain; charset=UTF-8"
for a UTF-8 encoded text document. Character encoding information is still
present also in the content encoding metadata field for backwards
compatibility, but that field should be considered deprecated. (TIKA-431)

* Extraction of embedded resources from OLE2 Office Documents, where
the resource isn't another office document, has been fixed (TIKA-948)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,8 @@
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

/**
Expand Down
34 changes: 11 additions & 23 deletions tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
Original file line number Diff line number Diff line change
Expand Up @@ -116,30 +116,18 @@ public MediaType detect(InputStream input, Metadata metadata)

input.mark(bytesToTest);
try {
int chars = 0;
int controls = 0;
int asciis = 0;
int ch = input.read();
while (ch != -1 && chars < bytesToTest) {
if (ch < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[ch]) {
controls++;
} else if (ch < 127) {
asciis++;
}
ch = input.read();
chars++;
TextStatistics stats = new TextStatistics();

byte[] buffer = new byte[1024];
int n = 0;
int m = input.read(buffer, 0, Math.min(bytesToTest, buffer.length));
while (m != -1 && n < bytesToTest) {
stats.addData(buffer, 0, m);
n += m;
m = input.read(buffer, 0, Math.min(bytesToTest - n, buffer.length));
}
if (chars == 0) {
// Empty document, so treat it as binary
// See https://issues.apache.org/jira/browse/TIKA-483
return MediaType.OCTET_STREAM;
} else if (controls == 0) {
// No control characters, so treat it as text
return MediaType.TEXT_PLAIN;
} else if (controls < chars * 2 / 100
&& asciis > chars * 90 / 100) {
// Almost plain text (< 2% control, > 90% ASCII range)
// See https://issues.apache.org/jira/browse/TIKA-688

if (stats.isMostlyAscii()) {
return MediaType.TEXT_PLAIN;
} else {
return MediaType.OCTET_STREAM;
Expand Down
133 changes: 133 additions & 0 deletions tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.detect;

/**
* Utility class for computing a histogram of the bytes seen in a stream.
*
* @since Apache Tika 1.2
*/
public class TextStatistics {

private final int[] counts = new int[256];

private int total = 0;

public void addData(byte[] buffer, int offset, int length) {
for (int i = 0; i < length; i++) {
counts[buffer[offset + i] & 0xff]++;
total++;
}
}

/**
* Checks whether at least one byte was seen and that the bytes that
* were seen were mostly plain text (i.e. < 2% control, > 90% ASCII range).
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
* @see <a href="https://issues.apache.org/jira/browse/TIKA-688">TIKA-688</a>
* @return <code>true</code> if the seen bytes were mostly safe ASCII,
* <code>false</code> otherwise
*/
public boolean isMostlyAscii() {
int control = count(0, 0x20);
int ascii = count(0x20, 128);
int safe = countSafeControl();
return total > 0
&& (control - safe) * 100 < total * 2
&& (ascii + safe) * 100 > total * 90;
}

/**
* Returns the total number of bytes seen so far.
*
* @return count of all bytes
*/
public int count() {
return total;
}

/**
* Returns the number of occurrences of the given byte.
*
* @param b byte
* @return count of the given byte
*/
public int count(int b) {
return counts[b & 0xff];
}

/**
* Counts control characters (i.e. < 0x20, excluding tab, CR, LF,
* page feed and escape).
* <p>
* This definition of control characters is based on section 4 of the
* "Content-Type Processing Model" Internet-draft
* (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
* >draft-abarth-mime-sniff-01</a>).
* <pre>
* +-------------------------+
* | Binary data byte ranges |
* +-------------------------+
* | 0x00 -- 0x08 |
* | 0x0B |
* | 0x0E -- 0x1A |
* | 0x1C -- 0x1F |
* +-------------------------+
* </pre>
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a>
* @return count of control characters
*/
public int countControl() {
return count(0, 0x20) - countSafeControl();
}

/**
* Counts "safe" (i.e. seven-bit non-control) ASCII characters.
*
* @see #countControl()
* @return count of safe ASCII characters
*/
public int countSafeAscii() {
return count(0x20, 128) + countSafeControl();
}

/**
* Counts eight bit characters, i.e. bytes with their highest bit set.
*
* @return count of eight bit characters
*/
public int countEightBit() {
return count(128, 256);
}

private int count(int from, int to) {
assert 0 <= from && to < counts.length;
int count = 0;
for (int i = from; i < to; i++) {
count += counts[i];
}
return count;
}

private int countSafeControl() {
return count('\t') + count('\n') + count('\r') // tab, LF, CR
+ count(0x0c) + count(0x1b); // new page, escape
}

}
25 changes: 25 additions & 0 deletions tika-core/src/main/java/org/apache/tika/mime/MediaType.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package org.apache.tika.mime;

import java.io.Serializable;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
Expand Down Expand Up @@ -72,6 +73,8 @@ public final class MediaType implements Comparable<MediaType>, Serializable {

public static final MediaType TEXT_PLAIN = parse("text/plain");

public static final MediaType TEXT_HTML = parse("text/html");

public static final MediaType APPLICATION_XML = parse("application/xml");

public static final MediaType APPLICATION_ZIP = parse("application/zip");
Expand Down Expand Up @@ -345,6 +348,28 @@ public MediaType(MediaType type, Map<String, String> parameters) {
union(type.parameters, parameters));
}

/**
* Creates a media type by adding a parameter to a base type.
*
* @param type base type
* @param name parameter name
* @param value parameter value
* @since Apache Tika 1.2
*/
public MediaType(MediaType type, String name, String value) {
this(type, Collections.singletonMap(name, value));
}

/**
* Creates a media type by adding the "charset" parameter to a base type.
*
* @param type base type
* @param charset charset value
* @since Apache Tika 1.2
*/
public MediaType(MediaType type, Charset charset) {
this(type, "charset", charset.name());
}
/**
* Returns the base form of the MediaType, excluding
* any parameters, such as "text/plain" for
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@
/**
* Media type information.
*/
@aQute.bnd.annotation.Version("1.0.0")
@aQute.bnd.annotation.Version("1.2.0")
package org.apache.tika.mime;
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,16 @@ public void testDetectEmpty() throws Exception {
public void testDetectText() throws Exception {
assertText("Hello, World!".getBytes("UTF-8"));
assertText(" \t\r\n".getBytes("UTF-8"));
assertText(new byte[] { -1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B });
assertNotText(new byte[] { -1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B });
assertNotText(new byte[] { 0 });
assertNotText(new byte[] { 'H', 'e', 'l', 'l', 'o', 0 });

byte[] data = new byte[512];
Arrays.fill(data, (byte) '.');
assertText(data);
Arrays.fill(data, 100, 109, (byte) 0x1f);
assertText(data); // almost text
Arrays.fill(data, 100, 110, (byte) 0x1f);
assertText(data); // almost text
Arrays.fill(data, 100, 111, (byte) 0x1f);
assertNotText(data); // no longer almost text, too many control chars
Arrays.fill(data, (byte) 0x1f);
assertNotText(data);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,13 @@ public void testDetection() throws Exception {

public void testByteOrderMark() throws Exception {
assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
new ByteArrayInputStream("\ufffetest".getBytes("UTF-16LE")),
new ByteArrayInputStream("\ufefftest".getBytes("UTF-16LE")),
new Metadata()));
assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
new ByteArrayInputStream("\ufffetest".getBytes("UTF-16BE")),
new ByteArrayInputStream("\ufefftest".getBytes("UTF-16BE")),
new Metadata()));
assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
new ByteArrayInputStream("\ufffetest".getBytes("UTF-8")),
new ByteArrayInputStream("\ufefftest".getBytes("UTF-8")),
new Metadata()));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
Expand Down Expand Up @@ -57,7 +58,7 @@ public class HtmlParser extends AbstractParser {
new ServiceLoader(HtmlParser.class.getClassLoader());

/**
* HTML schema singleton used to amortize the heavy instantiation time.
* HTML schema singleton used to amortise the heavy instantiation time.
*/
private static final Schema HTML_SCHEMA = new HTMLSchema();

Expand All @@ -73,11 +74,14 @@ public void parse(
AutoDetectReader reader = new AutoDetectReader(
new CloseShieldInputStream(stream), metadata, LOADER);
try {
if (metadata.get(Metadata.CONTENT_TYPE) == null) {
// TODO: Include charset
metadata.set(Metadata.CONTENT_TYPE, "text/html");
Charset charset = reader.getCharset();
String previous = metadata.get(Metadata.CONTENT_TYPE);
if (previous == null || previous.startsWith("text/html")) {
MediaType type = new MediaType(MediaType.TEXT_HTML, charset);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
}
metadata.set(Metadata.CONTENT_ENCODING, reader.getCharset().name());
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());

// Get the HTML mapper from the parse context
HtmlMapper mapper =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@
import java.io.InputStream;
import java.nio.channels.FileChannel;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

Expand Down Expand Up @@ -76,10 +74,12 @@ public class POIFSContainerDetector implements Detector {
public static final MediaType GENERAL_EMBEDDED = application("x-tika-msoffice-embedded");

/** An OLE10 Native embedded document within another OLE2 document */
public static final MediaType OLE10_NATIVE = new MediaType(GENERAL_EMBEDDED, format("ole10_native"));
public static final MediaType OLE10_NATIVE =
new MediaType(GENERAL_EMBEDDED, "format", "ole10_native");

/** Some other kind of embedded document, in a CompObj container within another OLE2 document */
public static final MediaType COMP_OBJ = new MediaType(GENERAL_EMBEDDED, format("comp_obj"));
public static final MediaType COMP_OBJ =
new MediaType(GENERAL_EMBEDDED, "format", "comp_obj");

/** Microsoft Excel */
public static final MediaType XLS = application("vnd.ms-excel");
Expand Down Expand Up @@ -122,13 +122,7 @@ public class POIFSContainerDetector implements Detector {

/** Regexp for matching the MPP Project Data stream */
private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+");

private static Map<String,String> format(String format) {
Map<String, String> params = new HashMap<String, String>();
params.put("format", format);
return params;
}


public MediaType detect(InputStream input, Metadata metadata)
throws IOException {
// Check if we have access to the document
Expand Down
Loading

0 comments on commit 7d89a5e

Please sign in to comment.