Skip to content

Commit

Permalink
BAEL-7175: UTF-8 Validation in Java (eugenp#15428)
Browse files Browse the repository at this point in the history
  • Loading branch information
manfred106 authored Dec 22, 2023
1 parent 65a9ccf commit f66f49b
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 0 deletions.
17 changes: 17 additions & 0 deletions core-java-modules/core-java-string-operations-7/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,21 @@
<artifactId>commons-text</artifactId>
<version>${commons-text.version}</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>${apache.tika.version}</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers-standard-package</artifactId>
<version>${apache.tika.version}</version>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>${icu4j.version}</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
Expand Down Expand Up @@ -60,7 +75,9 @@
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
<apache.commons.lang3.version>3.13.0</apache.commons.lang3.version>
<apache.tika.version>2.9.1</apache.tika.version>
<commons-text.version>1.10.0</commons-text.version>
<icu4j.version>74.1</icu4j.version>
<liquibase.core.version>4.25.0</liquibase.core.version>
</properties>

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
package com.baeldung.utf8validation;

import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.txt.UniversalEncodingDetector;
import org.junit.jupiter.api.Test;

import java.io.*;
import java.nio.CharBuffer;
import java.nio.charset.*;

import static org.junit.jupiter.api.Assertions.*;

class UTF8ValidationUnitTest {

private static final String UTF8_STRING = "Hello 你好";

private static final byte[] UTF8_BYTES = UTF8_STRING.getBytes(StandardCharsets.UTF_8);

private static final byte[] INVALID_UTF8_BYTES = {(byte) 0xF0, (byte) 0xC1, (byte) 0x8C, (byte) 0xBC, (byte) 0xD1};

private static final InputStream ENGLISH_INPUTSTREAM = new ByteArrayInputStream("Hello".getBytes(StandardCharsets.UTF_8));

private static final InputStream UTF8_INPUTSTREAM = new ByteArrayInputStream(UTF8_BYTES);

private static final InputStream INVALID_UTF8_INPUTSTREAM = new ByteArrayInputStream(INVALID_UTF8_BYTES);

@Test
void whenConvertValidUTF8BytesToString_thenReturnExpectedString() {
String decodedStr = new String(UTF8_BYTES, StandardCharsets.UTF_8);
assertEquals(UTF8_STRING, decodedStr);
}

@Test
void whenConvertInvalidUTF8BytesToString_thenReturnReplacementCharacters() {
String decodedStr = new String(INVALID_UTF8_BYTES, StandardCharsets.UTF_8);
assertEquals("�����", decodedStr);
}

@Test
void whenDecodeValidUTF8Bytes_thenSucceeds() throws CharacterCodingException {

CharsetDecoder charsetDecoder = StandardCharsets.UTF_8.newDecoder();
CharBuffer decodedCharBuffer = charsetDecoder.decode(java.nio.ByteBuffer.wrap(UTF8_BYTES));
assertEquals(UTF8_STRING, decodedCharBuffer.toString());
}

@Test
void whenDecodeInvalidUTF8Bytes_thenThrowsMalformedInputException() {

CharsetDecoder charsetDecoder = StandardCharsets.UTF_8.newDecoder();
assertThrows(MalformedInputException.class,() -> {charsetDecoder.decode(java.nio.ByteBuffer.wrap(INVALID_UTF8_BYTES));});
}

@Test
void whenValidateValidInputStreamByTika_thenReturnsUTF8() throws IOException {

EncodingDetector encodingDetector = new UniversalEncodingDetector();
Charset detectedCharset = encodingDetector.detect(UTF8_INPUTSTREAM, new Metadata());
assertEquals(StandardCharsets.UTF_8, detectedCharset);
}

@Test
void whenValidateValidEnglishInputStreamByTika_thenReturnsISO_88591_1() throws IOException {

EncodingDetector encodingDetector = new UniversalEncodingDetector();
Charset detectedCharset = encodingDetector.detect(ENGLISH_INPUTSTREAM, new Metadata());
assertEquals(StandardCharsets.ISO_8859_1, detectedCharset);
}

@Test
void whenValidateInvalidInputStreamByTika_thenReturnsNull() throws IOException {

EncodingDetector encodingDetector = new UniversalEncodingDetector();
Charset detectedCharset = encodingDetector.detect(INVALID_UTF8_INPUTSTREAM, new Metadata());
assertNull(detectedCharset);
}

@Test
void whenValidateValidInputStreamByICU4J_thenReturnsUTF8() throws IOException {

CharsetDetector detector = new CharsetDetector();
detector.setText(UTF8_INPUTSTREAM);
CharsetMatch charsetMatch = detector.detect();
assertEquals(StandardCharsets.UTF_8.name(), charsetMatch.getName());
}

@Test
void whenValidateValidEnglishInputStreamByICU4J_thenReturnsISO_8859_1() throws IOException {

CharsetDetector detector = new CharsetDetector();
detector.setText(ENGLISH_INPUTSTREAM);
CharsetMatch charsetMatch = detector.detect();
assertEquals(StandardCharsets.ISO_8859_1.name(), charsetMatch.getName());
}

@Test
void whenValidateValidInputStreamByICU4J_thenReturnsNotEqualToUTF_8() throws IOException {

CharsetDetector detector = new CharsetDetector();
detector.setText(INVALID_UTF8_INPUTSTREAM);
CharsetMatch charsetMatch = detector.detect();
assertNotEquals(StandardCharsets.UTF_8.name(), charsetMatch.getName());
}

}

0 comments on commit f66f49b

Please sign in to comment.