BAEL-7175: UTF-8 Validation in Java (eugenp#15428)

zw1127 · Dec 22, 2023 · f66f49b · f66f49b
1 parent 65a9ccf
commit f66f49b
Show file tree

Hide file tree

Showing 2 changed files with 125 additions and 0 deletions.
diff --git a/core-java-modules/core-java-string-operations-7/pom.xml b/core-java-modules/core-java-string-operations-7/pom.xml
@@ -24,6 +24,21 @@
             <artifactId>commons-text</artifactId>
             <version>${commons-text.version}</version>
         </dependency>
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-core</artifactId>
+            <version>${apache.tika.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-parsers-standard-package</artifactId>
+            <version>${apache.tika.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>com.ibm.icu</groupId>
+            <artifactId>icu4j</artifactId>
+            <version>${icu4j.version}</version>
+        </dependency>
         <dependency>
             <groupId>org.junit.jupiter</groupId>
             <artifactId>junit-jupiter</artifactId>
@@ -60,7 +75,9 @@
         <maven.compiler.source>11</maven.compiler.source>
         <maven.compiler.target>11</maven.compiler.target>
         <apache.commons.lang3.version>3.13.0</apache.commons.lang3.version>
+        <apache.tika.version>2.9.1</apache.tika.version>
         <commons-text.version>1.10.0</commons-text.version>
+        <icu4j.version>74.1</icu4j.version>
         <liquibase.core.version>4.25.0</liquibase.core.version>
     </properties>
 

diff --git a/...string-operations-7/src/test/java/com/baeldung/utf8validation/UTF8ValidationUnitTest.java b/...string-operations-7/src/test/java/com/baeldung/utf8validation/UTF8ValidationUnitTest.java
@@ -0,0 +1,108 @@
+package com.baeldung.utf8validation;
+
+import com.ibm.icu.text.CharsetDetector;
+import com.ibm.icu.text.CharsetMatch;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.txt.UniversalEncodingDetector;
+import org.junit.jupiter.api.Test;
+
+import java.io.*;
+import java.nio.CharBuffer;
+import java.nio.charset.*;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+class UTF8ValidationUnitTest {
+
+    private static final String UTF8_STRING = "Hello 你好";
+
+    private static final byte[] UTF8_BYTES = UTF8_STRING.getBytes(StandardCharsets.UTF_8);
+
+    private static final byte[] INVALID_UTF8_BYTES = {(byte) 0xF0, (byte) 0xC1, (byte) 0x8C, (byte) 0xBC, (byte) 0xD1};
+
+    private static final InputStream ENGLISH_INPUTSTREAM = new ByteArrayInputStream("Hello".getBytes(StandardCharsets.UTF_8));
+
+    private static final InputStream UTF8_INPUTSTREAM = new ByteArrayInputStream(UTF8_BYTES);
+
+    private static final InputStream INVALID_UTF8_INPUTSTREAM = new ByteArrayInputStream(INVALID_UTF8_BYTES);
+
+    @Test
+    void whenConvertValidUTF8BytesToString_thenReturnExpectedString() {
+        String decodedStr = new String(UTF8_BYTES, StandardCharsets.UTF_8);
+        assertEquals(UTF8_STRING, decodedStr);
+    }
+
+    @Test
+    void whenConvertInvalidUTF8BytesToString_thenReturnReplacementCharacters() {
+        String decodedStr = new String(INVALID_UTF8_BYTES, StandardCharsets.UTF_8);
+        assertEquals("�����", decodedStr);
+    }
+
+    @Test
+    void whenDecodeValidUTF8Bytes_thenSucceeds() throws CharacterCodingException {
+
+        CharsetDecoder charsetDecoder = StandardCharsets.UTF_8.newDecoder();
+        CharBuffer decodedCharBuffer = charsetDecoder.decode(java.nio.ByteBuffer.wrap(UTF8_BYTES));
+        assertEquals(UTF8_STRING, decodedCharBuffer.toString());
+    }
+
+    @Test
+    void whenDecodeInvalidUTF8Bytes_thenThrowsMalformedInputException() {
+
+        CharsetDecoder charsetDecoder = StandardCharsets.UTF_8.newDecoder();
+        assertThrows(MalformedInputException.class,() -> {charsetDecoder.decode(java.nio.ByteBuffer.wrap(INVALID_UTF8_BYTES));});
+    }
+
+    @Test
+    void whenValidateValidInputStreamByTika_thenReturnsUTF8() throws IOException {
+
+        EncodingDetector encodingDetector = new UniversalEncodingDetector();
+        Charset detectedCharset = encodingDetector.detect(UTF8_INPUTSTREAM, new Metadata());
+        assertEquals(StandardCharsets.UTF_8, detectedCharset);
+    }
+
+    @Test
+    void whenValidateValidEnglishInputStreamByTika_thenReturnsISO_88591_1() throws IOException {
+
+        EncodingDetector encodingDetector = new UniversalEncodingDetector();
+        Charset detectedCharset = encodingDetector.detect(ENGLISH_INPUTSTREAM, new Metadata());
+        assertEquals(StandardCharsets.ISO_8859_1, detectedCharset);
+    }
+
+    @Test
+    void whenValidateInvalidInputStreamByTika_thenReturnsNull() throws IOException {
+
+        EncodingDetector encodingDetector = new UniversalEncodingDetector();
+        Charset detectedCharset = encodingDetector.detect(INVALID_UTF8_INPUTSTREAM, new Metadata());
+        assertNull(detectedCharset);
+    }
+
+    @Test
+    void whenValidateValidInputStreamByICU4J_thenReturnsUTF8() throws IOException {
+
+        CharsetDetector detector = new CharsetDetector();
+        detector.setText(UTF8_INPUTSTREAM);
+        CharsetMatch charsetMatch = detector.detect();
+        assertEquals(StandardCharsets.UTF_8.name(), charsetMatch.getName());
+    }
+
+    @Test
+    void whenValidateValidEnglishInputStreamByICU4J_thenReturnsISO_8859_1() throws IOException {
+
+        CharsetDetector detector = new CharsetDetector();
+        detector.setText(ENGLISH_INPUTSTREAM);
+        CharsetMatch charsetMatch = detector.detect();
+        assertEquals(StandardCharsets.ISO_8859_1.name(), charsetMatch.getName());
+    }
+
+    @Test
+    void whenValidateValidInputStreamByICU4J_thenReturnsNotEqualToUTF_8() throws IOException {
+
+        CharsetDetector detector = new CharsetDetector();
+        detector.setText(INVALID_UTF8_INPUTSTREAM);
+        CharsetMatch charsetMatch = detector.detect();
+        assertNotEquals(StandardCharsets.UTF_8.name(), charsetMatch.getName());
+    }
+
+}