[SPARK-30201][SQL] HiveOutputWriter standardOI should use ObjectInspe…

…ctorCopyOption.DEFAULT ### What changes were proposed in this pull request? Now spark use `ObjectInspectorCopyOption.JAVA` as oi option which will convert any string to UTF-8 string. When write non UTF-8 code data, then `EFBFBD` will appear. We should use `ObjectInspectorCopyOption.DEFAULT` to support pass the bytes. ### Why are the changes needed? Here is the way to reproduce: 1. make a file contains 16 radix 'AABBCC' which is not the UTF-8 code. 2. create table test1 (c string) location '$file_path'; 3. select hex(c) from test1; // AABBCC 4. craete table test2 (c string) as select c from test1; 5. select hex(c) from test2; // EFBFBDEFBFBDEFBFBD ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Closes apache#26831 from ulysses-you/SPARK-30201. Authored-by: ulysses <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
shaotongtc · Dec 17, 2019 · 1da7e82 · 1da7e82
1 parent e75d9af
commit 1da7e82
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 3 deletions.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -305,12 +305,17 @@ private[hive] trait HiveInspectors {
         withNullSafe(o => getByteWritable(o))
       case _: ByteObjectInspector =>
         withNullSafe(o => o.asInstanceOf[java.lang.Byte])
-      case _: JavaHiveVarcharObjectInspector =>
+        // To spark HiveVarchar and HiveChar are same as string
+      case _: HiveVarcharObjectInspector if x.preferWritable() =>
+        withNullSafe(o => getStringWritable(o))
+      case _: HiveVarcharObjectInspector =>
         withNullSafe { o =>
             val s = o.asInstanceOf[UTF8String].toString
             new HiveVarchar(s, s.length)
         }
-      case _: JavaHiveCharObjectInspector =>
+      case _: HiveCharObjectInspector if x.preferWritable() =>
+        withNullSafe(o => getStringWritable(o))
+      case _: HiveCharObjectInspector =>
         withNullSafe { o =>
             val s = o.asInstanceOf[UTF8String].toString
             new HiveChar(s, s.length)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala
@@ -130,10 +130,15 @@ class HiveOutputWriter(
     new Path(path),
     Reporter.NULL)
 
+  /**
+   * Since SPARK-30201 ObjectInspectorCopyOption.JAVA change to ObjectInspectorCopyOption.DEFAULT.
+   * The reason is DEFAULT option can convert `UTF8String` to `Text` with bytes and
+   * we can compatible with non UTF-8 code bytes during write.
+   */
   private val standardOI = ObjectInspectorUtils
     .getStandardObjectInspector(
       tableDesc.getDeserializer(jobConf).getObjectInspector,
-      ObjectInspectorCopyOption.JAVA)
+      ObjectInspectorCopyOption.DEFAULT)
     .asInstanceOf[StructObjectInspector]
 
   private val fieldOIs =

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.hive
 
 import java.io.File
 
+import com.google.common.io.Files
 import org.apache.hadoop.fs.Path
 import org.scalatest.{BeforeAndAfter, PrivateMethodTester}
 
@@ -823,4 +824,27 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter
       }
     }
   }
+
+  test("SPARK-30201 HiveOutputWriter standardOI should use ObjectInspectorCopyOption.DEFAULT") {
+    withTable("t1", "t2") {
+      withTempDir { dir =>
+        val file = new File(dir, "test.hex")
+        val hex = "AABBCC"
+        val bs = org.apache.commons.codec.binary.Hex.decodeHex(hex.toCharArray)
+        Files.write(bs, file)
+        val path = file.getParent
+        sql(s"create table t1 (c string) STORED AS TEXTFILE location '$path'")
+        checkAnswer(
+          sql("select hex(c) from t1"),
+          Row(hex)
+        )
+
+        sql("create table t2 as select c from t1")
+        checkAnswer(
+          sql("select hex(c) from t2"),
+          Row(hex)
+        )
+      }
+    }
+  }
 }