Skip to content

Commit

Permalink
[SPARK-30201][SQL] HiveOutputWriter standardOI should use ObjectInspe…
Browse files Browse the repository at this point in the history
…ctorCopyOption.DEFAULT

### What changes were proposed in this pull request?

Now spark use `ObjectInspectorCopyOption.JAVA` as oi option which will convert any string to UTF-8 string. When write non UTF-8 code data, then `EFBFBD` will appear.
We should use `ObjectInspectorCopyOption.DEFAULT` to support pass the bytes.

### Why are the changes needed?

Here is the way to reproduce:
1. make a file contains 16 radix 'AABBCC' which is not the UTF-8 code.
2. create table test1 (c string) location '$file_path';
3. select hex(c) from test1; // AABBCC
4. craete table test2 (c string) as select c from test1;
5. select hex(c) from test2; // EFBFBDEFBFBDEFBFBD

### Does this PR introduce any user-facing change?

No.

### How was this patch tested?

Closes apache#26831 from ulysses-you/SPARK-30201.

Authored-by: ulysses <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
  • Loading branch information
ulysses-you authored and cloud-fan committed Dec 17, 2019
1 parent e75d9af commit 1da7e82
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -305,12 +305,17 @@ private[hive] trait HiveInspectors {
withNullSafe(o => getByteWritable(o))
case _: ByteObjectInspector =>
withNullSafe(o => o.asInstanceOf[java.lang.Byte])
case _: JavaHiveVarcharObjectInspector =>
// To spark HiveVarchar and HiveChar are same as string
case _: HiveVarcharObjectInspector if x.preferWritable() =>
withNullSafe(o => getStringWritable(o))
case _: HiveVarcharObjectInspector =>
withNullSafe { o =>
val s = o.asInstanceOf[UTF8String].toString
new HiveVarchar(s, s.length)
}
case _: JavaHiveCharObjectInspector =>
case _: HiveCharObjectInspector if x.preferWritable() =>
withNullSafe(o => getStringWritable(o))
case _: HiveCharObjectInspector =>
withNullSafe { o =>
val s = o.asInstanceOf[UTF8String].toString
new HiveChar(s, s.length)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,15 @@ class HiveOutputWriter(
new Path(path),
Reporter.NULL)

/**
* Since SPARK-30201 ObjectInspectorCopyOption.JAVA change to ObjectInspectorCopyOption.DEFAULT.
* The reason is DEFAULT option can convert `UTF8String` to `Text` with bytes and
* we can compatible with non UTF-8 code bytes during write.
*/
private val standardOI = ObjectInspectorUtils
.getStandardObjectInspector(
tableDesc.getDeserializer(jobConf).getObjectInspector,
ObjectInspectorCopyOption.JAVA)
ObjectInspectorCopyOption.DEFAULT)
.asInstanceOf[StructObjectInspector]

private val fieldOIs =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package org.apache.spark.sql.hive

import java.io.File

import com.google.common.io.Files
import org.apache.hadoop.fs.Path
import org.scalatest.{BeforeAndAfter, PrivateMethodTester}

Expand Down Expand Up @@ -823,4 +824,27 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter
}
}
}

test("SPARK-30201 HiveOutputWriter standardOI should use ObjectInspectorCopyOption.DEFAULT") {
withTable("t1", "t2") {
withTempDir { dir =>
val file = new File(dir, "test.hex")
val hex = "AABBCC"
val bs = org.apache.commons.codec.binary.Hex.decodeHex(hex.toCharArray)
Files.write(bs, file)
val path = file.getParent
sql(s"create table t1 (c string) STORED AS TEXTFILE location '$path'")
checkAnswer(
sql("select hex(c) from t1"),
Row(hex)
)

sql("create table t2 as select c from t1")
checkAnswer(
sql("select hex(c) from t2"),
Row(hex)
)
}
}
}
}

0 comments on commit 1da7e82

Please sign in to comment.