Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 7d669a5

Browse files
committedJun 11, 2015
[SPARK-8286] Rewrite UTF8String in Java and move it into unsafe package.
Unit test is still in Scala. Author: Reynold Xin <[email protected]> Closes apache#6738 from rxin/utf8string-java and squashes the following commits: 562dc6e [Reynold Xin] Flag... 98e600b [Reynold Xin] Another try with encoding setting .. cfa6bdf [Reynold Xin] Merge branch 'master' into utf8string-java a3b124d [Reynold Xin] Try different UTF-8 encoded characters. 1ff7c82 [Reynold Xin] Enable UTF-8 encoding. 82d58cc [Reynold Xin] Reset run-tests. 2cb3c69 [Reynold Xin] Use utf-8 encoding in set bytes. 53f8ef4 [Reynold Xin] Hack Jenkins to run one test. 9a48e8d [Reynold Xin] Fixed runtime compilation error. 911c450 [Reynold Xin] Moved unit test also to Java. 4eff7bd [Reynold Xin] Improved unit test coverage. 8e89a3c [Reynold Xin] Fixed tests. 77c64bd [Reynold Xin] Fixed string type codegen. ffedb62 [Reynold Xin] Code review feedback. 0967ce6 [Reynold Xin] Fixed import ordering. 45a123d [Reynold Xin] [SPARK-8286] Rewrite UTF8String in Java and move it into unsafe package.
1 parent 9cbdf31 commit 7d669a5

File tree

34 files changed

+390
-335
lines changed

34 files changed

+390
-335
lines changed
 

‎project/SparkBuild.scala

+3-1
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,9 @@ object SparkBuild extends PomBuild {
149149
javacOptions in (Compile, doc) ++= {
150150
val Array(major, minor, _) = System.getProperty("java.version").split("\\.", 3)
151151
if (major.toInt >= 1 && minor.toInt >= 8) Seq("-Xdoclint:all", "-Xdoclint:-missing") else Seq.empty
152-
}
152+
},
153+
154+
javacOptions in Compile ++= Seq("-encoding", "UTF-8")
153155
)
154156

155157
def enable(settings: Seq[Setting[_]])(projectRef: ProjectRef) = {

‎sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
import org.apache.spark.sql.BaseMutableRow;
3131
import org.apache.spark.sql.types.DataType;
3232
import org.apache.spark.sql.types.StructType;
33-
import org.apache.spark.sql.types.UTF8String;
33+
import org.apache.spark.unsafe.types.UTF8String;
3434
import org.apache.spark.unsafe.PlatformDependent;
3535
import org.apache.spark.unsafe.bitset.BitSetMethods;
3636

‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystTypeConverters.scala

+2-1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import scala.collection.mutable.HashMap
2828
import org.apache.spark.sql.catalyst.expressions._
2929
import org.apache.spark.sql.catalyst.util.DateUtils
3030
import org.apache.spark.sql.types._
31+
import org.apache.spark.unsafe.types.UTF8String
3132

3233
/**
3334
* Functions to convert Scala types to Catalyst types and vice versa.
@@ -257,7 +258,7 @@ object CatalystTypeConverters {
257258

258259
private object StringConverter extends CatalystTypeConverter[Any, String, Any] {
259260
override def toCatalystImpl(scalaValue: Any): UTF8String = scalaValue match {
260-
case str: String => UTF8String(str)
261+
case str: String => UTF8String.fromString(str)
261262
case utf8: UTF8String => utf8
262263
}
263264
override def toScala(catalystValue: Any): String = catalystValue match {

‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
package org.apache.spark.sql.catalyst
1919

20+
import org.apache.spark.unsafe.types.UTF8String
2021
import org.apache.spark.util.Utils
2122
import org.apache.spark.sql.catalyst.expressions._
2223
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation

‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala

+5-4
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import org.apache.spark.Logging
2424
import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
2525
import org.apache.spark.sql.catalyst.util.DateUtils
2626
import org.apache.spark.sql.types._
27+
import org.apache.spark.unsafe.types.UTF8String
2728

2829
/** Cast the child expression to the target data type. */
2930
case class Cast(child: Expression, dataType: DataType) extends UnaryExpression with Logging {
@@ -111,11 +112,11 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
111112

112113
// UDFToString
113114
private[this] def castToString(from: DataType): Any => Any = from match {
114-
case BinaryType => buildCast[Array[Byte]](_, UTF8String(_))
115-
case DateType => buildCast[Int](_, d => UTF8String(DateUtils.toString(d)))
115+
case BinaryType => buildCast[Array[Byte]](_, UTF8String.fromBytes)
116+
case DateType => buildCast[Int](_, d => UTF8String.fromString(DateUtils.toString(d)))
116117
case TimestampType => buildCast[Long](_,
117-
t => UTF8String(timestampToString(DateUtils.toJavaTimestamp(t))))
118-
case _ => buildCast[Any](_, o => UTF8String(o.toString))
118+
t => UTF8String.fromString(timestampToString(DateUtils.toJavaTimestamp(t))))
119+
case _ => buildCast[Any](_, o => UTF8String.fromString(o.toString))
119120
}
120121

121122
// BinaryConverter

‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala

+3-1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package org.apache.spark.sql.catalyst.expressions
1919

2020
import org.apache.spark.sql.types._
21+
import org.apache.spark.unsafe.types.UTF8String
2122

2223
/**
2324
* A parent class for mutable container objects that are reused when the values are changed,
@@ -240,7 +241,8 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR
240241
}
241242
}
242243

243-
override def setString(ordinal: Int, value: String): Unit = update(ordinal, UTF8String(value))
244+
override def setString(ordinal: Int, value: String): Unit =
245+
update(ordinal, UTF8String.fromString(value))
244246

245247
override def getString(ordinal: Int): String = apply(ordinal).toString
246248

‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
2020
import org.apache.spark.sql.types._
2121
import org.apache.spark.unsafe.PlatformDependent
2222
import org.apache.spark.unsafe.array.ByteArrayMethods
23+
import org.apache.spark.unsafe.types.UTF8String
2324

2425
/**
2526
* Converts Rows into UnsafeRow format. This class is NOT thread-safe.

‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala

+2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ import org.codehaus.janino.ClassBodyEvaluator
2626
import org.apache.spark.Logging
2727
import org.apache.spark.sql.catalyst.expressions._
2828
import org.apache.spark.sql.types._
29+
import org.apache.spark.unsafe.types.UTF8String
30+
2931

3032
// These classes are here to avoid issues with serialization and integration with quasiquotes.
3133
class IntegerHashSet extends org.apache.spark.util.collection.OpenHashSet[Int]

‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala

+2-1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.CatalystTypeConverters
2323
import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
2424
import org.apache.spark.sql.catalyst.util.DateUtils
2525
import org.apache.spark.sql.types._
26+
import org.apache.spark.unsafe.types.UTF8String
2627

2728
object Literal {
2829
def apply(v: Any): Literal = v match {
@@ -32,7 +33,7 @@ object Literal {
3233
case f: Float => Literal(f, FloatType)
3334
case b: Byte => Literal(b, ByteType)
3435
case s: Short => Literal(s, ShortType)
35-
case s: String => Literal(UTF8String(s), StringType)
36+
case s: String => Literal(UTF8String.fromString(s), StringType)
3637
case b: Boolean => Literal(b, BooleanType)
3738
case d: BigDecimal => Literal(Decimal(d), DecimalType.Unlimited)
3839
case d: java.math.BigDecimal => Literal(Decimal(d), DecimalType.Unlimited)

‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala

+5-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717

1818
package org.apache.spark.sql.catalyst.expressions
1919

20-
import org.apache.spark.sql.types.{UTF8String, DataType, StructType, AtomicType}
20+
import org.apache.spark.sql.types.{DataType, StructType, AtomicType}
21+
import org.apache.spark.unsafe.types.UTF8String
2122

2223
/**
2324
* An extended interface to [[Row]] that allows the values for each column to be updated. Setting
@@ -197,7 +198,9 @@ class GenericMutableRow(v: Array[Any]) extends GenericRow(v) with MutableRow {
197198
override def setFloat(ordinal: Int, value: Float): Unit = { values(ordinal) = value }
198199
override def setInt(ordinal: Int, value: Int): Unit = { values(ordinal) = value }
199200
override def setLong(ordinal: Int, value: Long): Unit = { values(ordinal) = value }
200-
override def setString(ordinal: Int, value: String) { values(ordinal) = UTF8String(value)}
201+
override def setString(ordinal: Int, value: String) {
202+
values(ordinal) = UTF8String.fromString(value)
203+
}
201204
override def setNullAt(i: Int): Unit = { values(i) = null }
202205

203206
override def setShort(ordinal: Int, value: Short): Unit = { values(ordinal) = value }

‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala

+2-1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import java.util.regex.Pattern
2222
import org.apache.spark.sql.catalyst.analysis.UnresolvedException
2323
import org.apache.spark.sql.catalyst.expressions.codegen._
2424
import org.apache.spark.sql.types._
25+
import org.apache.spark.unsafe.types.UTF8String
2526

2627
trait StringRegexExpression extends ExpectsInputTypes {
2728
self: BinaryExpression =>
@@ -277,7 +278,7 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
277278
ba.slice(st, end)
278279
case s: UTF8String =>
279280
val (st, end) = slicePos(start, length, () => s.length())
280-
s.slice(st, end)
281+
s.substring(st, end)
281282
}
282283
}
283284
}

‎sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import scala.reflect.runtime.universe.typeTag
2222

2323
import org.apache.spark.annotation.DeveloperApi
2424
import org.apache.spark.sql.catalyst.ScalaReflectionLock
25+
import org.apache.spark.unsafe.types.UTF8String
2526

2627
/**
2728
* :: DeveloperApi ::

‎sql/catalyst/src/main/scala/org/apache/spark/sql/types/UTF8String.scala

-221
This file was deleted.

‎sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import org.apache.spark.SparkFunSuite
2121
import org.apache.spark.sql.catalyst.analysis.UnresolvedExtractValue
2222
import org.apache.spark.sql.catalyst.dsl.expressions._
2323
import org.apache.spark.sql.types._
24+
import org.apache.spark.unsafe.types.UTF8String
2425

2526

2627
class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
There was a problem loading the remainder of the diff.

0 commit comments

Comments
 (0)
Failed to load comments.