Skip to content

Commit

Permalink
[SPARK-17299] TRIM/LTRIM/RTRIM should not strips characters other tha…
Browse files Browse the repository at this point in the history
…n spaces

## What changes were proposed in this pull request?
TRIM/LTRIM/RTRIM should not strips characters other than spaces, we were trimming all chars small than ASCII 0x20(space)

## How was this patch tested?
fixed existing tests.

Author: Sandeep Singh <[email protected]>

Closes apache#14924 from techaddict/SPARK-17299.
  • Loading branch information
techaddict authored and srowen committed Sep 6, 2016
1 parent 6c08dbf commit 7775d9f
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -465,9 +465,9 @@ public UTF8String trim() {
int s = 0;
int e = this.numBytes - 1;
// skip all of the space (0x20) in the left side
while (s < this.numBytes && getByte(s) <= 0x20 && getByte(s) >= 0x00) s++;
while (s < this.numBytes && getByte(s) == 0x20) s++;
// skip all of the space (0x20) in the right side
while (e >= 0 && getByte(e) <= 0x20 && getByte(e) >= 0x00) e--;
while (e >= 0 && getByte(e) == 0x20) e--;
if (s > e) {
// empty string
return EMPTY_UTF8;
Expand All @@ -479,7 +479,7 @@ public UTF8String trim() {
public UTF8String trimLeft() {
int s = 0;
// skip all of the space (0x20) in the left side
while (s < this.numBytes && getByte(s) <= 0x20 && getByte(s) >= 0x00) s++;
while (s < this.numBytes && getByte(s) == 0x20) s++;
if (s == this.numBytes) {
// empty string
return EMPTY_UTF8;
Expand All @@ -491,7 +491,7 @@ public UTF8String trimLeft() {
public UTF8String trimRight() {
int e = numBytes - 1;
// skip all of the space (0x20) in the right side
while (e >= 0 && getByte(e) <= 0x20 && getByte(e) >= 0x00) e--;
while (e >= 0 && getByte(e) == 0x20) e--;

if (e < 0) {
// empty string
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,16 @@ public void trims() {
assertEquals(fromString("数据砖头"), fromString("数据砖头").trim());
assertEquals(fromString("数据砖头"), fromString("数据砖头").trimLeft());
assertEquals(fromString("数据砖头"), fromString("数据砖头").trimRight());

char[] charsLessThan0x20 = new char[10];
Arrays.fill(charsLessThan0x20, (char)(' ' - 1));
String stringStartingWithSpace =
new String(charsLessThan0x20) + "hello" + new String(charsLessThan0x20);
assertEquals(fromString(stringStartingWithSpace), fromString(stringStartingWithSpace).trim());
assertEquals(fromString(stringStartingWithSpace),
fromString(stringStartingWithSpace).trimLeft());
assertEquals(fromString(stringStartingWithSpace),
fromString(stringStartingWithSpace).trimRight());
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ class UTF8StringPropertyCheckSuite extends FunSuite with GeneratorDrivenProperty
}
}

val whitespaceChar: Gen[Char] = Gen.choose(0x00, 0x20).map(_.toChar)
val whitespaceChar: Gen[Char] = Gen.const(0x20.toChar)
val whitespaceString: Gen[String] = Gen.listOf(whitespaceChar).map(_.mkString)
val randomString: Gen[String] = Arbitrary.arbString.arbitrary

Expand All @@ -107,15 +107,15 @@ class UTF8StringPropertyCheckSuite extends FunSuite with GeneratorDrivenProperty
def lTrim(s: String): String = {
var st = 0
val array: Array[Char] = s.toCharArray
while ((st < s.length) && (array(st) <= ' ')) {
while ((st < s.length) && (array(st) == ' ')) {
st += 1
}
if (st > 0) s.substring(st, s.length) else s
}
def rTrim(s: String): String = {
var len = s.length
val array: Array[Char] = s.toCharArray
while ((len > 0) && (array(len - 1) <= ' ')) {
while ((len > 0) && (array(len - 1) == ' ')) {
len -= 1
}
if (len < s.length) s.substring(0, len) else s
Expand All @@ -127,7 +127,7 @@ class UTF8StringPropertyCheckSuite extends FunSuite with GeneratorDrivenProperty
whitespaceString
) { (start: String, middle: String, end: String) =>
val s = start + middle + end
assert(toUTF8(s).trim() === toUTF8(s.trim()))
assert(toUTF8(s).trim() === toUTF8(rTrim(lTrim(s))))
assert(toUTF8(s).trimLeft() === toUTF8(lTrim(s)))
assert(toUTF8(s).trimRight() === toUTF8(rTrim(s)))
}
Expand Down

0 comments on commit 7775d9f

Please sign in to comment.