@@ -835,6 +835,190 @@ public UTF8String translate(Map<Character, Character> dict) {
835
835
return fromString (sb .toString ());
836
836
}
837
837
838
+ private int getDigit (byte b ) {
839
+ if (b >= '0' && b <= '9' ) {
840
+ return b - '0' ;
841
+ }
842
+ throw new NumberFormatException (toString ());
843
+ }
844
+
845
+ /**
846
+ * Parses this UTF8String to long.
847
+ *
848
+ * Note that, in this method we accumulate the result in negative format, and convert it to
849
+ * positive format at the end, if this string is not started with '-'. This is because min value
850
+ * is bigger than max value in digits, e.g. Integer.MAX_VALUE is '2147483647' and
851
+ * Integer.MIN_VALUE is '-2147483648'.
852
+ *
853
+ * This code is mostly copied from LazyLong.parseLong in Hive.
854
+ */
855
+ public long toLong () {
856
+ if (numBytes == 0 ) {
857
+ throw new NumberFormatException ("Empty string" );
858
+ }
859
+
860
+ byte b = getByte (0 );
861
+ final boolean negative = b == '-' ;
862
+ int offset = 0 ;
863
+ if (negative || b == '+' ) {
864
+ offset ++;
865
+ if (numBytes == 1 ) {
866
+ throw new NumberFormatException (toString ());
867
+ }
868
+ }
869
+
870
+ final byte separator = '.' ;
871
+ final int radix = 10 ;
872
+ final long stopValue = Long .MIN_VALUE / radix ;
873
+ long result = 0 ;
874
+
875
+ while (offset < numBytes ) {
876
+ b = getByte (offset );
877
+ offset ++;
878
+ if (b == separator ) {
879
+ // We allow decimals and will return a truncated integral in that case.
880
+ // Therefore we won't throw an exception here (checking the fractional
881
+ // part happens below.)
882
+ break ;
883
+ }
884
+
885
+ int digit = getDigit (b );
886
+ // We are going to process the new digit and accumulate the result. However, before doing
887
+ // this, if the result is already smaller than the stopValue(Long.MIN_VALUE / radix), then
888
+ // result * 10 will definitely be smaller than minValue, and we can stop and throw exception.
889
+ if (result < stopValue ) {
890
+ throw new NumberFormatException (toString ());
891
+ }
892
+
893
+ result = result * radix - digit ;
894
+ // Since the previous result is less than or equal to stopValue(Long.MIN_VALUE / radix), we
895
+ // can just use `result > 0` to check overflow. If result overflows, we should stop and throw
896
+ // exception.
897
+ if (result > 0 ) {
898
+ throw new NumberFormatException (toString ());
899
+ }
900
+ }
901
+
902
+ // This is the case when we've encountered a decimal separator. The fractional
903
+ // part will not change the number, but we will verify that the fractional part
904
+ // is well formed.
905
+ while (offset < numBytes ) {
906
+ if (getDigit (getByte (offset )) == -1 ) {
907
+ throw new NumberFormatException (toString ());
908
+ }
909
+ offset ++;
910
+ }
911
+
912
+ if (!negative ) {
913
+ result = -result ;
914
+ if (result < 0 ) {
915
+ throw new NumberFormatException (toString ());
916
+ }
917
+ }
918
+
919
+ return result ;
920
+ }
921
+
922
+ /**
923
+ * Parses this UTF8String to int.
924
+ *
925
+ * Note that, in this method we accumulate the result in negative format, and convert it to
926
+ * positive format at the end, if this string is not started with '-'. This is because min value
927
+ * is bigger than max value in digits, e.g. Integer.MAX_VALUE is '2147483647' and
928
+ * Integer.MIN_VALUE is '-2147483648'.
929
+ *
930
+ * This code is mostly copied from LazyInt.parseInt in Hive.
931
+ *
932
+ * Note that, this method is almost same as `toLong`, but we leave it duplicated for performance
933
+ * reasons, like Hive does.
934
+ */
935
+ public int toInt () {
936
+ if (numBytes == 0 ) {
937
+ throw new NumberFormatException ("Empty string" );
938
+ }
939
+
940
+ byte b = getByte (0 );
941
+ final boolean negative = b == '-' ;
942
+ int offset = 0 ;
943
+ if (negative || b == '+' ) {
944
+ offset ++;
945
+ if (numBytes == 1 ) {
946
+ throw new NumberFormatException (toString ());
947
+ }
948
+ }
949
+
950
+ final byte separator = '.' ;
951
+ final int radix = 10 ;
952
+ final int stopValue = Integer .MIN_VALUE / radix ;
953
+ int result = 0 ;
954
+
955
+ while (offset < numBytes ) {
956
+ b = getByte (offset );
957
+ offset ++;
958
+ if (b == separator ) {
959
+ // We allow decimals and will return a truncated integral in that case.
960
+ // Therefore we won't throw an exception here (checking the fractional
961
+ // part happens below.)
962
+ break ;
963
+ }
964
+
965
+ int digit = getDigit (b );
966
+ // We are going to process the new digit and accumulate the result. However, before doing
967
+ // this, if the result is already smaller than the stopValue(Integer.MIN_VALUE / radix), then
968
+ // result * 10 will definitely be smaller than minValue, and we can stop and throw exception.
969
+ if (result < stopValue ) {
970
+ throw new NumberFormatException (toString ());
971
+ }
972
+
973
+ result = result * radix - digit ;
974
+ // Since the previous result is less than or equal to stopValue(Integer.MIN_VALUE / radix),
975
+ // we can just use `result > 0` to check overflow. If result overflows, we should stop and
976
+ // throw exception.
977
+ if (result > 0 ) {
978
+ throw new NumberFormatException (toString ());
979
+ }
980
+ }
981
+
982
+ // This is the case when we've encountered a decimal separator. The fractional
983
+ // part will not change the number, but we will verify that the fractional part
984
+ // is well formed.
985
+ while (offset < numBytes ) {
986
+ if (getDigit (getByte (offset )) == -1 ) {
987
+ throw new NumberFormatException (toString ());
988
+ }
989
+ offset ++;
990
+ }
991
+
992
+ if (!negative ) {
993
+ result = -result ;
994
+ if (result < 0 ) {
995
+ throw new NumberFormatException (toString ());
996
+ }
997
+ }
998
+
999
+ return result ;
1000
+ }
1001
+
1002
+ public short toShort () {
1003
+ int intValue = toInt ();
1004
+ short result = (short ) intValue ;
1005
+ if (result != intValue ) {
1006
+ throw new NumberFormatException (toString ());
1007
+ }
1008
+
1009
+ return result ;
1010
+ }
1011
+
1012
+ public byte toByte () {
1013
+ int intValue = toInt ();
1014
+ byte result = (byte ) intValue ;
1015
+ if (result != intValue ) {
1016
+ throw new NumberFormatException (toString ());
1017
+ }
1018
+
1019
+ return result ;
1020
+ }
1021
+
838
1022
@ Override
839
1023
public String toString () {
840
1024
return new String (getBytes (), StandardCharsets .UTF_8 );
0 commit comments