Skip to content

Commit

Permalink
Switch arrow type for string array to large string (pandas-dev#56220)
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl authored Dec 21, 2023
1 parent 5ef4a35 commit 2488e5e
Show file tree
Hide file tree
Showing 13 changed files with 118 additions and 27 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,8 @@ Other enhancements
- Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`)
- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as "BMS" (:issue:`56243`)
- Improved error message when constructing :class:`Period` with invalid offsets such as "QS" (:issue:`55785`)
- The dtypes ``string[pyarrow]`` and ``string[pyarrow_numpy]`` now both utilize the ``large_string`` type from PyArrow to avoid overflow for long columns (:issue:`56259`)


.. ---------------------------------------------------------------------------
.. _whatsnew_220.notable_bug_fixes:
Expand Down
21 changes: 16 additions & 5 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@ def _from_sequence_of_strings(
pa_type is None
or pa.types.is_binary(pa_type)
or pa.types.is_string(pa_type)
or pa.types.is_large_string(pa_type)
):
# pa_type is None: Let pa.array infer
# pa_type is string/binary: scalars already correct type
Expand Down Expand Up @@ -632,7 +633,9 @@ def __invert__(self) -> Self:
# This is a bit wise op for integer types
if pa.types.is_integer(self._pa_array.type):
return type(self)(pc.bit_wise_not(self._pa_array))
elif pa.types.is_string(self._pa_array.type):
elif pa.types.is_string(self._pa_array.type) or pa.types.is_large_string(
self._pa_array.type
):
# Raise TypeError instead of pa.ArrowNotImplementedError
raise TypeError("__invert__ is not supported for string dtypes")
else:
Expand Down Expand Up @@ -692,7 +695,11 @@ def _evaluate_op_method(self, other, op, arrow_funcs):
pa_type = self._pa_array.type
other = self._box_pa(other)

if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type):
if (
pa.types.is_string(pa_type)
or pa.types.is_large_string(pa_type)
or pa.types.is_binary(pa_type)
):
if op in [operator.add, roperator.radd]:
sep = pa.scalar("", type=pa_type)
if op is operator.add:
Expand All @@ -709,7 +716,9 @@ def _evaluate_op_method(self, other, op, arrow_funcs):
result = pc.binary_repeat(binary, pa_integral)
return type(self)(result)
elif (
pa.types.is_string(other.type) or pa.types.is_binary(other.type)
pa.types.is_string(other.type)
or pa.types.is_binary(other.type)
or pa.types.is_large_string(other.type)
) and op in [operator.mul, roperator.rmul]:
binary = other
integral = self._pa_array
Expand Down Expand Up @@ -1467,7 +1476,7 @@ def _concat_same_type(cls, to_concat) -> Self:
chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
if to_concat[0].dtype == "string":
# StringDtype has no attribute pyarrow_dtype
pa_dtype = pa.string()
pa_dtype = pa.large_string()
else:
pa_dtype = to_concat[0].dtype.pyarrow_dtype
arr = pa.chunked_array(chunks, type=pa_dtype)
Expand Down Expand Up @@ -2271,7 +2280,9 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
return type(self)(result)

def _str_join(self, sep: str):
if pa.types.is_string(self._pa_array.type):
if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string(
self._pa_array.type
):
result = self._apply_elementwise(list)
result = pa.chunked_array(result, type=pa.list_(pa.string()))
else:
Expand Down
38 changes: 26 additions & 12 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,17 +126,40 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
_storage = "pyarrow"

def __init__(self, values) -> None:
_chk_pyarrow_available()
if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string(
values.type
):
values = pc.cast(values, pa.large_string())

super().__init__(values)
self._dtype = StringDtype(storage=self._storage)

if not pa.types.is_string(self._pa_array.type) and not (
if not pa.types.is_large_string(self._pa_array.type) and not (
pa.types.is_dictionary(self._pa_array.type)
and pa.types.is_string(self._pa_array.type.value_type)
and pa.types.is_large_string(self._pa_array.type.value_type)
):
raise ValueError(
"ArrowStringArray requires a PyArrow (chunked) array of string type"
"ArrowStringArray requires a PyArrow (chunked) array of "
"large_string type"
)

@classmethod
def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
pa_scalar = super()._box_pa_scalar(value, pa_type)
if pa.types.is_string(pa_scalar.type) and pa_type is None:
pa_scalar = pc.cast(pa_scalar, pa.large_string())
return pa_scalar

@classmethod
def _box_pa_array(
cls, value, pa_type: pa.DataType | None = None, copy: bool = False
) -> pa.Array | pa.ChunkedArray:
pa_array = super()._box_pa_array(value, pa_type)
if pa.types.is_string(pa_array.type) and pa_type is None:
pa_array = pc.cast(pa_array, pa.large_string())
return pa_array

def __len__(self) -> int:
"""
Length of this array.
Expand Down Expand Up @@ -574,15 +597,6 @@ def _rank(
class ArrowStringArrayNumpySemantics(ArrowStringArray):
_storage = "pyarrow_numpy"

def __init__(self, values) -> None:
_chk_pyarrow_available()

if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_large_string(
values.type
):
values = pc.cast(values, pa.string())
super().__init__(values)

@classmethod
def _result_converter(cls, values, na=None):
if not isna(na):
Expand Down
14 changes: 11 additions & 3 deletions pandas/io/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,17 @@ def _convert_arrays_to_dataframe(
)
if dtype_backend == "pyarrow":
pa = import_optional_dependency("pyarrow")
arrays = [
ArrowExtensionArray(pa.array(arr, from_pandas=True)) for arr in arrays
]

result_arrays = []
for arr in arrays:
pa_array = pa.array(arr, from_pandas=True)
if arr.dtype == "string":
# TODO: Arrow still infers strings arrays as regular strings instead
# of large_string, which is what we preserver everywhere else for
# dtype_backend="pyarrow". We may want to reconsider this
pa_array = pa_array.cast(pa.string())
result_arrays.append(ArrowExtensionArray(pa_array))
arrays = result_arrays # type: ignore[assignment]
if arrays:
df = DataFrame(dict(zip(list(range(len(columns))), arrays)))
df.columns = columns
Expand Down
16 changes: 12 additions & 4 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,13 +487,15 @@ def test_fillna_args(dtype, arrow_string_storage):
def test_arrow_array(dtype):
# protocol added in 0.15.0
pa = pytest.importorskip("pyarrow")
import pyarrow.compute as pc

data = pd.array(["a", "b", "c"], dtype=dtype)
arr = pa.array(data)
expected = pa.array(list(data), type=pa.string(), from_pandas=True)
expected = pa.array(list(data), type=pa.large_string(), from_pandas=True)
if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0:
expected = pa.chunked_array(expected)

if dtype.storage == "python":
expected = pc.cast(expected, pa.string())
assert arr.equals(expected)


Expand All @@ -512,7 +514,10 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
data = pd.array(["a", "b", None], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
assert table.field("a").type == "string"
if dtype.storage == "python":
assert table.field("a").type == "string"
else:
assert table.field("a").type == "large_string"
with pd.option_context("string_storage", string_storage2):
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.StringDtype)
Expand All @@ -539,7 +544,10 @@ def test_arrow_load_from_zero_chunks(
data = pd.array([], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
assert table.field("a").type == "string"
if dtype.storage == "python":
assert table.field("a").type == "string"
else:
assert table.field("a").type == "large_string"
# Instantiate the same table with no chunks at all
table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
with pd.option_context("string_storage", string_storage2):
Expand Down
9 changes: 6 additions & 3 deletions pandas/tests/arrays/string_/test_string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_constructor_not_string_type_raises(array, chunked, arrow_string_storage
msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowExtensionArray"
else:
msg = re.escape(
"ArrowStringArray requires a PyArrow (chunked) array of string type"
"ArrowStringArray requires a PyArrow (chunked) array of large_string type"
)
with pytest.raises(ValueError, match=msg):
ArrowStringArray(arr)
Expand All @@ -76,17 +76,20 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked):
arr = pa.chunked_array(arr)

msg = re.escape(
"ArrowStringArray requires a PyArrow (chunked) array of string type"
"ArrowStringArray requires a PyArrow (chunked) array of large_string type"
)
with pytest.raises(ValueError, match=msg):
ArrowStringArray(arr)


@pytest.mark.xfail(
reason="dict conversion does not seem to be implemented for large string in arrow"
)
@pytest.mark.parametrize("chunked", [True, False])
def test_constructor_valid_string_type_value_dictionary(chunked):
pa = pytest.importorskip("pyarrow")

arr = pa.array(["1", "2", "3"], pa.dictionary(pa.int32(), pa.utf8()))
arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode()
if chunked:
arr = pa.chunked_array(arr)

Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2054,6 +2054,13 @@ def test_read_json_dtype_backend(
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))

elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))

else:
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/io/parser/test_read_fwf.py
Original file line number Diff line number Diff line change
Expand Up @@ -971,6 +971,12 @@ def test_dtype_backend(string_storage, dtype_backend):
if string_storage == "python":
arr = StringArray(np.array(["a", "b"], dtype=np.object_))
arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_))
elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

arr = ArrowExtensionArray(pa.array(["a", "b"]))
arr_na = ArrowExtensionArray(pa.array([None, "a"]))
else:
pa = pytest.importorskip("pyarrow")
arr = ArrowStringArray(pa.array(["a", "b"]))
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/io/test_clipboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,13 @@ def test_read_clipboard_dtype_backend(
string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))

elif dtype_backend == "pyarrow" and engine != "c":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["x", "y"]))
string_array_na = ArrowExtensionArray(pa.array(["x", None]))

else:
string_array = ArrowStringArray(pa.array(["x", "y"]))
string_array_na = ArrowStringArray(pa.array(["x", None]))
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/io/test_feather.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,12 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))

elif dtype_backend == "pyarrow":
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))

else:
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/io/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,12 @@ def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html):
if string_storage == "python":
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
else:
pa = pytest.importorskip("pyarrow")
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/io/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -3647,6 +3647,13 @@ def func(storage, dtype_backend, conn_name) -> DataFrame:
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))

elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) # type: ignore[assignment]
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) # type: ignore[assignment]

else:
pa = pytest.importorskip("pyarrow")
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/io/xml/test_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -2044,6 +2044,13 @@ def test_read_xml_nullable_dtypes(
string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))

elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["x", "y"]))
string_array_na = ArrowExtensionArray(pa.array(["x", None]))

else:
pa = pytest.importorskip("pyarrow")
string_array = ArrowStringArray(pa.array(["x", "y"]))
Expand Down

0 comments on commit 2488e5e

Please sign in to comment.