From 2f0272c4ea861dac53e47894ba80f3cf2ffee720 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 7 Nov 2024 23:23:11 +0100 Subject: [PATCH 1/3] ENH (string dtype): convert string_view columns to future string dtype instead of object dtype in Parquet IO --- pandas/io/_util.py | 9 +++++++-- pandas/tests/io/test_parquet.py | 21 +++++++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index a1c3318f04466..9a8c87a738d4c 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -4,6 +4,7 @@ import numpy as np +from pandas.compat import pa_version_under18p0 from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -35,7 +36,11 @@ def _arrow_dtype_mapping() -> dict: def arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") - return { + mapping = { pa.string(): pd.StringDtype(na_value=np.nan), pa.large_string(): pd.StringDtype(na_value=np.nan), - }.get + } + if not pa_version_under18p0: + mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan) + + return mapping.get diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 6ef7105cf5ccc..d6a832843ec12 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -17,6 +17,7 @@ pa_version_under13p0, pa_version_under15p0, pa_version_under17p0, + pa_version_under18p0, ) import pandas as pd @@ -1144,6 +1145,26 @@ def test_infer_string_large_string_type(self, tmp_path, pa): ) tm.assert_frame_equal(result, expected) + @pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0") + def test_infer_string_string_view_type(self, tmp_path, pa): + # GH#54798 + import pyarrow as pa + import pyarrow.parquet as pq + + path = tmp_path / "string_view.parquet" + + table = pa.table({"a": pa.array([None, "b", "c"], pa.string_view())}) + pq.write_table(table, path) + + with pd.option_context("future.infer_string", True): + result = read_parquet(path) + expected = pd.DataFrame( + data={"a": [None, "b", "c"]}, + dtype=pd.StringDtype(na_value=np.nan), + columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), + ) + tm.assert_frame_equal(result, expected) + # NOTE: this test is not run by default, because it requires a lot of memory (>5GB) # @pytest.mark.slow # def test_string_column_above_2GB(self, tmp_path, pa): From c444bfa5ac7da65c1a2937eaad13bcf75d3058ea Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Nov 2024 13:59:12 +0100 Subject: [PATCH 2/3] move test to feather --- pandas/tests/io/test_feather.py | 19 +++++++++++++++++++ pandas/tests/io/test_parquet.py | 21 --------------------- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 8ae2033faab4f..fee2aac344656 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas.compat.pyarrow import pa_version_under18p0 + import pandas as pd import pandas._testing as tm @@ -249,6 +251,23 @@ def test_string_inference(self, tmp_path): ) tm.assert_frame_equal(result, expected) + @pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0") + def test_string_inference_string_view_type(self, tmp_path): + # GH#54798 + import pyarrow as pa + from pyarrow import feather + + path = tmp_path / "string_view.parquet" + table = pa.table({"a": pa.array([None, "b", "c"], pa.string_view())}) + feather.write_feather(table, path) + + with pd.option_context("future.infer_string", True): + result = read_feather(path) + expected = pd.DataFrame( + data={"a": [None, "b", "c"]}, dtype=pd.StringDtype(na_value=np.nan) + ) + tm.assert_frame_equal(result, expected) + def test_out_of_bounds_datetime_to_feather(self): # GH#47832 df = pd.DataFrame( diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index d6a832843ec12..6ef7105cf5ccc 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -17,7 +17,6 @@ pa_version_under13p0, pa_version_under15p0, pa_version_under17p0, - pa_version_under18p0, ) import pandas as pd @@ -1145,26 +1144,6 @@ def test_infer_string_large_string_type(self, tmp_path, pa): ) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0") - def test_infer_string_string_view_type(self, tmp_path, pa): - # GH#54798 - import pyarrow as pa - import pyarrow.parquet as pq - - path = tmp_path / "string_view.parquet" - - table = pa.table({"a": pa.array([None, "b", "c"], pa.string_view())}) - pq.write_table(table, path) - - with pd.option_context("future.infer_string", True): - result = read_parquet(path) - expected = pd.DataFrame( - data={"a": [None, "b", "c"]}, - dtype=pd.StringDtype(na_value=np.nan), - columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), - ) - tm.assert_frame_equal(result, expected) - # NOTE: this test is not run by default, because it requires a lot of memory (>5GB) # @pytest.mark.slow # def test_string_column_above_2GB(self, tmp_path, pa): From a566e687f881dc2f5549fdfb0486a3ac4b5bf619 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 9 Nov 2024 17:48:53 +0100 Subject: [PATCH 3/3] fixup --- pandas/tests/io/test_feather.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index fee2aac344656..69354066dd5ef 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -263,9 +263,10 @@ def test_string_inference_string_view_type(self, tmp_path): with pd.option_context("future.infer_string", True): result = read_feather(path) - expected = pd.DataFrame( - data={"a": [None, "b", "c"]}, dtype=pd.StringDtype(na_value=np.nan) - ) + + expected = pd.DataFrame( + data={"a": [None, "b", "c"]}, dtype=pd.StringDtype(na_value=np.nan) + ) tm.assert_frame_equal(result, expected) def test_out_of_bounds_datetime_to_feather(self):