diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 59512ddc91a8a..d908a68a8be4a 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -140,9 +140,6 @@ jobs: moto: image: motoserver/moto:5.0.27 - env: - AWS_ACCESS_KEY_ID: foobar_key - AWS_SECRET_ACCESS_KEY: foobar_secret ports: - 5000:5000 diff --git a/pandas/conftest.py b/pandas/conftest.py index f9c10a7758bd2..f81dc0af234e3 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -2116,3 +2116,9 @@ def temp_file(tmp_path): file_path = tmp_path / str(uuid.uuid4()) file_path.touch() return file_path + + +@pytest.fixture(scope="session") +def monkeysession(): + with pytest.MonkeyPatch.context() as mp: + yield mp diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index a5ddda9d66e7a..5ce44e87570a7 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -1,6 +1,3 @@ -import shlex -import subprocess -import time import uuid import pytest @@ -50,93 +47,77 @@ def xml_file(datapath): return datapath("io", "data", "xml", "books.xml") -@pytest.fixture -def s3_base(worker_id, monkeypatch): - """ - Fixture for mocking S3 interaction. +@pytest.fixture(scope="session") +def aws_credentials(monkeysession): + """Mocked AWS Credentials for moto.""" + monkeysession.setenv("AWS_ACCESS_KEY_ID", "testing") + monkeysession.setenv("AWS_SECRET_ACCESS_KEY", "testing") + monkeysession.setenv("AWS_SECURITY_TOKEN", "testing") + monkeysession.setenv("AWS_SESSION_AWS_SESSION_TOKEN", "testing") + monkeysession.setenv("AWS_DEFAULT_REGION", "us-east-1") - Sets up moto server in separate process locally - Return url for motoserver/moto CI service - """ - pytest.importorskip("s3fs") - pytest.importorskip("boto3") - - # temporary workaround as moto fails for botocore >= 1.11 otherwise, - # see https://github.com/spulec/moto/issues/1924 & 1952 - monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key") - monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") - if is_ci_environment(): - if is_platform_arm() or is_platform_mac() or is_platform_windows(): - # NOT RUN on Windows/macOS, only Ubuntu - # - subprocess in CI can cause timeouts - # - GitHub Actions do not support - # container services for the above OSs - pytest.skip( - "S3 tests do not have a corresponding service on " - "Windows or macOS platforms" - ) - else: - # set in .github/workflows/unit-tests.yml - yield "http://localhost:5000" + +@pytest.fixture(scope="session") +def moto_server(aws_credentials): + # use service container for Linux on GitHub Actions + if is_ci_environment() and not ( + is_platform_mac() or is_platform_arm() or is_platform_windows() + ): + yield "http://localhost:5000" else: - requests = pytest.importorskip("requests") - pytest.importorskip("moto") - pytest.importorskip("flask") # server mode needs flask too - - # Launching moto in server mode, i.e., as a separate process - # with an S3 endpoint on localhost - - worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw") - endpoint_port = f"555{worker_id}" - endpoint_uri = f"http://127.0.0.1:{endpoint_port}/" - - # pipe to null to avoid logging in terminal - with subprocess.Popen( - shlex.split(f"moto_server s3 -p {endpoint_port}"), - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) as proc: - timeout = 5 - while timeout > 0: - try: - # OK to go once server is accepting connections - r = requests.get(endpoint_uri) - if r.ok: - break - except Exception: - pass - timeout -= 0.1 - time.sleep(0.1) - yield endpoint_uri - - proc.terminate() + moto_server = pytest.importorskip("moto.server") + server = moto_server.ThreadedMotoServer(port=0) + server.start() + host, port = server.get_host_and_port() + yield f"http://{host}:{port}" + server.stop() @pytest.fixture -def s3so(s3_base): - return {"client_kwargs": {"endpoint_url": s3_base}} +def moto_s3_resource(moto_server): + boto3 = pytest.importorskip("boto3") + s3 = boto3.resource("s3", endpoint_url=moto_server) + return s3 -@pytest.fixture -def s3_resource(s3_base): - import boto3 +@pytest.fixture(scope="session") +def s3so(moto_server): + return { + "client_kwargs": { + "endpoint_url": moto_server, + } + } - s3 = boto3.resource("s3", endpoint_url=s3_base) - return s3 + +@pytest.fixture +def s3_bucket_public(moto_s3_resource): + """ + Create a public S3 bucket using moto. + """ + bucket_name = f"pandas-test-{uuid.uuid4()}" + bucket = moto_s3_resource.Bucket(bucket_name) + bucket.create(ACL="public-read") + yield bucket + bucket.objects.delete() + bucket.delete() @pytest.fixture -def s3_public_bucket(s3_resource): - bucket = s3_resource.Bucket(f"pandas-test-{uuid.uuid4()}") - bucket.create() +def s3_bucket_private(moto_s3_resource): + """ + Create a private S3 bucket using moto. + """ + bucket_name = f"cant_get_it-{uuid.uuid4()}" + bucket = moto_s3_resource.Bucket(bucket_name) + bucket.create(ACL="private") yield bucket bucket.objects.delete() bucket.delete() @pytest.fixture -def s3_public_bucket_with_data( - s3_public_bucket, tips_file, jsonl_file, feather_file, xml_file +def s3_bucket_public_with_data( + s3_bucket_public, tips_file, jsonl_file, feather_file, xml_file ): """ The following datasets @@ -158,22 +139,13 @@ def s3_public_bucket_with_data( ] for s3_key, file_name in test_s3_files: with open(file_name, "rb") as f: - s3_public_bucket.put_object(Key=s3_key, Body=f) - return s3_public_bucket - - -@pytest.fixture -def s3_private_bucket(s3_resource): - bucket = s3_resource.Bucket(f"cant_get_it-{uuid.uuid4()}") - bucket.create(ACL="private") - yield bucket - bucket.objects.delete() - bucket.delete() + s3_bucket_public.put_object(Key=s3_key, Body=f) + return s3_bucket_public @pytest.fixture -def s3_private_bucket_with_data( - s3_private_bucket, tips_file, jsonl_file, feather_file, xml_file +def s3_bucket_private_with_data( + s3_bucket_private, tips_file, jsonl_file, feather_file, xml_file ): """ The following datasets @@ -195,8 +167,8 @@ def s3_private_bucket_with_data( ] for s3_key, file_name in test_s3_files: with open(file_name, "rb") as f: - s3_private_bucket.put_object(Key=s3_key, Body=f) - return s3_private_bucket + s3_bucket_private.put_object(Key=s3_key, Body=f) + return s3_bucket_private _compression_formats_params = [ diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 140cf39b26556..71fb8f490e114 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -934,29 +934,27 @@ def test_read_from_http_url(self, httpserver, read_ext): @td.skip_if_not_us_locale @pytest.mark.single_cpu - def test_read_from_s3_url(self, read_ext, s3_public_bucket, s3so): - # Bucket created in tests/io/conftest.py + def test_read_from_s3_url(self, read_ext, s3_bucket_public, s3so): with open("test1" + read_ext, "rb") as f: - s3_public_bucket.put_object(Key="test1" + read_ext, Body=f) + s3_bucket_public.put_object(Key="test1" + read_ext, Body=f) - url = f"s3://{s3_public_bucket.name}/test1" + read_ext + url = f"s3://{s3_bucket_public.name}/test1" + read_ext url_table = pd.read_excel(url, storage_options=s3so) local_table = pd.read_excel("test1" + read_ext) tm.assert_frame_equal(url_table, local_table) @pytest.mark.single_cpu - def test_read_from_s3_object(self, read_ext, s3_public_bucket, s3so): + def test_read_from_s3_object(self, read_ext, s3_bucket_public, s3so): # GH 38788 - # Bucket created in tests/io/conftest.py with open("test1" + read_ext, "rb") as f: - s3_public_bucket.put_object(Key="test1" + read_ext, Body=f) + s3_bucket_public.put_object(Key="test1" + read_ext, Body=f) import s3fs s3 = s3fs.S3FileSystem(**s3so) - with s3.open(f"s3://{s3_public_bucket.name}/test1" + read_ext) as f: + with s3.open(f"s3://{s3_bucket_public.name}/test1" + read_ext) as f: url_table = pd.read_excel(f) local_table = pd.read_excel("test1" + read_ext) diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index 0e13b2f94ed58..8c70e5eed3c97 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -318,16 +318,16 @@ def custom_converter(css): @pytest.mark.single_cpu @td.skip_if_not_us_locale -def test_styler_to_s3(s3_public_bucket, s3so): +def test_styler_to_s3(s3_bucket_public, s3so): # GH#46381 - - mock_bucket_name, target_file = s3_public_bucket.name, "test.xlsx" + mock_bucket_name = s3_bucket_public.name + target_file = f"{uuid.uuid4()}.xlsx" df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) styler = df.style.set_sticky(axis="index") styler.to_excel(f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so) timeout = 5 while True: - if target_file in (obj.key for obj in s3_public_bucket.objects.all()): + if target_file in (obj.key for obj in s3_bucket_public.objects.all()): break time.sleep(0.1) timeout -= 0.1 diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 953a9246da1cd..f43d03b5a8354 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -2,6 +2,7 @@ BytesIO, StringIO, ) +import uuid import pytest @@ -42,17 +43,18 @@ def test_read_zipped_json(datapath): @td.skip_if_not_us_locale @pytest.mark.single_cpu @pytest.mark.network -def test_with_s3_url(compression, s3_public_bucket, s3so): +def test_with_s3_url(compression, s3_bucket_public, s3so): # Bucket created in tests/io/conftest.py df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}')) + key = f"{uuid.uuid4()}.json" with tm.ensure_clean() as path: df.to_json(path, compression=compression) with open(path, "rb") as f: - s3_public_bucket.put_object(Key="test-1", Body=f) + s3_bucket_public.put_object(Key=key, Body=f) roundtripped_df = pd.read_json( - f"s3://{s3_public_bucket.name}/test-1", + f"s3://{s3_bucket_public.name}/{key}", compression=compression, storage_options=s3so, ) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 32eeb30de4b69..98f437e757e31 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -8,6 +8,7 @@ import os import sys import time +import uuid import numpy as np import pytest @@ -1411,11 +1412,10 @@ def test_read_inline_jsonl(self): @pytest.mark.single_cpu @pytest.mark.network @td.skip_if_not_us_locale - def test_read_s3_jsonl(self, s3_public_bucket_with_data, s3so): + def test_read_s3_jsonl(self, s3_bucket_public_with_data, s3so): # GH17200 - result = read_json( - f"s3n://{s3_public_bucket_with_data.name}/items.jsonl", + f"s3n://{s3_bucket_public_with_data.name}/items.jsonl", lines=True, storage_options=s3so, ) @@ -2011,14 +2011,15 @@ def test_json_multiindex(self): @pytest.mark.single_cpu @pytest.mark.network - def test_to_s3(self, s3_public_bucket, s3so): + def test_to_s3(self, s3_bucket_public, s3so): # GH 28375 - mock_bucket_name, target_file = s3_public_bucket.name, "test.json" + mock_bucket_name = s3_bucket_public.name + target_file = f"{uuid.uuid4()}.json" df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) df.to_json(f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so) timeout = 5 while True: - if target_file in (obj.key for obj in s3_public_bucket.objects.all()): + if target_file in (obj.key for obj in s3_bucket_public.objects.all()): break time.sleep(0.1) timeout -= 0.1 diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 836ac71d8e865..d6bdff5be9b4e 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -76,211 +76,126 @@ def tips_df(datapath): @pytest.mark.single_cpu @pytest.mark.network -@pytest.mark.usefixtures("s3_resource") @td.skip_if_not_us_locale() class TestS3: - def test_parse_public_s3_bucket(self, s3_public_bucket_with_data, tips_df, s3so): + @pytest.mark.parametrize( + "suffix, compression", + [ + ("", None), + (".gz", "gzip"), + (".bz2", "bz2"), + ], + ) + @pytest.mark.parametrize("nrows", [None, 10]) + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_parse_public_s3_bucket( + self, + s3_bucket_public_with_data, + s3so, + tips_df, + suffix, + compression, + nrows, + engine, + ): # more of an integration test due to the not-public contents portion # can probably mock this though. pytest.importorskip("s3fs") - for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv( - f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext, - compression=comp, - storage_options=s3so, - ) - assert isinstance(df, DataFrame) - assert not df.empty - tm.assert_frame_equal(df, tips_df) + df = read_csv( + f"s3://{s3_bucket_public_with_data.name}/tips.csv{suffix}", + nrows=nrows, + compression=compression, + storage_options=s3so, + engine=engine, + ) + tm.assert_frame_equal(df, tips_df.iloc[:nrows]) - def test_parse_private_s3_bucket(self, s3_private_bucket_with_data, tips_df, s3so): + def test_parse_private_s3_bucket(self, s3_bucket_private_with_data, s3so, tips_df): # Read public file from bucket with not-public contents pytest.importorskip("s3fs") df = read_csv( - f"s3://{s3_private_bucket_with_data.name}/tips.csv", storage_options=s3so + f"s3://{s3_bucket_private_with_data.name}/tips.csv", storage_options=s3so ) - assert isinstance(df, DataFrame) - assert not df.empty tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3n_bucket(self, s3_public_bucket_with_data, tips_df, s3so): - # Read from AWS s3 as "s3n" URL - df = read_csv( - f"s3n://{s3_public_bucket_with_data.name}/tips.csv", - nrows=10, - storage_options=s3so, - ) - assert isinstance(df, DataFrame) - assert not df.empty - tm.assert_frame_equal(tips_df.iloc[:10], df) - - def test_parse_public_s3a_bucket(self, s3_public_bucket_with_data, tips_df, s3so): - # Read from AWS s3 as "s3a" URL + @pytest.mark.parametrize("scheme", ["s3n", "s3a"]) + def test_parse_public_bucket_s3n_s3a( + self, s3_bucket_public_with_data, s3so, tips_df, scheme + ): + nrows = 10 df = read_csv( - f"s3a://{s3_public_bucket_with_data.name}/tips.csv", - nrows=10, + f"{scheme}://{s3_bucket_public_with_data.name}/tips.csv", + nrows=nrows, storage_options=s3so, ) - assert isinstance(df, DataFrame) - assert not df.empty - tm.assert_frame_equal(tips_df.iloc[:10], df) - - def test_parse_public_s3_bucket_nrows( - self, s3_public_bucket_with_data, tips_df, s3so - ): - for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv( - f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext, - nrows=10, - compression=comp, - storage_options=s3so, - ) - assert isinstance(df, DataFrame) - assert not df.empty - tm.assert_frame_equal(tips_df.iloc[:10], df) - + tm.assert_frame_equal(df, tips_df.iloc[:nrows]) + + @pytest.mark.parametrize( + "suffix, compression", + [ + ("", None), + (".gz", "gzip"), + (".bz2", "bz2"), + ], + ) + @pytest.mark.parametrize("engine", ["c", "python"]) def test_parse_public_s3_bucket_chunked( - self, s3_public_bucket_with_data, tips_df, s3so + self, s3_bucket_public_with_data, s3so, tips_df, suffix, compression, engine ): # Read with a chunksize chunksize = 5 - for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - with read_csv( - f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext, - chunksize=chunksize, - compression=comp, - storage_options=s3so, - ) as df_reader: - assert df_reader.chunksize == chunksize - for i_chunk in [0, 1, 2]: - # Read a couple of chunks and make sure we see them - # properly. - df = df_reader.get_chunk() - assert isinstance(df, DataFrame) - assert not df.empty - true_df = tips_df.iloc[ - chunksize * i_chunk : chunksize * (i_chunk + 1) - ] - tm.assert_frame_equal(true_df, df) - - def test_parse_public_s3_bucket_chunked_python( - self, s3_public_bucket_with_data, tips_df, s3so - ): - # Read with a chunksize using the Python parser - chunksize = 5 - for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - with read_csv( - f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext, - chunksize=chunksize, - compression=comp, - engine="python", - storage_options=s3so, - ) as df_reader: - assert df_reader.chunksize == chunksize - for i_chunk in [0, 1, 2]: - # Read a couple of chunks and make sure we see them properly. - df = df_reader.get_chunk() - assert isinstance(df, DataFrame) - assert not df.empty - true_df = tips_df.iloc[ - chunksize * i_chunk : chunksize * (i_chunk + 1) - ] - tm.assert_frame_equal(true_df, df) - - def test_parse_public_s3_bucket_python( - self, s3_public_bucket_with_data, tips_df, s3so - ): - for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv( - f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext, - engine="python", - compression=comp, - storage_options=s3so, - ) - assert isinstance(df, DataFrame) - assert not df.empty - tm.assert_frame_equal(df, tips_df) - - def test_infer_s3_compression(self, s3_public_bucket_with_data, tips_df, s3so): - for ext in ["", ".gz", ".bz2"]: - df = read_csv( - f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext, - engine="python", - compression="infer", - storage_options=s3so, - ) - assert isinstance(df, DataFrame) - assert not df.empty - tm.assert_frame_equal(df, tips_df) - - def test_parse_public_s3_bucket_nrows_python( - self, s3_public_bucket_with_data, tips_df, s3so + with read_csv( + f"s3://{s3_bucket_public_with_data.name}/tips.csv{suffix}", + chunksize=chunksize, + compression=compression, + storage_options=s3so, + engine=engine, + ) as df_reader: + assert df_reader.chunksize == chunksize + for i_chunk in [0, 1, 2]: + # Read a couple of chunks and make sure we see them + # properly. + df = df_reader.get_chunk() + assert isinstance(df, DataFrame) + assert not df.empty + true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] + tm.assert_frame_equal(true_df, df) + + @pytest.mark.parametrize("suffix", ["", ".gz", ".bz2"]) + def test_infer_s3_compression( + self, s3_bucket_public_with_data, s3so, tips_df, suffix ): - for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv( - f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext, - engine="python", - nrows=10, - compression=comp, - storage_options=s3so, - ) - assert isinstance(df, DataFrame) - assert not df.empty - tm.assert_frame_equal(tips_df.iloc[:10], df) + df = read_csv( + f"s3://{s3_bucket_public_with_data.name}/tips.csv{suffix}", + engine="python", + compression="infer", + storage_options=s3so, + ) + tm.assert_frame_equal(df, tips_df) - def test_read_s3_fails(self, s3so): + def test_read_s3_fails(self, s3_bucket_public_with_data, s3so): msg = "The specified bucket does not exist" with pytest.raises(OSError, match=msg): read_csv("s3://nyqpug/asdf.csv", storage_options=s3so) - def test_read_s3_fails_private(self, s3_private_bucket, s3so): - msg = "The specified bucket does not exist" + def test_read_s3_fails_private(self, s3_bucket_private_with_data, s3so): + s3_url = f"{s3_bucket_private_with_data.name}/file.csv" + msg = rf"{s3_url}" # Receive a permission error when trying to read a private bucket. # It's irrelevant here that this isn't actually a table. - with pytest.raises(OSError, match=msg): - read_csv(f"s3://{s3_private_bucket.name}/file.csv") - - @pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False) - def test_write_s3_csv_fails(self, tips_df, s3so): - # GH 32486 - # Attempting to write to an invalid S3 path should raise - import botocore - - # GH 34087 - # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html - # Catch a ClientError since AWS Service Errors are defined dynamically - error = (FileNotFoundError, botocore.exceptions.ClientError) - - with pytest.raises(error, match="The specified bucket does not exist"): - tips_df.to_csv( - "s3://an_s3_bucket_data_doesnt_exit/not_real.csv", storage_options=s3so - ) - - @pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False) - def test_write_s3_parquet_fails(self, tips_df, s3so): - # GH 27679 - # Attempting to write to an invalid S3 path should raise - pytest.importorskip("pyarrow") - import botocore - - # GH 34087 - # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html - # Catch a ClientError since AWS Service Errors are defined dynamically - error = (FileNotFoundError, botocore.exceptions.ClientError) - - with pytest.raises(error, match="The specified bucket does not exist"): - tips_df.to_parquet( - "s3://an_s3_bucket_data_doesnt_exit/not_real.parquet", + with pytest.raises(FileNotFoundError, match=msg): + read_csv( + f"s3://{s3_url}", storage_options=s3so, ) @pytest.mark.single_cpu def test_read_csv_handles_boto_s3_object( - self, s3_public_bucket_with_data, tips_file + self, s3_bucket_public_with_data, tips_file ): # see gh-16135 - s3_object = s3_public_bucket_with_data.Object("tips.csv") + s3_object = s3_bucket_public_with_data.Object("tips.csv") with BytesIO(s3_object.get()["Body"].read()) as buffer: result = read_csv(buffer, encoding="utf8") @@ -291,12 +206,12 @@ def test_read_csv_handles_boto_s3_object( tm.assert_frame_equal(result, expected) @pytest.mark.single_cpu - def test_read_csv_chunked_download(self, s3_public_bucket, caplog, s3so): + def test_read_csv_chunked_download(self, s3_bucket_public, s3so, caplog): # 8 MB, S3FS uses 5MB chunks df = DataFrame(np.zeros((100000, 4)), columns=list("abcd")) with BytesIO(df.to_csv().encode("utf-8")) as buf: - s3_public_bucket.put_object(Key="large-file.csv", Body=buf) - uri = f"{s3_public_bucket.name}/large-file.csv" + s3_bucket_public.put_object(Key="large-file.csv", Body=buf) + uri = f"{s3_bucket_public.name}/large-file.csv" match_re = re.compile(rf"^Fetch: {uri}, 0-(?P\d+)$") with caplog.at_level(logging.DEBUG, logger="s3fs"): read_csv( @@ -309,21 +224,21 @@ def test_read_csv_chunked_download(self, s3_public_bucket, caplog, s3so): # Less than 8 MB assert int(match.group("stop")) < 8000000 - def test_read_s3_with_hash_in_key(self, s3_public_bucket_with_data, tips_df, s3so): + def test_read_s3_with_hash_in_key(self, s3_bucket_public_with_data, s3so, tips_df): # GH 25945 result = read_csv( - f"s3://{s3_public_bucket_with_data.name}/tips#1.csv", storage_options=s3so + f"s3://{s3_bucket_public_with_data.name}/tips#1.csv", storage_options=s3so ) tm.assert_frame_equal(tips_df, result) def test_read_feather_s3_file_path( - self, s3_public_bucket_with_data, feather_file, s3so + self, s3_bucket_public_with_data, s3so, feather_file ): # GH 29055 pytest.importorskip("pyarrow") expected = read_feather(feather_file) res = read_feather( - f"s3://{s3_public_bucket_with_data.name}/simple_dataset.feather", + f"s3://{s3_bucket_public_with_data.name}/simple_dataset.feather", storage_options=s3so, ) tm.assert_frame_equal(expected, res) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 2e3e74a9d31ff..5f3c32fc9854d 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -230,49 +230,36 @@ def test_fastparquet_options(fsspectest): @pytest.mark.single_cpu -def test_from_s3_csv(s3_public_bucket_with_data, tips_file, s3so): +@pytest.mark.parametrize("compression_suffix", ["", ".gz", ".bz2"]) +def test_from_s3_csv(s3_bucket_public_with_data, s3so, tips_file, compression_suffix): pytest.importorskip("s3fs") - tm.assert_equal( - read_csv( - f"s3://{s3_public_bucket_with_data.name}/tips.csv", storage_options=s3so - ), - read_csv(tips_file), - ) - # the following are decompressed by pandas, not fsspec - tm.assert_equal( - read_csv( - f"s3://{s3_public_bucket_with_data.name}/tips.csv.gz", storage_options=s3so - ), - read_csv(tips_file), - ) - tm.assert_equal( - read_csv( - f"s3://{s3_public_bucket_with_data.name}/tips.csv.bz2", storage_options=s3so - ), - read_csv(tips_file), + df_from_s3 = read_csv( + f"s3://{s3_bucket_public_with_data.name}/tips.csv{compression_suffix}", + storage_options=s3so, ) + df_from_local = read_csv(tips_file) + tm.assert_equal(df_from_s3, df_from_local) @pytest.mark.single_cpu @pytest.mark.parametrize("protocol", ["s3", "s3a", "s3n"]) -def test_s3_protocols(s3_public_bucket_with_data, tips_file, protocol, s3so): +def test_s3_protocols(s3_bucket_public_with_data, s3so, tips_file, protocol): pytest.importorskip("s3fs") - tm.assert_equal( - read_csv( - f"{protocol}://{s3_public_bucket_with_data.name}/tips.csv", - storage_options=s3so, - ), - read_csv(tips_file), + df_from_s3 = read_csv( + f"{protocol}://{s3_bucket_public_with_data.name}/tips.csv", + storage_options=s3so, ) + df_from_local = read_csv(tips_file) + tm.assert_equal(df_from_s3, df_from_local) @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") @pytest.mark.single_cpu -def test_s3_parquet(s3_public_bucket, s3so, df1): +def test_s3_parquet(s3_bucket_public, s3so, df1): pytest.importorskip("fastparquet") pytest.importorskip("s3fs") - fn = f"s3://{s3_public_bucket.name}/test.parquet" + fn = f"s3://{s3_bucket_public.name}/test.parquet" df1.to_parquet( fn, index=False, engine="fastparquet", compression=None, storage_options=s3so ) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 78f39b649cb9a..5221725f700b4 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -809,26 +809,26 @@ def test_categorical(self, pa): check_round_trip(df, pa) @pytest.mark.single_cpu - def test_s3_roundtrip_explicit_fs(self, df_compat, s3_public_bucket, pa, s3so): + def test_s3_roundtrip_explicit_fs(self, df_compat, s3_bucket_public, s3so, pa): s3fs = pytest.importorskip("s3fs") s3 = s3fs.S3FileSystem(**s3so) kw = {"filesystem": s3} check_round_trip( df_compat, pa, - path=f"{s3_public_bucket.name}/pyarrow.parquet", + path=f"{s3_bucket_public.name}/pyarrow.parquet", read_kwargs=kw, write_kwargs=kw, ) @pytest.mark.single_cpu - def test_s3_roundtrip(self, df_compat, s3_public_bucket, pa, s3so): + def test_s3_roundtrip(self, df_compat, s3_bucket_public, s3so, pa): # GH #19134 s3so = {"storage_options": s3so} check_round_trip( df_compat, pa, - path=f"s3://{s3_public_bucket.name}/pyarrow.parquet", + path=f"s3://{s3_bucket_public.name}/pyarrow.parquet", read_kwargs=s3so, write_kwargs=s3so, ) @@ -836,7 +836,7 @@ def test_s3_roundtrip(self, df_compat, s3_public_bucket, pa, s3so): @pytest.mark.single_cpu @pytest.mark.parametrize("partition_col", [["A"], []]) def test_s3_roundtrip_for_dir( - self, df_compat, s3_public_bucket, pa, partition_col, s3so + self, df_compat, s3_bucket_public, pa, partition_col, s3so ): pytest.importorskip("s3fs") # GH #26388 @@ -855,7 +855,7 @@ def test_s3_roundtrip_for_dir( df_compat, pa, expected=expected_df, - path=f"s3://{s3_public_bucket.name}/parquet_dir", + path=f"s3://{s3_bucket_public.name}/parquet_dir", read_kwargs={"storage_options": s3so}, write_kwargs={ "partition_cols": partition_col, @@ -1306,12 +1306,12 @@ def test_filter_row_groups(self, fp): assert len(result) == 1 @pytest.mark.single_cpu - def test_s3_roundtrip(self, df_compat, s3_public_bucket, fp, s3so): + def test_s3_roundtrip(self, df_compat, s3_bucket_public, s3so, fp): # GH #19134 check_round_trip( df_compat, fp, - path=f"s3://{s3_public_bucket.name}/fastparquet.parquet", + path=f"s3://{s3_bucket_public.name}/fastparquet.parquet", read_kwargs={"storage_options": s3so}, write_kwargs={"compression": None, "storage_options": s3so}, ) diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py index 79473895b662d..31d22223b0a33 100644 --- a/pandas/tests/io/test_s3.py +++ b/pandas/tests/io/test_s3.py @@ -5,39 +5,29 @@ from pandas import read_csv -def test_streaming_s3_objects(): - # GH17135 +@pytest.mark.parametrize("data", [b"foo,bar,baz\n1,2,3\n4,5,6\n", b"just,the,header\n"]) +def test_streaming_s3_objects(data): + # GH 17135 # botocore gained iteration support in 1.10.47, can now be used in read_* pytest.importorskip("botocore", minversion="1.10.47") from botocore.response import StreamingBody - data = [b"foo,bar,baz\n1,2,3\n4,5,6\n", b"just,the,header\n"] - for el in data: - body = StreamingBody(BytesIO(el), content_length=len(el)) - read_csv(body) + body = StreamingBody(BytesIO(data), content_length=len(data)) + read_csv(body) @pytest.mark.single_cpu -def test_read_without_creds_from_pub_bucket(s3_public_bucket_with_data, s3so): - # GH 34626 - pytest.importorskip("s3fs") - result = read_csv( - f"s3://{s3_public_bucket_with_data.name}/tips.csv", - nrows=3, - storage_options=s3so, - ) - assert len(result) == 3 - - -@pytest.mark.single_cpu -def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, s3so): - # Ensure we can read from a public bucket with credentials +@pytest.mark.parametrize("header", ["infer", None]) +def test_read_with_and_without_creds_from_pub_bucket( + s3_bucket_public_with_data, s3so, header +): # GH 34626 pytest.importorskip("s3fs") + nrows = 5 df = read_csv( - f"s3://{s3_public_bucket_with_data.name}/tips.csv", - nrows=5, - header=None, + f"s3://{s3_bucket_public_with_data.name}/tips.csv", + nrows=nrows, + header=header, storage_options=s3so, ) - assert len(df) == 5 + assert len(df) == nrows diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py index 4446dbe320b69..ccd7e5d894943 100644 --- a/pandas/tests/io/xml/test_to_xml.py +++ b/pandas/tests/io/xml/test_to_xml.py @@ -1355,14 +1355,14 @@ def test_unsupported_compression(parser, geom_df): @pytest.mark.single_cpu -def test_s3_permission_output(parser, s3_public_bucket, geom_df): +def test_s3_permission_output(parser, s3_bucket_public, geom_df): s3fs = pytest.importorskip("s3fs") pytest.importorskip("lxml") with tm.external_error_raised((PermissionError, FileNotFoundError)): fs = s3fs.S3FileSystem(anon=True) - fs.ls(s3_public_bucket.name) + fs.ls(s3_bucket_public.name) geom_df.to_xml( - f"s3://{s3_public_bucket.name}/geom.xml", compression="zip", parser=parser + f"s3://{s3_bucket_public.name}/geom.xml", compression="zip", parser=parser ) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index cf8ae28c4d9b5..e73c685a09b23 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1972,11 +1972,10 @@ def test_unsupported_compression(parser): @pytest.mark.network @pytest.mark.single_cpu -def test_s3_parser_consistency(s3_public_bucket_with_data, s3so): +def test_s3_parser_consistency(s3_bucket_public_with_data, s3so): pytest.importorskip("s3fs") pytest.importorskip("lxml") - s3 = f"s3://{s3_public_bucket_with_data.name}/books.xml" - + s3 = f"s3://{s3_bucket_public_with_data.name}/books.xml" df_lxml = read_xml(s3, parser="lxml", storage_options=s3so) df_etree = read_xml(s3, parser="etree", storage_options=s3so)