Skip to content

Commit

Permalink
test: dataset split functionality tests
Browse files Browse the repository at this point in the history
  • Loading branch information
sbrugman committed Jul 5, 2022
1 parent ca22268 commit 25c0301
Show file tree
Hide file tree
Showing 4 changed files with 302 additions and 27 deletions.
2 changes: 1 addition & 1 deletion tests/popmon/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def pytest_configure():
pytest.date = load(f)

with open("{}/{}".format(TEMPLATE_PATH, "eyesColor.json")) as f:
pytest.eyesColor = load(f)
pytest.eyeColor = load(f)

with open("{}/{}".format(TEMPLATE_PATH, "gender.json")) as f:
pytest.gender = load(f)
Expand Down
112 changes: 112 additions & 0 deletions tests/popmon/pipeline/test_split_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
from datetime import datetime, timedelta

import pandas as pd
import pytest

from popmon.pipeline.dataset_splitter import split_dataset


@pytest.fixture
def test_dataframe_pandas():
n_samples = 1000
start = datetime.today()
return pd.DataFrame(
{
"date": [start + timedelta(days=delta) for delta in range(n_samples)],
"f1": [1] * n_samples,
"f2": [0] * n_samples,
}
)


def test_split_dataset_pandas_int(test_dataframe_pandas):
reference, df = split_dataset(test_dataframe_pandas, split=3, time_axis="date")

assert reference.shape[0] == 3
assert df.shape[0] == 997
assert reference.columns.values.tolist() == ["date", "f1", "f2"]
assert df.columns.values.tolist() == ["date", "f1", "f2"]


def test_split_dataset_pandas_int_underflow(test_dataframe_pandas):
with pytest.raises(ValueError) as e:
_ = split_dataset(test_dataframe_pandas, split=0, time_axis="date")

assert e.value.args[0] == "Number of instances should be greater than 0"


def test_split_dataset_pandas_int_overflow(test_dataframe_pandas):
with pytest.raises(ValueError) as e:
_ = split_dataset(test_dataframe_pandas, split=1001, time_axis="date")

assert (
e.value.args[0]
== "Returned dataframe is empty. Please adjust the `split` argument"
)


def test_split_dataset_pandas_float(test_dataframe_pandas):
reference, df = split_dataset(test_dataframe_pandas, split=0.45, time_axis="date")

assert reference.shape[0] == 450
assert df.shape[0] == 550
assert reference.columns.values.tolist() == ["date", "f1", "f2"]
assert df.columns.values.tolist() == ["date", "f1", "f2"]


def test_split_dataset_pandas_float_round(test_dataframe_pandas):
reference, df = split_dataset(test_dataframe_pandas, split=0.8888, time_axis="date")

assert reference.shape[0] == 888
assert df.shape[0] == 112
assert reference.columns.values.tolist() == ["date", "f1", "f2"]
assert df.columns.values.tolist() == ["date", "f1", "f2"]


def test_split_dataset_pandas_float_underflow(test_dataframe_pandas):
with pytest.raises(ValueError) as e:
_ = split_dataset(test_dataframe_pandas, split=0.0, time_axis="date")

assert e.value.args[0] == "Fraction should be 0 > fraction > 1"

with pytest.raises(ValueError) as e:
_ = split_dataset(test_dataframe_pandas, split=-1.0, time_axis="date")

assert e.value.args[0] == "Fraction should be 0 > fraction > 1"


def test_split_dataset_pandas_float_overflow(test_dataframe_pandas):
with pytest.raises(ValueError) as e:
_ = split_dataset(test_dataframe_pandas, split=1.0, time_axis="date")

assert e.value.args[0] == "Fraction should be 0 > fraction > 1"

with pytest.raises(ValueError) as e:
_ = split_dataset(test_dataframe_pandas, split=10.0, time_axis="date")

assert e.value.args[0] == "Fraction should be 0 > fraction > 1"


def test_split_dataset_pandas_condition(test_dataframe_pandas):
reference, df = split_dataset(
test_dataframe_pandas,
split=test_dataframe_pandas.date
< datetime.today() + timedelta(days=50, hours=5),
time_axis="date",
)

assert reference.shape[0] == 51
assert df.shape[0] == 949
assert reference.columns.values.tolist() == ["date", "f1", "f2"]
assert df.columns.values.tolist() == ["date", "f1", "f2"]


def test_split_dataset_pandas_condition_false(test_dataframe_pandas):
with pytest.raises(ValueError) as e:
split_dataset(
test_dataframe_pandas,
split=test_dataframe_pandas.date < datetime.today() - timedelta(days=1),
time_axis="date",
)

assert e.value.args[0] == "Reference is empty. Please adjust the `split` argument"
54 changes: 28 additions & 26 deletions tests/popmon/spark/test_spark.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from os.path import abspath, dirname, join
from copy import deepcopy
from pathlib import Path

import pandas as pd
import pytest
Expand All @@ -20,19 +21,16 @@ def spark_context():
if not spark_found:
return None

current_path = dirname(abspath(__file__))
current_path = Path(__file__).parent

scala = "2.12" if int(pyspark_version[0]) >= 3 else "2.11"
hist_spark_jar = join(
current_path, f"jars/histogrammar-sparksql_{scala}-1.0.11.jar"
)
hist_jar = join(current_path, f"jars/histogrammar_{scala}-1.0.11.jar")
hist_spark_jar = current_path / f"jars/histogrammar-sparksql_{scala}-1.0.11.jar"
hist_jar = current_path / f"jars/histogrammar_{scala}-1.0.11.jar"

spark = (
SparkSession.builder.master("local")
.appName("popmon-pytest")
.config("spark.jars", f"{hist_spark_jar},{hist_jar}")
.config("spark.sql.execution.arrow.enabled", "false")
.config("spark.sql.session.timeZone", "GMT")
.getOrCreate()
)
Expand All @@ -51,8 +49,8 @@ def test_spark_stability_metrics(spark_context):
features = ["date:isActive", "date:eyeColor", "date:latitude"]
bin_specs = {
"date": {
"bin_width": pd.Timedelta("1y").value,
"bin_offset": pd.Timestamp("2000-1-1").value,
"bin_width": pd.Timedelta(365, "days").value,
"bin_offset": pd.Timestamp(year=2000, month=1, day=1).value,
},
"latitude": {"bin_width": 5.0, "bin_offset": 0.0},
}
Expand All @@ -75,16 +73,17 @@ def test_spark_stability_metrics(spark_context):
"ignore:createDataFrame attempted Arrow optimization because"
)
def test_spark_make_histograms(spark_context):
pytest.age["data"]["name"] = "b'age'"
pytest.company["data"]["name"] = "b'company'"
pytest.eyesColor["data"]["name"] = "b'eyeColor'"
pytest.gender["data"]["name"] = "b'gender'"
pytest.isActive["data"]["name"] = "b'isActive'"
pytest.latitude["data"]["name"] = "b'latitude'"
pytest.longitude["data"]["name"] = "b'longitude'"
pytest.transaction["data"]["name"] = "b'transaction'"

pytest.latitude_longitude["data"]["name"] = "b'latitude:longitude'"
names = [
"age",
"company",
"eyeColor",
"gender",
"latitude",
"longitude",
"transaction",
]

pytest.latitude_longitude["data"]["name"] = "'latitude:longitude'"
pytest.latitude_longitude["data"]["bins:name"] = "unit_func"

spark_df = spark_context.createDataFrame(pytest.test_df)
Expand Down Expand Up @@ -113,10 +112,13 @@ def test_spark_make_histograms(spark_context):
binning="unit",
)

assert current_hists["age"].toJson() == pytest.age
assert current_hists["company"].toJson() == pytest.company
assert current_hists["eyeColor"].toJson() == pytest.eyesColor
assert current_hists["gender"].toJson() == pytest.gender
assert current_hists["latitude"].toJson() == pytest.latitude
assert current_hists["longitude"].toJson() == pytest.longitude
assert current_hists["transaction"].toJson() == pytest.transaction
# backwards compatibility
for name in names:
v1 = deepcopy(getattr(pytest, name))
v1["data"]["name"] = f"'{name}'"

v2 = deepcopy(getattr(pytest, name))
v2["data"]["name"] = f"b'{name}'"

output = current_hists[name].toJson()
assert output == v1 or output == v2
161 changes: 161 additions & 0 deletions tests/popmon/spark/test_split_dataset_spark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
from datetime import datetime, timedelta
from pathlib import Path

import pandas as pd
import pytest

from popmon.pipeline.dataset_splitter import split_dataset

try:
from pyspark import __version__ as pyspark_version
from pyspark.sql import SparkSession

spark_found = True
except (ModuleNotFoundError, AttributeError):
spark_found = False


@pytest.fixture
def spark_context():
if not spark_found:
return None

current_path = Path(__file__).parent

scala = "2.12" if int(pyspark_version[0]) >= 3 else "2.11"
hist_spark_jar = current_path / f"jars/histogrammar-sparksql_{scala}-1.0.11.jar"
hist_jar = current_path / f"jars/histogrammar_{scala}-1.0.11.jar"

spark = (
SparkSession.builder.master("local")
.appName("popmon-pytest")
.config("spark.jars", f"{hist_spark_jar},{hist_jar}")
.config("spark.sql.session.timeZone", "GMT")
.getOrCreate()
)
return spark


@pytest.fixture
def test_dataframe_spark(spark_context):
n_samples = 1000
start = datetime.today()
df = pd.DataFrame(
{
"date": [start + timedelta(days=delta) for delta in range(n_samples)],
"f1": [1] * n_samples,
"f2": [0] * n_samples,
}
)
spark_df = spark_context.createDataFrame(df)
return spark_df


@pytest.mark.spark
@pytest.mark.skipif(not spark_found, reason="spark not found")
def test_split_dataset_spark_int(test_dataframe_spark):
reference, df = split_dataset(test_dataframe_spark, split=3, time_axis="date")

assert reference.count() == 3
assert df.count() == 997
assert reference.columns == ["date", "f1", "f2"]
assert df.columns == ["date", "f1", "f2"]


@pytest.mark.spark
@pytest.mark.skipif(not spark_found, reason="spark not found")
def test_split_dataset_spark_int_underflow(test_dataframe_spark):
with pytest.raises(ValueError) as e:
_ = split_dataset(test_dataframe_spark, split=0, time_axis="date")

assert e.value.args[0] == "Number of instances should be greater than 0"


@pytest.mark.spark
@pytest.mark.skipif(not spark_found, reason="spark not found")
def test_split_dataset_spark_int_overflow(test_dataframe_spark):
with pytest.raises(ValueError) as e:
_ = split_dataset(test_dataframe_spark, split=1001, time_axis="date")

assert (
e.value.args[0]
== "Returned dataframe is empty. Please adjust the `split` argument"
)


@pytest.mark.spark
@pytest.mark.skipif(not spark_found, reason="spark not found")
def test_split_dataset_spark_float(test_dataframe_spark):
reference, df = split_dataset(test_dataframe_spark, split=0.45, time_axis="date")

assert reference.count() == 450
assert df.count() == 550
assert reference.columns == ["date", "f1", "f2"]
assert df.columns == ["date", "f1", "f2"]


@pytest.mark.spark
@pytest.mark.skipif(not spark_found, reason="spark not found")
def test_split_dataset_spark_float_round(test_dataframe_spark):
reference, df = split_dataset(test_dataframe_spark, split=0.8888, time_axis="date")

assert reference.count() == 888
assert df.count() == 112
assert reference.columns == ["date", "f1", "f2"]
assert df.columns == ["date", "f1", "f2"]


@pytest.mark.spark
@pytest.mark.skipif(not spark_found, reason="spark not found")
def test_split_dataset_spark_float_underflow(test_dataframe_spark):
with pytest.raises(ValueError) as e:
_ = split_dataset(test_dataframe_spark, split=0.0, time_axis="date")

assert e.value.args[0] == "Fraction should be 0 > fraction > 1"

with pytest.raises(ValueError) as e:
_ = split_dataset(test_dataframe_spark, split=-1.0, time_axis="date")

assert e.value.args[0] == "Fraction should be 0 > fraction > 1"


@pytest.mark.spark
@pytest.mark.skipif(not spark_found, reason="spark not found")
def test_split_dataset_spark_float_overflow(test_dataframe_spark):
with pytest.raises(ValueError) as e:
_ = split_dataset(test_dataframe_spark, split=1.0, time_axis="date")

assert e.value.args[0] == "Fraction should be 0 > fraction > 1"

with pytest.raises(ValueError) as e:
_ = split_dataset(test_dataframe_spark, split=10.0, time_axis="date")

assert e.value.args[0] == "Fraction should be 0 > fraction > 1"


@pytest.mark.spark
@pytest.mark.skipif(not spark_found, reason="spark not found")
def test_split_dataset_spark_condition(test_dataframe_spark):
reference, df = split_dataset(
test_dataframe_spark,
split=f"date < '{(datetime.today() + timedelta(days=50, hours=5)).strftime('%Y-%m-%d %H:%M:%S')}'",
time_axis="date",
)

assert reference.count() == 51
assert df.count() == 949
assert reference.columns == ["date", "f1", "f2"]
assert df.columns == ["date", "f1", "f2"]


@pytest.mark.spark
@pytest.mark.skipif(not spark_found, reason="spark not found")
def test_split_dataset_spark_condition_false(test_dataframe_spark):
with pytest.raises(ValueError) as e:
split_dataset(
test_dataframe_spark,
split=f"date < '{(datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d %H:%M:%S')}'",
time_axis="date",
)

assert e.value.args[0] == "Reference is empty. Please adjust the `split` argument"

0 comments on commit 25c0301

Please sign in to comment.