diff --git a/.github/workflows/confidence.yml b/.github/workflows/confidence.yml index 90b377a..d657dc6 100644 --- a/.github/workflows/confidence.yml +++ b/.github/workflows/confidence.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.8', '3.9', '3.10'] + python-version: ['3.9', '3.10', '3.11'] steps: - uses: actions/checkout@v1 diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index fcfc6d6..4f2e205 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -22,7 +22,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.7' + python-version: '3.9' - name: Install dependencies run: | python -m pip install --upgrade pip @@ -39,4 +39,4 @@ jobs: with: user: __token__ password: ${{ secrets.TEST_PYPI_API_TOKEN }} - repository_url: https://test.pypi.org/legacy/ \ No newline at end of file + repository_url: https://test.pypi.org/legacy/ diff --git a/HISTORY.rst b/HISTORY.rst index 2c7848b..5549064 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,6 +2,28 @@ History ======= +4.0.0 (2024-11-24) +------------------ +* Update mininum requirement for Chartify to avoid cropping bug in `chrome-webdriver` +* Deprecating support for python 3.8 + + +3.0.4 (2023-10-12) +------------------ +* Fixing so that other multiple correction methods than Bonferroni are applied correctly. + + +3.0.3 (2023-10-12) +------------------ +* Relaxing version requirements for scipy and pandas to allow versions 2.x + + +3.0.2 (2023-08-08) +------------------ +* Added docstring to Experiment +* When using variance reduction and there is no pre-exposure data, so that the corresponding matrix is not invertable we now fall back to using a matrix with zeroes, leading to no variance reduction for this case, instead of crashing. + + 3.0.1 (2023-04-20) ------------------ * Added reference level point estimate to the hover box of difference plots diff --git a/README.md b/README.md index 3677894..ee73955 100644 --- a/README.md +++ b/README.md @@ -2,18 +2,17 @@ Spotify Confidence ======== ![Status](https://img.shields.io/badge/Status-Beta-blue.svg) -![Latest release](https://img.shields.io/badge/release-3.0.1-green.svg "Latest release: 3.0.1") -![Python](https://img.shields.io/badge/Python-3.7-blue.svg "Python") -![Python](https://img.shields.io/badge/Python-3.8-blue.svg "Python") +![Latest release](https://img.shields.io/badge/release-4.0.0-green.svg "Latest release: 4.0.0") ![Python](https://img.shields.io/badge/Python-3.9-blue.svg "Python") ![Python](https://img.shields.io/badge/Python-3.10-blue.svg "Python") +![Python](https://img.shields.io/badge/Python-3.11-blue.svg "Python") Python library for AB test analysis. Why use Spotify Confidence? ----------------- -Spotify Confidence provides convinience wrappers around statsmodel's various functions for computing p-values and confidence intervalls. +Spotify Confidence provides convenience wrappers around statsmodel's various functions for computing p-values and confidence intervalls. With Spotify Confidence it's easy to compute several p-values and confidence bounds in one go, e.g. one for each country or for each date. Each function comes in two versions: - one that return a pandas dataframe, diff --git a/requirements_dev.txt b/requirements_dev.txt index 51b4ffc..cf02b2e 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -17,3 +17,4 @@ ipywidgets>=7.1.0 black==23.1.0 ipython>=8.10.0 # not directly required, pinned by Snyk to avoid a vulnerability setuptools>=65.5.1 # not directly required, pinned by Snyk to avoid a vulnerability +tornado>=6.3.2 # not directly required, pinned by Snyk to avoid a vulnerability diff --git a/setup.cfg b/setup.cfg index 23d452e..4186a42 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = spotify-confidence -version = 3.0.1 +version = 4.0.0 author = Per Sillren author_email = pers@spotify.com description = Package for calculating and visualising confidence intervals, e.g. for A/B test analysis. @@ -18,13 +18,13 @@ classifiers = package_dir = = . packages = find: -python_requires = >=3.8 +python_requires = >=3.9 install_requires = numpy>=1.20.0,<2.0.0 - scipy>=1.6.0,<1.8.0 - pandas>=1.2.0,<2.0.0 + scipy>=1.6.0 + pandas>=1.2.0 statsmodels>=0.13.0,<1.0.0 - chartify>=4.0.3 + chartify>=5.0.1 ipywidgets>=8.0.0 [options.packages.find] diff --git a/spotify_confidence/analysis/frequentist/chartify_grapher.py b/spotify_confidence/analysis/frequentist/chartify_grapher.py index 3a297c1..874a5da 100644 --- a/spotify_confidence/analysis/frequentist/chartify_grapher.py +++ b/spotify_confidence/analysis/frequentist/chartify_grapher.py @@ -278,12 +278,14 @@ def _categorical_difference_chart( df[~df[NIM].isna()] .assign( color_column=lambda df: df.apply( - lambda row: "red" - if row[LOWER] < row[NULL_HYPOTHESIS] - and row[PREFERENCE] == "increase" - or row[NULL_HYPOTHESIS] < row[UPPER] - and row[PREFERENCE] == "decrease" - else "green", + lambda row: ( + "red" + if row[LOWER] < row[NULL_HYPOTHESIS] + and row[PREFERENCE] == "increase" + or row[NULL_HYPOTHESIS] < row[UPPER] + and row[PREFERENCE] == "decrease" + else "green" + ), axis=1, ) ) diff --git a/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py b/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py index da64a3a..4c47c22 100644 --- a/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py +++ b/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py @@ -524,6 +524,14 @@ def join(df: DataFrame) -> DataFrame: ), lambda df: _compute_comparisons(df, **kwargs), ) + comparison_df = comparison_df.pipe(add_adjusted_p_and_is_significant, **kwargs) + comparison_df = groupbyApplyParallel( + comparison_df.groupby( + groups_except_ordinal + [self._method_column, "level_1", "level_2"], as_index=False, sort=False + ), + lambda df: _add_ci_and_adjust_if_absolute(df, **kwargs), + ) + return comparison_df def achieved_power(self, level_1, level_2, mde, alpha, groupby): @@ -557,9 +565,8 @@ def _compute_comparisons(df: DataFrame, **kwargs: Dict) -> DataFrame: return ( df.assign(**{DIFFERENCE: lambda df: df[POINT_ESTIMATE + SFX2] - df[POINT_ESTIMATE + SFX1]}) .assign(**{STD_ERR: confidence_computers[df[kwargs[METHOD]].values[0]].std_err(df, **kwargs)}) - .pipe(_add_p_value_and_ci, **kwargs) + .pipe(_add_p_value, **kwargs) .pipe(_powered_effect_and_required_sample_size_from_difference_df, **kwargs) - .pipe(_adjust_if_absolute, absolute=kwargs[ABSOLUTE]) .assign(**{PREFERENCE: lambda df: df[PREFERENCE].map(PREFERENCE_DICT)}) .pipe(_add_variance_reduction_rate, **kwargs) ) @@ -580,15 +587,16 @@ def _add_variance_reduction_rate(df: DataFrame, **kwargs: Dict) -> DataFrame: return df -def _add_p_value_and_ci(df: DataFrame, **kwargs: Dict) -> DataFrame: - return ( - df.pipe(set_alpha_and_adjust_preference, **kwargs) - .assign(**{P_VALUE: lambda df: df.pipe(_p_value, **kwargs)}) - .pipe(add_adjusted_p_and_is_significant, **kwargs) - .pipe(add_ci, **kwargs) +def _add_p_value(df: DataFrame, **kwargs: Dict) -> DataFrame: + return df.pipe(set_alpha_and_adjust_preference, **kwargs).assign( + **{P_VALUE: lambda df: df.pipe(_p_value, **kwargs)} ) +def _add_ci_and_adjust_if_absolute(df: DataFrame, **kwargs: Dict) -> DataFrame: + return df.pipe(add_ci, **kwargs).pipe(_adjust_if_absolute, absolute=kwargs[ABSOLUTE]) + + def _adjust_if_absolute(df: DataFrame, absolute: bool) -> DataFrame: if absolute: return df.assign(absolute_difference=absolute) diff --git a/spotify_confidence/analysis/frequentist/confidence_computers/z_test_computer.py b/spotify_confidence/analysis/frequentist/confidence_computers/z_test_computer.py index bbe923f..7f0395d 100644 --- a/spotify_confidence/analysis/frequentist/confidence_computers/z_test_computer.py +++ b/spotify_confidence/analysis/frequentist/confidence_computers/z_test_computer.py @@ -4,7 +4,11 @@ from pandas import DataFrame, Series from scipy import optimize from scipy import stats as st -from scipy.stats.stats import _unequal_var_ttest_denom + +try: + from scipy.stats._stats_py import _unequal_var_ttest_denom +except ImportError: # Fallback for scipy<1.8.0 + from scipy.stats.stats import _unequal_var_ttest_denom from statsmodels.stats.weightstats import _zconfint_generic, _zstat_generic @@ -137,9 +141,11 @@ def adjusted_alphas_for_group(grp: DataFrame) -> Series: .assign( **{ ADJUSTED_ALPHA: lambda df: df.apply( - lambda row: 2 * (1 - st.norm.cdf(row["zb"])) - if (grp[PREFERENCE_TEST] == TWO_SIDED).all() - else 1 - st.norm.cdf(row["zb"]), + lambda row: ( + 2 * (1 - st.norm.cdf(row["zb"])) + if (grp[PREFERENCE_TEST] == TWO_SIDED).all() + else 1 - st.norm.cdf(row["zb"]) + ), axis=1, ) } diff --git a/spotify_confidence/analysis/frequentist/confidence_computers/z_test_linreg_computer.py b/spotify_confidence/analysis/frequentist/confidence_computers/z_test_linreg_computer.py index 8c812bb..cc8f1da 100644 --- a/spotify_confidence/analysis/frequentist/confidence_computers/z_test_linreg_computer.py +++ b/spotify_confidence/analysis/frequentist/confidence_computers/z_test_linreg_computer.py @@ -38,7 +38,10 @@ def dimension(x): Xy0[0,] = col_sum(df[kwargs[NUMERATOR]]) Xy0[1 : (k + 1),] = np.atleast_2d(col_sum(df[kwargs[FEATURE_CROSS]])).reshape(-1, 1) - b = np.matmul(np.linalg.inv(XX0), Xy0) + try: + b = np.matmul(np.linalg.inv(XX0), Xy0) + except np.linalg.LinAlgError: + b = np.zeros((k + 1, 1)) out = b[1 : (k + 1)] if out.size == 1: out = out.item() @@ -88,9 +91,11 @@ def lin_reg_variance_delta(row, **kwargs): def variance(df: DataFrame, **kwargs) -> Series: variance1 = z_test_computer.variance(df, **kwargs) - if kwargs[FEATURE] in df: - return variance1 + df.apply(lin_reg_variance_delta, axis=1, **kwargs) + computed_variances = variance1 + df.apply(lin_reg_variance_delta, axis=1, **kwargs) + if (computed_variances < 0).any(): + raise ValueError("Computed variance is negative, please check sufficient " "statistics.") + return computed_variances else: return variance1 diff --git a/spotify_confidence/analysis/frequentist/experiment.py b/spotify_confidence/analysis/frequentist/experiment.py index 6fae40f..276dad7 100644 --- a/spotify_confidence/analysis/frequentist/experiment.py +++ b/spotify_confidence/analysis/frequentist/experiment.py @@ -33,6 +33,37 @@ class Experiment(ConfidenceABC): + """ + This class represents an experiment which might include several metrics and treatment groups, as well as other + dimensions to break down the results by. It provides several methods to analyze and visualize the results of the + experiment. The experiment is based on data provided as a DataFrame with sufficient statistics, such as information + the numerator, and denominator (number of units in the experiment) of the tested metrics and their grouping columns. + + Attributes: + data_frame (DataFrame): DataFrame that contains the experimental data. + numerator_column (str): Name of the DataFrame column that contains the numerator of the tested metric. + numerator_sum_squares_column (Union[str, None]): Name of the DataFrame column that contains the sum of squares + of the numerator. + denominator_column (str): Name of the DataFrame column that contains the denominator of the tested metric. + categorical_group_columns (Union[str, Iterable]): Column(s) that categorically group the data. + ordinal_group_column (Union[str, None]): Column that ordinally group the data. Values need to be of + types int or datetime + interval_size (float): Size of the confidence interval. Defaults to 0.95. + correction_method (str): Method for correction. Defaults to "bonferroni". + confidence_computer (ConfidenceComputerABC): ConfidenceComputerABC object to compute confidence intervals. + confidence_grapher (ConfidenceGrapherABC): ConfidenceGrapherABC object to plot confidence intervals. + method_column (str): Column that contains the statistical test method, e.g. "z-test", "t-test". + bootstrap_samples_column (str): Column that contains the bootstrap samples when method is "bootstrap". + metric_column (str): Column that contain the names of metrics. + treatment_column (str): Column that contains the names of treatment and control groups. + power (float): Desired statistical power. Defaults to 0.8. + feature_column (str): Column that contains the features when method is "z-test-linreg". + feature_sum_squares_column (str): Column that contains the sum of squares of the features method is + "z-test-linreg". + feature_cross_sum_column (str): Column that contains the cross product sum of the features method is + "z-test-linreg". + """ + def __init__( self, data_frame: DataFrame, diff --git a/spotify_confidence/analysis/frequentist/multiple_comparison.py b/spotify_confidence/analysis/frequentist/multiple_comparison.py index 60cee96..65cdc46 100644 --- a/spotify_confidence/analysis/frequentist/multiple_comparison.py +++ b/spotify_confidence/analysis/frequentist/multiple_comparison.py @@ -1,9 +1,10 @@ from _warnings import warn from typing import Iterable, Dict -from pandas import DataFrame, Series +from pandas import DataFrame from statsmodels.stats.multitest import multipletests +from spotify_confidence.analysis.confidence_utils import groupbyApplyParallel from spotify_confidence.analysis.constants import ( BONFERRONI, BONFERRONI_ONLY_COUNT_TWOSIDED, @@ -32,6 +33,7 @@ NIM, NUMBER_OF_COMPARISONS, FINAL_EXPECTED_SAMPLE_SIZE, + ORDINAL_GROUP_COLUMN, CORRECTION_METHOD, METHOD, IS_SIGNIFICANT, @@ -144,11 +146,18 @@ def add_adjusted_p_and_is_significant(df: DataFrame, **kwargs: Dict) -> DataFram f"{BONFERRONI}, {BONFERRONI_ONLY_COUNT_TWOSIDED}, " f"{BONFERRONI_DO_NOT_COUNT_NON_INFERIORITY}, {SPOT_1}" ) - adjusted_alpha = compute_sequential_adjusted_alpha(df, **kwargs) - df = df.merge(adjusted_alpha, left_index=True, right_index=True) - df[IS_SIGNIFICANT] = df[P_VALUE] < df[ADJUSTED_ALPHA] - df[P_VALUE] = None - df[ADJUSTED_P] = None + + groups_except_ordinal = [ + column + for column in df.index.names + if kwargs[ORDINAL_GROUP_COLUMN] is not None + and column is not None + and (column != kwargs[ORDINAL_GROUP_COLUMN] or kwargs[FINAL_EXPECTED_SAMPLE_SIZE] is None) + ] + df = groupbyApplyParallel( + df.groupby(groups_except_ordinal + [kwargs[METHOD], "level_1", "level_2"], as_index=False, sort=False), + lambda df: compute_sequential_adjusted_alpha(df, **kwargs), + ) elif kwargs[CORRECTION_METHOD] in [ HOLM, HOMMEL, @@ -194,9 +203,14 @@ def add_adjusted_p_and_is_significant(df: DataFrame, **kwargs: Dict) -> DataFram return df -def compute_sequential_adjusted_alpha(df: DataFrame, **kwargs: Dict) -> Series: +def compute_sequential_adjusted_alpha(df: DataFrame, **kwargs: Dict) -> DataFrame: if df[kwargs[METHOD]].isin([ZTEST, ZTESTLINREG]).all(): - return confidence_computers[ZTEST].compute_sequential_adjusted_alpha(df, **kwargs) + adjusted_alpha = confidence_computers[ZTEST].compute_sequential_adjusted_alpha(df, **kwargs) + df = df.merge(adjusted_alpha, left_index=True, right_index=True) + df[IS_SIGNIFICANT] = df[P_VALUE] < df[ADJUSTED_ALPHA] + df[P_VALUE] = None + df[ADJUSTED_P] = None + return df else: raise NotImplementedError("Sequential testing is only supported for z-test and z-testlinreg") @@ -254,9 +268,9 @@ def set_alpha_and_adjust_preference(df: DataFrame, **kwargs: Dict) -> DataFrame: return df.assign( **{ ALPHA: df.apply( - lambda row: 2 * alpha_0 - if kwargs[CORRECTION_METHOD] == SPOT_1 and row[PREFERENCE] != TWO_SIDED - else alpha_0, + lambda row: ( + 2 * alpha_0 if kwargs[CORRECTION_METHOD] == SPOT_1 and row[PREFERENCE] != TWO_SIDED else alpha_0 + ), axis=1, ) } diff --git a/spotify_confidence/analysis/frequentist/nims_and_mdes.py b/spotify_confidence/analysis/frequentist/nims_and_mdes.py index 5ffdbb5..7bfc20f 100644 --- a/spotify_confidence/analysis/frequentist/nims_and_mdes.py +++ b/spotify_confidence/analysis/frequentist/nims_and_mdes.py @@ -29,9 +29,11 @@ def add_nim_input_columns_from_tuple_or_dict(df, nims: NIM_TYPE, mde_column: str elif nims is None or not nims: return df.assign(**{NIM_COLUMN_DEFAULT: None}).assign( **{ - PREFERRED_DIRECTION_COLUMN_DEFAULT: None - if PREFERRED_DIRECTION_COLUMN_DEFAULT not in df or mde_column is None - else df[PREFERRED_DIRECTION_COLUMN_DEFAULT] + PREFERRED_DIRECTION_COLUMN_DEFAULT: ( + None + if PREFERRED_DIRECTION_COLUMN_DEFAULT not in df or mde_column is None + else df[PREFERRED_DIRECTION_COLUMN_DEFAULT] + ) } ) else: diff --git a/spotify_confidence/samplesize/sample_size_calculator.py b/spotify_confidence/samplesize/sample_size_calculator.py index 5fdd151..10b77d5 100644 --- a/spotify_confidence/samplesize/sample_size_calculator.py +++ b/spotify_confidence/samplesize/sample_size_calculator.py @@ -61,7 +61,7 @@ def continuous( treatment_allocations=default_treatment_allocations, bonferroni_correction=default_bonferroni, ): - """Calculate the required sample size for a binomial metric. + """Calculate the required sample size for a continuous metric. Args: average_absolute_mde (float): Average absolute minimal detectable diff --git a/tests/frequentist/test_freqsamplesizecalculator.py b/tests/frequentist/test_freqsamplesizecalculator.py index be77024..1eaf148 100644 --- a/tests/frequentist/test_freqsamplesizecalculator.py +++ b/tests/frequentist/test_freqsamplesizecalculator.py @@ -819,6 +819,6 @@ def test_sample_size_with_nan(self): ) assert len(ss) == len(df) - assert ss[REQUIRED_SAMPLE_SIZE_METRIC].values[0] is None + assert ss[REQUIRED_SAMPLE_SIZE_METRIC].isna()[0] assert 0.999 < ss[REQUIRED_SAMPLE_SIZE_METRIC].values[1] / 95459 < 1.001 assert ss[CI_WIDTH].isna().all() diff --git a/tests/frequentist/test_ztest.py b/tests/frequentist/test_ztest.py index 7c3a16f..b03b4f1 100644 --- a/tests/frequentist/test_ztest.py +++ b/tests/frequentist/test_ztest.py @@ -14,6 +14,7 @@ ADJUSTED_UPPER, DIFFERENCE, BONFERRONI, + BONFERRONI_ONLY_COUNT_TWOSIDED, BONFERRONI_DO_NOT_COUNT_NON_INFERIORITY, CORRECTION_METHODS, SPOT_1, @@ -21,6 +22,7 @@ POWERED_EFFECT, REQUIRED_SAMPLE_SIZE, ) +from statsmodels.stats.multitest import multipletests class TestPoweredEffectContinuousSingleMetric(object): @@ -203,8 +205,8 @@ def test_powered_effect(self): assert np.isclose(powered_effect[POWERED_EFFECT][1], 0.5291, atol=0.001) assert np.isclose(powered_effect[POWERED_EFFECT][2], 0.4596, atol=0.001) assert np.isclose(powered_effect[POWERED_EFFECT][3], 0.4869, atol=0.001) - assert powered_effect[REQUIRED_SAMPLE_SIZE][0] is None - assert powered_effect[REQUIRED_SAMPLE_SIZE][1] is None + assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[0] + assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[1] assert np.isclose(powered_effect[REQUIRED_SAMPLE_SIZE][2], 16487886, atol=100) assert np.isclose(powered_effect[REQUIRED_SAMPLE_SIZE][3], 3083846, atol=100) @@ -329,10 +331,10 @@ def test_powered_effect(self): assert np.isclose(powered_effect[POWERED_EFFECT][6], 0.4995, atol=0.001) assert np.isclose(powered_effect[POWERED_EFFECT][7], 0.5291, atol=0.001) - assert powered_effect[REQUIRED_SAMPLE_SIZE][0] is None - assert powered_effect[REQUIRED_SAMPLE_SIZE][1] is None - assert powered_effect[REQUIRED_SAMPLE_SIZE][2] is None - assert powered_effect[REQUIRED_SAMPLE_SIZE][3] is None + assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[0] + assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[1] + assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[2] + assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[3] assert np.isclose(powered_effect[REQUIRED_SAMPLE_SIZE][4], 19475238, atol=100) assert np.isclose(powered_effect[REQUIRED_SAMPLE_SIZE][5], 3642591, atol=100) assert np.isclose(powered_effect[REQUIRED_SAMPLE_SIZE][6], 19475238, atol=100) @@ -733,12 +735,12 @@ def test_powered_effect(self): # assert np.isclose(powered_effect[POWERED_EFFECT][10], 0.2663, atol=0.001) # assert np.isclose(powered_effect[POWERED_EFFECT][11], 0.2479, atol=0.001) - assert powered_effect[REQUIRED_SAMPLE_SIZE][0] is None - assert powered_effect[REQUIRED_SAMPLE_SIZE][1] is None - assert powered_effect[REQUIRED_SAMPLE_SIZE][2] is None - assert powered_effect[REQUIRED_SAMPLE_SIZE][3] is None - assert powered_effect[REQUIRED_SAMPLE_SIZE][4] is None - assert powered_effect[REQUIRED_SAMPLE_SIZE][5] is None + assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[0] + assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[1] + assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[2] + assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[3] + assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[4] + assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[5] assert np.isclose(powered_effect[REQUIRED_SAMPLE_SIZE][6], 260541, atol=100) assert np.isclose(powered_effect[REQUIRED_SAMPLE_SIZE][7], 361863, atol=100) assert np.isclose(powered_effect[REQUIRED_SAMPLE_SIZE][8], 326159, atol=100) @@ -1535,7 +1537,12 @@ def test_difference_groupby(self, correction_method): @pytest.mark.parametrize("correction_method", CORRECTION_METHODS, ids=lambda x: f"correction method: {x}") def test_multiple_difference(self, correction_method): self.test._confidence_computer._correction_method = correction_method - if BONFERRONI in correction_method: + if correction_method in [ + BONFERRONI, + BONFERRONI_ONLY_COUNT_TWOSIDED, + BONFERRONI_DO_NOT_COUNT_NON_INFERIORITY, + SPOT_1, + ]: difference_df = self.test.multiple_difference( level=("control", 1), groupby="country", level_as_reference=True ) @@ -1569,10 +1576,32 @@ def test_multiple_difference(self, correction_method): ) ) + if correction_method.startswith("spot-"): + corr_method = correction_method[7:] + else: + corr_method = correction_method + + _, adjusted_p, _, _ = multipletests( + pvals=difference_df["p-value"], + alpha=1 - self.test._confidence_computer._interval_size, + method=corr_method, + ) + + assert np.allclose( + adjusted_p, + difference_df["adjusted p-value"], + rtol=0.01, + ) + @pytest.mark.parametrize("correction_method", CORRECTION_METHODS, ids=lambda x: f"correction method: {x}") def test_multiple_difference_groupby(self, correction_method): self.test._confidence_computer._correction_method = correction_method - if BONFERRONI in correction_method: + if correction_method in [ + BONFERRONI, + BONFERRONI_ONLY_COUNT_TWOSIDED, + BONFERRONI_DO_NOT_COUNT_NON_INFERIORITY, + SPOT_1, + ]: difference_df = self.test.multiple_difference( level="control", groupby=["days_since_reg", "country"], level_as_reference=True ) @@ -1603,8 +1632,25 @@ def test_multiple_difference_groupby(self, correction_method): if correction_method in CORRECTION_METHODS_THAT_SUPPORT_CI: assert not any(difference_df[ADJUSTED_LOWER].isna()) + if correction_method.startswith("spot-"): + corr_method = correction_method[7:] + else: + corr_method = correction_method + + _, adjusted_p, _, _ = multipletests( + pvals=difference_df["p-value"], + alpha=1 - self.test._confidence_computer._interval_size, + method=corr_method, + ) + + assert np.allclose( + adjusted_p, + difference_df["adjusted p-value"], + rtol=0.01, + ) + @pytest.mark.parametrize("correction_method", CORRECTION_METHODS, ids=lambda x: f"correction method: {x}") - def test_differece_with_nims(self, correction_method): + def test_difference_with_nims(self, correction_method): self.test._confidence_computer._correction_method = correction_method df = self.test.difference( level_1=("test", "us"), diff --git a/tests/frequentist/test_ztest_linreg.py b/tests/frequentist/test_ztest_linreg.py index 6044f2c..eecd4ad 100644 --- a/tests/frequentist/test_ztest_linreg.py +++ b/tests/frequentist/test_ztest_linreg.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd +import pytest import spotify_confidence from spotify_confidence.analysis.constants import REGRESSION_PARAM, DECREASE_PREFFERED, METHOD_COLUMN_NAME @@ -510,5 +511,104 @@ def test_parameters_univariate_required_sample_size(self): ) -# TODO: Test for sequential data (w/ ordinal column) -# TODO: Test for segmentation +class TestUnivariateSingleMetricWithBadPreExposureData(object): + def setup(self): + np.random.seed(123) + n = 10000 + d = np.random.randint(2, size=n) + y = 0.5 * d + np.random.standard_normal(size=n) + data = pd.DataFrame({"variation_name": list(map(str, d)), "y": y}) + data = ( + data.assign(y2=y**2) + .groupby(["variation_name"]) + .agg({"y": ["sum", "count"], "y2": "sum"}) + .assign(**{"x_sum": 0.0, "x2_sum": 0.0, "xy_sum": 0.0}) + .reset_index() + ) + + data.columns = data.columns.map("_".join).str.strip("_") + data = data.assign(**{"metric_name": "metricA"}) + self.n = n + self.y = y + self.d = d + self.data = data + + self.test = spotify_confidence.ZTestLinreg( + data_frame=data, + numerator_column="y_sum", + numerator_sum_squares_column="y2_sum", + denominator_column="y_count", + categorical_group_columns=["variation_name"], + feature_column="x_sum", + feature_sum_squares_column="x2_sum", + feature_cross_sum_column="xy_sum", + interval_size=0.99, + correction_method="bonferroni", + metric_column="metric_name", + ) + + def test_summary(self): + summary_df = self.test.summary(verbose=True) + print(summary_df) + assert len(summary_df) == len(self.data) + + def test_parameters_univariate(self): + summary_df = self.test.summary(verbose=True) + assert np.allclose(0.0, summary_df[REGRESSION_PARAM][0], rtol=0.0001) + + diff = self.test.difference(level_1="0", level_2="1", verbose=True, groupby="metric_name") + y = self.y + d = self.d + assert np.allclose(y[d == 1].mean() - y[d == 0].mean(), diff["difference"]) + + v0 = np.var(y[d == 0]) + v1 = np.var(y[d == 1]) + n0 = y[d == 0].size + n1 = y[d == 1].size + assert np.allclose(diff["std_err"], np.sqrt(v0 / n0 + v1 / n1), rtol=1e-3) + + +class TestUnivariateSingleMetricNegativeVariance(object): + def setup(self): + self.data = pd.DataFrame( + [ + { + "group": "1", + "count": 17512, + "sum": 16544, + "sum_of_squares": 16044, + "sum_2": 6625, + "sum_of_squares_2": 6455, + "sum_of_squares_x": 3513, + "metric_name": "metricA", + }, + { + "group": "2", + "count": 159142, + "sum": 150364, + "sum_of_squares": 145794, + "sum_2": 60540, + "sum_of_squares_2": 59047, + "sum_of_squares_x": 32398, + "metric_name": "metricA", + }, + ] + ) + + self.test = spotify_confidence.ZTestLinreg( + data_frame=self.data, + numerator_column="sum", + numerator_sum_squares_column="sum_of_squares", + denominator_column="count", + categorical_group_columns=["group"], + feature_column="sum_2", + feature_sum_squares_column="sum_of_squares_2", + feature_cross_sum_column="sum_of_squares_x", + interval_size=0.99, + correction_method="bonferroni", + metric_column="metric_name", + ) + + def test_setup_that_will_fail_with_negative_variance(self): + with pytest.raises(ValueError): + self.test.summary(verbose=True) diff --git a/tox.ini b/tox.ini index 81273da..fc4e692 100644 --- a/tox.ini +++ b/tox.ini @@ -1,19 +1,19 @@ [tox] -envlist = python3.8, python3.9, python3.10 +envlist = python3.9, python3.10, python3.11 skipsdist = True usedevelop = True [travis] python = - 3.8: python3.8 3.9: python3.9 3.10: python3.10 + 3.11: python3.11 [gh-actions] python = - 3.8: python3.8 3.9: python3.9 3.10: python3.10 + 3.11: python3.11 [testenv] setenv =