diff --git a/.github/workflows/confidence.yml b/.github/workflows/confidence.yml
index 90b377a..d657dc6 100644
--- a/.github/workflows/confidence.yml
+++ b/.github/workflows/confidence.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.8', '3.9', '3.10']
+        python-version: ['3.9', '3.10', '3.11']
 
     steps:
     - uses: actions/checkout@v1
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index fcfc6d6..4f2e205 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -22,7 +22,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v2
       with:
-        python-version: '3.7'
+        python-version: '3.9'
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
@@ -39,4 +39,4 @@ jobs:
       with:
         user: __token__
         password: ${{ secrets.TEST_PYPI_API_TOKEN }}
-        repository_url: https://test.pypi.org/legacy/
\ No newline at end of file
+        repository_url: https://test.pypi.org/legacy/
diff --git a/HISTORY.rst b/HISTORY.rst
index 2c7848b..5549064 100644
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -2,6 +2,28 @@
 History
 =======
 
+4.0.0 (2024-11-24)
+------------------
+* Update mininum requirement for Chartify to avoid cropping bug in `chrome-webdriver`
+* Deprecating support for python 3.8
+
+
+3.0.4 (2023-10-12)
+------------------
+* Fixing so that other multiple correction methods than Bonferroni are applied correctly.
+
+
+3.0.3 (2023-10-12)
+------------------
+* Relaxing version requirements for scipy and pandas to allow versions 2.x
+
+
+3.0.2 (2023-08-08)
+------------------
+* Added docstring to Experiment
+* When using variance reduction and there is no pre-exposure data, so that the corresponding matrix is not invertable we now fall back to using a matrix with zeroes, leading to no variance reduction for this case, instead of crashing.
+
+
 3.0.1 (2023-04-20)
 ------------------
 * Added reference level point estimate to the hover box of difference plots
diff --git a/README.md b/README.md
index 3677894..ee73955 100644
--- a/README.md
+++ b/README.md
@@ -2,18 +2,17 @@ Spotify Confidence
 ========
 
 ![Status](https://img.shields.io/badge/Status-Beta-blue.svg)
-![Latest release](https://img.shields.io/badge/release-3.0.1-green.svg "Latest release: 3.0.1")
-![Python](https://img.shields.io/badge/Python-3.7-blue.svg "Python")
-![Python](https://img.shields.io/badge/Python-3.8-blue.svg "Python")
+![Latest release](https://img.shields.io/badge/release-4.0.0-green.svg "Latest release: 4.0.0")
 ![Python](https://img.shields.io/badge/Python-3.9-blue.svg "Python")
 ![Python](https://img.shields.io/badge/Python-3.10-blue.svg "Python")
+![Python](https://img.shields.io/badge/Python-3.11-blue.svg "Python")
 
 Python library for AB test analysis.
 
 Why use Spotify Confidence?
 -----------------
 
-Spotify Confidence provides convinience wrappers around statsmodel's various functions for computing p-values and confidence intervalls. 
+Spotify Confidence provides convenience wrappers around statsmodel's various functions for computing p-values and confidence intervalls. 
 With Spotify Confidence it's easy to compute several p-values and confidence bounds in one go, e.g. one for each country or for each date. 
 Each function comes in two versions: 
  - one that return a pandas dataframe,
diff --git a/requirements_dev.txt b/requirements_dev.txt
index 51b4ffc..cf02b2e 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -17,3 +17,4 @@ ipywidgets>=7.1.0
 black==23.1.0
 ipython>=8.10.0 # not directly required, pinned by Snyk to avoid a vulnerability
 setuptools>=65.5.1 # not directly required, pinned by Snyk to avoid a vulnerability
+tornado>=6.3.2 # not directly required, pinned by Snyk to avoid a vulnerability
diff --git a/setup.cfg b/setup.cfg
index 23d452e..4186a42 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = spotify-confidence
-version = 3.0.1
+version = 4.0.0
 author = Per Sillren
 author_email = pers@spotify.com
 description = Package for calculating and visualising confidence intervals, e.g. for A/B test analysis.
@@ -18,13 +18,13 @@ classifiers =
 package_dir =
     = .
 packages = find:
-python_requires = >=3.8
+python_requires = >=3.9
 install_requires =
     numpy>=1.20.0,<2.0.0
-    scipy>=1.6.0,<1.8.0
-    pandas>=1.2.0,<2.0.0
+    scipy>=1.6.0
+    pandas>=1.2.0
     statsmodels>=0.13.0,<1.0.0
-    chartify>=4.0.3
+    chartify>=5.0.1
     ipywidgets>=8.0.0
 
 [options.packages.find]
diff --git a/spotify_confidence/analysis/frequentist/chartify_grapher.py b/spotify_confidence/analysis/frequentist/chartify_grapher.py
index 3a297c1..874a5da 100644
--- a/spotify_confidence/analysis/frequentist/chartify_grapher.py
+++ b/spotify_confidence/analysis/frequentist/chartify_grapher.py
@@ -278,12 +278,14 @@ def _categorical_difference_chart(
                 df[~df[NIM].isna()]
                 .assign(
                     color_column=lambda df: df.apply(
-                        lambda row: "red"
-                        if row[LOWER] < row[NULL_HYPOTHESIS]
-                        and row[PREFERENCE] == "increase"
-                        or row[NULL_HYPOTHESIS] < row[UPPER]
-                        and row[PREFERENCE] == "decrease"
-                        else "green",
+                        lambda row: (
+                            "red"
+                            if row[LOWER] < row[NULL_HYPOTHESIS]
+                            and row[PREFERENCE] == "increase"
+                            or row[NULL_HYPOTHESIS] < row[UPPER]
+                            and row[PREFERENCE] == "decrease"
+                            else "green"
+                        ),
                         axis=1,
                     )
                 )
diff --git a/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py b/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py
index da64a3a..4c47c22 100644
--- a/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py
+++ b/spotify_confidence/analysis/frequentist/confidence_computers/confidence_computer.py
@@ -524,6 +524,14 @@ def join(df: DataFrame) -> DataFrame:
             ),
             lambda df: _compute_comparisons(df, **kwargs),
         )
+        comparison_df = comparison_df.pipe(add_adjusted_p_and_is_significant, **kwargs)
+        comparison_df = groupbyApplyParallel(
+            comparison_df.groupby(
+                groups_except_ordinal + [self._method_column, "level_1", "level_2"], as_index=False, sort=False
+            ),
+            lambda df: _add_ci_and_adjust_if_absolute(df, **kwargs),
+        )
+
         return comparison_df
 
     def achieved_power(self, level_1, level_2, mde, alpha, groupby):
@@ -557,9 +565,8 @@ def _compute_comparisons(df: DataFrame, **kwargs: Dict) -> DataFrame:
     return (
         df.assign(**{DIFFERENCE: lambda df: df[POINT_ESTIMATE + SFX2] - df[POINT_ESTIMATE + SFX1]})
         .assign(**{STD_ERR: confidence_computers[df[kwargs[METHOD]].values[0]].std_err(df, **kwargs)})
-        .pipe(_add_p_value_and_ci, **kwargs)
+        .pipe(_add_p_value, **kwargs)
         .pipe(_powered_effect_and_required_sample_size_from_difference_df, **kwargs)
-        .pipe(_adjust_if_absolute, absolute=kwargs[ABSOLUTE])
         .assign(**{PREFERENCE: lambda df: df[PREFERENCE].map(PREFERENCE_DICT)})
         .pipe(_add_variance_reduction_rate, **kwargs)
     )
@@ -580,15 +587,16 @@ def _add_variance_reduction_rate(df: DataFrame, **kwargs: Dict) -> DataFrame:
     return df
 
 
-def _add_p_value_and_ci(df: DataFrame, **kwargs: Dict) -> DataFrame:
-    return (
-        df.pipe(set_alpha_and_adjust_preference, **kwargs)
-        .assign(**{P_VALUE: lambda df: df.pipe(_p_value, **kwargs)})
-        .pipe(add_adjusted_p_and_is_significant, **kwargs)
-        .pipe(add_ci, **kwargs)
+def _add_p_value(df: DataFrame, **kwargs: Dict) -> DataFrame:
+    return df.pipe(set_alpha_and_adjust_preference, **kwargs).assign(
+        **{P_VALUE: lambda df: df.pipe(_p_value, **kwargs)}
     )
 
 
+def _add_ci_and_adjust_if_absolute(df: DataFrame, **kwargs: Dict) -> DataFrame:
+    return df.pipe(add_ci, **kwargs).pipe(_adjust_if_absolute, absolute=kwargs[ABSOLUTE])
+
+
 def _adjust_if_absolute(df: DataFrame, absolute: bool) -> DataFrame:
     if absolute:
         return df.assign(absolute_difference=absolute)
diff --git a/spotify_confidence/analysis/frequentist/confidence_computers/z_test_computer.py b/spotify_confidence/analysis/frequentist/confidence_computers/z_test_computer.py
index bbe923f..7f0395d 100644
--- a/spotify_confidence/analysis/frequentist/confidence_computers/z_test_computer.py
+++ b/spotify_confidence/analysis/frequentist/confidence_computers/z_test_computer.py
@@ -4,7 +4,11 @@
 from pandas import DataFrame, Series
 from scipy import optimize
 from scipy import stats as st
-from scipy.stats.stats import _unequal_var_ttest_denom
+
+try:
+    from scipy.stats._stats_py import _unequal_var_ttest_denom
+except ImportError:  # Fallback for scipy<1.8.0
+    from scipy.stats.stats import _unequal_var_ttest_denom
 
 from statsmodels.stats.weightstats import _zconfint_generic, _zstat_generic
 
@@ -137,9 +141,11 @@ def adjusted_alphas_for_group(grp: DataFrame) -> Series:
             .assign(
                 **{
                     ADJUSTED_ALPHA: lambda df: df.apply(
-                        lambda row: 2 * (1 - st.norm.cdf(row["zb"]))
-                        if (grp[PREFERENCE_TEST] == TWO_SIDED).all()
-                        else 1 - st.norm.cdf(row["zb"]),
+                        lambda row: (
+                            2 * (1 - st.norm.cdf(row["zb"]))
+                            if (grp[PREFERENCE_TEST] == TWO_SIDED).all()
+                            else 1 - st.norm.cdf(row["zb"])
+                        ),
                         axis=1,
                     )
                 }
diff --git a/spotify_confidence/analysis/frequentist/confidence_computers/z_test_linreg_computer.py b/spotify_confidence/analysis/frequentist/confidence_computers/z_test_linreg_computer.py
index 8c812bb..cc8f1da 100644
--- a/spotify_confidence/analysis/frequentist/confidence_computers/z_test_linreg_computer.py
+++ b/spotify_confidence/analysis/frequentist/confidence_computers/z_test_linreg_computer.py
@@ -38,7 +38,10 @@ def dimension(x):
     Xy0[0,] = col_sum(df[kwargs[NUMERATOR]])
     Xy0[1 : (k + 1),] = np.atleast_2d(col_sum(df[kwargs[FEATURE_CROSS]])).reshape(-1, 1)
 
-    b = np.matmul(np.linalg.inv(XX0), Xy0)
+    try:
+        b = np.matmul(np.linalg.inv(XX0), Xy0)
+    except np.linalg.LinAlgError:
+        b = np.zeros((k + 1, 1))
     out = b[1 : (k + 1)]
     if out.size == 1:
         out = out.item()
@@ -88,9 +91,11 @@ def lin_reg_variance_delta(row, **kwargs):
 
 def variance(df: DataFrame, **kwargs) -> Series:
     variance1 = z_test_computer.variance(df, **kwargs)
-
     if kwargs[FEATURE] in df:
-        return variance1 + df.apply(lin_reg_variance_delta, axis=1, **kwargs)
+        computed_variances = variance1 + df.apply(lin_reg_variance_delta, axis=1, **kwargs)
+        if (computed_variances < 0).any():
+            raise ValueError("Computed variance is negative, please check sufficient " "statistics.")
+        return computed_variances
     else:
         return variance1
 
diff --git a/spotify_confidence/analysis/frequentist/experiment.py b/spotify_confidence/analysis/frequentist/experiment.py
index 6fae40f..276dad7 100644
--- a/spotify_confidence/analysis/frequentist/experiment.py
+++ b/spotify_confidence/analysis/frequentist/experiment.py
@@ -33,6 +33,37 @@
 
 
 class Experiment(ConfidenceABC):
+    """
+    This class represents an experiment which might include several metrics and treatment groups, as well as other
+    dimensions to break down the results by. It provides several methods to analyze and visualize the results of the
+    experiment. The experiment is based on data provided as a DataFrame with sufficient statistics, such as information
+    the numerator, and denominator (number of units in the experiment) of the tested metrics and their grouping columns.
+
+    Attributes:
+        data_frame (DataFrame): DataFrame that contains the experimental data.
+        numerator_column (str): Name of the DataFrame column that contains the numerator of the tested metric.
+        numerator_sum_squares_column (Union[str, None]): Name of the DataFrame column that contains the sum of squares
+        of the numerator.
+        denominator_column (str): Name of the DataFrame column that contains the denominator of the tested metric.
+        categorical_group_columns (Union[str, Iterable]): Column(s) that categorically group the data.
+        ordinal_group_column (Union[str, None]): Column that ordinally group the data. Values need to be of
+        types int or datetime
+        interval_size (float): Size of the confidence interval. Defaults to 0.95.
+        correction_method (str): Method for correction. Defaults to "bonferroni".
+        confidence_computer (ConfidenceComputerABC): ConfidenceComputerABC object to compute confidence intervals.
+        confidence_grapher (ConfidenceGrapherABC): ConfidenceGrapherABC object to plot confidence intervals.
+        method_column (str): Column that contains the statistical test method, e.g. "z-test", "t-test".
+        bootstrap_samples_column (str): Column that contains the bootstrap samples when method is "bootstrap".
+        metric_column (str): Column that contain the names of metrics.
+        treatment_column (str): Column that contains the names of treatment and control groups.
+        power (float): Desired statistical power. Defaults to 0.8.
+        feature_column (str): Column that contains the features when method is "z-test-linreg".
+        feature_sum_squares_column (str): Column that contains the sum of squares of the features method is
+        "z-test-linreg".
+        feature_cross_sum_column (str): Column that contains the cross product sum of the features method is
+        "z-test-linreg".
+    """
+
     def __init__(
         self,
         data_frame: DataFrame,
diff --git a/spotify_confidence/analysis/frequentist/multiple_comparison.py b/spotify_confidence/analysis/frequentist/multiple_comparison.py
index 60cee96..65cdc46 100644
--- a/spotify_confidence/analysis/frequentist/multiple_comparison.py
+++ b/spotify_confidence/analysis/frequentist/multiple_comparison.py
@@ -1,9 +1,10 @@
 from _warnings import warn
 from typing import Iterable, Dict
 
-from pandas import DataFrame, Series
+from pandas import DataFrame
 from statsmodels.stats.multitest import multipletests
 
+from spotify_confidence.analysis.confidence_utils import groupbyApplyParallel
 from spotify_confidence.analysis.constants import (
     BONFERRONI,
     BONFERRONI_ONLY_COUNT_TWOSIDED,
@@ -32,6 +33,7 @@
     NIM,
     NUMBER_OF_COMPARISONS,
     FINAL_EXPECTED_SAMPLE_SIZE,
+    ORDINAL_GROUP_COLUMN,
     CORRECTION_METHOD,
     METHOD,
     IS_SIGNIFICANT,
@@ -144,11 +146,18 @@ def add_adjusted_p_and_is_significant(df: DataFrame, **kwargs: Dict) -> DataFram
                 f"{BONFERRONI}, {BONFERRONI_ONLY_COUNT_TWOSIDED}, "
                 f"{BONFERRONI_DO_NOT_COUNT_NON_INFERIORITY}, {SPOT_1}"
             )
-        adjusted_alpha = compute_sequential_adjusted_alpha(df, **kwargs)
-        df = df.merge(adjusted_alpha, left_index=True, right_index=True)
-        df[IS_SIGNIFICANT] = df[P_VALUE] < df[ADJUSTED_ALPHA]
-        df[P_VALUE] = None
-        df[ADJUSTED_P] = None
+
+        groups_except_ordinal = [
+            column
+            for column in df.index.names
+            if kwargs[ORDINAL_GROUP_COLUMN] is not None
+            and column is not None
+            and (column != kwargs[ORDINAL_GROUP_COLUMN] or kwargs[FINAL_EXPECTED_SAMPLE_SIZE] is None)
+        ]
+        df = groupbyApplyParallel(
+            df.groupby(groups_except_ordinal + [kwargs[METHOD], "level_1", "level_2"], as_index=False, sort=False),
+            lambda df: compute_sequential_adjusted_alpha(df, **kwargs),
+        )
     elif kwargs[CORRECTION_METHOD] in [
         HOLM,
         HOMMEL,
@@ -194,9 +203,14 @@ def add_adjusted_p_and_is_significant(df: DataFrame, **kwargs: Dict) -> DataFram
     return df
 
 
-def compute_sequential_adjusted_alpha(df: DataFrame, **kwargs: Dict) -> Series:
+def compute_sequential_adjusted_alpha(df: DataFrame, **kwargs: Dict) -> DataFrame:
     if df[kwargs[METHOD]].isin([ZTEST, ZTESTLINREG]).all():
-        return confidence_computers[ZTEST].compute_sequential_adjusted_alpha(df, **kwargs)
+        adjusted_alpha = confidence_computers[ZTEST].compute_sequential_adjusted_alpha(df, **kwargs)
+        df = df.merge(adjusted_alpha, left_index=True, right_index=True)
+        df[IS_SIGNIFICANT] = df[P_VALUE] < df[ADJUSTED_ALPHA]
+        df[P_VALUE] = None
+        df[ADJUSTED_P] = None
+        return df
     else:
         raise NotImplementedError("Sequential testing is only supported for z-test and z-testlinreg")
 
@@ -254,9 +268,9 @@ def set_alpha_and_adjust_preference(df: DataFrame, **kwargs: Dict) -> DataFrame:
     return df.assign(
         **{
             ALPHA: df.apply(
-                lambda row: 2 * alpha_0
-                if kwargs[CORRECTION_METHOD] == SPOT_1 and row[PREFERENCE] != TWO_SIDED
-                else alpha_0,
+                lambda row: (
+                    2 * alpha_0 if kwargs[CORRECTION_METHOD] == SPOT_1 and row[PREFERENCE] != TWO_SIDED else alpha_0
+                ),
                 axis=1,
             )
         }
diff --git a/spotify_confidence/analysis/frequentist/nims_and_mdes.py b/spotify_confidence/analysis/frequentist/nims_and_mdes.py
index 5ffdbb5..7bfc20f 100644
--- a/spotify_confidence/analysis/frequentist/nims_and_mdes.py
+++ b/spotify_confidence/analysis/frequentist/nims_and_mdes.py
@@ -29,9 +29,11 @@ def add_nim_input_columns_from_tuple_or_dict(df, nims: NIM_TYPE, mde_column: str
     elif nims is None or not nims:
         return df.assign(**{NIM_COLUMN_DEFAULT: None}).assign(
             **{
-                PREFERRED_DIRECTION_COLUMN_DEFAULT: None
-                if PREFERRED_DIRECTION_COLUMN_DEFAULT not in df or mde_column is None
-                else df[PREFERRED_DIRECTION_COLUMN_DEFAULT]
+                PREFERRED_DIRECTION_COLUMN_DEFAULT: (
+                    None
+                    if PREFERRED_DIRECTION_COLUMN_DEFAULT not in df or mde_column is None
+                    else df[PREFERRED_DIRECTION_COLUMN_DEFAULT]
+                )
             }
         )
     else:
diff --git a/spotify_confidence/samplesize/sample_size_calculator.py b/spotify_confidence/samplesize/sample_size_calculator.py
index 5fdd151..10b77d5 100644
--- a/spotify_confidence/samplesize/sample_size_calculator.py
+++ b/spotify_confidence/samplesize/sample_size_calculator.py
@@ -61,7 +61,7 @@ def continuous(
         treatment_allocations=default_treatment_allocations,
         bonferroni_correction=default_bonferroni,
     ):
-        """Calculate the required sample size for a binomial metric.
+        """Calculate the required sample size for a continuous metric.
 
         Args:
             average_absolute_mde (float): Average absolute minimal detectable
diff --git a/tests/frequentist/test_freqsamplesizecalculator.py b/tests/frequentist/test_freqsamplesizecalculator.py
index be77024..1eaf148 100644
--- a/tests/frequentist/test_freqsamplesizecalculator.py
+++ b/tests/frequentist/test_freqsamplesizecalculator.py
@@ -819,6 +819,6 @@ def test_sample_size_with_nan(self):
         )
 
         assert len(ss) == len(df)
-        assert ss[REQUIRED_SAMPLE_SIZE_METRIC].values[0] is None
+        assert ss[REQUIRED_SAMPLE_SIZE_METRIC].isna()[0]
         assert 0.999 < ss[REQUIRED_SAMPLE_SIZE_METRIC].values[1] / 95459 < 1.001
         assert ss[CI_WIDTH].isna().all()
diff --git a/tests/frequentist/test_ztest.py b/tests/frequentist/test_ztest.py
index 7c3a16f..b03b4f1 100644
--- a/tests/frequentist/test_ztest.py
+++ b/tests/frequentist/test_ztest.py
@@ -14,6 +14,7 @@
     ADJUSTED_UPPER,
     DIFFERENCE,
     BONFERRONI,
+    BONFERRONI_ONLY_COUNT_TWOSIDED,
     BONFERRONI_DO_NOT_COUNT_NON_INFERIORITY,
     CORRECTION_METHODS,
     SPOT_1,
@@ -21,6 +22,7 @@
     POWERED_EFFECT,
     REQUIRED_SAMPLE_SIZE,
 )
+from statsmodels.stats.multitest import multipletests
 
 
 class TestPoweredEffectContinuousSingleMetric(object):
@@ -203,8 +205,8 @@ def test_powered_effect(self):
         assert np.isclose(powered_effect[POWERED_EFFECT][1], 0.5291, atol=0.001)
         assert np.isclose(powered_effect[POWERED_EFFECT][2], 0.4596, atol=0.001)
         assert np.isclose(powered_effect[POWERED_EFFECT][3], 0.4869, atol=0.001)
-        assert powered_effect[REQUIRED_SAMPLE_SIZE][0] is None
-        assert powered_effect[REQUIRED_SAMPLE_SIZE][1] is None
+        assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[0]
+        assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[1]
         assert np.isclose(powered_effect[REQUIRED_SAMPLE_SIZE][2], 16487886, atol=100)
         assert np.isclose(powered_effect[REQUIRED_SAMPLE_SIZE][3], 3083846, atol=100)
 
@@ -329,10 +331,10 @@ def test_powered_effect(self):
         assert np.isclose(powered_effect[POWERED_EFFECT][6], 0.4995, atol=0.001)
         assert np.isclose(powered_effect[POWERED_EFFECT][7], 0.5291, atol=0.001)
 
-        assert powered_effect[REQUIRED_SAMPLE_SIZE][0] is None
-        assert powered_effect[REQUIRED_SAMPLE_SIZE][1] is None
-        assert powered_effect[REQUIRED_SAMPLE_SIZE][2] is None
-        assert powered_effect[REQUIRED_SAMPLE_SIZE][3] is None
+        assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[0]
+        assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[1]
+        assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[2]
+        assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[3]
         assert np.isclose(powered_effect[REQUIRED_SAMPLE_SIZE][4], 19475238, atol=100)
         assert np.isclose(powered_effect[REQUIRED_SAMPLE_SIZE][5], 3642591, atol=100)
         assert np.isclose(powered_effect[REQUIRED_SAMPLE_SIZE][6], 19475238, atol=100)
@@ -733,12 +735,12 @@ def test_powered_effect(self):
         #  assert np.isclose(powered_effect[POWERED_EFFECT][10], 0.2663, atol=0.001)
         #  assert np.isclose(powered_effect[POWERED_EFFECT][11], 0.2479, atol=0.001)
 
-        assert powered_effect[REQUIRED_SAMPLE_SIZE][0] is None
-        assert powered_effect[REQUIRED_SAMPLE_SIZE][1] is None
-        assert powered_effect[REQUIRED_SAMPLE_SIZE][2] is None
-        assert powered_effect[REQUIRED_SAMPLE_SIZE][3] is None
-        assert powered_effect[REQUIRED_SAMPLE_SIZE][4] is None
-        assert powered_effect[REQUIRED_SAMPLE_SIZE][5] is None
+        assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[0]
+        assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[1]
+        assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[2]
+        assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[3]
+        assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[4]
+        assert powered_effect[REQUIRED_SAMPLE_SIZE].isna()[5]
         assert np.isclose(powered_effect[REQUIRED_SAMPLE_SIZE][6], 260541, atol=100)
         assert np.isclose(powered_effect[REQUIRED_SAMPLE_SIZE][7], 361863, atol=100)
         assert np.isclose(powered_effect[REQUIRED_SAMPLE_SIZE][8], 326159, atol=100)
@@ -1535,7 +1537,12 @@ def test_difference_groupby(self, correction_method):
     @pytest.mark.parametrize("correction_method", CORRECTION_METHODS, ids=lambda x: f"correction method: {x}")
     def test_multiple_difference(self, correction_method):
         self.test._confidence_computer._correction_method = correction_method
-        if BONFERRONI in correction_method:
+        if correction_method in [
+            BONFERRONI,
+            BONFERRONI_ONLY_COUNT_TWOSIDED,
+            BONFERRONI_DO_NOT_COUNT_NON_INFERIORITY,
+            SPOT_1,
+        ]:
             difference_df = self.test.multiple_difference(
                 level=("control", 1), groupby="country", level_as_reference=True
             )
@@ -1569,10 +1576,32 @@ def test_multiple_difference(self, correction_method):
                 )
             )
 
+            if correction_method.startswith("spot-"):
+                corr_method = correction_method[7:]
+            else:
+                corr_method = correction_method
+
+            _, adjusted_p, _, _ = multipletests(
+                pvals=difference_df["p-value"],
+                alpha=1 - self.test._confidence_computer._interval_size,
+                method=corr_method,
+            )
+
+            assert np.allclose(
+                adjusted_p,
+                difference_df["adjusted p-value"],
+                rtol=0.01,
+            )
+
     @pytest.mark.parametrize("correction_method", CORRECTION_METHODS, ids=lambda x: f"correction method: {x}")
     def test_multiple_difference_groupby(self, correction_method):
         self.test._confidence_computer._correction_method = correction_method
-        if BONFERRONI in correction_method:
+        if correction_method in [
+            BONFERRONI,
+            BONFERRONI_ONLY_COUNT_TWOSIDED,
+            BONFERRONI_DO_NOT_COUNT_NON_INFERIORITY,
+            SPOT_1,
+        ]:
             difference_df = self.test.multiple_difference(
                 level="control", groupby=["days_since_reg", "country"], level_as_reference=True
             )
@@ -1603,8 +1632,25 @@ def test_multiple_difference_groupby(self, correction_method):
             if correction_method in CORRECTION_METHODS_THAT_SUPPORT_CI:
                 assert not any(difference_df[ADJUSTED_LOWER].isna())
 
+            if correction_method.startswith("spot-"):
+                corr_method = correction_method[7:]
+            else:
+                corr_method = correction_method
+
+            _, adjusted_p, _, _ = multipletests(
+                pvals=difference_df["p-value"],
+                alpha=1 - self.test._confidence_computer._interval_size,
+                method=corr_method,
+            )
+
+            assert np.allclose(
+                adjusted_p,
+                difference_df["adjusted p-value"],
+                rtol=0.01,
+            )
+
     @pytest.mark.parametrize("correction_method", CORRECTION_METHODS, ids=lambda x: f"correction method: {x}")
-    def test_differece_with_nims(self, correction_method):
+    def test_difference_with_nims(self, correction_method):
         self.test._confidence_computer._correction_method = correction_method
         df = self.test.difference(
             level_1=("test", "us"),
diff --git a/tests/frequentist/test_ztest_linreg.py b/tests/frequentist/test_ztest_linreg.py
index 6044f2c..eecd4ad 100644
--- a/tests/frequentist/test_ztest_linreg.py
+++ b/tests/frequentist/test_ztest_linreg.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pandas as pd
+import pytest
 
 import spotify_confidence
 from spotify_confidence.analysis.constants import REGRESSION_PARAM, DECREASE_PREFFERED, METHOD_COLUMN_NAME
@@ -510,5 +511,104 @@ def test_parameters_univariate_required_sample_size(self):
         )
 
 
-# TODO: Test for sequential data (w/ ordinal column)
-# TODO: Test for segmentation
+class TestUnivariateSingleMetricWithBadPreExposureData(object):
+    def setup(self):
+        np.random.seed(123)
+        n = 10000
+        d = np.random.randint(2, size=n)
+        y = 0.5 * d + np.random.standard_normal(size=n)
+        data = pd.DataFrame({"variation_name": list(map(str, d)), "y": y})
+        data = (
+            data.assign(y2=y**2)
+            .groupby(["variation_name"])
+            .agg({"y": ["sum", "count"], "y2": "sum"})
+            .assign(**{"x_sum": 0.0, "x2_sum": 0.0, "xy_sum": 0.0})
+            .reset_index()
+        )
+
+        data.columns = data.columns.map("_".join).str.strip("_")
+        data = data.assign(**{"metric_name": "metricA"})
+        self.n = n
+        self.y = y
+        self.d = d
+        self.data = data
+
+        self.test = spotify_confidence.ZTestLinreg(
+            data_frame=data,
+            numerator_column="y_sum",
+            numerator_sum_squares_column="y2_sum",
+            denominator_column="y_count",
+            categorical_group_columns=["variation_name"],
+            feature_column="x_sum",
+            feature_sum_squares_column="x2_sum",
+            feature_cross_sum_column="xy_sum",
+            interval_size=0.99,
+            correction_method="bonferroni",
+            metric_column="metric_name",
+        )
+
+    def test_summary(self):
+        summary_df = self.test.summary(verbose=True)
+        print(summary_df)
+        assert len(summary_df) == len(self.data)
+
+    def test_parameters_univariate(self):
+        summary_df = self.test.summary(verbose=True)
+        assert np.allclose(0.0, summary_df[REGRESSION_PARAM][0], rtol=0.0001)
+
+        diff = self.test.difference(level_1="0", level_2="1", verbose=True, groupby="metric_name")
+        y = self.y
+        d = self.d
+        assert np.allclose(y[d == 1].mean() - y[d == 0].mean(), diff["difference"])
+
+        v0 = np.var(y[d == 0])
+        v1 = np.var(y[d == 1])
+        n0 = y[d == 0].size
+        n1 = y[d == 1].size
+        assert np.allclose(diff["std_err"], np.sqrt(v0 / n0 + v1 / n1), rtol=1e-3)
+
+
+class TestUnivariateSingleMetricNegativeVariance(object):
+    def setup(self):
+        self.data = pd.DataFrame(
+            [
+                {
+                    "group": "1",
+                    "count": 17512,
+                    "sum": 16544,
+                    "sum_of_squares": 16044,
+                    "sum_2": 6625,
+                    "sum_of_squares_2": 6455,
+                    "sum_of_squares_x": 3513,
+                    "metric_name": "metricA",
+                },
+                {
+                    "group": "2",
+                    "count": 159142,
+                    "sum": 150364,
+                    "sum_of_squares": 145794,
+                    "sum_2": 60540,
+                    "sum_of_squares_2": 59047,
+                    "sum_of_squares_x": 32398,
+                    "metric_name": "metricA",
+                },
+            ]
+        )
+
+        self.test = spotify_confidence.ZTestLinreg(
+            data_frame=self.data,
+            numerator_column="sum",
+            numerator_sum_squares_column="sum_of_squares",
+            denominator_column="count",
+            categorical_group_columns=["group"],
+            feature_column="sum_2",
+            feature_sum_squares_column="sum_of_squares_2",
+            feature_cross_sum_column="sum_of_squares_x",
+            interval_size=0.99,
+            correction_method="bonferroni",
+            metric_column="metric_name",
+        )
+
+    def test_setup_that_will_fail_with_negative_variance(self):
+        with pytest.raises(ValueError):
+            self.test.summary(verbose=True)
diff --git a/tox.ini b/tox.ini
index 81273da..fc4e692 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,19 +1,19 @@
 [tox]
-envlist = python3.8, python3.9, python3.10
+envlist = python3.9, python3.10, python3.11
 skipsdist = True
 usedevelop = True
 
 [travis]
 python =
-    3.8: python3.8
     3.9: python3.9
     3.10: python3.10
+    3.11: python3.11
 
 [gh-actions]
 python =
-    3.8: python3.8
     3.9: python3.9
     3.10: python3.10
+    3.11: python3.11
 
 [testenv]
 setenv =