Add invalid_score and refactor

valteresj2 · Jul 28, 2020 · 8baafaa · 8baafaa
1 parent d8375f4
commit 8baafaa
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 48 deletions.
diff --git a/README.md b/README.md
@@ -86,7 +86,7 @@ sns.heatmap(matrix_df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True)
 
 ## API
 
-### ppscore.score(df, x, y, sample=5_000, cross_validation=4, random_seed=None)
+### ppscore.score(df, x, y, sample=5_000, cross_validation=4, random_seed=None, invalid_score=0)
 
 Calculate the Predictive Power Score (PPS) for "x predicts y"
 
@@ -116,6 +116,9 @@ Calculate the Predictive Power Score (PPS) for "x predicts y"
 - __random_seed__ : int or ``None``
     - Random seed for the parts of the calculation that require random numbers, e.g. shuffling or sampling.
     If the value is set, the results will be reproducible. If the value is ``None`` a new random number is drawn at the start of each calculation.
+- __invalid_score__ : any
+    - The score that is returned when a calculation is not valid, e.g. because the data type was not supported.
+
 #### Returns
 
 - __Dict__:

diff --git a/src/ppscore/calculation.py b/src/ppscore/calculation.py
@@ -115,7 +115,7 @@ def _f1_normalizer(df, y, model_score, random_seed):
     return ppscore, baseline_score
 
 
-CASES = {
+VALID_CALCULATIONS = {
     "regression": {
         "type": "regression",
         "is_valid_calculation": True,
@@ -182,42 +182,14 @@ def _f1_normalizer(df, y, model_score, random_seed):
         "model": None,
         "score_normalizer": None,
     },
-    # cases that are invalid_calculations
-    "target_is_datetime": {
-        "type": "target_is_datetime",
-        "is_valid_calculation": False,
-        "model_score": 0,
-        "baseline_score": 0,
-        "ppscore": 0,
-        "metric_name": None,
-        "metric_key": None,
-        "model": None,
-        "score_normalizer": None,
-    },
-    "target_data_type_not_supported": {
-        "type": "target_data_type_not_supported",
-        "is_valid_calculation": False,
-        "model_score": 0,
-        "baseline_score": 0,
-        "ppscore": 0,
-        "metric_name": None,
-        "metric_key": None,
-        "model": None,
-        "score_normalizer": None,
-    },
-    "empty_dataframe_after_dropping_na": {
-        "type": "empty_dataframe_after_dropping_na",
-        "is_valid_calculation": False,
-        "model_score": 0,
-        "baseline_score": 0,
-        "ppscore": 0,
-        "metric_name": None,
-        "metric_key": None,
-        "model": None,
-        "score_normalizer": None,
-    },
 }
 
+INVALID_CALCULATIONS = [
+    "target_is_datetime",
+    "target_data_type_not_supported",
+    "empty_dataframe_after_dropping_na",
+]
+
 
 def _dtype_represents_categories(series) -> bool:
     "Determines if the dtype of the series represents categorical values"
@@ -322,6 +294,7 @@ def score(
     sample=5_000,
     cross_validation=4,
     random_seed=None,
+    invalid_score=0,
 ):
     """
     Calculate the Predictive Power Score (PPS) for "x predicts y"
@@ -348,6 +321,8 @@ def score(
     random_seed : int or ``None``
         Random seed for the parts of the calculation that require random numbers, e.g. shuffling or sampling.
         If the value is set, the results will be reproducible. If the value is ``None`` a new random number is drawn at the start of each calculation.
+    invalid_score : any
+        The score that is returned when a calculation is invalid, e.g. because the data type was not supported.
 
     Returns
     -------
@@ -386,8 +361,10 @@ def score(
 
         random_seed = int(random() * 1000)
 
-    df, case_type = _determine_case_and_prepare_df(df, x, y, sample=sample, random_seed=random_seed)
-    task = CASES[case_type]
+    df, case_type = _determine_case_and_prepare_df(
+        df, x, y, sample=sample, random_seed=random_seed
+    )
+    task = _get_task(case_type, invalid_score)
 
     if case_type in ["classification", "regression"]:
         model_score = _calculate_model_cv_score_(
@@ -421,6 +398,24 @@ def score(
     }
 
 
+def _get_task(case_type, invalid_score):
+    if case_type in VALID_CALCULATIONS.keys():
+        return VALID_CALCULATIONS[case_type]
+    elif case_type in INVALID_CALCULATIONS:
+        return {
+            "type": case_type,
+            "is_valid_calculation": False,
+            "model_score": invalid_score,
+            "baseline_score": invalid_score,
+            "ppscore": invalid_score,
+            "metric_name": None,
+            "metric_key": None,
+            "model": None,
+            "score_normalizer": None,
+        }
+    raise Exception(f"case_type {case_type} is not supported")
+
+
 def _format_list_of_dicts(scores, output, sorted):
     """
     Format list of score dicts ``scores``
@@ -466,7 +461,7 @@ def predictors(df, y, output="df", sorted=True, **kwargs):
         Whether or not to sort the output dataframe/list by the ppscore
     kwargs:
         Other key-word arguments that shall be forwarded to the pps.score method,
-        e.g. ``sample``, ``cross_validation``, or ``random_seed``
+        e.g. ``sample``, ``cross_validation``, ``random_seed``, ``invalid_score``
 
     Returns
     -------
@@ -514,7 +509,7 @@ def matrix(df, output="df", sorted=False, **kwargs):
         Whether or not to sort the output dataframe/list by the ppscore
     kwargs:
         Other key-word arguments that shall be forwarded to the pps.score method,
-        e.g. ``sample``, ``cross_validation``, or ``random_seed``
+        e.g. ``sample``, ``cross_validation``, ``random_seed``, ``invalid_score``
 
     Returns
     -------

diff --git a/tests/test_calculation.py b/tests/test_calculation.py
@@ -49,17 +49,36 @@ def test__determine_case_and_prepare_df():
     assert _determine_case_and_prepare_df(df, "x", "Pclass_integer")[1] == "regression"
 
     # check classification
-    assert _determine_case_and_prepare_df(df, "x", "Pclass_category")[1] == "classification"
-    assert _determine_case_and_prepare_df(df, "x", "Survived_boolean")[1] == "classification"
-    assert _determine_case_and_prepare_df(df, "x", "Ticket_object")[1] == "classification"
-    assert _determine_case_and_prepare_df(df, "x", "Cabin_string")[1] == "classification"
+    assert (
+        _determine_case_and_prepare_df(df, "x", "Pclass_category")[1]
+        == "classification"
+    )
+    assert (
+        _determine_case_and_prepare_df(df, "x", "Survived_boolean")[1]
+        == "classification"
+    )
+    assert (
+        _determine_case_and_prepare_df(df, "x", "Ticket_object")[1] == "classification"
+    )
+    assert (
+        _determine_case_and_prepare_df(df, "x", "Cabin_string")[1] == "classification"
+    )
 
     # check special cases
-    assert _determine_case_and_prepare_df(df, "Name_object_id", "x")[1] == "feature_is_id"
+    assert (
+        _determine_case_and_prepare_df(df, "Name_object_id", "x")[1] == "feature_is_id"
+    )
     assert _determine_case_and_prepare_df(df, "x", "x")[1] == "predict_itself"
-    assert _determine_case_and_prepare_df(df, "x", "constant")[1] == "target_is_constant"
-    assert _determine_case_and_prepare_df(df, "x", "Name_object_id")[1] == "target_is_id"
-    assert _determine_case_and_prepare_df(df, "x", "Pclass_datetime")[1] == "target_is_datetime"
+    assert (
+        _determine_case_and_prepare_df(df, "x", "constant")[1] == "target_is_constant"
+    )
+    assert (
+        _determine_case_and_prepare_df(df, "x", "Name_object_id")[1] == "target_is_id"
+    )
+    assert (
+        _determine_case_and_prepare_df(df, "x", "Pclass_datetime")[1]
+        == "target_is_datetime"
+    )
 
 
 def test__maybe_sample():
@@ -144,6 +163,10 @@ def test_score():
     # the random seed that is drawn automatically is smaller than <1000
     assert pps.score(df, "x", "y") != pps.score(df, "x", "y", random_seed=123_456)
 
+    # check invalid_score
+    invalid_score = -99
+    assert pps.score(df, "nan", "y", invalid_score=invalid_score)["ppscore"] == invalid_score
+
     # check case discrimination
     assert pps.score(df, "x", "y")["case"] == "regression"
     assert pps.score(df, "x", "x_greater_0_string")["case"] == "classification"