Skip to content

Commit

Permalink
Add invalid_score and refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
FlorianWetschoreck committed Jul 28, 2020
1 parent d8375f4 commit 8baafaa
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 48 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ sns.heatmap(matrix_df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True)

## API

### ppscore.score(df, x, y, sample=5_000, cross_validation=4, random_seed=None)
### ppscore.score(df, x, y, sample=5_000, cross_validation=4, random_seed=None, invalid_score=0)

Calculate the Predictive Power Score (PPS) for "x predicts y"

Expand Down Expand Up @@ -116,6 +116,9 @@ Calculate the Predictive Power Score (PPS) for "x predicts y"
- __random_seed__ : int or ``None``
- Random seed for the parts of the calculation that require random numbers, e.g. shuffling or sampling.
If the value is set, the results will be reproducible. If the value is ``None`` a new random number is drawn at the start of each calculation.
- __invalid_score__ : any
- The score that is returned when a calculation is not valid, e.g. because the data type was not supported.

#### Returns

- __Dict__:
Expand Down
73 changes: 34 additions & 39 deletions src/ppscore/calculation.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def _f1_normalizer(df, y, model_score, random_seed):
return ppscore, baseline_score


CASES = {
VALID_CALCULATIONS = {
"regression": {
"type": "regression",
"is_valid_calculation": True,
Expand Down Expand Up @@ -182,42 +182,14 @@ def _f1_normalizer(df, y, model_score, random_seed):
"model": None,
"score_normalizer": None,
},
# cases that are invalid_calculations
"target_is_datetime": {
"type": "target_is_datetime",
"is_valid_calculation": False,
"model_score": 0,
"baseline_score": 0,
"ppscore": 0,
"metric_name": None,
"metric_key": None,
"model": None,
"score_normalizer": None,
},
"target_data_type_not_supported": {
"type": "target_data_type_not_supported",
"is_valid_calculation": False,
"model_score": 0,
"baseline_score": 0,
"ppscore": 0,
"metric_name": None,
"metric_key": None,
"model": None,
"score_normalizer": None,
},
"empty_dataframe_after_dropping_na": {
"type": "empty_dataframe_after_dropping_na",
"is_valid_calculation": False,
"model_score": 0,
"baseline_score": 0,
"ppscore": 0,
"metric_name": None,
"metric_key": None,
"model": None,
"score_normalizer": None,
},
}

INVALID_CALCULATIONS = [
"target_is_datetime",
"target_data_type_not_supported",
"empty_dataframe_after_dropping_na",
]


def _dtype_represents_categories(series) -> bool:
"Determines if the dtype of the series represents categorical values"
Expand Down Expand Up @@ -322,6 +294,7 @@ def score(
sample=5_000,
cross_validation=4,
random_seed=None,
invalid_score=0,
):
"""
Calculate the Predictive Power Score (PPS) for "x predicts y"
Expand All @@ -348,6 +321,8 @@ def score(
random_seed : int or ``None``
Random seed for the parts of the calculation that require random numbers, e.g. shuffling or sampling.
If the value is set, the results will be reproducible. If the value is ``None`` a new random number is drawn at the start of each calculation.
invalid_score : any
The score that is returned when a calculation is invalid, e.g. because the data type was not supported.
Returns
-------
Expand Down Expand Up @@ -386,8 +361,10 @@ def score(

random_seed = int(random() * 1000)

df, case_type = _determine_case_and_prepare_df(df, x, y, sample=sample, random_seed=random_seed)
task = CASES[case_type]
df, case_type = _determine_case_and_prepare_df(
df, x, y, sample=sample, random_seed=random_seed
)
task = _get_task(case_type, invalid_score)

if case_type in ["classification", "regression"]:
model_score = _calculate_model_cv_score_(
Expand Down Expand Up @@ -421,6 +398,24 @@ def score(
}


def _get_task(case_type, invalid_score):
if case_type in VALID_CALCULATIONS.keys():
return VALID_CALCULATIONS[case_type]
elif case_type in INVALID_CALCULATIONS:
return {
"type": case_type,
"is_valid_calculation": False,
"model_score": invalid_score,
"baseline_score": invalid_score,
"ppscore": invalid_score,
"metric_name": None,
"metric_key": None,
"model": None,
"score_normalizer": None,
}
raise Exception(f"case_type {case_type} is not supported")


def _format_list_of_dicts(scores, output, sorted):
"""
Format list of score dicts ``scores``
Expand Down Expand Up @@ -466,7 +461,7 @@ def predictors(df, y, output="df", sorted=True, **kwargs):
Whether or not to sort the output dataframe/list by the ppscore
kwargs:
Other key-word arguments that shall be forwarded to the pps.score method,
e.g. ``sample``, ``cross_validation``, or ``random_seed``
e.g. ``sample``, ``cross_validation``, ``random_seed``, ``invalid_score``
Returns
-------
Expand Down Expand Up @@ -514,7 +509,7 @@ def matrix(df, output="df", sorted=False, **kwargs):
Whether or not to sort the output dataframe/list by the ppscore
kwargs:
Other key-word arguments that shall be forwarded to the pps.score method,
e.g. ``sample``, ``cross_validation``, or ``random_seed``
e.g. ``sample``, ``cross_validation``, ``random_seed``, ``invalid_score``
Returns
-------
Expand Down
39 changes: 31 additions & 8 deletions tests/test_calculation.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,17 +49,36 @@ def test__determine_case_and_prepare_df():
assert _determine_case_and_prepare_df(df, "x", "Pclass_integer")[1] == "regression"

# check classification
assert _determine_case_and_prepare_df(df, "x", "Pclass_category")[1] == "classification"
assert _determine_case_and_prepare_df(df, "x", "Survived_boolean")[1] == "classification"
assert _determine_case_and_prepare_df(df, "x", "Ticket_object")[1] == "classification"
assert _determine_case_and_prepare_df(df, "x", "Cabin_string")[1] == "classification"
assert (
_determine_case_and_prepare_df(df, "x", "Pclass_category")[1]
== "classification"
)
assert (
_determine_case_and_prepare_df(df, "x", "Survived_boolean")[1]
== "classification"
)
assert (
_determine_case_and_prepare_df(df, "x", "Ticket_object")[1] == "classification"
)
assert (
_determine_case_and_prepare_df(df, "x", "Cabin_string")[1] == "classification"
)

# check special cases
assert _determine_case_and_prepare_df(df, "Name_object_id", "x")[1] == "feature_is_id"
assert (
_determine_case_and_prepare_df(df, "Name_object_id", "x")[1] == "feature_is_id"
)
assert _determine_case_and_prepare_df(df, "x", "x")[1] == "predict_itself"
assert _determine_case_and_prepare_df(df, "x", "constant")[1] == "target_is_constant"
assert _determine_case_and_prepare_df(df, "x", "Name_object_id")[1] == "target_is_id"
assert _determine_case_and_prepare_df(df, "x", "Pclass_datetime")[1] == "target_is_datetime"
assert (
_determine_case_and_prepare_df(df, "x", "constant")[1] == "target_is_constant"
)
assert (
_determine_case_and_prepare_df(df, "x", "Name_object_id")[1] == "target_is_id"
)
assert (
_determine_case_and_prepare_df(df, "x", "Pclass_datetime")[1]
== "target_is_datetime"
)


def test__maybe_sample():
Expand Down Expand Up @@ -144,6 +163,10 @@ def test_score():
# the random seed that is drawn automatically is smaller than <1000
assert pps.score(df, "x", "y") != pps.score(df, "x", "y", random_seed=123_456)

# check invalid_score
invalid_score = -99
assert pps.score(df, "nan", "y", invalid_score=invalid_score)["ppscore"] == invalid_score

# check case discrimination
assert pps.score(df, "x", "y")["case"] == "regression"
assert pps.score(df, "x", "x_greater_0_string")["case"] == "classification"
Expand Down

0 comments on commit 8baafaa

Please sign in to comment.