Skip to content

Commit

Permalink
Much-improved support for NaN values
Browse files Browse the repository at this point in the history
  • Loading branch information
fbdesignpro committed Jul 20, 2020
1 parent 7ce4ee7 commit 5e68147
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 10 deletions.
2 changes: 1 addition & 1 deletion sweetviz/graph_cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def __init__(self, which_graph: str, to_process: FeatureToProcess):
axs.xaxis.tick_top()
elif is_detail:
height = config["Graphs"].getfloat("detail_graph_height_base") \
+ config["Graphs"].getfloat("detail_graph_height_per_elem") * len(plot_data_series)
+ config["Graphs"].getfloat("detail_graph_height_per_elem") * max(1, len(plot_data_series))
if height > config["Graphs"].getfloat("detail_graph_categorical_max_height"):
# Shrink height to fit, past a certain number
height = config["Graphs"].getfloat("detail_graph_categorical_max_height")
Expand Down
14 changes: 11 additions & 3 deletions sweetviz/graph_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,23 @@ def __init__(self, which_graph: str, to_process: FeatureToProcess):
np.seterr(all='raise')
# WORKAROUND histogram warnings
cleaned_source = to_process.source[~np.isnan(to_process.source)]
if len(cleaned_source):
norm_source = np.full(len(cleaned_source), 1.0 / len(cleaned_source))
else:
norm_source = []
if to_process.compare is not None:
# COMPARE
cleaned_compare = to_process.compare[~np.isnan(to_process.compare)]
plot_data = (cleaned_source, cleaned_compare)
normalizing_weights = (np.full(len(cleaned_source), 1.0 / len(cleaned_source)),
np.full(len(cleaned_compare), 1.0 / len(cleaned_compare)))
if len(cleaned_compare):
norm_compare = np.full(len(cleaned_compare), 1.0 / len(cleaned_compare))
else:
norm_compare = []
normalizing_weights = (norm_source, norm_compare)

else:
plot_data = cleaned_source
normalizing_weights = np.full(len(cleaned_source), 1.0 / len(cleaned_source))
normalizing_weights = norm_source

gap_percent = config["Graphs"].getfloat("summary_graph_categorical_gap")

Expand Down
26 changes: 22 additions & 4 deletions sweetviz/series_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def analyze_feature_to_dictionary(to_process: FeatureToProcess) -> dict:
to_process.source_counts = get_counts(to_process.source)
returned_feature_dict["type"] = determine_feature_type(to_process.source, to_process.source_counts,
to_process.predetermined_type, "SOURCE")
source_type = returned_feature_dict["type"]

# Determine COMPARED feature type & initialize
compare_dict = None
Expand All @@ -91,14 +92,31 @@ def analyze_feature_to_dictionary(to_process: FeatureToProcess) -> dict:
compare_type = determine_feature_type(to_process.compare,
to_process.compare_counts,
returned_feature_dict["type"], "COMPARED")
# Explicitly show missing categories on each set
if compare_type == FeatureType.TYPE_CAT or compare_type == FeatureType.TYPE_BOOL:
fill_out_missing_counts_in_other_series(to_process.compare_counts, to_process.source_counts)
fill_out_missing_counts_in_other_series(to_process.source_counts, to_process.compare_counts)
if compare_type != FeatureType.TYPE_ALL_NAN and \
source_type != FeatureType.TYPE_ALL_NAN:
# Explicitly show missing categories on each set
if compare_type == FeatureType.TYPE_CAT or compare_type == FeatureType.TYPE_BOOL:
fill_out_missing_counts_in_other_series(to_process.compare_counts, to_process.source_counts)
fill_out_missing_counts_in_other_series(to_process.source_counts, to_process.compare_counts)
returned_feature_dict["compare"] = dict()
compare_dict = returned_feature_dict["compare"]
compare_dict["type"] = compare_type

# Settle all-NaN series, depending on source versus compared
if to_process.compare is not None:
# Settle all-Nan WITH COMPARE: Must consider all cases between source and compare
if compare_type == FeatureType.TYPE_ALL_NAN and source_type == FeatureType.TYPE_ALL_NAN:
returned_feature_dict["type"] = FeatureType.TYPE_TEXT
compare_dict["type"] = FeatureType.TYPE_TEXT
elif compare_type == FeatureType.TYPE_ALL_NAN:
compare_dict["type"] = source_type
elif source_type == FeatureType.TYPE_ALL_NAN:
returned_feature_dict["type"] = compare_type
else:
# Settle all-Nan WITHOUT COMPARE ( trivial: consider as TEXT )
if source_type == FeatureType.TYPE_ALL_NAN:
returned_feature_dict["type"] = FeatureType.TYPE_TEXT

# Establish base stats
add_series_base_stats_to_dict(to_process.source, to_process.source_counts, returned_feature_dict)
if to_process.compare is not None:
Expand Down
1 change: 1 addition & 0 deletions sweetviz/sv_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class FeatureType(Enum):
TYPE_NUM = "NUM"
TYPE_TEXT = "TEXT"
TYPE_UNSUPPORTED = "UNSUPPORTED"
TYPE_ALL_NAN = "ALL_NAN"
TYPE_UNKNOWN = "UNKNOWN"
TYPE_SKIPPED = "SKIPPED"

Expand Down
7 changes: 5 additions & 2 deletions sweetviz/type_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def determine_feature_type(series: pd.Series, counts: dict,
# TODO: must_be_this_type ENFORCING
if counts["distinct_count_without_nan"] == 0:
# Empty
var_type = FeatureType.TYPE_UNSUPPORTED
var_type = FeatureType.TYPE_ALL_NAN
# var_type = FeatureType.TYPE_UNSUPPORTED
elif is_boolean(series, counts):
var_type = FeatureType.TYPE_BOOL
elif is_numeric(series, counts):
Expand All @@ -48,7 +49,9 @@ def determine_feature_type(series: pd.Series, counts: dict,
# NUM -> CAT
# NUM -> TEXT
if must_be_this_type != FeatureType.TYPE_UNKNOWN and \
must_be_this_type != var_type:
must_be_this_type != var_type and \
must_be_this_type != FeatureType.TYPE_ALL_NAN and \
var_type != FeatureType.TYPE_ALL_NAN:
if var_type == FeatureType.TYPE_TEXT and must_be_this_type == FeatureType.TYPE_CAT:
var_type = FeatureType.TYPE_CAT
elif (var_type == FeatureType.TYPE_CAT or var_type == FeatureType.TYPE_BOOL ) and \
Expand Down

0 comments on commit 5e68147

Please sign in to comment.