Much-improved support for NaN values

XrosLiang · Jul 20, 2020 · 5e68147 · 5e68147
1 parent 7ce4ee7
commit 5e68147
Show file tree

Hide file tree

Showing 5 changed files with 40 additions and 10 deletions.
diff --git a/sweetviz/graph_cat.py b/sweetviz/graph_cat.py
@@ -83,7 +83,7 @@ def __init__(self, which_graph: str, to_process: FeatureToProcess):
  axs.xaxis.tick_top()
  elif is_detail:
  height = config["Graphs"].getfloat("detail_graph_height_base") \
- + config["Graphs"].getfloat("detail_graph_height_per_elem") * len(plot_data_series)
+ + config["Graphs"].getfloat("detail_graph_height_per_elem") * max(1, len(plot_data_series))
  if height > config["Graphs"].getfloat("detail_graph_categorical_max_height"):
  # Shrink height to fit, past a certain number
  height = config["Graphs"].getfloat("detail_graph_categorical_max_height")

diff --git a/sweetviz/graph_numeric.py b/sweetviz/graph_numeric.py
@@ -47,15 +47,23 @@ def __init__(self, which_graph: str, to_process: FeatureToProcess):
  np.seterr(all='raise')
  # WORKAROUND histogram warnings
  cleaned_source = to_process.source[~np.isnan(to_process.source)]
+ if len(cleaned_source):
+ norm_source = np.full(len(cleaned_source), 1.0 / len(cleaned_source))
+ else:
+ norm_source = []
  if to_process.compare is not None:
  # COMPARE
  cleaned_compare = to_process.compare[~np.isnan(to_process.compare)]
  plot_data = (cleaned_source, cleaned_compare)
- normalizing_weights = (np.full(len(cleaned_source), 1.0 / len(cleaned_source)),
- np.full(len(cleaned_compare), 1.0 / len(cleaned_compare)))
+ if len(cleaned_compare):
+ norm_compare = np.full(len(cleaned_compare), 1.0 / len(cleaned_compare))
+ else:
+ norm_compare = []
+ normalizing_weights = (norm_source, norm_compare)
+
  else:
  plot_data = cleaned_source
- normalizing_weights = np.full(len(cleaned_source), 1.0 / len(cleaned_source))
+ normalizing_weights = norm_source
 
  gap_percent = config["Graphs"].getfloat("summary_graph_categorical_gap")
 

diff --git a/sweetviz/series_analyzer.py b/sweetviz/series_analyzer.py
@@ -83,6 +83,7 @@ def analyze_feature_to_dictionary(to_process: FeatureToProcess) -> dict:
  to_process.source_counts = get_counts(to_process.source)
  returned_feature_dict["type"] = determine_feature_type(to_process.source, to_process.source_counts,
  to_process.predetermined_type, "SOURCE")
+ source_type = returned_feature_dict["type"]
 
  # Determine COMPARED feature type & initialize
  compare_dict = None
@@ -91,14 +92,31 @@ def analyze_feature_to_dictionary(to_process: FeatureToProcess) -> dict:
  compare_type = determine_feature_type(to_process.compare,
  to_process.compare_counts,
  returned_feature_dict["type"], "COMPARED")
- # Explicitly show missing categories on each set
- if compare_type == FeatureType.TYPE_CAT or compare_type == FeatureType.TYPE_BOOL:
- fill_out_missing_counts_in_other_series(to_process.compare_counts, to_process.source_counts)
- fill_out_missing_counts_in_other_series(to_process.source_counts, to_process.compare_counts)
+ if compare_type != FeatureType.TYPE_ALL_NAN and \
+ source_type != FeatureType.TYPE_ALL_NAN:
+ # Explicitly show missing categories on each set
+ if compare_type == FeatureType.TYPE_CAT or compare_type == FeatureType.TYPE_BOOL:
+ fill_out_missing_counts_in_other_series(to_process.compare_counts, to_process.source_counts)
+ fill_out_missing_counts_in_other_series(to_process.source_counts, to_process.compare_counts)
  returned_feature_dict["compare"] = dict()
  compare_dict = returned_feature_dict["compare"]
  compare_dict["type"] = compare_type
 
+ # Settle all-NaN series, depending on source versus compared
+ if to_process.compare is not None:
+ # Settle all-Nan WITH COMPARE: Must consider all cases between source and compare
+ if compare_type == FeatureType.TYPE_ALL_NAN and source_type == FeatureType.TYPE_ALL_NAN:
+ returned_feature_dict["type"] = FeatureType.TYPE_TEXT
+ compare_dict["type"] = FeatureType.TYPE_TEXT
+ elif compare_type == FeatureType.TYPE_ALL_NAN:
+ compare_dict["type"] = source_type
+ elif source_type == FeatureType.TYPE_ALL_NAN:
+ returned_feature_dict["type"] = compare_type
+ else:
+ # Settle all-Nan WITHOUT COMPARE ( trivial: consider as TEXT )
+ if source_type == FeatureType.TYPE_ALL_NAN:
+ returned_feature_dict["type"] = FeatureType.TYPE_TEXT
+
  # Establish base stats
  add_series_base_stats_to_dict(to_process.source, to_process.source_counts, returned_feature_dict)
  if to_process.compare is not None:

diff --git a/sweetviz/sv_types.py b/sweetviz/sv_types.py
@@ -12,6 +12,7 @@ class FeatureType(Enum):
  TYPE_NUM = "NUM"
  TYPE_TEXT = "TEXT"
  TYPE_UNSUPPORTED = "UNSUPPORTED"
+ TYPE_ALL_NAN = "ALL_NAN"
  TYPE_UNKNOWN = "UNKNOWN"
  TYPE_SKIPPED = "SKIPPED"
 

diff --git a/sweetviz/type_detection.py b/sweetviz/type_detection.py
@@ -29,7 +29,8 @@ def determine_feature_type(series: pd.Series, counts: dict,
  # TODO: must_be_this_type ENFORCING
  if counts["distinct_count_without_nan"] == 0:
  # Empty
- var_type = FeatureType.TYPE_UNSUPPORTED
+ var_type = FeatureType.TYPE_ALL_NAN
+ # var_type = FeatureType.TYPE_UNSUPPORTED
  elif is_boolean(series, counts):
  var_type = FeatureType.TYPE_BOOL
  elif is_numeric(series, counts):
@@ -48,7 +49,9 @@ def determine_feature_type(series: pd.Series, counts: dict,
  # NUM -> CAT
  # NUM -> TEXT
  if must_be_this_type != FeatureType.TYPE_UNKNOWN and \
- must_be_this_type != var_type:
+ must_be_this_type != var_type and \
+ must_be_this_type != FeatureType.TYPE_ALL_NAN and \
+ var_type != FeatureType.TYPE_ALL_NAN:
  if var_type == FeatureType.TYPE_TEXT and must_be_this_type == FeatureType.TYPE_CAT:
  var_type = FeatureType.TYPE_CAT
  elif (var_type == FeatureType.TYPE_CAT or var_type == FeatureType.TYPE_BOOL ) and \