Skip to content

Commit

Permalink
Add include null to valid_count and invalid_count and percentage vers…
Browse files Browse the repository at this point in the history
…ion. (#2186)

* Add include null to valid_count and invalid_count and percentage version.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix missing configuration.

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Milan Lukac <[email protected]>
  • Loading branch information
3 people authored Nov 28, 2024
1 parent 067c535 commit d030911
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 4 deletions.
23 changes: 19 additions & 4 deletions soda/core/soda/execution/metric/numeric_query_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,11 @@ def get_sql_aggregation_expression(self) -> str | None:
missing_condition = self.build_missing_condition()
valid_condition = self.build_valid_condition()
invalid_condition = self.build_invalid_condition()
include_null = self.build_include_null()
if valid_condition:
condition = f"NOT ({missing_condition}) AND NOT ({valid_condition})"
condition = f"NOT ({missing_condition}) AND NOT ({valid_condition}){include_null}"
elif invalid_condition:
condition = f"NOT ({missing_condition}) AND ({invalid_condition})"
condition = f"NOT ({missing_condition}) AND ({invalid_condition}){include_null}"
else:
self.logs.warning(
f'Counting invalid without valid or invalid specification does not make sense. ("{self.check.check_cfg.source_line}" @ {self.check.check_cfg.location})'
Expand Down Expand Up @@ -272,13 +273,22 @@ def build_non_missing_and_valid_condition(self):
missing_condition = self.build_missing_condition()
valid_condition = self.build_valid_condition()
invalid_condition = self.build_invalid_condition()
include_null = self.build_include_null()
if valid_condition:
return f"NOT ({missing_condition}) AND ({valid_condition})"
return f"NOT ({missing_condition}) AND ({valid_condition}){include_null}"
elif invalid_condition:
return f"NOT ({missing_condition}) AND NOT ({invalid_condition})"
return f"NOT ({missing_condition}) AND NOT ({invalid_condition}){include_null}"
else:
return f"NOT ({missing_condition})"

def build_include_null(self) -> str:
column_name = self.column_name

if self.missing_and_valid_cfg is None:
return ""

return f" OR {column_name} IS NULL" if self.missing_and_valid_cfg.include_null == True else ""

def get_numeric_format(self) -> str | None:
if self.missing_and_valid_cfg and FormatHelper.is_numeric(self.missing_and_valid_cfg.valid_format):
return self.missing_and_valid_cfg.valid_format
Expand All @@ -297,6 +307,7 @@ def create_failed_rows_sample_query(self) -> SampleQuery | None:
and isinstance(self.value, Number)
and self.value > 0
):
include_null = None
where_clauses = []
passing_where_clauses = []
partition_filter = self.partition.sql_partition_filter
Expand All @@ -309,6 +320,7 @@ def create_failed_rows_sample_query(self) -> SampleQuery | None:
where_clauses.append(f"({self.build_missing_condition()})")
passing_where_clauses.append(f"NOT ({self.build_missing_condition()})")
elif self.name == "invalid_count":
include_null = self.build_include_null()
where_clauses.append(f"NOT ({self.build_missing_condition()})")
passing_where_clauses.append(f"NOT ({self.build_missing_condition()})")

Expand All @@ -329,6 +341,9 @@ def create_failed_rows_sample_query(self) -> SampleQuery | None:
where_sql = " AND ".join(where_clauses)
passing_where_sql = " AND ".join(passing_where_clauses)

if include_null is not None:
where_sql += include_null

sql = self.data_source_scan.data_source.sql_select_all(
self.partition.table.table_name, self.samples_limit, where_sql
)
Expand Down
10 changes: 10 additions & 0 deletions soda/core/soda/sodacl/missing_and_valid_cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
CFG_MISSING_FORMAT = "missing format"
CFG_MISSING_REGEX = "missing regex"

CFG_INCLUDE_NULL = "include null"

CFG_MISSING_VALID_ALL = [
CFG_VALID_VALUES,
CFG_INVALID_VALUES,
Expand All @@ -33,6 +35,7 @@
CFG_MISSING_VALUES,
CFG_MISSING_FORMAT,
CFG_MISSING_REGEX,
CFG_INCLUDE_NULL,
]


Expand Down Expand Up @@ -68,6 +71,8 @@ def __init__(self):
self.valid_min_location: Location | None = None
self.valid_max: float | None = None
self.valid_max_location: Location | None = None
self.include_null: bool | None = None
self.include_null_location: Location | None = None
# TODO
# self.valid_expr: Optional[str] = None

Expand All @@ -86,6 +91,7 @@ def get_identity_parts(self) -> list:
Identity.property("valid_max_length", self.valid_max_length),
Identity.property("valid_min", self.valid_min),
Identity.property("valid_max", self.valid_max),
Identity.property("include_null", self.include_null),
]

def is_empty(self) -> bool:
Expand Down Expand Up @@ -161,3 +167,7 @@ def __merge(self, other: MissingAndValidCfg):
if self.valid_max is None and other.valid_max is not None:
self.valid_max = other.valid_max
self.valid_max_location = other.valid_max_location

if self.include_null is None and other.include_null is not None:
self.include_null = other.include_null
self.include_null_location = other.include_null_location
8 changes: 8 additions & 0 deletions soda/core/soda/sodacl/sodacl_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1640,6 +1640,14 @@ def set_configuration_value(value):
f"{configuration_type} must be a string, but was {type(configuration_value).__name__}",
location=self.location,
)
elif configuration_type in ["include null"]:
if isinstance(configuration_value, bool):
set_configuration_value(configuration_value)
else:
self.logs.error(
f"{configuration_type} must be a boolean, but was '{type(configuration_value).__name__}'",
location=self.location,
)

def __parse_table_filter_section(self, antlr_table_filter_header, header_str, header_content):
from soda.sodacl.partition_cfg import PartitionCfg
Expand Down
36 changes: 36 additions & 0 deletions soda/core/tests/data_source/test_invalid.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,3 +297,39 @@ def test_valid_with_invalid_config(check: str, data_source_fixture: DataSourceFi
scan.execute()

scan.assert_all_checks_pass()


def test_invalid_include_null(data_source_fixture: DataSourceFixture):
table_name = data_source_fixture.ensure_test_table(customers_test_table)

# Row count is 10
scan = data_source_fixture.create_test_scan()
scan.add_sodacl_yaml_str(
f"""
checks for {table_name}:
- invalid_count(pct) = 3:
invalid values: ["error", "No value"]
include null: True
"""
)
scan.execute()

scan.assert_all_checks_pass()


def test_valid_include_null(data_source_fixture: DataSourceFixture):
table_name = data_source_fixture.ensure_test_table(customers_test_table)

# Row count is 10
scan = data_source_fixture.create_test_scan()
scan.add_sodacl_yaml_str(
f"""
checks for {table_name}:
- valid_count(pct) = 8:
invalid values: ["error", "No value"]
include null: True
"""
)
scan.execute()

scan.assert_all_checks_pass()

0 comments on commit d030911

Please sign in to comment.