From d9427f5daeea17b746a00382d07023ddc02ca922 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Tue, 7 Jan 2025 10:52:59 -0300 Subject: [PATCH 01/12] add custom mask functionalities --- .../utilities/data_masking/base.py | 129 ++++++++++++++++-- .../utilities/data_masking/provider/base.py | 73 +++++++++- 2 files changed, 186 insertions(+), 16 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 9b80e50bd58..4cebcef37cb 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -1,5 +1,6 @@ from __future__ import annotations +import ast import functools import logging import warnings @@ -94,8 +95,41 @@ def erase(self, data: tuple, fields: list[str]) -> tuple[str]: ... @overload def erase(self, data: dict, fields: list[str]) -> dict: ... - def erase(self, data: Sequence | Mapping, fields: list[str] | None = None) -> str | list[str] | tuple[str] | dict: - return self._apply_action(data=data, fields=fields, action=self.provider.erase) + @overload + def erase( + self, + data: dict, + fields: list[str], + custom_mask: bool | None = None, + mask_pattern: str | None = None, + regex_pattern: str | None = None, + mask_format: str | None = None, + ) -> dict: ... + + def erase( + self, + data: Sequence | Mapping, + fields: list[str] | None = None, + custom_mask: bool | None = None, + mask_pattern: str | None = None, + regex_pattern: str | None = None, + mask_format: str | None = None, + masking_rules: dict | None = None, + ) -> str | list[str] | tuple[str] | dict: + if not data: + return data + if masking_rules: + return self._apply_masking_rules(data, masking_rules) + else: + return self._apply_action( + data=data, + fields=fields, + action=self.provider.erase, + custom_mask=custom_mask, + mask_pattern=mask_pattern, + regex_pattern=regex_pattern, + mask_format=mask_format, + ) def _apply_action( self, @@ -103,6 +137,10 @@ def _apply_action( fields: list[str] | None, action: Callable, provider_options: dict | None = None, + custom_mask: bool | None = None, + mask_pattern: str | None = None, + regex_pattern: str | None = None, + mask_format: str | None = None, **encryption_context: str, ): """ @@ -136,11 +174,23 @@ def _apply_action( fields=fields, action=action, provider_options=provider_options, + custom_mask=custom_mask, + mask_pattern=mask_pattern, + regex_pattern=regex_pattern, + mask_format=mask_format, **encryption_context, ) else: logger.debug(f"Running action {action.__name__} with the entire data") - return action(data=data, provider_options=provider_options, **encryption_context) + return action( + data=data, + provider_options=provider_options, + custom_mask=custom_mask, + mask_pattern=mask_pattern, + regex_pattern=regex_pattern, + mask_format=mask_format, + **encryption_context, + ) def _apply_action_to_fields( self, @@ -148,6 +198,10 @@ def _apply_action_to_fields( fields: list, action: Callable, provider_options: dict | None = None, + custom_mask: bool | None = None, + mask_pattern: str | None = None, + regex_pattern: str | None = None, + mask_format: str | None = None, **encryption_context: str, ) -> dict | str: """ @@ -194,6 +248,8 @@ def _apply_action_to_fields( new_dict = {'a': {'b': {'c': '*****'}}, 'x': {'y': '*****'}} ``` """ + if not fields: + raise ValueError("Fields parameter cannot be empty") data_parsed: dict = self._normalize_data_to_parse(fields, data) @@ -204,6 +260,10 @@ def _apply_action_to_fields( self._call_action, action=action, provider_options=provider_options, + custom_mask=custom_mask, + mask_pattern=mask_pattern, + regex_pattern=regex_pattern, + mask_format=mask_format, **encryption_context, # type: ignore[arg-type] ) @@ -225,12 +285,6 @@ def _apply_action_to_fields( # For in-place updates, json_parse accepts a callback function # that receives 3 args: field_value, fields, field_name # We create a partial callback to pre-populate known provider options (action, provider opts, enc ctx) - update_callback = functools.partial( - self._call_action, - action=action, - provider_options=provider_options, - **encryption_context, # type: ignore[arg-type] - ) json_parse.update( data_parsed, @@ -239,6 +293,49 @@ def _apply_action_to_fields( return data_parsed + def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: + """ + Apply masking rules to data, supporting different rules for each field. + """ + result = data.copy() + + for path, rule in masking_rules.items(): + try: + # Handle nested paths (e.g., 'address.street') + parts = path.split(".") + current = result + + for part in parts[:-1]: + if isinstance(current[part], str) and current[part].startswith("{"): + try: + current[part] = ast.literal_eval(current[part]) + except (ValueError, SyntaxError): + continue + current = current[part] + + final_field = parts[-1] + + # Apply masking rule to the target field + if final_field in current: + current[final_field] = self.provider.erase(str(current[final_field]), **rule) + + except (KeyError, TypeError, AttributeError): + # Log warning if field not found or invalid path + warnings.warn(f"Could not apply masking rule for path: {path}", stacklevel=2) + continue + + return result + + def _mask_nested_field(self, data: dict, field_path: str, mask_function): + keys = field_path.split(".") + current = data + for key in keys[:-1]: + current = current.get(key, {}) + if not isinstance(current, dict): + return # Caminho inválido + if keys[-1] in current: + current[keys[-1]] = mask_function(current[keys[-1]]) + @staticmethod def _call_action( field_value: Any, @@ -246,6 +343,10 @@ def _call_action( field_name: str, action: Callable, provider_options: dict[str, Any] | None = None, + custom_mask: bool | None = None, + mask_pattern: str | None = None, + regex_pattern: str | None = None, + mask_format: str | None = None, **encryption_context, ) -> None: """ @@ -263,7 +364,15 @@ def _call_action( Returns: - fields[field_name]: Returns the processed field value """ - fields[field_name] = action(field_value, provider_options=provider_options, **encryption_context) + fields[field_name] = action( + field_value, + provider_options=provider_options, + custom_mask=custom_mask, + mask_pattern=mask_pattern, + regex_pattern=regex_pattern, + mask_format=mask_format, + **encryption_context, + ) return fields[field_name] def _normalize_data_to_parse(self, fields: list, data: str | dict) -> dict: diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 28bc8384f8d..6a5d806f056 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -2,10 +2,14 @@ import functools import json +import re from typing import Any, Callable, Iterable from aws_lambda_powertools.utilities.data_masking.constants import DATA_MASKING_STRING +PRESERVE_CHARS = set("-_. ") +_regex_cache = {} + class BaseProvider: """ @@ -63,7 +67,16 @@ def decrypt(self, data, provider_options: dict | None = None, **encryption_conte """ raise NotImplementedError("Subclasses must implement decrypt()") - def erase(self, data, **kwargs) -> Iterable[str]: + def erase( + self, + data, + custom_mask: bool | None = None, + mask_pattern: str | None = None, + regex_pattern: str | None = None, + mask_format: str | None = None, + masking_rules: dict | None = None, + **kwargs, + ) -> Iterable[str]: """ This method irreversibly erases data. @@ -72,10 +85,58 @@ def erase(self, data, **kwargs) -> Iterable[str]: If the data to be erased is of an iterable type like `list`, `tuple`, or `set`, this method will return a new object of the same type as the - input data but with each element replaced by the string "*****". + input data but with each element replaced by the string "*****" or following one of the custom masks. """ - if isinstance(data, (str, dict, bytes)): + result = DATA_MASKING_STRING + + if data: + if isinstance(data, str): + if custom_mask: + if mask_pattern: + result = self._pattern_mask(data, mask_pattern) + elif regex_pattern and mask_format: + result = self._regex_mask(data, regex_pattern, mask_format) + else: + result = self._custom_erase(data, **kwargs) + elif isinstance(data, dict): + if masking_rules: + result = self._apply_masking_rules(data, masking_rules) + elif isinstance(data, (list, tuple, set)): + result = type(data)( + self.erase( + item, + custom_mask=custom_mask, + mask_pattern=mask_pattern, + regex_pattern=regex_pattern, + mask_format=mask_format, + masking_rules=masking_rules, + **kwargs, + ) + for item in data + ) + + return result + + def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: + return { + key: self.erase(str(value), **masking_rules[key]) if key in masking_rules else str(value) + for key, value in data.items() + } + + def _pattern_mask(self, data: str, pattern: str) -> str: + return pattern[: len(data)] if len(pattern) >= len(data) else pattern + + def _regex_mask(self, data: str, regex_pattern: str, mask_format: str) -> str: + try: + if regex_pattern not in _regex_cache: + _regex_cache[regex_pattern] = re.compile(regex_pattern) + return _regex_cache[regex_pattern].sub(mask_format, data) + except re.error: return DATA_MASKING_STRING - elif isinstance(data, (list, tuple, set)): - return type(data)([DATA_MASKING_STRING] * len(data)) - return DATA_MASKING_STRING + + def _custom_erase(self, data: str, **kwargs) -> str: + if not data: + return "" + + # Use join with list comprehension instead of building list incrementally + return "".join("*" if char not in PRESERVE_CHARS else char for char in data) From 796bd898de8e95c8342994449e157ccad15d1ce7 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Tue, 7 Jan 2025 14:05:39 -0300 Subject: [PATCH 02/12] change flags name to more intuitive --- .../utilities/data_masking/base.py | 30 +++++++++---------- .../utilities/data_masking/provider/base.py | 17 +++++------ 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 4cebcef37cb..f08e10371f7 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -100,8 +100,8 @@ def erase( self, data: dict, fields: list[str], - custom_mask: bool | None = None, - mask_pattern: str | None = None, + dynamic_mask: bool | None = None, + custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, ) -> dict: ... @@ -110,8 +110,8 @@ def erase( self, data: Sequence | Mapping, fields: list[str] | None = None, - custom_mask: bool | None = None, - mask_pattern: str | None = None, + dynamic_mask: bool | None = None, + custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, masking_rules: dict | None = None, @@ -125,8 +125,8 @@ def erase( data=data, fields=fields, action=self.provider.erase, + dynamic_mask=dynamic_mask, custom_mask=custom_mask, - mask_pattern=mask_pattern, regex_pattern=regex_pattern, mask_format=mask_format, ) @@ -137,8 +137,8 @@ def _apply_action( fields: list[str] | None, action: Callable, provider_options: dict | None = None, - custom_mask: bool | None = None, - mask_pattern: str | None = None, + dynamic_mask: bool | None = None, + custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, **encryption_context: str, @@ -174,8 +174,8 @@ def _apply_action( fields=fields, action=action, provider_options=provider_options, + dynamic_mask=dynamic_mask, custom_mask=custom_mask, - mask_pattern=mask_pattern, regex_pattern=regex_pattern, mask_format=mask_format, **encryption_context, @@ -185,8 +185,8 @@ def _apply_action( return action( data=data, provider_options=provider_options, + dynamic_mask=dynamic_mask, custom_mask=custom_mask, - mask_pattern=mask_pattern, regex_pattern=regex_pattern, mask_format=mask_format, **encryption_context, @@ -198,8 +198,8 @@ def _apply_action_to_fields( fields: list, action: Callable, provider_options: dict | None = None, - custom_mask: bool | None = None, - mask_pattern: str | None = None, + dynamic_mask: bool | None = None, + custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, **encryption_context: str, @@ -260,8 +260,8 @@ def _apply_action_to_fields( self._call_action, action=action, provider_options=provider_options, + dynamic_mask=dynamic_mask, custom_mask=custom_mask, - mask_pattern=mask_pattern, regex_pattern=regex_pattern, mask_format=mask_format, **encryption_context, # type: ignore[arg-type] @@ -343,8 +343,8 @@ def _call_action( field_name: str, action: Callable, provider_options: dict[str, Any] | None = None, - custom_mask: bool | None = None, - mask_pattern: str | None = None, + dynamic_mask: bool | None = None, + custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, **encryption_context, @@ -367,8 +367,8 @@ def _call_action( fields[field_name] = action( field_value, provider_options=provider_options, + dynamic_mask=dynamic_mask, custom_mask=custom_mask, - mask_pattern=mask_pattern, regex_pattern=regex_pattern, mask_format=mask_format, **encryption_context, diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 6a5d806f056..4337a0e6502 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -70,8 +70,8 @@ def decrypt(self, data, provider_options: dict | None = None, **encryption_conte def erase( self, data, - custom_mask: bool | None = None, - mask_pattern: str | None = None, + dynamic_mask: bool | None = None, + custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, masking_rules: dict | None = None, @@ -91,13 +91,12 @@ def erase( if data: if isinstance(data, str): + if dynamic_mask: + result = self._custom_erase(data, **kwargs) if custom_mask: - if mask_pattern: - result = self._pattern_mask(data, mask_pattern) - elif regex_pattern and mask_format: - result = self._regex_mask(data, regex_pattern, mask_format) - else: - result = self._custom_erase(data, **kwargs) + result = self._pattern_mask(data, custom_mask) + if regex_pattern and mask_format: + result = self._regex_mask(data, regex_pattern, mask_format) elif isinstance(data, dict): if masking_rules: result = self._apply_masking_rules(data, masking_rules) @@ -105,8 +104,8 @@ def erase( result = type(data)( self.erase( item, + dynamic_mask=dynamic_mask, custom_mask=custom_mask, - mask_pattern=mask_pattern, regex_pattern=regex_pattern, mask_format=mask_format, masking_rules=masking_rules, From d9319179fa9bacde7f10e0e301a980b4c4ebf0c1 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Tue, 7 Jan 2025 18:50:30 -0300 Subject: [PATCH 03/12] fix type check error --- .../utilities/data_masking/provider/base.py | 81 ++++++++++++------- 1 file changed, 51 insertions(+), 30 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 4337a0e6502..382264c220e 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -3,8 +3,9 @@ import functools import json import re -from typing import Any, Callable, Iterable +from typing import Any, Callable +# , Iterable from aws_lambda_powertools.utilities.data_masking.constants import DATA_MASKING_STRING PRESERVE_CHARS = set("-_. ") @@ -69,14 +70,14 @@ def decrypt(self, data, provider_options: dict | None = None, **encryption_conte def erase( self, - data, + data: Any, dynamic_mask: bool | None = None, custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, masking_rules: dict | None = None, **kwargs, - ) -> Iterable[str]: + ) -> str | dict | list | tuple | set: """ This method irreversibly erases data. @@ -85,47 +86,68 @@ def erase( If the data to be erased is of an iterable type like `list`, `tuple`, or `set`, this method will return a new object of the same type as the - input data but with each element replaced by the string "*****" or following one of the custom masks. + input data but with each element masked according to the specified rules. """ - result = DATA_MASKING_STRING - - if data: - if isinstance(data, str): - if dynamic_mask: - result = self._custom_erase(data, **kwargs) - if custom_mask: - result = self._pattern_mask(data, custom_mask) - if regex_pattern and mask_format: - result = self._regex_mask(data, regex_pattern, mask_format) - elif isinstance(data, dict): - if masking_rules: - result = self._apply_masking_rules(data, masking_rules) - elif isinstance(data, (list, tuple, set)): - result = type(data)( - self.erase( - item, - dynamic_mask=dynamic_mask, - custom_mask=custom_mask, - regex_pattern=regex_pattern, - mask_format=mask_format, - masking_rules=masking_rules, - **kwargs, - ) - for item in data + result = None + + # Handle empty or None data + if not data: + result = DATA_MASKING_STRING if isinstance(data, (str, bytes)) else data + + # Handle string data + elif isinstance(data, str): + if regex_pattern and mask_format: + result = self._regex_mask(data, regex_pattern, mask_format) + elif custom_mask: + result = self._pattern_mask(data, custom_mask) + elif dynamic_mask: + result = self._custom_erase(data, **kwargs) + else: + result = DATA_MASKING_STRING + + # Handle dictionary data + elif isinstance(data, dict): + result = ( + self._apply_masking_rules(data, masking_rules) + if masking_rules + else {k: DATA_MASKING_STRING for k in data} + ) + + # Handle iterable data (list, tuple, set) + elif isinstance(data, (list, tuple, set)): + masked_data = ( + self.erase( + item, + dynamic_mask=dynamic_mask, + custom_mask=custom_mask, + regex_pattern=regex_pattern, + mask_format=mask_format, + masking_rules=masking_rules, + **kwargs, ) + for item in data + ) + result = type(data)(masked_data) + + # Default case + else: + result = DATA_MASKING_STRING return result def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: + """Apply masking rules to dictionary data.""" return { key: self.erase(str(value), **masking_rules[key]) if key in masking_rules else str(value) for key, value in data.items() } def _pattern_mask(self, data: str, pattern: str) -> str: + """Apply pattern masking to string data.""" return pattern[: len(data)] if len(pattern) >= len(data) else pattern def _regex_mask(self, data: str, regex_pattern: str, mask_format: str) -> str: + """Apply regex masking to string data.""" try: if regex_pattern not in _regex_cache: _regex_cache[regex_pattern] = re.compile(regex_pattern) @@ -137,5 +159,4 @@ def _custom_erase(self, data: str, **kwargs) -> str: if not data: return "" - # Use join with list comprehension instead of building list incrementally return "".join("*" if char not in PRESERVE_CHARS else char for char in data) From 4c0070c30050749d4b570f0fc10916624ee52081 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Wed, 8 Jan 2025 13:55:54 -0300 Subject: [PATCH 04/12] add draft documentation --- docs/utilities/data_masking.md | 21 +++++++++- .../data_masking/src/custom_data_masking.py | 38 +++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 examples/data_masking/src/custom_data_masking.py diff --git a/docs/utilities/data_masking.md b/docs/utilities/data_masking.md index 162292e79a0..b1485dac6df 100644 --- a/docs/utilities/data_masking.md +++ b/docs/utilities/data_masking.md @@ -43,7 +43,7 @@ stateDiagram-v2 ## Terminology -**Erasing** replaces sensitive information **irreversibly** with a non-sensitive placeholder _(`*****`)_. This operation replaces data in-memory, making it a one-way action. +**Erasing** replaces sensitive information **irreversibly** with a non-sensitive placeholder _(`*****`)_, or with a customized mask. This operation replaces data in-memory, making it a one-way action. **Encrypting** transforms plaintext into ciphertext using an encryption algorithm and a cryptographic key. It allows you to encrypt any sensitive data, so only allowed personnel to decrypt it. Learn more about encryption [here](https://aws.amazon.com/blogs/security/importance-of-encryption-and-how-aws-can-help/){target="_blank"}. @@ -117,6 +117,25 @@ Erasing will remove the original data and replace it with a `*****`. This means --8<-- "examples/data_masking/src/getting_started_erase_data_output.json" ``` +The `erase` method also supports additional flags for more advanced and flexible masking: + +| Flag | Behavior | +| ---------------- | ----------------------------------------------------------| +| `dynamic_mask`(bool) | When set to `True`, this flag enables custom masking behavior. It activates the use of advanced masking techniques such as pattern-based or regex-based masking.| +| `custom_mask`(str) | Specifies a simple pattern for masking data. This pattern is applied directly to the input string, replacing all the original characters. For example, with a `custom_mask` of "XX-XX" applied to "12345", the result would be "XX-XX".| +| `regex_pattern`(str) | Defines a regular expression pattern used to identify parts of the input string that should be masked. This allows for more complex and flexible masking rules. It's used in conjunction with `mask_format`.| +| `mask_format`(str) | Specifies the format to use when replacing parts of the string matched by `regex_pattern`. It can include placeholders (like \1, \2) to refer to captured groups in the regex pattern, allowing some parts of the original string to be preserved.| +| `masking_rules`(dict) | Allows you to apply different masking rules (flags) for each data field.| + +=== "custom_data_masking.py" + ```python hl_lines="13 17 21 25 36" + --8<-- "examples/data_masking/src/custom_data_masking.py" + ``` +=== "generic_data_input.json" + ```json hl_lines="6 7 9 12" + --8<-- "examples/data_masking/src/generic_data_input.json" + ``` + ### Encrypting data ???+ note "About static typing and encryption" diff --git a/examples/data_masking/src/custom_data_masking.py b/examples/data_masking/src/custom_data_masking.py new file mode 100644 index 00000000000..a99b9045cac --- /dev/null +++ b/examples/data_masking/src/custom_data_masking.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from aws_lambda_powertools.utilities.data_masking import DataMasking +from aws_lambda_powertools.utilities.typing import LambdaContext + +data_masker = DataMasking() + + +def lambda_handler(event: dict, context: LambdaContext) -> dict: + data: dict = event.get("body", {}) + + # Default erase (*****) + default_erased = data_masker.erase(data, fields=["address.zip"]) + # 'street': '*****' + + # dynamic_mask + dynamic_mask = data_masker.erase(data, fields=["address.zip"], dynamic_mask=True) + #'street': '*** **** **' + + # custom_mask + custom_mask = data_masker.erase(data, fields=["address.zip"], custom_mask="XX") + #'zip': 'XX' + + # regex_pattern and mask_format + regex_pattern = data_masker.erase(data, fields=["email"], regex_pattern=r"(.)(.*)(@.*)", mask_format=r"\1****\3") + #'email': 'j****@example.com' + + # Masking rules for each field + masking_rules = { + "email": {"regex_pattern": "(.)(.*)(@.*)", "mask_format": r"\1****\3"}, + "age": {"dynamic_mask": True}, + "address.zip": {"dynamic_mask": True, "custom_mask": "xxx"}, + "address.street": {"dynamic_mask": False}, + } + + masking_rules_erase = data_masker.erase(data, masking_rules=masking_rules) + + return default_erased, dynamic_mask, custom_mask, regex_pattern, masking_rules_erase From ae81dce2d47967145b9f2a223356996a986ea0fb Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Wed, 8 Jan 2025 16:43:16 -0300 Subject: [PATCH 05/12] change doc examples --- .../utilities/data_masking/base.py | 3 ++ docs/utilities/data_masking.md | 43 +++++++++++++------ .../data_masking/src/custom_data_masking.py | 20 +-------- 3 files changed, 34 insertions(+), 32 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index f08e10371f7..7695b41bd6b 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -95,6 +95,9 @@ def erase(self, data: tuple, fields: list[str]) -> tuple[str]: ... @overload def erase(self, data: dict, fields: list[str]) -> dict: ... + @overload + def erase(self, data: dict[Any, Any], *, masking_rules: dict[str, object]) -> dict[Any, Any]: ... + @overload def erase( self, diff --git a/docs/utilities/data_masking.md b/docs/utilities/data_masking.md index b1485dac6df..c90abfc236e 100644 --- a/docs/utilities/data_masking.md +++ b/docs/utilities/data_masking.md @@ -119,22 +119,37 @@ Erasing will remove the original data and replace it with a `*****`. This means The `erase` method also supports additional flags for more advanced and flexible masking: -| Flag | Behavior | -| ---------------- | ----------------------------------------------------------| -| `dynamic_mask`(bool) | When set to `True`, this flag enables custom masking behavior. It activates the use of advanced masking techniques such as pattern-based or regex-based masking.| -| `custom_mask`(str) | Specifies a simple pattern for masking data. This pattern is applied directly to the input string, replacing all the original characters. For example, with a `custom_mask` of "XX-XX" applied to "12345", the result would be "XX-XX".| -| `regex_pattern`(str) | Defines a regular expression pattern used to identify parts of the input string that should be masked. This allows for more complex and flexible masking rules. It's used in conjunction with `mask_format`.| -| `mask_format`(str) | Specifies the format to use when replacing parts of the string matched by `regex_pattern`. It can include placeholders (like \1, \2) to refer to captured groups in the regex pattern, allowing some parts of the original string to be preserved.| -| `masking_rules`(dict) | Allows you to apply different masking rules (flags) for each data field.| - -=== "custom_data_masking.py" - ```python hl_lines="13 17 21 25 36" +=== "dynamic_mask" + + (bool) When set to `True`, this flag enables custom masking behavior. It activates the use of advanced masking techniques such as pattern-based or regex-based masking. + + > Expression: `data_masker.erase(data, fields=["address.zip"], dynamic_mask=True)` + + > Field result: `'street': '*** **** **'` + +=== "custom_mask" + + (str) Specifies a simple pattern for masking data. This pattern is applied directly to the input string, replacing all the original characters. For example, with a `custom_mask` of "XX-XX" applied to "12345", the result would be "XX-XX". + + > Expression: `data_masker.erase(data, fields=["address.zip"], custom_mask="XX")` + + > Field result: `'zip': 'XX'` + +=== "regex_pattern & mask_format" + + (str) `regex_pattern` defines a regular expression pattern used to identify parts of the input string that should be masked. This allows for more complex and flexible masking rules. It's used in conjunction with `mask_format`. + `mask_format` specifies the format to use when replacing parts of the string matched by `regex_pattern`. It can include placeholders (like \1, \2) to refer to captured groups in the regex pattern, allowing some parts of the original string to be preserved. + + > Expression: `data_masker.erase(data, fields=["email"], regex_pattern=r"(.)(.*)(@.*)", mask_format=r"\1****\3")` + + > Field result: `'email': 'j****@example.com'` + +=== "masking_rules" + + (dict) Allows you to apply different masking rules (flags) for each data field. + ```python hl_lines="20" --8<-- "examples/data_masking/src/custom_data_masking.py" ``` -=== "generic_data_input.json" - ```json hl_lines="6 7 9 12" - --8<-- "examples/data_masking/src/generic_data_input.json" - ``` ### Encrypting data diff --git a/examples/data_masking/src/custom_data_masking.py b/examples/data_masking/src/custom_data_masking.py index a99b9045cac..24a5d51bc81 100644 --- a/examples/data_masking/src/custom_data_masking.py +++ b/examples/data_masking/src/custom_data_masking.py @@ -9,22 +9,6 @@ def lambda_handler(event: dict, context: LambdaContext) -> dict: data: dict = event.get("body", {}) - # Default erase (*****) - default_erased = data_masker.erase(data, fields=["address.zip"]) - # 'street': '*****' - - # dynamic_mask - dynamic_mask = data_masker.erase(data, fields=["address.zip"], dynamic_mask=True) - #'street': '*** **** **' - - # custom_mask - custom_mask = data_masker.erase(data, fields=["address.zip"], custom_mask="XX") - #'zip': 'XX' - - # regex_pattern and mask_format - regex_pattern = data_masker.erase(data, fields=["email"], regex_pattern=r"(.)(.*)(@.*)", mask_format=r"\1****\3") - #'email': 'j****@example.com' - # Masking rules for each field masking_rules = { "email": {"regex_pattern": "(.)(.*)(@.*)", "mask_format": r"\1****\3"}, @@ -33,6 +17,6 @@ def lambda_handler(event: dict, context: LambdaContext) -> dict: "address.street": {"dynamic_mask": False}, } - masking_rules_erase = data_masker.erase(data, masking_rules=masking_rules) + result = data_masker.erase(data, masking_rules=masking_rules) - return default_erased, dynamic_mask, custom_mask, regex_pattern, masking_rules_erase + return result From 7630b068ed0f9bffa2894f7dc8f0c86c04bba134 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Thu, 9 Jan 2025 08:30:26 -0300 Subject: [PATCH 06/12] style: format code with black --- .../utilities/data_masking/base.py | 2 +- .../utilities/data_masking/provider/base.py | 35 ++++++++++++------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 7695b41bd6b..8136c8bcaaf 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -335,7 +335,7 @@ def _mask_nested_field(self, data: dict, field_path: str, mask_function): for key in keys[:-1]: current = current.get(key, {}) if not isinstance(current, dict): - return # Caminho inválido + return if keys[-1] in current: current[keys[-1]] = mask_function(current[keys[-1]]) diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 382264c220e..6fa5648e7bc 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -91,11 +91,11 @@ def erase( result = None # Handle empty or None data - if not data: - result = DATA_MASKING_STRING if isinstance(data, (str, bytes)) else data + if data is None or (isinstance(data, (str, list, dict)) and not data): + return data # Handle string data - elif isinstance(data, str): + if isinstance(data, str): if regex_pattern and mask_format: result = self._regex_mask(data, regex_pattern, mask_format) elif custom_mask: @@ -107,15 +107,24 @@ def erase( # Handle dictionary data elif isinstance(data, dict): - result = ( - self._apply_masking_rules(data, masking_rules) - if masking_rules - else {k: DATA_MASKING_STRING for k in data} - ) + if masking_rules: + result = self._apply_masking_rules(data, masking_rules) + else: + result = {} + for k, v in data.items(): + result[str(k)] = self.erase( + str(v), + dynamic_mask=dynamic_mask, + custom_mask=custom_mask, + regex_pattern=regex_pattern, + mask_format=mask_format, + masking_rules=masking_rules, + **kwargs, + ) # Handle iterable data (list, tuple, set) elif isinstance(data, (list, tuple, set)): - masked_data = ( + masked_data = [ self.erase( item, dynamic_mask=dynamic_mask, @@ -126,16 +135,16 @@ def erase( **kwargs, ) for item in data - ) + ] result = type(data)(masked_data) - # Default case + # Handle other types (int, float, bool, etc.) else: - result = DATA_MASKING_STRING + result = str(data) return result - def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: + def _apply_masking_rules(self, data: dict, masking_rules: dict) -> Any: """Apply masking rules to dictionary data.""" return { key: self.erase(str(value), **masking_rules[key]) if key in masking_rules else str(value) From 6e2ec354612b44ceede4c56bc4d56bb602ecc7e4 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Thu, 9 Jan 2025 08:49:17 -0300 Subject: [PATCH 07/12] fix format base --- .../utilities/data_masking/provider/base.py | 138 +++++++++++------- 1 file changed, 85 insertions(+), 53 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 6fa5648e7bc..47079c42484 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -5,7 +5,6 @@ import re from typing import Any, Callable -# , Iterable from aws_lambda_powertools.utilities.data_masking.constants import DATA_MASKING_STRING PRESERVE_CHARS = set("-_. ") @@ -77,56 +76,72 @@ def erase( mask_format: str | None = None, masking_rules: dict | None = None, **kwargs, - ) -> str | dict | list | tuple | set: - """ - This method irreversibly erases data. - - If the data to be erased is of type `str`, `dict`, or `bytes`, - this method will return an erased string, i.e. "*****". - - If the data to be erased is of an iterable type like `list`, `tuple`, - or `set`, this method will return a new object of the same type as the - input data but with each element masked according to the specified rules. - """ - result = None - + ) -> Any: # Handle empty or None data if data is None or (isinstance(data, (str, list, dict)) and not data): return data - # Handle string data - if isinstance(data, str): - if regex_pattern and mask_format: - result = self._regex_mask(data, regex_pattern, mask_format) - elif custom_mask: - result = self._pattern_mask(data, custom_mask) - elif dynamic_mask: - result = self._custom_erase(data, **kwargs) - else: - result = DATA_MASKING_STRING - - # Handle dictionary data + result = data # Default to returning the original data + + if isinstance(data, (str, int, float)): + result = self._mask_primitive(str(data), dynamic_mask, custom_mask, regex_pattern, mask_format, **kwargs) elif isinstance(data, dict): - if masking_rules: - result = self._apply_masking_rules(data, masking_rules) - else: - result = {} - for k, v in data.items(): - result[str(k)] = self.erase( - str(v), - dynamic_mask=dynamic_mask, - custom_mask=custom_mask, - regex_pattern=regex_pattern, - mask_format=mask_format, - masking_rules=masking_rules, - **kwargs, - ) - - # Handle iterable data (list, tuple, set) + result = self._mask_dict( + data, + dynamic_mask, + custom_mask, + regex_pattern, + mask_format, + masking_rules, + **kwargs, + ) elif isinstance(data, (list, tuple, set)): - masked_data = [ - self.erase( - item, + result = self._mask_iterable( + data, + dynamic_mask, + custom_mask, + regex_pattern, + mask_format, + masking_rules, + **kwargs, + ) + + return result + + def _mask_primitive( + self, + data: str, + dynamic_mask: bool | None, + custom_mask: str | None, + regex_pattern: str | None, + mask_format: str | None, + **kwargs, + ) -> str: + if regex_pattern and mask_format: + return self._regex_mask(data, regex_pattern, mask_format) + elif custom_mask: + return self._pattern_mask(data, custom_mask) + elif dynamic_mask: + return self._custom_erase(data, **kwargs) + else: + return DATA_MASKING_STRING + + def _mask_dict( + self, + data: dict, + dynamic_mask: bool | None, + custom_mask: str | None, + regex_pattern: str | None, + mask_format: str | None, + masking_rules: dict | None, + **kwargs, + ) -> dict: + if masking_rules: + return self._apply_masking_rules(data, masking_rules) + else: + return { + k: self.erase( + v, dynamic_mask=dynamic_mask, custom_mask=custom_mask, regex_pattern=regex_pattern, @@ -134,15 +149,32 @@ def erase( masking_rules=masking_rules, **kwargs, ) - for item in data - ] - result = type(data)(masked_data) + for k, v in data.items() + } - # Handle other types (int, float, bool, etc.) - else: - result = str(data) - - return result + def _mask_iterable( + self, + data: list | tuple | set, + dynamic_mask: bool | None, + custom_mask: str | None, + regex_pattern: str | None, + mask_format: str | None, + masking_rules: dict | None, + **kwargs, + ) -> list | tuple | set: + masked_data = [ + self.erase( + item, + dynamic_mask=dynamic_mask, + custom_mask=custom_mask, + regex_pattern=regex_pattern, + mask_format=mask_format, + masking_rules=masking_rules, + **kwargs, + ) + for item in data + ] + return type(data)(masked_data) def _apply_masking_rules(self, data: dict, masking_rules: dict) -> Any: """Apply masking rules to dictionary data.""" From 93c1544fd31cb282ba15fc86de3b76fd53fbf02c Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Thu, 9 Jan 2025 11:39:39 -0300 Subject: [PATCH 08/12] add tests for new masks --- .../utilities/data_masking/base.py | 2 - .../utilities/data_masking/provider/base.py | 15 ++++--- .../test_unit_data_masking.py | 43 +++++++++++++++++++ 3 files changed, 53 insertions(+), 7 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 8136c8bcaaf..23b7a684dde 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -119,8 +119,6 @@ def erase( mask_format: str | None = None, masking_rules: dict | None = None, ) -> str | list[str] | tuple[str] | dict: - if not data: - return data if masking_rules: return self._apply_masking_rules(data, masking_rules) else: diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 47079c42484..02e6406b862 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -77,11 +77,16 @@ def erase( masking_rules: dict | None = None, **kwargs, ) -> Any: - # Handle empty or None data - if data is None or (isinstance(data, (str, list, dict)) and not data): - return data - result = data # Default to returning the original data + result = DATA_MASKING_STRING + + if not any([dynamic_mask, custom_mask, regex_pattern, mask_format, masking_rules]): + if isinstance(data, (str, int, float, dict, bytes)): + return DATA_MASKING_STRING + elif isinstance(data, (list, tuple, set)): + return type(data)([DATA_MASKING_STRING] * len(data)) + else: + return DATA_MASKING_STRING if isinstance(data, (str, int, float)): result = self._mask_primitive(str(data), dynamic_mask, custom_mask, regex_pattern, mask_format, **kwargs) @@ -194,7 +199,7 @@ def _regex_mask(self, data: str, regex_pattern: str, mask_format: str) -> str: _regex_cache[regex_pattern] = re.compile(regex_pattern) return _regex_cache[regex_pattern].sub(mask_format, data) except re.error: - return DATA_MASKING_STRING + return data def _custom_erase(self, data: str, **kwargs) -> str: if not data: diff --git a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py index 4fbbc188ceb..cd728904cc7 100644 --- a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py +++ b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py @@ -25,6 +25,16 @@ def test_erase_int(data_masker): assert erased_string == DATA_MASKING_STRING +def test_erase_int_custom_mask(data_masker): + # GIVEN an int data type + + # WHEN erase is called with no fields argument + erased_string = data_masker.erase(42, custom_mask="XX") + + # THEN the result is the data masked + assert erased_string == "XX" + + def test_erase_float(data_masker): # GIVEN a float data type @@ -205,3 +215,36 @@ def test_parsing_nonexistent_fields_warning_on_missing_field(): # THEN the "erased" payload is the same of the original assert masked_json_string == data + + +def test_regex_mask(data_masker): + data = "Hello! My name is Fulano Ciclano" + regex_pattern = r"\b[A-Z][a-z]+ [A-Z][a-z]+\b" + mask_format = "XXXX XXXX" + + result = data_masker.erase(data, regex_pattern=regex_pattern, mask_format=mask_format) + + assert result == "Hello! My name is XXXX XXXX" + + +def test_erase_json_dict_with_fields_and_masks(data_masker): + # GIVEN the data type is a json representation of a dictionary + data = json.dumps( + { + "a": { + "1": {"None": "hello", "four": "world"}, + "b": {"3": {"4": "goodbye", "e": "world"}}, + }, + }, + ) + + # WHEN erase is called with a list of fields specified + masked_json_string = data_masker.erase(data, fields=["a.'1'.None", "a..'4'"], dynamic_mask=True) + + # THEN the result is only the specified fields are erased + assert masked_json_string == { + "a": { + "1": {"None": "*****", "four": "world"}, + "b": {"3": {"4": "*******", "e": "world"}}, + }, + } From 92d474020b65100fb23d4903fd51e42a5f38ce0c Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Wed, 15 Jan 2025 11:22:39 -0300 Subject: [PATCH 09/12] sub header for custom mask in docs --- docs/utilities/data_masking.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/utilities/data_masking.md b/docs/utilities/data_masking.md index c90abfc236e..596fa2c3fa3 100644 --- a/docs/utilities/data_masking.md +++ b/docs/utilities/data_masking.md @@ -117,6 +117,8 @@ Erasing will remove the original data and replace it with a `*****`. This means --8<-- "examples/data_masking/src/getting_started_erase_data_output.json" ``` +#### Custom masking + The `erase` method also supports additional flags for more advanced and flexible masking: === "dynamic_mask" From d9535d6ff78ee638c5ea9b4813c16b64c8dd8bdc Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Wed, 15 Jan 2025 13:03:04 -0300 Subject: [PATCH 10/12] masking rules to handle complex nest --- .../utilities/data_masking/base.py | 67 ++++++++++++------- 1 file changed, 44 insertions(+), 23 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 23b7a684dde..0dd41522d61 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -1,6 +1,5 @@ from __future__ import annotations -import ast import functools import logging import warnings @@ -296,33 +295,55 @@ def _apply_action_to_fields( def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: """ - Apply masking rules to data, supporting different rules for each field. + Apply masking rules to data, supporting both simple field names and complex path expressions. + + Args: + data: The dictionary containing data to mask + masking_rules: Dictionary mapping field names or path expressions to masking rules + + Returns: + dict: The masked data dictionary """ result = data.copy() for path, rule in masking_rules.items(): try: - # Handle nested paths (e.g., 'address.street') - parts = path.split(".") - current = result - - for part in parts[:-1]: - if isinstance(current[part], str) and current[part].startswith("{"): - try: - current[part] = ast.literal_eval(current[part]) - except (ValueError, SyntaxError): - continue - current = current[part] - - final_field = parts[-1] - - # Apply masking rule to the target field - if final_field in current: - current[final_field] = self.provider.erase(str(current[final_field]), **rule) - - except (KeyError, TypeError, AttributeError): - # Log warning if field not found or invalid path - warnings.warn(f"Could not apply masking rule for path: {path}", stacklevel=2) + if ".." in path: + # Handle recursive descent paths (e.g., "address..name") + base_path, field = path.split("..") + jsonpath_expr = parse(f"$.{base_path}..{field}") + elif "[" in path: + # Handle array notation paths (e.g., "address[*].street") + jsonpath_expr = parse(f"$.{path}") + else: + # Handle simple field names (e.g., "email") + jsonpath_expr = parse(f"$.{path}") + + matches = jsonpath_expr.find(result) + + if not matches: + warnings.warn(f"No matches found for path: {path}", stacklevel=2) + continue + + for match in matches: + try: + value = match.value + if value is not None: + if isinstance(value, dict): + # Handle dictionary values by masking each field + for k, v in value.items(): + if v is not None: + value[k] = self.provider.erase(str(v), **rule) + else: + masked_value = self.provider.erase(str(value), **rule) + match.full_path.update(result, masked_value) + + except Exception as e: + warnings.warn(f"Error masking value for path {path}: {str(e)}", stacklevel=2) + continue + + except Exception as e: + warnings.warn(f"Error processing path {path}: {str(e)}", stacklevel=2) continue return result From 9dc2b562e685d1ec1f37e691403c32eb7644f8c0 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Wed, 15 Jan 2025 13:18:25 -0300 Subject: [PATCH 11/12] add test for masking rules --- .../test_unit_data_masking.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py index cd728904cc7..8eb0f955958 100644 --- a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py +++ b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py @@ -248,3 +248,37 @@ def test_erase_json_dict_with_fields_and_masks(data_masker): "b": {"3": {"4": "*******", "e": "world"}}, }, } + + +def test_erase_json_dict_with_complex_masking_rules(data_masker): + # GIVEN the data type is a json representation of a dictionary with nested and filtered paths + data = json.dumps( + { + "email": "john.doe@example.com", + "age": 30, + "addres": [ + {"postcode": 13000, "street": "123 Main St", "details": {"name": "Home", "type": "Primary"}}, + {"postcode": 14000, "street": "456 Other Street", "details": {"name": "Office", "type": "Secondary"}}, + ], + }, + ) + + # WHEN erase is called with complex masking rules + masking_rules = { + "email": {"regex_pattern": "(.)(.*)(@.*)", "mask_format": r"\1****\3"}, + "age": {"dynamic_mask": True}, + "addres..name": {"custom_mask": "xxx"}, + "addres[?(@.postcode > 12000)]": {"dynamic_mask": True}, + } + + masked_json_string = data_masker.erase(data, masking_rules=masking_rules) + + # THEN the result should have all specified fields masked according to their rules + assert masked_json_string == { + "email": "j****@example.com", + "age": "*****", + "addres": [ + {"postcode": "*****", "street": "*** *** **", "details": {"name": "xxx", "type": "*******"}}, + {"postcode": "*****", "street": "*** ***** ******", "details": {"name": "xxx", "type": "********"}}, + ], + } From 63c7918876c596c1b1d890885b2cee19a9d80a29 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Fri, 31 Jan 2025 09:33:55 -0300 Subject: [PATCH 12/12] modifications based on the feedback --- .../utilities/data_masking/base.py | 25 +++----------- docs/utilities/data_masking.md | 12 ++++++- .../data_masking/src/custom_data_masking.py | 4 +-- .../src/output_custom_masking.json | 29 ++++++++++++++++ .../src/payload_custom_masking.json | 34 +++++++++++++++++++ .../test_unit_data_masking.py | 6 +++- 6 files changed, 86 insertions(+), 24 deletions(-) create mode 100644 examples/data_masking/src/output_custom_masking.json create mode 100644 examples/data_masking/src/payload_custom_masking.json diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 0dd41522d61..00650789696 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -3,6 +3,7 @@ import functools import logging import warnings +from copy import deepcopy from typing import TYPE_CHECKING, Any, Callable, Mapping, Sequence, overload from jsonpath_ng.ext import parse @@ -304,21 +305,11 @@ def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: Returns: dict: The masked data dictionary """ - result = data.copy() + result = deepcopy(data) for path, rule in masking_rules.items(): try: - if ".." in path: - # Handle recursive descent paths (e.g., "address..name") - base_path, field = path.split("..") - jsonpath_expr = parse(f"$.{base_path}..{field}") - elif "[" in path: - # Handle array notation paths (e.g., "address[*].street") - jsonpath_expr = parse(f"$.{path}") - else: - # Handle simple field names (e.g., "email") - jsonpath_expr = parse(f"$.{path}") - + jsonpath_expr = parse(f"$.{path}") matches = jsonpath_expr.find(result) if not matches: @@ -329,14 +320,8 @@ def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: try: value = match.value if value is not None: - if isinstance(value, dict): - # Handle dictionary values by masking each field - for k, v in value.items(): - if v is not None: - value[k] = self.provider.erase(str(v), **rule) - else: - masked_value = self.provider.erase(str(value), **rule) - match.full_path.update(result, masked_value) + masked_value = self.provider.erase(str(value), **rule) + match.full_path.update(result, masked_value) except Exception as e: warnings.warn(f"Error masking value for path {path}: {str(e)}", stacklevel=2) diff --git a/docs/utilities/data_masking.md b/docs/utilities/data_masking.md index 596fa2c3fa3..94e470aa965 100644 --- a/docs/utilities/data_masking.md +++ b/docs/utilities/data_masking.md @@ -123,7 +123,7 @@ The `erase` method also supports additional flags for more advanced and flexible === "dynamic_mask" - (bool) When set to `True`, this flag enables custom masking behavior. It activates the use of advanced masking techniques such as pattern-based or regex-based masking. + (bool) Enables dynamic masking behavior when set to `True`, by maintaining the original length and structure of the text replacing with *. > Expression: `data_masker.erase(data, fields=["address.zip"], dynamic_mask=True)` @@ -152,6 +152,16 @@ The `erase` method also supports additional flags for more advanced and flexible ```python hl_lines="20" --8<-- "examples/data_masking/src/custom_data_masking.py" ``` +=== "Input example" + + ```json + --8<-- "examples/data_masking/src/payload_custom_masking.json" + ``` +=== "Masking rules output example" + + ```json hl_lines="4 5 10 21" + --8<-- "examples/data_masking/src/output_custom_masking.json" + ``` ### Encrypting data diff --git a/examples/data_masking/src/custom_data_masking.py b/examples/data_masking/src/custom_data_masking.py index 24a5d51bc81..7b96f6f379f 100644 --- a/examples/data_masking/src/custom_data_masking.py +++ b/examples/data_masking/src/custom_data_masking.py @@ -13,8 +13,8 @@ def lambda_handler(event: dict, context: LambdaContext) -> dict: masking_rules = { "email": {"regex_pattern": "(.)(.*)(@.*)", "mask_format": r"\1****\3"}, "age": {"dynamic_mask": True}, - "address.zip": {"dynamic_mask": True, "custom_mask": "xxx"}, - "address.street": {"dynamic_mask": False}, + "address.zip": {"custom_mask": "xxx"}, + "$.other_address[?(@.postcode > 12000)]": {"custom_mask": "Masked"}, } result = data_masker.erase(data, masking_rules=masking_rules) diff --git a/examples/data_masking/src/output_custom_masking.json b/examples/data_masking/src/output_custom_masking.json new file mode 100644 index 00000000000..0571da99808 --- /dev/null +++ b/examples/data_masking/src/output_custom_masking.json @@ -0,0 +1,29 @@ +{ + "id": 1, + "name": "John Doe", + "age": "**", + "email": "j****@example.com", + "address": { + "street": "123 Main St", + "city": "Anytown", + "state": "CA", + "zip": "xxx", + "postcode": 12345, + "product": { + "name": "Car" + } + }, + "other_address": [ + { + "postcode": 11345, + "street": "123 Any Drive" + }, + "Masked" + ], + "company_address": { + "street": "456 ACME Ave", + "city": "Anytown", + "state": "CA", + "zip": "12345" + } +} \ No newline at end of file diff --git a/examples/data_masking/src/payload_custom_masking.json b/examples/data_masking/src/payload_custom_masking.json new file mode 100644 index 00000000000..d50b715ffa4 --- /dev/null +++ b/examples/data_masking/src/payload_custom_masking.json @@ -0,0 +1,34 @@ +{ + "body": { + "id": 1, + "name": "Jane Doe", + "age": 30, + "email": "janedoe@example.com", + "address": { + "street": "123 Main St", + "city": "Anytown", + "state": "CA", + "zip": "12345", + "postcode": 12345, + "product": { + "name": "Car" + } + }, + "other_address": [ + { + "postcode": 11345, + "street": "123 Any Drive" + }, + { + "postcode": 67890, + "street": "100 Main Street," + } + ], + "company_address": { + "street": "456 ACME Ave", + "city": "Anytown", + "state": "CA", + "zip": "12345" + } + } +} \ No newline at end of file diff --git a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py index 8eb0f955958..93588445034 100644 --- a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py +++ b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py @@ -218,12 +218,16 @@ def test_parsing_nonexistent_fields_warning_on_missing_field(): def test_regex_mask(data_masker): - data = "Hello! My name is Fulano Ciclano" + # GIVEN a str data type + data = "Hello! My name is John Doe" + + # WHEN erase is called with regex pattern and mask format regex_pattern = r"\b[A-Z][a-z]+ [A-Z][a-z]+\b" mask_format = "XXXX XXXX" result = data_masker.erase(data, regex_pattern=regex_pattern, mask_format=mask_format) + # THEN the result is the regex part masked by the masked format assert result == "Hello! My name is XXXX XXXX"