Skip to content

Commit

Permalink
[Python | ES Number] Fix detection of "3 millones y medio" to match .…
Browse files Browse the repository at this point in the history
…NET implementation (microsoft#3023)
  • Loading branch information
rbrennangen authored Aug 16, 2022
1 parent c490a0a commit a49a113
Show file tree
Hide file tree
Showing 10 changed files with 89 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,10 @@ def non_standard_separator_variants(self) -> List[str]:
def is_multi_decimal_separator_culture(self) -> bool:
return self._is_multi_decimal_separator_culture

@property
def round_multiplier_regex(self) -> None:
return None

def __init__(self, culture_info=None):
if culture_info is None:
culture_info = CultureInfo(Culture.Chinese)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,14 @@ def written_fraction_separator_texts(self) -> List[str]:
def non_standard_separator_variants(self) -> List[str]:
return self._non_standard_separator_variants

@property
def is_multi_decimal_separator_culture(self) -> bool:
return self._is_multi_decimal_separator_culture

@property
def round_multiplier_regex(self) -> Pattern:
return self._round_multiplier_regex

def __init__(self, culture_info=None):
if culture_info is None:
culture_info = CultureInfo(Culture.English)
Expand Down Expand Up @@ -115,6 +120,8 @@ def __init__(self, culture_info=None):
EnglishNumeric.HalfADozenRegex)
self._digital_number_regex = RegExpUtility.get_safe_reg_exp(
EnglishNumeric.DigitalNumberRegex)
self._round_multiplier_regex = RegExpUtility.get_safe_reg_exp(
EnglishNumeric.RoundMultiplierRegex)

def normalize_token_set(self, tokens: List[str], context: ParseResult) -> List[str]:
frac_words: List[str] = list()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ def non_standard_separator_variants(self) -> List[str]:
def is_multi_decimal_separator_culture(self) -> bool:
return self._is_multi_decimal_separator_culture

@property
def round_multiplier_regex(self) -> Pattern:
return self._round_multiplier_regex

def __init__(self, culture_info=None):
if culture_info is None:
culture_info = CultureInfo(Culture.French)
Expand Down Expand Up @@ -117,6 +121,8 @@ def __init__(self, culture_info=None):
FrenchNumeric.HalfADozenRegex)
self._digital_number_regex = RegExpUtility.get_safe_reg_exp(
FrenchNumeric.DigitalNumberRegex)
self._round_multiplier_regex = RegExpUtility.get_safe_reg_exp(
FrenchNumeric.RoundMultiplierRegex)

def normalize_token_set(self, tokens: List[str], context: ParseResult) -> List[str]:
return tokens
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ def non_standard_separator_variants(self) -> List[str]:
def is_multi_decimal_separator_culture(self) -> bool:
return self._is_multi_decimal_separator_culture

@property
def round_multiplier_regex(self) -> Pattern:
return self._round_multiplier_regex

def __init__(self, culture_info=None):
if culture_info is None:
culture_info = CultureInfo(Culture.German)
Expand Down Expand Up @@ -116,6 +120,8 @@ def __init__(self, culture_info=None):
GermanNumeric.HalfADozenRegex)
self._digital_number_regex = RegExpUtility.get_safe_reg_exp(
GermanNumeric.DigitalNumberRegex)
self._round_multiplier_regex = RegExpUtility.get_safe_reg_exp(
GermanNumeric.RoundMultiplierRegex)

def normalize_token_set(self, tokens: List[str], context: ParseResult) -> List[str]:
frac_words: List[str] = list()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ def non_standard_separator_variants(self) -> List[str]:
def is_multi_decimal_separator_culture(self) -> bool:
return self._is_multi_decimal_separator_culture

@property
def round_multiplier_regex(self) -> Pattern:
return self._round_multiplier_regex

def __init__(self, culture_info=None):
if culture_info is None:
culture_info = CultureInfo(Culture.Italian)
Expand Down Expand Up @@ -116,6 +120,8 @@ def __init__(self, culture_info=None):
ItalianNumeric.HalfADozenRegex)
self._digital_number_regex = RegExpUtility.get_safe_reg_exp(
ItalianNumeric.DigitalNumberRegex)
self._round_multiplier_regex = RegExpUtility.get_safe_reg_exp(
ItalianNumeric.RoundMultiplierRegex)

def normalize_token_set(self, tokens: List[str], context: ParseResult) -> List[str]:
frac_words: List[str] = list()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,10 @@ def non_standard_separator_variants(self) -> List[str]:
def is_multi_decimal_separator_culture(self) -> bool:
return self._is_multi_decimal_separator_culture

@property
def round_multiplier_regex(self) -> None:
return None

def __init__(self, culture_info=None):
if culture_info is None:
culture_info = CultureInfo(Culture.Japanese)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,20 +121,25 @@ def non_standard_separator_variants(self) -> List[str]:
def is_multi_decimal_separator_culture(self) -> bool:
pass

@property
@abstractmethod
def round_multiplier_regex(self) -> Pattern:
pass


class BaseNumberParser(Parser):
def __init__(self, config: NumberParserConfiguration):
self.config: NumberParserConfiguration = config
self.supported_types: List[str] = list()

single_int_frac = f'{self.config.word_separator_token}| -|{self._get_key_regex(self.config.cardinal_number_map.keys())}|{self._get_key_regex(self.config.ordinal_number_map.keys())}'
single_int_frac = f'{self.config.word_separator_token}| -|{self._get_key_regex(self.config.cardinal_number_map.keys())}|{self._get_key_regex(self.config.ordinal_number_map.keys())}|\\d+'
self.text_number_regex: Pattern = self._get_text_number_regex(single_int_frac)
self.arabic_number_regex: Pattern = RegExpUtility.get_safe_reg_exp(
r'\d+', flags=regex.I | regex.S)
self.round_number_set: List[str] = list(
self.config.round_number_map.keys())
self.is_non_standard_separator_variant = self.config.culture_info.code in \
self.config.non_standard_separator_variants
self.config.non_standard_separator_variants

def parse(self, source: ExtractResult) -> Optional[ParseResult]:
# Check if the parser is configured to support specific types
Expand Down Expand Up @@ -247,6 +252,15 @@ def _frac_like_number_parse(self, ext_result: ExtractResult) -> ParseResult:

result.value = small_value / big_value
else:
is_fraction_multiplier = False
multiplier = 1
if self.config.round_multiplier_regex is not None:
match = self.config.round_multiplier_regex.search(result_text)
if match is not None:
result_text = result_text.replace(match.group(0), "")
multiplier = self.config.round_number_map[match.group("multiplier")]
is_fraction_multiplier = True if match.groups("fracMultiplier") is not None else False

words = list(filter(lambda x: x, result_text.split(' ')))
frac_words = self.config.normalize_token_set(words, result)

Expand All @@ -258,7 +272,7 @@ def _frac_like_number_parse(self, ext_result: ExtractResult) -> ParseResult:

# for case like "half"
if len(frac_words) == 1:
result.value = 1 / self.__get_int_value(frac_words)
result.value = (1 / self.__get_int_value(frac_words)) * multiplier
return result

for split_index in range(len(frac_words) - 2, -1, -1):
Expand Down Expand Up @@ -290,7 +304,8 @@ def _frac_like_number_parse(self, ext_result: ExtractResult) -> ParseResult:
# frac[i+1] % 100 and frac[i] % 100 = 0
if (self.config.resolve_composite_number(frac_words[split_index]) >= sm_hundreds
and not frac_words[split_index + 1] in self.config.written_fraction_separator_texts
and self.config.resolve_composite_number(frac_words[split_index + 1]) < sm_hundreds):
and self.config.resolve_composite_number(
frac_words[split_index + 1]) < sm_hundreds):
split_index += 1
break
split_index += 1
Expand Down Expand Up @@ -330,12 +345,11 @@ def _frac_like_number_parse(self, ext_result: ExtractResult) -> ParseResult:
int_value = self.__get_int_value(self.__get_matches(int_str))

# Find mixed number
if (mixed_index != len(frac_words) and numer_value < denomi_value):
# int_value + numer_value / denomi_value
result.value = int_value + numer_value / denomi_value
if mixed_index != len(frac_words) and numer_value < denomi_value:
result.value = (int_value + (numer_value / denomi_value)) * multiplier if is_fraction_multiplier else \
int_value + (multiplier * numer_value / denomi_value)
else:
# (int_value + numer_value) / denomi_value
result.value = (int_value + numer_value) / denomi_value
result.value = multiplier * (int_value + numer_value) / denomi_value

# Convert to float for fixed float point vs. exponential notation consistency /w C#/TS/JS
result.value = float(result.value)
Expand Down Expand Up @@ -405,7 +419,7 @@ def _power_number_parse(self, ext_result: ExtractResult) -> ParseResult:
negative = not negative
elif c == '+':
continue
if i == len(handle)-1:
if i == len(handle) - 1:
if negative:
call_stack.append(-tmp)
else:
Expand Down Expand Up @@ -447,7 +461,7 @@ def __get_int_value(self, matches: List[str]) -> Decimal:
end_flag = 1

# Scan from end to start, find the end word
for i in range(len(matches)-1, 0, -1):
for i in range(len(matches) - 1, 0, -1):
if matches[i] in self.round_number_set:
# if false,then continue, you will meet hundred first, then thousand.
if end_flag > self.config.round_number_map[matches[i]]:
Expand Down Expand Up @@ -495,6 +509,8 @@ def __get_int_value(self, matches: List[str]) -> Decimal:
tmp = tmp_stack.pop() + match_value
tmp += tmp_stack.pop()
tmp_stack.append(tmp)
elif match.isdigit():
tmp_stack.append(int(match))
else:
complex_val = self.config.resolve_composite_number(match)
if complex_val != 0:
Expand Down Expand Up @@ -534,7 +550,7 @@ def __get_point_value(self, matches: List[str]) -> Decimal:
scale = Decimal(0.1)
for match in matches:
result += scale * \
Decimal(self.config.cardinal_number_map[match])
Decimal(self.config.cardinal_number_map[match])
scale *= Decimal(0.1)

return result
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ def non_standard_separator_variants(self) -> List[str]:
def is_multi_decimal_separator_culture(self) -> bool:
return self._is_multi_decimal_separator_culture

@property
def round_multiplier_regex(self) -> Pattern:
return self._round_multiplier_regex

def __init__(self, culture_info=None):
if culture_info is None:
culture_info = CultureInfo(Culture.Portuguese)
Expand Down Expand Up @@ -117,6 +121,8 @@ def __init__(self, culture_info=None):
PortugueseNumeric.HalfADozenRegex)
self._digital_number_regex = RegExpUtility.get_safe_reg_exp(
PortugueseNumeric.DigitalNumberRegex)
self._round_multiplier_regex = RegExpUtility.get_safe_reg_exp(
PortugueseNumeric.RoundMultiplierRegex)

def normalize_token_set(self, tokens: List[str], context: ParseResult) -> List[str]:
result = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ def non_standard_separator_variants(self) -> List[str]:
def is_multi_decimal_separator_culture(self) -> bool:
return self._is_multi_decimal_separator_culture

@property
def round_multiplier_regex(self) -> Pattern:
return self._round_multiplier_regex

def __init__(self, culture_info=None):
if culture_info is None:
culture_info = CultureInfo(Culture.Spanish)
Expand Down Expand Up @@ -126,6 +130,8 @@ def __init__(self, culture_info=None):
SpanishNumeric.HalfADozenRegex)
self._digital_number_regex = RegExpUtility.get_safe_reg_exp(
SpanishNumeric.DigitalNumberRegex)
self._round_multiplier_regex = RegExpUtility.get_safe_reg_exp(
SpanishNumeric.RoundMultiplierRegex)

def normalize_token_set(self, tokens: List[str], context: ParseResult) -> List[str]:
result: List[str] = list()
Expand All @@ -149,6 +155,16 @@ def normalize_token_set(self, tokens: List[str], context: ParseResult) -> List[s
continue
result.append(token)

# The following piece of code is needed to compute the fraction pattern number+'y medio'
# e.g. 'cinco y medio' ('five and a half') where the numerator is omitted in Spanish.
# It works by inserting the numerator 'un' ('a') in the list fracWords
# so that the pattern is correctly processed.
if len(result) > 2:
if result[len(result) - 1] == SpanishNumeric.OneHalfTokens[1] and \
result[len(result) - 2] == SpanishNumeric.WordSeparatorToken:
result[len(result) - 2] = SpanishNumeric.WrittenFractionSeparatorTexts[0]
result.insert(len(result) - 1, SpanishNumeric.OneHalfTokens[0])

return result

def resolve_composite_number(self, number_str: str) -> int:
Expand Down
12 changes: 6 additions & 6 deletions Specs/Number/Spanish/NumberModel.json
Original file line number Diff line number Diff line change
Expand Up @@ -2596,7 +2596,7 @@
},
{
"Input": "once con uno y medio",
"NotSupported": "java, javascript, python",
"NotSupported": "java, javascript",
"Results": [
{
"Text": "once con uno y medio",
Expand Down Expand Up @@ -3013,7 +3013,7 @@
},
{
"Input": "el resultado es cinco y medio",
"NotSupported": "java, javascript, python",
"NotSupported": "java, javascript",
"Results": [
{
"Text": "cinco y medio",
Expand All @@ -3029,7 +3029,7 @@
},
{
"Input": "La población es 2 millones y medio",
"NotSupported": "java, javascript, python",
"NotSupported": "java, javascript",
"Results": [
{
"Text": "2 millones y medio",
Expand All @@ -3045,7 +3045,7 @@
},
{
"Input": "Ellos obtuvieron tres millones y medio",
"NotSupported": "java, javascript, python",
"NotSupported": "java, javascript",
"Results": [
{
"Text": "tres millones y medio",
Expand Down Expand Up @@ -3077,7 +3077,7 @@
},
{
"Input": "El número es cinco billones con tres cuartos.",
"NotSupported": "java, javascript, python",
"NotSupported": "java, javascript",
"Results": [
{
"Text": "cinco billones con tres cuartos",
Expand All @@ -3093,7 +3093,7 @@
},
{
"Input": "Conté seis millones con dos tercios",
"NotSupported": "java, javascript, python",
"NotSupported": "java, javascript",
"Results": [
{
"Text": "seis millones con dos tercios",
Expand Down

0 comments on commit a49a113

Please sign in to comment.