From 4201b79119fb2b9b994f68b82dddea134a0708c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=B6rje=20Karlsson?= Date: Sat, 29 May 2021 15:45:32 +0800 Subject: [PATCH] Improvements to handling multiple number formats (#2623) * Recognize "hundert" and "tausend" by themselves as numbers in German. * FP filter for year in float number. * Adding sample lists of variant cultures that differ from default in handling number separators. * Cleanup of multi decimal separators code. * Fixing bug in assigning subtype for numbers like "1.2b". * Exposing Spanish.Mexican as culture as workaround before refactoring of model config for culture/requestedCulture. * Extra separator variant support and test cases in es-MX. * Re-gen resources across platforms. --- .../BaseNumbers.cs | 4 +- .../Dutch/DateTimeDefinitions.cs | 4 +- .../English/DateTimeDefinitions.cs | 1 + .../English/NumbersDefinitions.cs | 6 + .../French/DateTimeDefinitions.cs | 1 + .../German/DateTimeDefinitions.cs | 1 + .../German/NumbersDefinitions.cs | 12 +- .../Italian/DateTimeDefinitions.cs | 1 + .../Portuguese/DateTimeDefinitions.cs | 1 + .../Spanish/DateTimeDefinitions.cs | 1 + .../Spanish/NumbersDefinitions.cs | 17 +- .../Number/LongFormTestConfiguration.cs | 3 + .../Number/TestNumber_SpanishMexican.cs | 39 ++++ .../Config/IDateTimeOptionsConfiguration.cs | 4 +- .../DateTimeRecognizer.cs | 8 + .../Config/INumberOptionsConfiguration.cs | 4 +- .../EnglishNumberParserConfiguration.cs | 3 + .../NumberRecognizer.cs | 28 +++ .../Parsers/BaseCJKNumberParser.cs | 2 +- .../Parsers/BaseIndianNumberParser.cs | 40 ++--- .../Parsers/BaseNumberParser.cs | 115 ++++++++---- .../Parsers/BasePercentageParser.cs | 1 - .../Parsers/INumberParserConfiguration.cs | 12 +- .../SpanishNumberParserConfiguration.cs | 2 + .../NumberWithUnitRecognizer.cs | 40 +++++ .../Config/IConfiguration.cs | 4 +- .NET/Microsoft.Recognizers.Text/Culture.cs | 2 + .../Extractors/Metadata.cs | 3 + .../datetime/resources/ChineseDateTime.java | 12 ++ .../datetime/resources/EnglishDateTime.java | 1 + .../datetime/resources/FrenchDateTime.java | 11 +- .../resources/PortugueseDateTime.java | 36 +++- .../datetime/resources/SpanishDateTime.java | 21 ++- .../text/number/resources/BaseNumbers.java | 4 +- .../text/number/resources/EnglishNumeric.java | 2 + .../text/number/resources/GermanNumeric.java | 12 +- .../text/number/resources/SpanishNumeric.java | 8 +- .../src/resources/chineseDateTime.ts | 6 + .../src/resources/englishDateTime.ts | 2 +- .../src/resources/frenchDateTime.ts | 7 +- .../src/resources/portugueseDateTime.ts | 17 +- .../src/resources/spanishDateTime.ts | 16 +- .../src/resources/baseNumbers.ts | 4 +- .../src/resources/baseNumbers.ts | 4 +- .../src/resources/englishNumeric.ts | 1 + .../src/resources/spanishNumeric.ts | 7 +- Patterns/Base-Numbers.yaml | 4 +- Patterns/Dutch/Dutch-DateTime.yaml | 4 +- Patterns/English/English-DateTime.yaml | 1 + Patterns/English/English-Numbers.yaml | 7 + Patterns/French/French-DateTime.yaml | 1 + Patterns/German/German-DateTime.yaml | 1 + Patterns/German/German-Numbers.yaml | 12 +- Patterns/Italian/Italian-DateTime.yaml | 1 + Patterns/Portuguese/Portuguese-DateTime.yaml | 1 + Patterns/Spanish/Spanish-DateTime.yaml | 1 + Patterns/Spanish/Spanish-Numbers.yaml | 18 +- .../resources/chinese_date_time.py | 6 + .../resources/english_date_time.py | 3 +- .../resources/french_date_time.py | 8 +- .../resources/portuguese_date_time.py | 19 +- .../resources/spanish_date_time.py | 17 +- .../resources/base_numbers.py | 4 +- .../resources/base_numbers.py | 4 +- .../resources/english_numeric.py | 1 + .../resources/german_numeric.py | 12 +- .../resources/spanish_numeric.py | 7 +- Specs/DateTime/English/DateTimeModel.json | 8 + Specs/Number/English/NumberModel.json | 34 +++- .../English/NumberModelExperimentalMode.json | 4 +- Specs/Number/German/NumberModel.json | 30 ++++ Specs/Number/Spanish/NumberModel.json | 60 ++++++- Specs/Number/SpanishMexican/NumberModel.json | 170 ++++++++++++++++++ Specs/Number/Swedish/NumberModel.json | 8 +- 74 files changed, 785 insertions(+), 191 deletions(-) create mode 100644 .NET/Microsoft.Recognizers.Text.DataDrivenTests/Number/TestNumber_SpanishMexican.cs create mode 100644 Specs/Number/SpanishMexican/NumberModel.json diff --git a/.NET/Microsoft.Recognizers.Definitions.Common/BaseNumbers.cs b/.NET/Microsoft.Recognizers.Definitions.Common/BaseNumbers.cs index 627f459165..0ea19e6a07 100644 --- a/.NET/Microsoft.Recognizers.Definitions.Common/BaseNumbers.cs +++ b/.NET/Microsoft.Recognizers.Definitions.Common/BaseNumbers.cs @@ -23,9 +23,9 @@ public static class BaseNumbers { public const string NumberReplaceToken = @"@builtin.num"; public const string FractionNumberReplaceToken = @"@builtin.num.fraction"; - public static readonly Func IntegerRegexDefinition = (placeholder, thousandsmark) => $@"(((? IntegerRegexDefinition = (placeholder, thousandsmark) => $@"(((? DoubleRegexDefinition = (placeholder, thousandsmark, decimalmark) => $@"(((? DoubleRegexDefinition = (placeholder, thousandsmark, decimalmark) => $@"(((? AmbiguityFiltersDict = new Dictionary { - { @"\bmorning|afternoon|evening|night|day\b", @"\b(good\s+(morning|afternoon|evening|night|day))|(nighty\s+night)\b" }, - { @"\bnow\b", @"\b(^now,)|\b((is|are)\s+now\s+for|for\s+now)\b" }, - { @"\bmay\b", @"\b((^may i)|(i|you|he|she|we|they)\s+may|(may\s+((((also|not|(also not)|well)\s+)?(be|ask|contain|constitute|email|e-mail|take|have|result|involve|get|work|reply|differ))|(or may not))))\b" }, + { @"^\d{4}$", @"(\d\.\d{4}|\d{4}\.\d)" }, { @"^(jan|feb|mar|mrt|apr|mei|jun|jul|aug|sept?|oct|okt|nov|dec)$", @"([$%£&!?@#])(jan|feb|mar|mrt|apr|mei|jun|jul|aug|sept?|oct|okt|nov|dec)|(jan|feb|mar|mrt|apr|mei|jun|jul|aug|sept?|oct|okt|nov|dec)([$%£&@#])" } }; public static readonly IList MorningTermList = new List diff --git a/.NET/Microsoft.Recognizers.Definitions.Common/English/DateTimeDefinitions.cs b/.NET/Microsoft.Recognizers.Definitions.Common/English/DateTimeDefinitions.cs index 51308908bc..7adee914bd 100644 --- a/.NET/Microsoft.Recognizers.Definitions.Common/English/DateTimeDefinitions.cs +++ b/.NET/Microsoft.Recognizers.Definitions.Common/English/DateTimeDefinitions.cs @@ -737,6 +737,7 @@ public static class DateTimeDefinitions public static readonly string[] DurationDateRestrictions = { @"today", @"now" }; public static readonly Dictionary AmbiguityFiltersDict = new Dictionary { + { @"^\d{4}$", @"(\d\.\d{4}|\d{4}\.\d)" }, { @"^(morning|afternoon|evening|night|day)\b", @"\b(good\s+(morning|afternoon|evening|night|day))|(nighty\s+night)\b" }, { @"\bnow\b", @"\b(^now,)|\b((is|are)\s+now\s+for|for\s+now)\b" }, { @"\bmay\b", @"\b((((!|\.|\?|,|;|)\s+|^)may i)|(i|you|he|she|we|they)\s+may|(may\s+((((also|not|(also not)|well)\s+)?(be|ask|contain|constitute|e-?mail|take|have|result|involve|get|work|reply|differ))|(or may not))))\b" }, diff --git a/.NET/Microsoft.Recognizers.Definitions.Common/English/NumbersDefinitions.cs b/.NET/Microsoft.Recognizers.Definitions.Common/English/NumbersDefinitions.cs index 045d174483..44b701e727 100644 --- a/.NET/Microsoft.Recognizers.Definitions.Common/English/NumbersDefinitions.cs +++ b/.NET/Microsoft.Recognizers.Definitions.Common/English/NumbersDefinitions.cs @@ -24,6 +24,12 @@ public static class NumbersDefinitions public const string LangMarker = @"Eng"; public const bool CompoundNumberLanguage = false; public const bool MultiDecimalSeparatorCulture = true; + public static readonly IList NonStandardSeparatorVariants = new List + { + @"en-za", + @"en-na", + @"en-zw" + }; public const string RoundNumberIntegerRegex = @"(?:hundred|thousand|million|mln|billion|bln|trillion|tln|lakh|crore)s?"; public const string ZeroToNineIntegerRegex = @"(?:three|seven|eight|four|five|zero|nine|one|two|six)"; public const string TwoToNineIntegerRegex = @"(?:three|seven|eight|four|five|nine|two|six)"; diff --git a/.NET/Microsoft.Recognizers.Definitions.Common/French/DateTimeDefinitions.cs b/.NET/Microsoft.Recognizers.Definitions.Common/French/DateTimeDefinitions.cs index fd9d483e02..b584aba61a 100644 --- a/.NET/Microsoft.Recognizers.Definitions.Common/French/DateTimeDefinitions.cs +++ b/.NET/Microsoft.Recognizers.Definitions.Common/French/DateTimeDefinitions.cs @@ -719,6 +719,7 @@ public static class DateTimeDefinitions public static readonly string[] DurationDateRestrictions = { }; public static readonly Dictionary AmbiguityFiltersDict = new Dictionary { + { @"^\d{4}$", @"(\d\.\d{4}|\d{4}\.\d)" }, { @"^([eé]t[eé])$", @"(? AmbiguityFiltersDict = new Dictionary { + { @"^\d{4}$", @"(\d\.\d{4}|\d{4}\.\d)" }, { @"\b(morgen|nachmittag|abend|nacht|tag)\b", @"\b(gut(en?)?\s+(morgen|nachmittag|abend|nacht|tag))\b" }, { @"^(apr|aug|dez|feb|j[äa]n|jul|jun|märz|mai|nov|okt|sept?)$", @"([$%£&!?@#])(apr|aug|dez|feb|j[äa]n|jul|jun|märz|mai|nov|okt|sept?)|(apr|aug|dez|feb|j[äa]n|jul|jun|märz|mai|nov|okt|sept?)([$%£&@#])" }, { @"^(mo|di|mi|do|fr|sa|so)$", @"\b(mo|di|mi|do|fr|sa|so)\b" } diff --git a/.NET/Microsoft.Recognizers.Definitions.Common/German/NumbersDefinitions.cs b/.NET/Microsoft.Recognizers.Definitions.Common/German/NumbersDefinitions.cs index f644b77b2a..37fe7a0ce3 100644 --- a/.NET/Microsoft.Recognizers.Definitions.Common/German/NumbersDefinitions.cs +++ b/.NET/Microsoft.Recognizers.Definitions.Common/German/NumbersDefinitions.cs @@ -28,7 +28,7 @@ public static class NumbersDefinitions public const string RoundNumberIntegerRegex = @"((ein)?hundert|tausend|(\s*(million(en)?|mio|milliarden?|mrd|billion(en)?)\s*))"; public const string AnIntRegex = @"(eine?)(?=\s)"; public const string TenToNineteenIntegerRegex = @"(siebzehn|dreizehn|vierzehn|achtzehn|neunzehn|fuenfzehn|sechzehn|elf|zwoelf|zwölf|zehn)"; - public const string TensNumberIntegerRegex = @"(siebzig|zwanzig|dreißig|achtzig|neunzig|vierzig|fuenfzig|fünfzig|sechzig)"; + public const string TensNumberIntegerRegex = @"(siebzig|zwanzig|dreißig|achtzig|neunzig|vierzig|fuenfzig|fünfzig|sechzig|hundert|tausend)"; public const string NegativeNumberTermsRegex = @"^[.]"; public static readonly string NegativeNumberSignRegex = $@"^({NegativeNumberTermsRegex}\s+).*"; public static readonly string SeparaIntRegex = $@"((({TenToNineteenIntegerRegex}|({ZeroToNineIntegerRegex}und{TensNumberIntegerRegex})|{TensNumberIntegerRegex}|{ZeroToNineIntegerRegex})(\s*{RoundNumberIntegerRegex})*))|(({AnIntRegex}(\s*{RoundNumberIntegerRegex})+))"; @@ -54,10 +54,10 @@ public static class NumbersDefinitions public const string FractionNotationWithSpacesRegex = @"(((?<=\W|^)-\s*)|(?<=\b))\d+\s+\d+[/]\d+(?=(\b[^/]|$))"; public static readonly string FractionNotationRegex = $@"{BaseNumbers.FractionNotationRegex}"; public const string FractionUnitsRegex = @"((?anderthalb|einundhalb)|(?dreiviertel))"; - public const string FractionHalfRegex = @"(einhalb)$"; - public static readonly string[] OneHalfTokens = { @"ein", @"halb" }; - public static readonly string FractionNounRegex = $@"(?<=\b)(({AllIntRegex})(\s*|\s*-\s*)((({AllOrdinalRegex})|({RoundNumberOrdinalRegex}))|halb(er|e|es)?|hälfte)|{FractionUnitsRegex})(?=\b)"; - public static readonly string FractionNounWithArticleRegex = $@"(?<=\b)(({AllIntRegex}\s+(und\s+)?)?eine?(\s+|\s*-\s*)({AllOrdinalRegex}|{RoundNumberOrdinalRegex}|{FractionUnitsRegex}|({AllIntRegex}ein)?(halb(er|e|es)?|hälfte))|{AllIntRegex}ein(halb))(?=\b)"; + public const string FractionHalfRegex = @"(einhalb(es)?)$"; + public static readonly string[] OneHalfTokens = { @"ein", @"halb", @"halbes" }; + public static readonly string FractionNounRegex = $@"(?<=\b)(({AllIntRegex})(\s*|\s*-\s*)((({AllOrdinalRegex})|({RoundNumberOrdinalRegex}))|halb(e[rs]?)?|hälfte)|{FractionUnitsRegex})(?=\b)"; + public static readonly string FractionNounWithArticleRegex = $@"(?<=\b)(({AllIntRegex}\s+(und\s+)?)?eine?(\s+|\s*-\s*)({AllOrdinalRegex}|{RoundNumberOrdinalRegex}|{FractionUnitsRegex}|({AllIntRegex}ein)?(halb(e[rs]?)?|hälfte))|{AllIntRegex}ein(halb))(?=\b)"; public static readonly string FractionPrepositionRegex = $@"(?({AllIntRegex})|((?({AllIntRegex})|(\d+)(?!\.))(?=\b)"; public static readonly string AllPointRegex = $@"((\s*{ZeroToNineIntegerRegex})+|(\s*{SeparaIntRegex}))"; public static readonly string AllFloatRegex = $@"({AllIntRegex}(\s*komma\s*){AllPointRegex})"; @@ -411,7 +411,7 @@ public static class NumbersDefinitions }; public static readonly Dictionary AmbiguityFiltersDict = new Dictionary { - { @"^[.]", @"" } + { @"^(tausend|hundert)$", @"(ed(ward(\s+m(\.)?)?)?|mary(\s+c(\.)?)?|joachim|claudia|franz|maria|klaus|prof(\.|essor)?|dr(\.)?|herr|fr[äa]u(lein)?|frl?\.)\s+(tausend|hundert)" } }; public static readonly Dictionary RelativeReferenceOffsetMap = new Dictionary { diff --git a/.NET/Microsoft.Recognizers.Definitions.Common/Italian/DateTimeDefinitions.cs b/.NET/Microsoft.Recognizers.Definitions.Common/Italian/DateTimeDefinitions.cs index 66c2849bea..17d2fd1ac0 100644 --- a/.NET/Microsoft.Recognizers.Definitions.Common/Italian/DateTimeDefinitions.cs +++ b/.NET/Microsoft.Recognizers.Definitions.Common/Italian/DateTimeDefinitions.cs @@ -648,6 +648,7 @@ public static class DateTimeDefinitions public static readonly string[] DurationDateRestrictions = { }; public static readonly Dictionary AmbiguityFiltersDict = new Dictionary { + { @"^\d{4}$", @"(\d\.\d{4}|\d{4}\.\d)" }, { @"\bgiorno|pomeriggio|sera|notte\b", @"\b(buona?\s*(giorno|pomeriggio|sera|notte))\b" }, { @"^(apr|ago|dic|feb|gen|lug|giu|mar|mag|nov|ott|sett?)$", @"([$%£&!?@#])(apr|ago|dic|feb|gen|lug|giu|mar|mag|nov|ott|sett?)|(apr|ago|dic|feb|gen|lug|giu|mar|mag|nov|ott|sett?)([$%£&@#])" } }; diff --git a/.NET/Microsoft.Recognizers.Definitions.Common/Portuguese/DateTimeDefinitions.cs b/.NET/Microsoft.Recognizers.Definitions.Common/Portuguese/DateTimeDefinitions.cs index 2fa0f8f08d..699c7a6d89 100644 --- a/.NET/Microsoft.Recognizers.Definitions.Common/Portuguese/DateTimeDefinitions.cs +++ b/.NET/Microsoft.Recognizers.Definitions.Common/Portuguese/DateTimeDefinitions.cs @@ -535,6 +535,7 @@ public static class DateTimeDefinitions public static readonly string[] DurationDateRestrictions = { }; public static readonly Dictionary AmbiguityFiltersDict = new Dictionary { + { @"^\d{4}$", @"(\d\.\d{4}|\d{4}\.\d)" }, { @"^(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)$", @"([$%£&!?@#])(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)|(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)([$%£&@#])" } }; public static readonly IList EarlyMorningTermList = new List diff --git a/.NET/Microsoft.Recognizers.Definitions.Common/Spanish/DateTimeDefinitions.cs b/.NET/Microsoft.Recognizers.Definitions.Common/Spanish/DateTimeDefinitions.cs index cbb2441a61..dfc62c9cc6 100644 --- a/.NET/Microsoft.Recognizers.Definitions.Common/Spanish/DateTimeDefinitions.cs +++ b/.NET/Microsoft.Recognizers.Definitions.Common/Spanish/DateTimeDefinitions.cs @@ -603,6 +603,7 @@ public static class DateTimeDefinitions public static readonly string[] DurationDateRestrictions = { @"hoy" }; public static readonly Dictionary AmbiguityFiltersDict = new Dictionary { + { @"^\d{4}$", @"(\d\.\d{4}|\d{4}\.\d)" }, { @"^(este\s+)?mi(\s+([uú]ltimo|pasado|anterior|pr[oó]ximo|siguiente|que\s+viene))?$", @"\b(este\s+)?mi(\s+([uú]ltimo|pasado|anterior|pr[oó]ximo|siguiente|que\s+viene))?\b" }, { @"^a[nñ]o$", @"(? NonStandardSeparatorVariants = new List + { + @"es-mx", + @"es-do", + @"es-sv", + @"es-gt", + @"es-hn", + @"es-ni", + @"es-pa", + @"es-pr" + }; public const string HundredsNumberIntegerRegex = @"(cuatrocient[ao]s|trescient[ao]s|seiscient[ao]s|setecient[ao]s|ochocient[ao]s|novecient[ao]s|doscient[ao]s|quinient[ao]s|(?(? NonStandardSeparatorVariants => Enumerable.Empty(); + public IEnumerable NormalizeTokenSet(IEnumerable tokens, ParseResult context) { throw new NotImplementedException(); diff --git a/.NET/Microsoft.Recognizers.Text.DataDrivenTests/Number/TestNumber_SpanishMexican.cs b/.NET/Microsoft.Recognizers.Text.DataDrivenTests/Number/TestNumber_SpanishMexican.cs new file mode 100644 index 0000000000..7a84dcdd5e --- /dev/null +++ b/.NET/Microsoft.Recognizers.Text.DataDrivenTests/Number/TestNumber_SpanishMexican.cs @@ -0,0 +1,39 @@ +using Microsoft.Recognizers.Text.DataDrivenTests; +using Microsoft.VisualStudio.TestTools.UnitTesting; + +namespace Microsoft.Recognizers.Text.Number.Tests +{ + [TestClass] + public class TestNumber_SpanishMexican : TestBase + { + [NetCoreTestDataSource] + [TestMethod] + public void NumberModel(TestModel testSpec) + { + TestNumber(testSpec); + } + + /* + [NetCoreTestDataSource] + [TestMethod] + public void OrdinalModel(TestModel testSpec) + { + TestNumber(testSpec); + } + + [NetCoreTestDataSource] + [TestMethod] + public void PercentModel(TestModel testSpec) + { + TestNumber(testSpec); + } + + [NetCoreTestDataSource] + [TestMethod] + public void NumberRangeModel(TestModel testSpec) + { + TestNumber(testSpec); + } + */ + } +} diff --git a/.NET/Microsoft.Recognizers.Text.DateTime/Config/IDateTimeOptionsConfiguration.cs b/.NET/Microsoft.Recognizers.Text.DateTime/Config/IDateTimeOptionsConfiguration.cs index 972795fb8c..c023d30390 100644 --- a/.NET/Microsoft.Recognizers.Text.DateTime/Config/IDateTimeOptionsConfiguration.cs +++ b/.NET/Microsoft.Recognizers.Text.DateTime/Config/IDateTimeOptionsConfiguration.cs @@ -1,6 +1,4 @@ -using Microsoft.Recognizers.Text.Config; - -namespace Microsoft.Recognizers.Text.DateTime +namespace Microsoft.Recognizers.Text.DateTime { public interface IDateTimeOptionsConfiguration : IConfiguration { diff --git a/.NET/Microsoft.Recognizers.Text.DateTime/DateTimeRecognizer.cs b/.NET/Microsoft.Recognizers.Text.DateTime/DateTimeRecognizer.cs index 0fbb1edc4b..2ebacecbf5 100644 --- a/.NET/Microsoft.Recognizers.Text.DateTime/DateTimeRecognizer.cs +++ b/.NET/Microsoft.Recognizers.Text.DateTime/DateTimeRecognizer.cs @@ -83,6 +83,14 @@ protected override void InitializeConfiguration() new BaseMergedDateTimeExtractor( new SpanishMergedExtractorConfiguration(new BaseDateTimeOptionsConfiguration(Culture.Spanish, options))))); + RegisterModel( + Culture.SpanishMexican, + options => new DateTimeModel( + new BaseMergedDateTimeParser( + new SpanishMergedParserConfiguration(new BaseDateTimeOptionsConfiguration(Culture.SpanishMexican, options))), + new BaseMergedDateTimeExtractor( + new SpanishMergedExtractorConfiguration(new BaseDateTimeOptionsConfiguration(Culture.SpanishMexican, options))))); + RegisterModel( Culture.French, options => new DateTimeModel( diff --git a/.NET/Microsoft.Recognizers.Text.Number/Config/INumberOptionsConfiguration.cs b/.NET/Microsoft.Recognizers.Text.Number/Config/INumberOptionsConfiguration.cs index 824b08b32a..e2305ec46a 100644 --- a/.NET/Microsoft.Recognizers.Text.Number/Config/INumberOptionsConfiguration.cs +++ b/.NET/Microsoft.Recognizers.Text.Number/Config/INumberOptionsConfiguration.cs @@ -1,6 +1,4 @@ -using Microsoft.Recognizers.Text.Config; - -namespace Microsoft.Recognizers.Text.Number +namespace Microsoft.Recognizers.Text.Number { public interface INumberOptionsConfiguration : IConfiguration { diff --git a/.NET/Microsoft.Recognizers.Text.Number/English/Parsers/EnglishNumberParserConfiguration.cs b/.NET/Microsoft.Recognizers.Text.Number/English/Parsers/EnglishNumberParserConfiguration.cs index 45489063a7..a0d34559c1 100644 --- a/.NET/Microsoft.Recognizers.Text.Number/English/Parsers/EnglishNumberParserConfiguration.cs +++ b/.NET/Microsoft.Recognizers.Text.Number/English/Parsers/EnglishNumberParserConfiguration.cs @@ -1,4 +1,5 @@ using System; +using System.Collections.Generic; using System.Collections.Immutable; using System.Globalization; using System.Text.RegularExpressions; @@ -54,5 +55,7 @@ public EnglishNumberParserConfiguration(INumberOptionsConfiguration config) } public string NonDecimalSeparatorText { get; private set; } + + public override IEnumerable NonStandardSeparatorVariants => NumbersDefinitions.NonStandardSeparatorVariants; } } diff --git a/.NET/Microsoft.Recognizers.Text.Number/NumberRecognizer.cs b/.NET/Microsoft.Recognizers.Text.Number/NumberRecognizer.cs index e4ae863b6c..f998994ab5 100644 --- a/.NET/Microsoft.Recognizers.Text.Number/NumberRecognizer.cs +++ b/.NET/Microsoft.Recognizers.Text.Number/NumberRecognizer.cs @@ -194,6 +194,34 @@ protected override void InitializeConfiguration() new BaseNumberOptionsConfiguration(Culture.Spanish, options))), new Spanish.NumberRangeExtractor(new BaseNumberOptionsConfiguration(Culture.Spanish, options)))); + RegisterModel( + Culture.SpanishMexican, + (options) => new NumberModel( + AgnosticNumberParserFactory.GetParser(AgnosticNumberParserType.Number, new SpanishNumberParserConfiguration( + new BaseNumberOptionsConfiguration(Culture.SpanishMexican, options))), + Spanish.NumberExtractor.GetInstance(new BaseNumberOptionsConfiguration(Culture.SpanishMexican, options, NumberMode.PureNumber)))); + + RegisterModel( + Culture.SpanishMexican, + (options) => new OrdinalModel( + AgnosticNumberParserFactory.GetParser(AgnosticNumberParserType.Ordinal, new SpanishNumberParserConfiguration( + new BaseNumberOptionsConfiguration(Culture.SpanishMexican, options))), + Spanish.OrdinalExtractor.GetInstance(new BaseNumberOptionsConfiguration(Culture.SpanishMexican, options)))); + + RegisterModel( + Culture.SpanishMexican, + (options) => new PercentModel( + AgnosticNumberParserFactory.GetParser(AgnosticNumberParserType.Percentage, new SpanishNumberParserConfiguration( + new BaseNumberOptionsConfiguration(Culture.SpanishMexican, options))), + new Spanish.PercentageExtractor(new BaseNumberOptionsConfiguration(Culture.SpanishMexican, options)))); + + RegisterModel( + Culture.SpanishMexican, + (options) => new NumberRangeModel( + new BaseNumberRangeParser(new SpanishNumberRangeParserConfiguration( + new BaseNumberOptionsConfiguration(Culture.SpanishMexican, options))), + new Spanish.NumberRangeExtractor(new BaseNumberOptionsConfiguration(Culture.SpanishMexican, options)))); + RegisterModel( Culture.Portuguese, (options) => new NumberModel( diff --git a/.NET/Microsoft.Recognizers.Text.Number/Parsers/BaseCJKNumberParser.cs b/.NET/Microsoft.Recognizers.Text.Number/Parsers/BaseCJKNumberParser.cs index 7e942fe001..fb2752aacd 100644 --- a/.NET/Microsoft.Recognizers.Text.Number/Parsers/BaseCJKNumberParser.cs +++ b/.NET/Microsoft.Recognizers.Text.Number/Parsers/BaseCJKNumberParser.cs @@ -114,7 +114,7 @@ public override ParseResult Parse(ExtractResult extResult) // TODO: @Refactor this check to determine the subtype for JA and KO if ((Config.CultureInfo.Name == "ja-JP" || Config.CultureInfo.Name == "ko-KR") && ret != null) { - ret.Type = DetermineType(extResult); + ret.Type = DetermineType(extResult, ret); ret.Text = ret.Text.ToLowerInvariant(); } diff --git a/.NET/Microsoft.Recognizers.Text.Number/Parsers/BaseIndianNumberParser.cs b/.NET/Microsoft.Recognizers.Text.Number/Parsers/BaseIndianNumberParser.cs index dbc4a8fc2d..1e703eb2f2 100644 --- a/.NET/Microsoft.Recognizers.Text.Number/Parsers/BaseIndianNumberParser.cs +++ b/.NET/Microsoft.Recognizers.Text.Number/Parsers/BaseIndianNumberParser.cs @@ -45,12 +45,12 @@ public override ParseResult FracLikeNumberParse(ExtractResult extResult) var denominator = match.Groups["denominator"].Value; var smallValue = char.IsDigit(numerator[0]) ? - GetDigitalValue(numerator, 1) : - GetIntValue(Utilities.RegExpUtility.GetMatches(this.TextNumberRegex, numerator)); + GetDigitalValue(numerator, 1) : + GetIntValue(Utilities.RegExpUtility.GetMatches(this.TextNumberRegex, numerator)); var bigValue = char.IsDigit(denominator[0]) ? - GetDigitalValue(denominator, 1) : - GetIntValue(Utilities.RegExpUtility.GetMatches(this.TextNumberRegex, denominator)); + GetDigitalValue(denominator, 1) : + GetIntValue(Utilities.RegExpUtility.GetMatches(this.TextNumberRegex, denominator)); result.Value = smallValue / bigValue; } @@ -64,12 +64,12 @@ public override ParseResult FracLikeNumberParse(ExtractResult extResult) var denominator = match.Groups["denominator"].Value; var smallValue = char.IsDigit(numerator[0]) ? - GetDigitalValue(numerator, 1) : - GetIntValue(Utilities.RegExpUtility.GetMatches(this.TextNumberRegex, numerator)); + GetDigitalValue(numerator, 1) : + GetIntValue(Utilities.RegExpUtility.GetMatches(this.TextNumberRegex, numerator)); var bigValue = char.IsDigit(denominator[0]) ? - GetDigitalValue(denominator, 1) : - GetIntValue(Utilities.RegExpUtility.GetMatches(this.TextNumberRegex, denominator)); + GetDigitalValue(denominator, 1) : + GetIntValue(Utilities.RegExpUtility.GetMatches(this.TextNumberRegex, denominator)); result.Value = smallValue / bigValue; } @@ -274,28 +274,14 @@ public override double GetDigitalValue(string digitsStr, double power) { double temp = 0; double scale = 10; - var decimalSeparator = false; + var decimalSeparatorFound = false; var strLength = digitsStr.Length; var isNegative = false; var isFrac = digitsStr.Contains('/'); - var lastDecimalSeparator = -1; - var lastNonDecimalSeparator = -1; var hasSingleSeparator = false; - if (Config.IsMultiDecimalSeparatorCulture) - { - lastDecimalSeparator = digitsStr.LastIndexOf(Config.DecimalSeparatorChar); - lastNonDecimalSeparator = digitsStr.LastIndexOf(Config.NonDecimalSeparatorChar); - - if ((lastDecimalSeparator < 0 && lastNonDecimalSeparator >= 0) || - (lastNonDecimalSeparator < 0 && lastDecimalSeparator >= 0)) - { - hasSingleSeparator = true; - } - } - var calStack = new Stack(); for (var i = 0; i < digitsStr.Length; i++) @@ -303,7 +289,7 @@ public override double GetDigitalValue(string digitsStr, double power) var ch = digitsStr[i]; var prevCh = (i > 0) ? digitsStr[i - 1] : '\0'; - var skippableNonDecimal = SkipNonDecimalSeparator(ch, strLength - i, hasSingleSeparator, prevCh); + var skippableNonDecimal = SkipNonDecimalSeparator(ch, strLength - i, i, hasSingleSeparator, prevCh, Config.NonDecimalSeparatorChar); if (!isFrac && (ch == ' ' || ch == Constants.NO_BREAK_SPACE || skippableNonDecimal)) { @@ -317,7 +303,7 @@ public override double GetDigitalValue(string digitsStr, double power) } else if (ch >= '0' && ch <= '9') { - if (decimalSeparator) + if (decimalSeparatorFound) { temp += scale * (ch - '0'); scale *= 0.1; @@ -329,7 +315,7 @@ public override double GetDigitalValue(string digitsStr, double power) } else if (ch == Config.DecimalSeparatorChar || (!skippableNonDecimal && ch == Config.NonDecimalSeparatorChar)) { - decimalSeparator = true; + decimalSeparatorFound = true; scale = 0.1; } else if (ch == '-') @@ -341,7 +327,7 @@ public override double GetDigitalValue(string digitsStr, double power) // handle Devanagari numerals defined in ZeroToNineMap if (char.IsDigit(ch)) { - if (decimalSeparator) + if (decimalSeparatorFound) { temp += Config.ZeroToNineMap[ch] * scale; scale *= 0.1; diff --git a/.NET/Microsoft.Recognizers.Text.Number/Parsers/BaseNumberParser.cs b/.NET/Microsoft.Recognizers.Text.Number/Parsers/BaseNumberParser.cs index 7488811a78..57170b6254 100644 --- a/.NET/Microsoft.Recognizers.Text.Number/Parsers/BaseNumberParser.cs +++ b/.NET/Microsoft.Recognizers.Text.Number/Parsers/BaseNumberParser.cs @@ -9,10 +9,12 @@ namespace Microsoft.Recognizers.Text.Number public class BaseNumberParser : IParser { private static readonly Regex LongFormRegex = - new Regex(@"\d+", RegexOptions.Singleline); + new Regex(@"\d+", RegexOptions.Singleline | RegexOptions.Compiled); private readonly bool isMultiDecimalSeparatorCulture = false; + private readonly bool isNonStandardSeparatorVariant = false; + private readonly bool isCompoundNumberLanguage = false; public BaseNumberParser(INumberParserConfiguration config) @@ -29,6 +31,8 @@ public BaseNumberParser(INumberParserConfiguration config) { RoundNumberSet.Add(roundNumber); } + + isNonStandardSeparatorVariant = Config.NonStandardSeparatorVariants.Contains(Config.CultureInfo.Name.ToLowerInvariant()); } internal IEnumerable SupportedTypes { get; set; } @@ -131,6 +135,7 @@ public virtual ParseResult Parse(ExtractResult extResult) } else if (ret?.Value != null) { + if (isNegative) { // Recover the original extracted Text @@ -168,7 +173,7 @@ public virtual ParseResult Parse(ExtractResult extResult) if (ret != null) { - ret.Type = DetermineType(extResult); + ret.Type = DetermineType(extResult, ret); ret.Text = ret.Text.ToLowerInvariant(); } @@ -193,7 +198,7 @@ public virtual ParseResult PowerNumberParse(ExtractResult extResult) var calStack = new Queue(); double scale = 10; - var dot = false; + var decimalSeparatorFound = false; var isNegative = false; double tmp = 0; for (var i = 0; i < handle.Length; i++) @@ -212,12 +217,12 @@ public virtual ParseResult PowerNumberParse(ExtractResult extResult) tmp = 0; scale = 10; - dot = false; + decimalSeparatorFound = false; isNegative = false; } else if (ch >= '0' && ch <= '9') { - if (dot) + if (decimalSeparatorFound) { tmp += scale * (ch - '0'); scale *= 0.1; @@ -229,7 +234,7 @@ public virtual ParseResult PowerNumberParse(ExtractResult extResult) } else if (ch == Config.DecimalSeparatorChar) { - dot = true; + decimalSeparatorFound = true; scale = 0.1; } else if (ch == '-') @@ -352,12 +357,12 @@ public virtual ParseResult FracLikeNumberParse(ExtractResult extResult) var denominator = match.Groups["denominator"].Value; var smallValue = char.IsDigit(numerator[0]) ? - GetDigitalValue(numerator, 1) : - GetIntValue(Utilities.RegExpUtility.GetMatches(this.TextNumberRegex, numerator)); + GetDigitalValue(numerator, 1) : + GetIntValue(Utilities.RegExpUtility.GetMatches(this.TextNumberRegex, numerator)); var bigValue = char.IsDigit(denominator[0]) ? - GetDigitalValue(denominator, 1) : - GetIntValue(Utilities.RegExpUtility.GetMatches(this.TextNumberRegex, denominator)); + GetDigitalValue(denominator, 1) : + GetIntValue(Utilities.RegExpUtility.GetMatches(this.TextNumberRegex, denominator)); result.Value = smallValue / bigValue; } @@ -516,7 +521,7 @@ public virtual ParseResult DigitNumberParse(ExtractResult extResult) Length = extResult.Length, Text = extResult.Text, Type = extResult.Type, - Metadata = extResult.Metadata, + Metadata = extResult.Metadata != null ? extResult.Metadata : new Metadata(), }; // [1] 24 @@ -551,7 +556,9 @@ public virtual ParseResult DigitNumberParse(ExtractResult extResult) } // Scale used in calculating double - result.Value = GetDigitalValue(extText, power); + var value = GetDigitalValue(extText, power); + result.Value = value; + result.Metadata.TreatAsInteger = (value % 1) == 0; return result; } @@ -560,32 +567,69 @@ public virtual double GetDigitalValue(string digitsStr, double power) { double temp = 0; double scale = 10; - var decimalSeparator = false; - var strLength = digitsStr.Length; + var hasDecimalSeparator = false; var isNegative = false; + var strLength = digitsStr.Length; var isFrac = digitsStr.Contains('/'); + // As some languages use different separators depending on variant, some pre-processing is required to allow for unified processing. + + // Default separators from general language config + var decimalSeparator = Config.DecimalSeparatorChar; + var nonDecimalSeparator = Config.NonDecimalSeparatorChar; + var lastDecimalSeparator = -1; var lastNonDecimalSeparator = -1; + var firstNonDecimalSeparator = int.MaxValue; var hasSingleSeparator = false; if (Config.IsMultiDecimalSeparatorCulture) { - lastDecimalSeparator = digitsStr.LastIndexOf(Config.DecimalSeparatorChar); - lastNonDecimalSeparator = digitsStr.LastIndexOf(Config.NonDecimalSeparatorChar); - if ((lastDecimalSeparator < 0 && lastNonDecimalSeparator >= 0) || - (lastNonDecimalSeparator < 0 && lastDecimalSeparator >= 0)) + if (isNonStandardSeparatorVariant) + { + // Reverse separators + decimalSeparator = Config.NonDecimalSeparatorChar; + nonDecimalSeparator = Config.DecimalSeparatorChar; + } + + for (int i = 0; i < strLength; i++) + { + var ch = digitsStr[i]; + if (ch == decimalSeparator) + { + lastDecimalSeparator = i; + } + else if (ch == nonDecimalSeparator) + { + lastNonDecimalSeparator = i; + if (firstNonDecimalSeparator == int.MaxValue) + { + firstNonDecimalSeparator = i; + } + } + } + + if (((lastDecimalSeparator < 0 && lastNonDecimalSeparator >= 0) || (lastNonDecimalSeparator < 0 && lastDecimalSeparator >= 0)) && + firstNonDecimalSeparator == lastNonDecimalSeparator) { hasSingleSeparator = true; } + else if ((lastDecimalSeparator < lastNonDecimalSeparator) && !(lastDecimalSeparator == -1 || lastNonDecimalSeparator == -1)) + { + // Switch separators + var aux = decimalSeparator; + decimalSeparator = nonDecimalSeparator; + nonDecimalSeparator = aux; + } + } // Try to parse vulgar fraction chars - if (digitsStr.Length == 1 && !char.IsDigit(digitsStr.ToCharArray()[0])) + if (!isFrac && strLength == 1 && !char.IsDigit(digitsStr[0])) { - double fracResult = char.GetNumericValue(digitsStr.ToCharArray()[0]); + double fracResult = char.GetNumericValue(digitsStr, 0); if (fracResult != -1.0) { @@ -595,12 +639,12 @@ public virtual double GetDigitalValue(string digitsStr, double power) var calStack = new Stack(); - for (var i = 0; i < digitsStr.Length; i++) + for (var i = 0; i < strLength; i++) { var ch = digitsStr[i]; var prevCh = (i > 0) ? digitsStr[i - 1] : '\0'; - var skippableNonDecimal = SkipNonDecimalSeparator(ch, strLength - i, hasSingleSeparator, prevCh); + var skippableNonDecimal = SkipNonDecimalSeparator(ch, strLength - i, i, hasSingleSeparator, prevCh, nonDecimalSeparator); if (!isFrac && (ch == ' ' || ch == Constants.NO_BREAK_SPACE || skippableNonDecimal)) { @@ -614,7 +658,7 @@ public virtual double GetDigitalValue(string digitsStr, double power) } else if (ch >= '0' && ch <= '9') { - if (decimalSeparator) + if (hasDecimalSeparator) { temp += scale * (ch - '0'); scale *= 0.1; @@ -624,9 +668,9 @@ public virtual double GetDigitalValue(string digitsStr, double power) temp = (temp * scale) + (ch - '0'); } } - else if (ch == Config.DecimalSeparatorChar || (!skippableNonDecimal && ch == Config.NonDecimalSeparatorChar)) + else if (ch == decimalSeparator || (!skippableNonDecimal && ch == nonDecimalSeparator)) { - decimalSeparator = true; + hasDecimalSeparator = true; scale = 0.1; } else if (ch == '-') @@ -827,7 +871,7 @@ protected static string GetKeyRegex(IEnumerable keyCollection) return string.Join("|", sortKeys); } - protected static string DetermineType(ExtractResult er) + protected static string DetermineType(ExtractResult er, ParseResult pr) { if (!string.IsNullOrEmpty(er.Type) && er.Type.Contains(Constants.MODEL_ORDINAL)) { @@ -849,11 +893,11 @@ protected static string DetermineType(ExtractResult er) } else if (data.StartsWith(Constants.INTEGER_PREFIX, StringComparison.Ordinal)) { - subType = Constants.INTEGER; + subType = (pr.Metadata == null || pr.Metadata.TreatAsInteger) ? Constants.INTEGER : Constants.DECIMAL; } else if (data.StartsWith(Constants.DOUBLE_PREFIX, StringComparison.Ordinal)) { - subType = Constants.DECIMAL; + subType = (pr.Metadata == null || !pr.Metadata.TreatAsInteger) ? Constants.DECIMAL : Constants.INTEGER; } } @@ -895,17 +939,20 @@ protected string GetResolutionStr(object value) // "1.000" can be ambiguous and should return "1000" by default // If only one separator and not three digits to the right, interpret as decimal separator // "100.00" = "100,00" -> "100" - protected bool SkipNonDecimalSeparator(char ch, int distance, bool hasSingleSeparator, char prevCh) + protected bool SkipNonDecimalSeparator(char ch, int distanceEnd, int distanceStart, bool hasSingleSeparator, char prevCh, char nonDecimalSeparator) { - const int decimalLength = 3; - bool result = false; - if (ch == Config.NonDecimalSeparatorChar) + const int decimalLength = 1 + 3; + + if (ch == nonDecimalSeparator) { - if (!(isMultiDecimalSeparatorCulture && (distance <= decimalLength))) + result = true; + + if (isMultiDecimalSeparatorCulture && hasSingleSeparator && + (distanceEnd != decimalLength || (prevCh == '0' && distanceStart == 1) || distanceStart > 3)) { - result = true; + result = false; } } diff --git a/.NET/Microsoft.Recognizers.Text.Number/Parsers/BasePercentageParser.cs b/.NET/Microsoft.Recognizers.Text.Number/Parsers/BasePercentageParser.cs index 28bbe0527e..f470c94220 100644 --- a/.NET/Microsoft.Recognizers.Text.Number/Parsers/BasePercentageParser.cs +++ b/.NET/Microsoft.Recognizers.Text.Number/Parsers/BasePercentageParser.cs @@ -1,6 +1,5 @@ using System; using System.Collections.Generic; -using System.Globalization; namespace Microsoft.Recognizers.Text.Number { diff --git a/.NET/Microsoft.Recognizers.Text.Number/Parsers/INumberParserConfiguration.cs b/.NET/Microsoft.Recognizers.Text.Number/Parsers/INumberParserConfiguration.cs index 76d6b23271..730adae44b 100644 --- a/.NET/Microsoft.Recognizers.Text.Number/Parsers/INumberParserConfiguration.cs +++ b/.NET/Microsoft.Recognizers.Text.Number/Parsers/INumberParserConfiguration.cs @@ -42,6 +42,10 @@ public interface INumberParserConfiguration char DecimalSeparatorChar { get; } + bool IsMultiDecimalSeparatorCulture { get; } + + IEnumerable NonStandardSeparatorVariants { get; } + string WordSeparatorToken { get; } IEnumerable WrittenDecimalSeparatorTexts { get; } @@ -56,8 +60,6 @@ public interface INumberParserConfiguration bool IsCompoundNumberLanguage { get; } - bool IsMultiDecimalSeparatorCulture { get; } - /// /// Used when requiring to normalize a token to a valid expression supported by the ImmutableDictionaries (language dictionaries). /// @@ -121,6 +123,10 @@ public class BaseNumberParserConfiguration : INumberParserConfiguration public char DecimalSeparatorChar { get; set; } + public bool IsMultiDecimalSeparatorCulture { get; set; } + + public virtual IEnumerable NonStandardSeparatorVariants => Enumerable.Empty(); + public string WordSeparatorToken { get; set; } public IEnumerable WrittenDecimalSeparatorTexts { get; set; } @@ -135,8 +141,6 @@ public class BaseNumberParserConfiguration : INumberParserConfiguration public bool IsCompoundNumberLanguage { get; set; } - public bool IsMultiDecimalSeparatorCulture { get; set; } - public virtual long ResolveCompositeNumber(string numberStr) { if (numberStr.Contains("-")) diff --git a/.NET/Microsoft.Recognizers.Text.Number/Spanish/Parsers/SpanishNumberParserConfiguration.cs b/.NET/Microsoft.Recognizers.Text.Number/Spanish/Parsers/SpanishNumberParserConfiguration.cs index 73a29fa76a..b848e51ec5 100644 --- a/.NET/Microsoft.Recognizers.Text.Number/Spanish/Parsers/SpanishNumberParserConfiguration.cs +++ b/.NET/Microsoft.Recognizers.Text.Number/Spanish/Parsers/SpanishNumberParserConfiguration.cs @@ -53,6 +53,8 @@ public SpanishNumberParserConfiguration(INumberOptionsConfiguration config) public string NonDecimalSeparatorText { get; private set; } + public override IEnumerable NonStandardSeparatorVariants => NumbersDefinitions.NonStandardSeparatorVariants; + public override IEnumerable NormalizeTokenSet(IEnumerable tokens, ParseResult context) { var result = new List(); diff --git a/.NET/Microsoft.Recognizers.Text.NumberWithUnit/NumberWithUnitRecognizer.cs b/.NET/Microsoft.Recognizers.Text.NumberWithUnit/NumberWithUnitRecognizer.cs index a5da1a09ea..a7b051d218 100644 --- a/.NET/Microsoft.Recognizers.Text.NumberWithUnit/NumberWithUnitRecognizer.cs +++ b/.NET/Microsoft.Recognizers.Text.NumberWithUnit/NumberWithUnitRecognizer.cs @@ -527,6 +527,46 @@ protected override void InitializeConfiguration() }, })); + RegisterModel( + Culture.SpanishMexican, + (options) => new CurrencyModel(new Dictionary + { + { + new BaseMergedUnitExtractor(new Spanish.CurrencyExtractorConfiguration()), + new BaseMergedUnitParser(new Spanish.CurrencyParserConfiguration()) + }, + })); + + RegisterModel( + Culture.SpanishMexican, + (options) => new TemperatureModel(new Dictionary + { + { + new NumberWithUnitExtractor(new Spanish.TemperatureExtractorConfiguration()), + new NumberWithUnitParser(new Spanish.TemperatureParserConfiguration()) + }, + })); + + RegisterModel( + Culture.SpanishMexican, + (options) => new DimensionModel(new Dictionary + { + { + new NumberWithUnitExtractor(new Spanish.DimensionExtractorConfiguration()), + new NumberWithUnitParser(new Spanish.DimensionParserConfiguration()) + }, + })); + + RegisterModel( + Culture.SpanishMexican, + (options) => new AgeModel(new Dictionary + { + { + new NumberWithUnitExtractor(new Spanish.AgeExtractorConfiguration()), + new NumberWithUnitParser(new Spanish.AgeParserConfiguration()) + }, + })); + RegisterModel( Culture.Swedish, (options) => new AgeModel(new Dictionary diff --git a/.NET/Microsoft.Recognizers.Text/Config/IConfiguration.cs b/.NET/Microsoft.Recognizers.Text/Config/IConfiguration.cs index 1524e00c33..8672c947ad 100644 --- a/.NET/Microsoft.Recognizers.Text/Config/IConfiguration.cs +++ b/.NET/Microsoft.Recognizers.Text/Config/IConfiguration.cs @@ -1,9 +1,11 @@ -namespace Microsoft.Recognizers.Text.Config +namespace Microsoft.Recognizers.Text { public interface IConfiguration { string Culture { get; } + // string RequestedCulture { get; } + } } diff --git a/.NET/Microsoft.Recognizers.Text/Culture.cs b/.NET/Microsoft.Recognizers.Text/Culture.cs index d9e7461930..dbe320c103 100644 --- a/.NET/Microsoft.Recognizers.Text/Culture.cs +++ b/.NET/Microsoft.Recognizers.Text/Culture.cs @@ -10,6 +10,7 @@ public sealed class Culture public const string EnglishOthers = "en-*"; public const string Chinese = "zh-cn"; public const string Spanish = "es-es"; + public const string SpanishMexican = "es-mx"; // Temporary workaround for language variant config issue public const string Portuguese = "pt-br"; public const string French = "fr-fr"; public const string German = "de-de"; @@ -29,6 +30,7 @@ public sealed class Culture new Culture("English", English), new Culture("Chinese", Chinese), new Culture("Spanish", Spanish), + new Culture("SpanishMexican", SpanishMexican), new Culture("Portuguese", Portuguese), new Culture("French", French), new Culture("German", German), diff --git a/.NET/Microsoft.Recognizers.Text/Extractors/Metadata.cs b/.NET/Microsoft.Recognizers.Text/Extractors/Metadata.cs index 8ee125ab57..b1b3e1b408 100644 --- a/.NET/Microsoft.Recognizers.Text/Extractors/Metadata.cs +++ b/.NET/Microsoft.Recognizers.Text/Extractors/Metadata.cs @@ -24,6 +24,9 @@ public class Metadata public bool IsMealtime { get; set; } = false; + // For cases where a language has variations in handling decimal separators + public bool TreatAsInteger { get; set; } = false; + public Metadata Clone() { return (Metadata)MemberwiseClone(); diff --git a/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/resources/ChineseDateTime.java b/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/resources/ChineseDateTime.java index 9fb4924648..54ac68eabe 100644 --- a/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/resources/ChineseDateTime.java +++ b/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/resources/ChineseDateTime.java @@ -332,6 +332,18 @@ public class ChineseDateTime { public static final String DateTimePeriodNumberCombinedWithUnit = "\\b(?\\d+(\\.\\d*)?){DateTimePeriodUnitRegex}" .replace("{DateTimePeriodUnitRegex}", DateTimePeriodUnitRegex); + public static final String DurationAllRegex = "^[.]"; + + public static final String DurationHalfRegex = "^[.]"; + + public static final String DurationRelativeDurationUnitRegex = "^[.]"; + + public static final String DurationDuringRegex = "^[.]"; + + public static final String DurationSomeRegex = "^[.]"; + + public static final String DurationMoreOrLessRegex = "^[.]"; + public static final String DurationYearRegex = "((\\d{3,4})|0\\d|两千)\\s*年"; public static final String DurationHalfSuffixRegex = "半"; diff --git a/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/resources/EnglishDateTime.java b/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/resources/EnglishDateTime.java index 8365145b49..95d42de217 100644 --- a/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/resources/EnglishDateTime.java +++ b/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/resources/EnglishDateTime.java @@ -1400,6 +1400,7 @@ public class EnglishDateTime { public static final List DurationDateRestrictions = Arrays.asList("today", "now"); public static final ImmutableMap AmbiguityFiltersDict = ImmutableMap.builder() + .put("^\\d{4}$", "(\\d\\.\\d{4}|\\d{4}\\.\\d)") .put("^(morning|afternoon|evening|night|day)\\b", "\\b(good\\s+(morning|afternoon|evening|night|day))|(nighty\\s+night)\\b") .put("\\bnow\\b", "\\b(^now,)|\\b((is|are)\\s+now\\s+for|for\\s+now)\\b") .put("\\bmay\\b", "\\b((((!|\\.|\\?|,|;|)\\s+|^)may i)|(i|you|he|she|we|they)\\s+may|(may\\s+((((also|not|(also not)|well)\\s+)?(be|ask|contain|constitute|e-?mail|take|have|result|involve|get|work|reply|differ))|(or may not))))\\b") diff --git a/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/resources/FrenchDateTime.java b/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/resources/FrenchDateTime.java index 2ad4ca224e..344d2b680a 100644 --- a/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/resources/FrenchDateTime.java +++ b/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/resources/FrenchDateTime.java @@ -41,6 +41,8 @@ public class FrenchDateTime { public static final String DayRegex = "(?(?:3[0-1]|[1-2]\\d|0?[1-9])(e(r)?)?)(?=\\b|t)"; + public static final String WrittenDayRegex = "(?((vingt|trente)(\\s*-\\s*|\\s+)et(\\s*-\\s*|\\s+))?un|(vingt(\\s*-\\s*|\\s+))?(deux|trois|quatre|cinq|six|sept|huit|neuf)|dix|onze|douze|treize|quatorze|quinze|seize|dix-(sept|huit|neuf)|vingt|trente)"; + public static final String MonthNumRegex = "(?1[0-2]|(0)?[1-9])\\b"; public static final String SpecialDescRegex = "(p\\b)"; @@ -331,7 +333,7 @@ public class FrenchDateTime { public static final String MidmorningRegex = "(?milieu\\s*d[ue]\\s*{MorningRegex})" .replace("{MorningRegex}", MorningRegex); - public static final String MiddayRegex = "(?milieu(\\s*|-)d[eu]\\s*(jour|midi)|apr[eè]s(-|\\s*)midi)"; + public static final String MiddayRegex = "(?milieu(\\s*|-)d[eu]\\s*(jour|midi)|apr[eè]s(-|\\s*)midi|(?<=\\bà\\s+)midi)"; public static final String MidafternoonRegex = "(?milieu\\s*d'+{AfternoonRegex})" .replace("{AfternoonRegex}", AfternoonRegex); @@ -677,9 +679,11 @@ public class FrenchDateTime { public static final String ForTheRegex = "\\b(((pour le {FlexibleDayRegex})|(dans (le\\s+)?{FlexibleDayRegex}(?<=(st|nd|rd|th))))(?\\s*(,|\\.|!|\\?|$)))" .replace("{FlexibleDayRegex}", FlexibleDayRegex); - public static final String WeekDayAndDayOfMonthRegex = "\\b{WeekDayRegex}\\s+(le\\s+{FlexibleDayRegex})\\b" + public static final String WeekDayAndDayOfMonthRegex = "\\b({WeekDayRegex}\\s+(le\\s+{FlexibleDayRegex})|le\\s+(?{DayRegex}|{WrittenDayRegex})\\s+{WeekDayRegex})\\b" .replace("{WeekDayRegex}", WeekDayRegex) - .replace("{FlexibleDayRegex}", FlexibleDayRegex); + .replace("{FlexibleDayRegex}", FlexibleDayRegex) + .replace("{DayRegex}", DayRegex) + .replace("{WrittenDayRegex}", WrittenDayRegex); public static final String WeekDayAndDayRegex = "\\b{WeekDayRegex}\\s+(?!(the)){DayRegex}(?!([-:]|(\\s+({AmDescRegex}|{PmDescRegex}|{OclockRegex}))))\\b" .replace("{WeekDayRegex}", WeekDayRegex) @@ -1202,6 +1206,7 @@ public class FrenchDateTime { public static final List DurationDateRestrictions = Arrays.asList(); public static final ImmutableMap AmbiguityFiltersDict = ImmutableMap.builder() + .put("^\\d{4}$", "(\\d\\.\\d{4}|\\d{4}\\.\\d)") .put("^([eé]t[eé])$", "(?(?:3[0-1]|[1-2]\\d|0?[1-9]))(?=\\b|t)"; + public static final String WrittenDayRegex = "(?(vinte\\s+e\\s+)?(um|dois|tr[eê]s|quatro|cinco|seis|sete|oito|nove)|dez|onze|doze|treze|(c|qu)atorze|quinze|dez[ae](s(seis|sete)|nove)|dezoito|vinte|trinta(\\s+e\\s+um)?)"; + public static final String MonthNumRegex = "(?1[0-2]|(0)?[1-9])\\b"; public static final String AmDescRegex = "({BaseDateTime.BaseAmDescRegex})" @@ -197,7 +199,13 @@ public class PortugueseDateTime { public static final String ForTheRegex = ".^"; - public static final String WeekDayAndDayOfMonthRegex = ".^"; + public static final String FlexibleDayRegex = "(?([a-z]+\\s)?({WrittenDayRegex}|{DayRegex}))" + .replace("{WrittenDayRegex}", WrittenDayRegex) + .replace("{DayRegex}", DayRegex); + + public static final String WeekDayAndDayOfMonthRegex = "\\b{WeekDayRegex}\\s+(dia\\s+{FlexibleDayRegex})\\b" + .replace("{WeekDayRegex}", WeekDayRegex) + .replace("{FlexibleDayRegex}", FlexibleDayRegex); public static final String WeekDayAndDayRegex = "\\b{WeekDayRegex}\\s+({DayRegex})(?!([-:/]|\\.\\d|(\\s+({AmDescRegex}|{PmDescRegex}|{OclockRegex}))))\\b" .replace("{WeekDayRegex}", WeekDayRegex) @@ -337,11 +345,29 @@ public class PortugueseDateTime { .replace("{BaseDateTime.MinuteRegex}", BaseDateTime.MinuteRegex) .replace("{BaseDateTime.SecondRegex}", BaseDateTime.SecondRegex); - public static final String AtRegex = "\\b((?<=\\b([aà]s?)\\s+)({WrittenTimeRegex}|{HourNumRegex}|{BaseDateTime.HourRegex})(\\s+horas?|\\s*h\\b)?|(?<=\\b(s(er)?[aã]o|v[aã]o\\s+ser|^[eé]h?)\\s+)({WrittenTimeRegex}|{HourNumRegex}|{BaseDateTime.HourRegex})(\\s+horas?|\\s*h\\b))(\\s+{OclockRegex})?\\b" + public static final String MidnightRegex = "(?meia\\s*(-\\s*)?noite)"; + + public static final String MidmorningRegex = "(?meio\\s+da\\s+manhã)"; + + public static final String MidEarlyMorning = "(?meio\\s+da\\s+madrugada)"; + + public static final String MidafternoonRegex = "(?meio\\s+da\\s+tarde)"; + + public static final String MiddayRegex = "(?meio\\s*(-\\s*)?dia)"; + + public static final String MidTimeRegex = "(?({MidnightRegex}|{MidmorningRegex}|{MidEarlyMorning}|{MidafternoonRegex}|{MiddayRegex}))" + .replace("{MidnightRegex}", MidnightRegex) + .replace("{MidmorningRegex}", MidmorningRegex) + .replace("{MidafternoonRegex}", MidafternoonRegex) + .replace("{MiddayRegex}", MiddayRegex) + .replace("{MidEarlyMorning}", MidEarlyMorning); + + public static final String AtRegex = "\\b(((?<=\\b([aà]s?)\\s+)({WrittenTimeRegex}|{HourNumRegex}|{BaseDateTime.HourRegex})(\\s+horas?|\\s*h\\b)?|(?<=\\b(s(er)?[aã]o|v[aã]o\\s+ser|^[eé]h?)\\s+)({WrittenTimeRegex}|{HourNumRegex}|{BaseDateTime.HourRegex})(\\s+horas?|\\s*h\\b))(\\s+{OclockRegex})?|{MidTimeRegex})\\b" .replace("{HourNumRegex}", HourNumRegex) .replace("{BaseDateTime.HourRegex}", BaseDateTime.HourRegex) .replace("{WrittenTimeRegex}", WrittenTimeRegex) - .replace("{OclockRegex}", OclockRegex); + .replace("{OclockRegex}", OclockRegex) + .replace("{MidTimeRegex}", MidTimeRegex); public static final String ConnectNumRegex = "({BaseDateTime.HourRegex}(?[0-5][0-9])\\s*{DescRegex})" .replace("{BaseDateTime.HourRegex}", BaseDateTime.HourRegex) @@ -403,8 +429,6 @@ public class PortugueseDateTime { .replace("{TensTimeRegex}", TensTimeRegex) .replace("{MinuteNumRegex}", MinuteNumRegex); - public static final String TimeRegex10 = "(\\b([àa]|ao?)|na|de|da|pela)\\s+(madrugada|manh[ãa]|meio\\s*dia|meia\\s*noite|tarde|noite)"; - public static final String TimeRegex11 = "\\b({WrittenTimeRegex})(\\s+{DescRegex})?\\b" .replace("{WrittenTimeRegex}", WrittenTimeRegex) .replace("{DescRegex}", DescRegex); @@ -836,6 +860,7 @@ public class PortugueseDateTime { .put("vinte e oito", 28) .put("vinte e nove", 29) .put("trinta", 30) + .put("trinta e um", 31) .build(); public static final ImmutableMap HolidayNames = ImmutableMap.builder() @@ -946,6 +971,7 @@ public class PortugueseDateTime { public static final List DurationDateRestrictions = Arrays.asList(); public static final ImmutableMap AmbiguityFiltersDict = ImmutableMap.builder() + .put("^\\d{4}$", "(\\d\\.\\d{4}|\\d{4}\\.\\d)") .put("^(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)$", "([$%£&!?@#])(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)|(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)([$%£&@#])") .build(); diff --git a/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/resources/SpanishDateTime.java b/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/resources/SpanishDateTime.java index c981e993d5..75c771394f 100644 --- a/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/resources/SpanishDateTime.java +++ b/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/resources/SpanishDateTime.java @@ -238,11 +238,11 @@ public class SpanishDateTime { public static final String SpecialDayWithNumRegex = "^[.]"; - public static final String FlexibleDayRegex = "(?([A-Za-z]+\\s)?({WrittenDayRegex}|{DayRegex}))" + public static final String FlexibleDayRegex = "(?([a-z]+\\s)?({WrittenDayRegex}|{DayRegex}))" .replace("{WrittenDayRegex}", WrittenDayRegex) .replace("{DayRegex}", DayRegex); - public static final String ForTheRegex = "\\b((((?<=para\\s+el\\s+){FlexibleDayRegex})|((?\\s*(,|\\.(?![º°ª])|!|\\?|-|$))(?!\\d))" + public static final String ForTheRegex = "\\b((((?<=para\\s+el\\s+){FlexibleDayRegex})|((?\\s*(,|\\.(?![º°ª])|!|\\?|-|$))(?!\\d))" .replace("{FlexibleDayRegex}", FlexibleDayRegex) .replace("{MonthRegex}", MonthRegex) .replace("{WeekDayRegex}", WeekDayRegex); @@ -418,7 +418,7 @@ public class SpanishDateTime { .replace("{BaseDateTime.MinuteRegex}", BaseDateTime.MinuteRegex) .replace("{BaseDateTime.SecondRegex}", BaseDateTime.SecondRegex); - public static final String MidTimeRegex = "(?((?media\\s*noche)|(?media\\s*mañana)|(?media\\s*tarde)|(?medio\\s*d[ií]a)))"; + public static final String MidTimeRegex = "(?((?media\\s*noche)|(?media\\s*madrugada)|(?media\\s*mañana)|(?media\\s*tarde)|(?medio\\s*d[ií]a)))"; public static final String AtRegex = "\\b((?<=\\b((a|de(sde)?)\\s+las?|al)\\s+)(({WrittenTimeRegex}|{HourNumRegex}|{BaseDateTime.HourRegex})\\b(\\s*\\bh\\b)?(DescRegex)?|{MidTimeRegex})|{MidTimeRegex})" .replace("{HourNumRegex}", HourNumRegex) @@ -493,8 +493,6 @@ public class SpanishDateTime { .replace("{TensTimeRegex}", TensTimeRegex) .replace("{MinuteNumRegex}", MinuteNumRegex); - public static final String TimeRegex10 = "(a\\s+la|al)\\s+(madrugada|mañana|tarde|noche)"; - public static final String TimeRegex11 = "\\b({WrittenTimeRegex})(\\s+{DescRegex})?\\b" .replace("{WrittenTimeRegex}", WrittenTimeRegex) .replace("{DescRegex}", DescRegex); @@ -558,7 +556,7 @@ public class SpanishDateTime { public static final String UnitRegex = "(?años?|(bi|tri|cuatri|se)mestre|mes(es)?|semanas?|fin(es)?\\s+de\\s+semana|finde|d[ií]as?|horas?|hra?s?|hs?|minutos?|mins?|segundos?|segs?|noches?)\\b"; - public static final String ConnectorRegex = "^(,|t|(para|y|a|en|por) las?|(\\s*,\\s*)?(cerca|alrededor) de las?)$"; + public static final String ConnectorRegex = "^(,|t|(para|y|a|en|por) las?|(\\s*,\\s*)?((cerca|alrededor)\\s+)?(de\\s+las?|del))$"; public static final String TimeHourNumRegex = "(?veint(i(uno|dos|tres|cuatro)|e)|cero|uno|dos|tres|cuatro|cinco|seis|siete|ocho|nueve|diez|once|doce|trece|catorce|quince|dieci(s([eé])is|siete|ocho|nueve))"; @@ -1165,6 +1163,7 @@ public class SpanishDateTime { public static final List DurationDateRestrictions = Arrays.asList("hoy"); public static final ImmutableMap AmbiguityFiltersDict = ImmutableMap.builder() + .put("^\\d{4}$", "(\\d\\.\\d{4}|\\d{4}\\.\\d)") .put("^(este\\s+)?mi(\\s+([uú]ltimo|pasado|anterior|pr[oó]ximo|siguiente|que\\s+viene))?$", "\\b(este\\s+)?mi(\\s+([uú]ltimo|pasado|anterior|pr[oó]ximo|siguiente|que\\s+viene))?\\b") .put("^a[nñ]o$", "(? EarlyMorningTermList = Arrays.asList("madrugada"); public static final List MorningTermList = Arrays.asList("mañana", "la mañana"); diff --git a/Java/libraries/recognizers-text-number/src/main/java/com/microsoft/recognizers/text/number/resources/BaseNumbers.java b/Java/libraries/recognizers-text-number/src/main/java/com/microsoft/recognizers/text/number/resources/BaseNumbers.java index bed1156a8a..cc3ec1ccf8 100644 --- a/Java/libraries/recognizers-text-number/src/main/java/com/microsoft/recognizers/text/number/resources/BaseNumbers.java +++ b/Java/libraries/recognizers-text-number/src/main/java/com/microsoft/recognizers/text/number/resources/BaseNumbers.java @@ -20,7 +20,7 @@ public class BaseNumbers { public static final String FractionNumberReplaceToken = "@builtin.num.fraction"; public static String IntegerRegexDefinition(String placeholder, String thousandsmark) { - return "(((? NonStandardSeparatorVariants = Arrays.asList("en-za", "en-na", "en-zw"); + public static final String RoundNumberIntegerRegex = "(?:hundred|thousand|million|mln|billion|bln|trillion|tln|lakh|crore)s?"; public static final String ZeroToNineIntegerRegex = "(?:three|seven|eight|four|five|zero|nine|one|two|six)"; diff --git a/Java/libraries/recognizers-text-number/src/main/java/com/microsoft/recognizers/text/number/resources/GermanNumeric.java b/Java/libraries/recognizers-text-number/src/main/java/com/microsoft/recognizers/text/number/resources/GermanNumeric.java index b5db0b96ce..7d612ebc02 100644 --- a/Java/libraries/recognizers-text-number/src/main/java/com/microsoft/recognizers/text/number/resources/GermanNumeric.java +++ b/Java/libraries/recognizers-text-number/src/main/java/com/microsoft/recognizers/text/number/resources/GermanNumeric.java @@ -33,7 +33,7 @@ public class GermanNumeric { public static final String TenToNineteenIntegerRegex = "(siebzehn|dreizehn|vierzehn|achtzehn|neunzehn|fuenfzehn|sechzehn|elf|zwoelf|zwölf|zehn)"; - public static final String TensNumberIntegerRegex = "(siebzig|zwanzig|dreißig|achtzig|neunzig|vierzig|fuenfzig|fünfzig|sechzig)"; + public static final String TensNumberIntegerRegex = "(siebzig|zwanzig|dreißig|achtzig|neunzig|vierzig|fuenfzig|fünfzig|sechzig|hundert|tausend)"; public static final String NegativeNumberTermsRegex = "^[.]"; @@ -117,17 +117,17 @@ public static String NumbersWithPlaceHolder(String placeholder) { public static final String FractionUnitsRegex = "((?anderthalb|einundhalb)|(?dreiviertel))"; - public static final String FractionHalfRegex = "(einhalb)$"; + public static final String FractionHalfRegex = "(einhalb(es)?)$"; - public static final List OneHalfTokens = Arrays.asList("ein", "halb"); + public static final List OneHalfTokens = Arrays.asList("ein", "halb", "halbes"); - public static final String FractionNounRegex = "(?<=\\b)(({AllIntRegex})(\\s*|\\s*-\\s*)((({AllOrdinalRegex})|({RoundNumberOrdinalRegex}))|halb(er|e|es)?|hälfte)|{FractionUnitsRegex})(?=\\b)" + public static final String FractionNounRegex = "(?<=\\b)(({AllIntRegex})(\\s*|\\s*-\\s*)((({AllOrdinalRegex})|({RoundNumberOrdinalRegex}))|halb(e[rs]?)?|hälfte)|{FractionUnitsRegex})(?=\\b)" .replace("{AllIntRegex}", AllIntRegex) .replace("{AllOrdinalRegex}", AllOrdinalRegex) .replace("{RoundNumberOrdinalRegex}", RoundNumberOrdinalRegex) .replace("{FractionUnitsRegex}", FractionUnitsRegex); - public static final String FractionNounWithArticleRegex = "(?<=\\b)(({AllIntRegex}\\s+(und\\s+)?)?eine?(\\s+|\\s*-\\s*)({AllOrdinalRegex}|{RoundNumberOrdinalRegex}|{FractionUnitsRegex}|({AllIntRegex}ein)?(halb(er|e|es)?|hälfte))|{AllIntRegex}ein(halb))(?=\\b)" + public static final String FractionNounWithArticleRegex = "(?<=\\b)(({AllIntRegex}\\s+(und\\s+)?)?eine?(\\s+|\\s*-\\s*)({AllOrdinalRegex}|{RoundNumberOrdinalRegex}|{FractionUnitsRegex}|({AllIntRegex}ein)?(halb(e[rs]?)?|hälfte))|{AllIntRegex}ein(halb))(?=\\b)" .replace("{AllIntRegex}", AllIntRegex) .replace("{AllOrdinalRegex}", AllOrdinalRegex) .replace("{RoundNumberOrdinalRegex}", RoundNumberOrdinalRegex) @@ -596,7 +596,7 @@ public static String DoubleWithoutIntegralRegex(String placeholder) { .build(); public static final ImmutableMap AmbiguityFiltersDict = ImmutableMap.builder() - .put("^[.]", "") + .put("^(tausend|hundert)$", "(ed(ward(\\s+m(\\.)?)?)?|mary(\\s+c(\\.)?)?|joachim|claudia|franz|maria|klaus|prof(\\.|essor)?|dr(\\.)?|herr|fr[äa]u(lein)?|frl?\\.)\\s+(tausend|hundert)") .build(); public static final ImmutableMap RelativeReferenceOffsetMap = ImmutableMap.builder() diff --git a/Java/libraries/recognizers-text-number/src/main/java/com/microsoft/recognizers/text/number/resources/SpanishNumeric.java b/Java/libraries/recognizers-text-number/src/main/java/com/microsoft/recognizers/text/number/resources/SpanishNumeric.java index c4940e4a29..94cd1afe0c 100644 --- a/Java/libraries/recognizers-text-number/src/main/java/com/microsoft/recognizers/text/number/resources/SpanishNumeric.java +++ b/Java/libraries/recognizers-text-number/src/main/java/com/microsoft/recognizers/text/number/resources/SpanishNumeric.java @@ -25,17 +25,19 @@ public class SpanishNumeric { public static final Boolean MultiDecimalSeparatorCulture = true; + public static final List NonStandardSeparatorVariants = Arrays.asList("es-mx", "es-do", "es-sv", "es-gt", "es-hn", "es-ni", "es-pa", "es-pr"); + public static final String HundredsNumberIntegerRegex = "(cuatrocient[ao]s|trescient[ao]s|seiscient[ao]s|setecient[ao]s|ochocient[ao]s|novecient[ao]s|doscient[ao]s|quinient[ao]s|(?(?(小时|钟头|分钟|秒钟|时|分|秒))`; export const DateTimePeriodFollowedUnit = `^\\s*${DateTimePeriodUnitRegex}`; export const DateTimePeriodNumberCombinedWithUnit = `\\b(?\\d+(\\.\\d*)?)${DateTimePeriodUnitRegex}`; + export const DurationAllRegex = `^[.]`; + export const DurationHalfRegex = `^[.]`; + export const DurationRelativeDurationUnitRegex = `^[.]`; + export const DurationDuringRegex = `^[.]`; + export const DurationSomeRegex = `^[.]`; + export const DurationMoreOrLessRegex = `^[.]`; export const DurationYearRegex = `((\\d{3,4})|0\\d|两千)\\s*年`; export const DurationHalfSuffixRegex = `半`; export const DurationSuffixList: ReadonlyMap = new Map([["M", "分钟"],["S", "秒钟|秒"],["H", "个小时|小时|个钟头|钟头|时"],["D", "天"],["W", "星期|个星期|周"],["Mon", "个月"],["Y", "年"]]); diff --git a/JavaScript/packages/recognizers-date-time/src/resources/englishDateTime.ts b/JavaScript/packages/recognizers-date-time/src/resources/englishDateTime.ts index 5c43b6a95e..60fa414062 100644 --- a/JavaScript/packages/recognizers-date-time/src/resources/englishDateTime.ts +++ b/JavaScript/packages/recognizers-date-time/src/resources/englishDateTime.ts @@ -304,7 +304,7 @@ export namespace EnglishDateTime { export const DefaultLanguageFallback = `MDY`; export const SuperfluousWordList = [ "preferably","how about","maybe","perhaps","say","like" ]; export const DurationDateRestrictions = [ "today","now" ]; - export const AmbiguityFiltersDict: ReadonlyMap = new Map([["^(morning|afternoon|evening|night|day)\\b", "\\b(good\\s+(morning|afternoon|evening|night|day))|(nighty\\s+night)\\b"],["\\bnow\\b", "\\b(^now,)|\\b((is|are)\\s+now\\s+for|for\\s+now)\\b"],["\\bmay\\b", "\\b((((!|\\.|\\?|,|;|)\\s+|^)may i)|(i|you|he|she|we|they)\\s+may|(may\\s+((((also|not|(also not)|well)\\s+)?(be|ask|contain|constitute|e-?mail|take|have|result|involve|get|work|reply|differ))|(or may not))))\\b"],["\\b(a|one) second\\b", "\\b(? = new Map([["^\\d{4}$", "(\\d\\.\\d{4}|\\d{4}\\.\\d)"],["^(morning|afternoon|evening|night|day)\\b", "\\b(good\\s+(morning|afternoon|evening|night|day))|(nighty\\s+night)\\b"],["\\bnow\\b", "\\b(^now,)|\\b((is|are)\\s+now\\s+for|for\\s+now)\\b"],["\\bmay\\b", "\\b((((!|\\.|\\?|,|;|)\\s+|^)may i)|(i|you|he|she|we|they)\\s+may|(may\\s+((((also|not|(also not)|well)\\s+)?(be|ask|contain|constitute|e-?mail|take|have|result|involve|get|work|reply|differ))|(or may not))))\\b"],["\\b(a|one) second\\b", "\\b(?ce(tte)?|au\\s+cours+(du|de))\\b`; export const RangePrefixRegex = `(du|depuis|des?|entre)`; export const DayRegex = `(?(?:3[0-1]|[1-2]\\d|0?[1-9])(e(r)?)?)(?=\\b|t)`; + export const WrittenDayRegex = `(?((vingt|trente)(\\s*-\\s*|\\s+)et(\\s*-\\s*|\\s+))?un|(vingt(\\s*-\\s*|\\s+))?(deux|trois|quatre|cinq|six|sept|huit|neuf)|dix|onze|douze|treize|quatorze|quinze|seize|dix-(sept|huit|neuf)|vingt|trente)`; export const MonthNumRegex = `(?1[0-2]|(0)?[1-9])\\b`; export const SpecialDescRegex = `(p\\b)`; export const AmDescRegex = `(h\\b|${BaseDateTime.BaseAmDescRegex})`; @@ -104,7 +105,7 @@ export namespace FrenchDateTime { export const MorningRegex = `(?matin([ée]e)?)`; export const AfternoonRegex = `(?(d'|l')?apr[eè]s(-|\\s*)midi)`; export const MidmorningRegex = `(?milieu\\s*d[ue]\\s*${MorningRegex})`; - export const MiddayRegex = `(?milieu(\\s*|-)d[eu]\\s*(jour|midi)|apr[eè]s(-|\\s*)midi)`; + export const MiddayRegex = `(?milieu(\\s*|-)d[eu]\\s*(jour|midi)|apr[eè]s(-|\\s*)midi|(?<=\\bà\\s+)midi)`; export const MidafternoonRegex = `(?milieu\\s*d'+${AfternoonRegex})`; export const MidTimeRegex = `(?(${MidnightRegex}|${MidmorningRegex}|${MidafternoonRegex}|${MiddayRegex}))`; export const AtRegex = `\\b(((?<=\\b[àa]\\s+)(${WrittenTimeRegex}|${HourNumRegex}|${BaseDateTime.HourRegex}|${MidTimeRegex}))|${MidTimeRegex})\\b`; @@ -219,7 +220,7 @@ export namespace FrenchDateTime { export const PrepositionSuffixRegex = `\\b(du|de|[àa]|vers|dans)$`; export const FlexibleDayRegex = `(?([A-Za-z]+\\s)?[A-Za-z\\d]+)`; export const ForTheRegex = `\\b(((pour le ${FlexibleDayRegex})|(dans (le\\s+)?${FlexibleDayRegex}(?<=(st|nd|rd|th))))(?\\s*(,|\\.|!|\\?|$)))`; - export const WeekDayAndDayOfMonthRegex = `\\b${WeekDayRegex}\\s+(le\\s+${FlexibleDayRegex})\\b`; + export const WeekDayAndDayOfMonthRegex = `\\b(${WeekDayRegex}\\s+(le\\s+${FlexibleDayRegex})|le\\s+(?${DayRegex}|${WrittenDayRegex})\\s+${WeekDayRegex})\\b`; export const WeekDayAndDayRegex = `\\b${WeekDayRegex}\\s+(?!(the))${DayRegex}(?!([-:]|(\\s+(${AmDescRegex}|${PmDescRegex}|${OclockRegex}))))\\b`; export const RestOfDateRegex = `\\b(reste|fin)\\s+(d[eu]\\s+)?((le|ce(tte)?)\\s+)?(?semaine|mois|l'ann[ée]e)\\b`; export const RestOfDateTimeRegex = `\\b(reste|fin)\\s+(d[eu]\\s+)?((le|ce(tte)?)\\s+)?(?jour)\\b`; @@ -261,7 +262,7 @@ export namespace FrenchDateTime { export const SpecialDecadeCases: ReadonlyMap = new Map([["", 0]]); export const DefaultLanguageFallback = `DMY`; export const DurationDateRestrictions = [ ]; - export const AmbiguityFiltersDict: ReadonlyMap = new Map([["^([eé]t[eé])$", "(? = new Map([["^\\d{4}$", "(\\d\\.\\d{4}|\\d{4}\\.\\d)"],["^([eé]t[eé])$", "(? = new Map([["heures?$", "\\b(pour|durée\\s+de|pendant)\\s+(\\S+\\s+){1,2}heures?\\b"]]); export const MorningTermList = [ "matinee","matin","matinée" ]; export const AfternoonTermList = [ "apres-midi","apres midi","après midi","après-midi" ]; diff --git a/JavaScript/packages/recognizers-date-time/src/resources/portugueseDateTime.ts b/JavaScript/packages/recognizers-date-time/src/resources/portugueseDateTime.ts index c98d64bd70..4719d3b3ad 100644 --- a/JavaScript/packages/recognizers-date-time/src/resources/portugueseDateTime.ts +++ b/JavaScript/packages/recognizers-date-time/src/resources/portugueseDateTime.ts @@ -16,6 +16,7 @@ export namespace PortugueseDateTime { export const TillRegex = `(?\\b(at[eé]h?|[aà]s|ao?)\\b|--|-|—|——)(\\s+\\b(o|[aà](s)?)\\b)?`; export const RangeConnectorRegex = `(?(e\\s*(([àa]s?)|o)?)|${BaseDateTime.RangeConnectorSymbolRegex})`; export const DayRegex = `(?(?:3[0-1]|[1-2]\\d|0?[1-9]))(?=\\b|t)`; + export const WrittenDayRegex = `(?(vinte\\s+e\\s+)?(um|dois|tr[eê]s|quatro|cinco|seis|sete|oito|nove)|dez|onze|doze|treze|(c|qu)atorze|quinze|dez[ae](s(seis|sete)|nove)|dezoito|vinte|trinta(\\s+e\\s+um)?)`; export const MonthNumRegex = `(?1[0-2]|(0)?[1-9])\\b`; export const AmDescRegex = `(${BaseDateTime.BaseAmDescRegex})`; export const PmDescRegex = `(${BaseDateTime.BasePmDescRegex})`; @@ -76,7 +77,8 @@ export namespace PortugueseDateTime { export const SpecialDayRegex = `\\b((d?o\\s+)?(dia\\s+antes\\s+de\\s+ontem|antes\\s+de\\s+ontem|anteontem)|((d?o\\s+)?(dia\\s+|depois\\s+|dia\\s+depois\\s+)?de\\s+amanh[aã])|(o\\s)?dia\\s+seguinte|(o\\s)?pr[oó]ximo\\s+dia|(o\\s+)?[uú]ltimo\\s+dia|ontem|amanh[ãa]|hoje)|(do\\s+dia$)\\b`; export const SpecialDayWithNumRegex = `^[.]`; export const ForTheRegex = `.^`; - export const WeekDayAndDayOfMonthRegex = `.^`; + export const FlexibleDayRegex = `(?([a-z]+\\s)?(${WrittenDayRegex}|${DayRegex}))`; + export const WeekDayAndDayOfMonthRegex = `\\b${WeekDayRegex}\\s+(dia\\s+${FlexibleDayRegex})\\b`; export const WeekDayAndDayRegex = `\\b${WeekDayRegex}\\s+(${DayRegex})(?!([-:/]|\\.\\d|(\\s+(${AmDescRegex}|${PmDescRegex}|${OclockRegex}))))\\b`; export const WeekDayOfMonthRegex = `(?(n?[ao]\\s+)?(?primeir[ao]|1[ao]|segund[ao]|2[ao]|terceir[ao]|3[ao]|[qc]uart[ao]|4[ao]|quint[ao]|5[ao]|[uú]ltim[ao])\\s+${WeekDayRegex}\\s+${MonthSuffixRegex})`; export const RelativeWeekDayRegex = `^[.]`; @@ -112,7 +114,13 @@ export namespace PortugueseDateTime { export const TimePrefix = `(?${LessThanOneHour}(\\s+(passad[ao]s)\\s+(as)?|\\s+depois\\s+(das?|do)|\\s+pras?|\\s+(para|antes)?\\s+([àa]s?))?)`; export const TimeSuffix = `(?(${LessThanOneHour}\\s+)?(${AmRegex}|${PmRegex}|${OclockRegex}))`; export const BasicTime = `(?${WrittenTimeRegex}|${HourNumRegex}|${BaseDateTime.HourRegex}:${BaseDateTime.MinuteRegex}(:${BaseDateTime.SecondRegex})?|${BaseDateTime.HourRegex})`; - export const AtRegex = `\\b((?<=\\b([aà]s?)\\s+)(${WrittenTimeRegex}|${HourNumRegex}|${BaseDateTime.HourRegex})(\\s+horas?|\\s*h\\b)?|(?<=\\b(s(er)?[aã]o|v[aã]o\\s+ser|^[eé]h?)\\s+)(${WrittenTimeRegex}|${HourNumRegex}|${BaseDateTime.HourRegex})(\\s+horas?|\\s*h\\b))(\\s+${OclockRegex})?\\b`; + export const MidnightRegex = `(?meia\\s*(-\\s*)?noite)`; + export const MidmorningRegex = `(?meio\\s+da\\s+manhã)`; + export const MidEarlyMorning = `(?meio\\s+da\\s+madrugada)`; + export const MidafternoonRegex = `(?meio\\s+da\\s+tarde)`; + export const MiddayRegex = `(?meio\\s*(-\\s*)?dia)`; + export const MidTimeRegex = `(?(${MidnightRegex}|${MidmorningRegex}|${MidEarlyMorning}|${MidafternoonRegex}|${MiddayRegex}))`; + export const AtRegex = `\\b(((?<=\\b([aà]s?)\\s+)(${WrittenTimeRegex}|${HourNumRegex}|${BaseDateTime.HourRegex})(\\s+horas?|\\s*h\\b)?|(?<=\\b(s(er)?[aã]o|v[aã]o\\s+ser|^[eé]h?)\\s+)(${WrittenTimeRegex}|${HourNumRegex}|${BaseDateTime.HourRegex})(\\s+horas?|\\s*h\\b))(\\s+${OclockRegex})?|${MidTimeRegex})\\b`; export const ConnectNumRegex = `(${BaseDateTime.HourRegex}(?[0-5][0-9])\\s*${DescRegex})`; export const TimeRegex1 = `(\\b${TimePrefix}\\s+)?(${WrittenTimeRegex}|${HourNumRegex}|${BaseDateTime.HourRegex})\\s*(${DescRegex})`; export const TimeRegex2 = `(\\b${TimePrefix}\\s+)?(t)?${BaseDateTime.HourRegex}(\\s*)?:(\\s*)?${BaseDateTime.MinuteRegex}((\\s*)?:(\\s*)?${BaseDateTime.SecondRegex})?((\\s*${DescRegex})|\\b)`; @@ -123,7 +131,6 @@ export namespace PortugueseDateTime { export const TimeRegex7 = `\\b${TimeSuffix}\\s+[àa]s?\\s+${BasicTime}((\\s*${DescRegex})|\\b)`; export const TimeRegex8 = `\\b${TimeSuffix}\\s+${BasicTime}((\\s*${DescRegex})|\\b)`; export const TimeRegex9 = `\\b(?${HourNumRegex}\\s+(${TensTimeRegex}\\s*)(e\\s+)?${MinuteNumRegex}?)\\b`; - export const TimeRegex10 = `(\\b([àa]|ao?)|na|de|da|pela)\\s+(madrugada|manh[ãa]|meio\\s*dia|meia\\s*noite|tarde|noite)`; export const TimeRegex11 = `\\b(${WrittenTimeRegex})(\\s+${DescRegex})?\\b`; export const TimeRegex12 = `(\\b${TimePrefix}\\s+)?${BaseDateTime.HourRegex}(\\s*h\\s*)${BaseDateTime.MinuteRegex}(\\s*${DescRegex})?`; export const PrepositionRegex = `(?([àa]s?|em|por|pel[ao]|n[ao]|de|d[ao]?)?$)`; @@ -199,7 +206,7 @@ export namespace PortugueseDateTime { export const CardinalMap: ReadonlyMap = new Map([["primeiro", 1],["primeira", 1],["1o", 1],["1a", 1],["segundo", 2],["segunda", 2],["2o", 2],["2a", 2],["terceiro", 3],["terceira", 3],["3o", 3],["3a", 3],["cuarto", 4],["quarto", 4],["cuarta", 4],["quarta", 4],["4o", 4],["4a", 4],["quinto", 5],["quinta", 5],["5o", 5],["5a", 5]]); export const DayOfWeek: ReadonlyMap = new Map([["segunda-feira", 1],["segundas-feiras", 1],["segunda feira", 1],["segundas feiras", 1],["segunda", 1],["segundas", 1],["terça-feira", 2],["terças-feiras", 2],["terça feira", 2],["terças feiras", 2],["terça", 2],["terças", 2],["terca-feira", 2],["tercas-feiras", 2],["terca feira", 2],["tercas feiras", 2],["terca", 2],["tercas", 2],["quarta-feira", 3],["quartas-feiras", 3],["quarta feira", 3],["quartas feiras", 3],["quarta", 3],["quartas", 3],["quinta-feira", 4],["quintas-feiras", 4],["quinta feira", 4],["quintas feiras", 4],["quinta", 4],["quintas", 4],["sexta-feira", 5],["sextas-feiras", 5],["sexta feira", 5],["sextas feiras", 5],["sexta", 5],["sextas", 5],["sabado", 6],["sabados", 6],["sábado", 6],["sábados", 6],["domingo", 0],["domingos", 0],["seg", 1],["seg.", 1],["2a", 1],["ter", 2],["ter.", 2],["3a", 2],["qua", 3],["qua.", 3],["4a", 3],["qui", 4],["qui.", 4],["5a", 4],["sex", 5],["sex.", 5],["6a", 5],["sab", 6],["sab.", 6],["dom", 0],["dom.", 0]]); export const MonthOfYear: ReadonlyMap = new Map([["1", 1],["2", 2],["3", 3],["4", 4],["5", 5],["6", 6],["7", 7],["8", 8],["9", 9],["10", 10],["11", 11],["12", 12],["janeiro", 1],["fevereiro", 2],["março", 3],["marco", 3],["abril", 4],["maio", 5],["junho", 6],["julho", 7],["agosto", 8],["septembro", 9],["setembro", 9],["outubro", 10],["novembro", 11],["dezembro", 12],["jan", 1],["fev", 2],["mar", 3],["abr", 4],["mai", 5],["jun", 6],["jul", 7],["ago", 8],["sept", 9],["set", 9],["out", 10],["nov", 11],["dez", 12],["01", 1],["02", 2],["03", 3],["04", 4],["05", 5],["06", 6],["07", 7],["08", 8],["09", 9]]); - export const Numbers: ReadonlyMap = new Map([["zero", 0],["um", 1],["uma", 1],["dois", 2],["tres", 3],["três", 3],["quatro", 4],["cinco", 5],["seis", 6],["sete", 7],["oito", 8],["nove", 9],["dez", 10],["onze", 11],["doze", 12],["dezena", 12],["dezenas", 12],["treze", 13],["catorze", 14],["quatorze", 14],["quinze", 15],["dezesseis", 16],["dezasseis", 16],["dezessete", 17],["dezassete", 17],["dezoito", 18],["dezenove", 19],["dezanove", 19],["vinte", 20],["vinte e um", 21],["vinte e uma", 21],["vinte e dois", 22],["vinte e duas", 22],["vinte e tres", 23],["vinte e três", 23],["vinte e quatro", 24],["vinte e cinco", 25],["vinte e seis", 26],["vinte e sete", 27],["vinte e oito", 28],["vinte e nove", 29],["trinta", 30]]); + export const Numbers: ReadonlyMap = new Map([["zero", 0],["um", 1],["uma", 1],["dois", 2],["tres", 3],["três", 3],["quatro", 4],["cinco", 5],["seis", 6],["sete", 7],["oito", 8],["nove", 9],["dez", 10],["onze", 11],["doze", 12],["dezena", 12],["dezenas", 12],["treze", 13],["catorze", 14],["quatorze", 14],["quinze", 15],["dezesseis", 16],["dezasseis", 16],["dezessete", 17],["dezassete", 17],["dezoito", 18],["dezenove", 19],["dezanove", 19],["vinte", 20],["vinte e um", 21],["vinte e uma", 21],["vinte e dois", 22],["vinte e duas", 22],["vinte e tres", 23],["vinte e três", 23],["vinte e quatro", 24],["vinte e cinco", 25],["vinte e seis", 26],["vinte e sete", 27],["vinte e oito", 28],["vinte e nove", 29],["trinta", 30],["trinta e um", 31]]); export const HolidayNames: ReadonlyMap = new Map([["pai", ["diadopai","diadospais"]],["mae", ["diadamae","diadasmaes"]],["acaodegracas", ["diadegracas","diadeacaodegracas","acaodegracas"]],["trabalho", ["diadotrabalho","diadotrabalhador","diadostrabalhadores"]],["pascoa", ["diadepascoa","pascoa"]],["natal", ["natal","diadenatal"]],["vesperadenatal", ["vesperadenatal"]],["anonovo", ["anonovo","diadeanonovo","diadoanonovo"]],["vesperadeanonovo", ["vesperadeanonovo","vesperadoanonovo"]],["yuandan", ["yuandan"]],["todosossantos", ["todosossantos"]],["professor", ["diadoprofessor","diadosprofessores"]],["crianca", ["diadacrianca","diadascriancas"]],["mulher", ["diadamulher"]]]); export const VariableHolidaysTimexDictionary: ReadonlyMap = new Map([["pai", "-06-WXX-7-3"],["mae", "-05-WXX-7-2"],["acaodegracas", "-11-WXX-4-4"],["memoria", "-03-WXX-2-4"]]); export const DoubleNumbers: ReadonlyMap = new Map([["metade", 0.5],["quarto", 0.25]]); @@ -238,7 +245,7 @@ export namespace PortugueseDateTime { export const SpecialDecadeCases: ReadonlyMap = new Map([["", 0]]); export const DefaultLanguageFallback = `DMY`; export const DurationDateRestrictions = [ ]; - export const AmbiguityFiltersDict: ReadonlyMap = new Map([["^(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)$", "([$%£&!?@#])(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)|(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)([$%£&@#])"]]); + export const AmbiguityFiltersDict: ReadonlyMap = new Map([["^\\d{4}$", "(\\d\\.\\d{4}|\\d{4}\\.\\d)"],["^(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)$", "([$%£&!?@#])(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)|(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)([$%£&@#])"]]); export const EarlyMorningTermList = [ "madrugada" ]; export const MorningTermList = [ "manha","manhã" ]; export const AfternoonTermList = [ "passado o meio dia","depois do meio dia" ]; diff --git a/JavaScript/packages/recognizers-date-time/src/resources/spanishDateTime.ts b/JavaScript/packages/recognizers-date-time/src/resources/spanishDateTime.ts index 5ded1ee043..4f8322a6f1 100644 --- a/JavaScript/packages/recognizers-date-time/src/resources/spanishDateTime.ts +++ b/JavaScript/packages/recognizers-date-time/src/resources/spanishDateTime.ts @@ -84,8 +84,8 @@ export namespace SpanishDateTime { export const RelaxedOnRegex = `(?<=\\b(en|d?el)\\s+)((?10|11|12|13|14|15|16|17|18|19|1st|20|21|22|23|24|25|26|27|28|29|2|30|31|3|4|5|6|7|8|9)s?)(?![.,]\\d)\\b`; export const SpecialDayRegex = `\\b((el\\s+)?(d[ií]a\\s+antes\\s+de\\s+ayer|anteayer)|((el\\s+)?d[ií]a\\s+(despu[eé]s\\s+)?de\\s+mañana|pasado\\s+mañana)|(el\\s)?d[ií]a\\s+(siguiente|anterior)|(el\\s)?pr[oó]ximo\\s+d[ií]a|(el\\s+)?[uú]ltimo\\s+d[ií]a|(d)?el\\s+d[ií]a(?!\\s+d)|ayer|mañana|hoy)\\b`; export const SpecialDayWithNumRegex = `^[.]`; - export const FlexibleDayRegex = `(?([A-Za-z]+\\s)?(${WrittenDayRegex}|${DayRegex}))`; - export const ForTheRegex = `\\b((((?<=para\\s+el\\s+)${FlexibleDayRegex})|((?\\s*(,|\\.(?![º°ª])|!|\\?|-|$))(?!\\d))`; + export const FlexibleDayRegex = `(?([a-z]+\\s)?(${WrittenDayRegex}|${DayRegex}))`; + export const ForTheRegex = `\\b((((?<=para\\s+el\\s+)${FlexibleDayRegex})|((?\\s*(,|\\.(?![º°ª])|!|\\?|-|$))(?!\\d))`; export const WeekDayAndDayOfMonthRegex = `\\b${WeekDayRegex}\\s+((el\\s+(d[ií]a\\s+)?)${FlexibleDayRegex})\\b`; export const WeekDayAndDayRegex = `\\b${WeekDayRegex}\\s+(${DayRegex}|${WrittenDayRegex})(?!([-:/]|\\.\\d|(\\s+(${AmDescRegex}|${PmDescRegex}|${OclockRegex}))))\\b`; export const WeekDayOfMonthRegex = `(?(el\\s+)?(?primera?|1era?|segund[ao]|2d[ao]|tercera?|3era?|cuart[ao]|4t[ao]|quint[ao]|5t[ao]|((1|2|3|4|5)(\\.)?[ºª])|[uú]ltim[ao])\\s+(semana\\s+${MonthSuffixRegex}\\s+el\\s+${WeekDayRegex}|${WeekDayRegex}\\s+${MonthSuffixRegex}))`; @@ -135,7 +135,7 @@ export namespace SpanishDateTime { export const TimeSuffix = `(?(${LessThanOneHour}\\s+)?(${AmRegex}|${PmRegex}|${OclockRegex}))`; export const GeneralDescRegex = `(${DescRegex}|(?${AmRegex}|${PmRegex}))`; export const BasicTime = `(?${WrittenTimeRegex}|${HourNumRegex}|${BaseDateTime.HourRegex}:${BaseDateTime.MinuteRegex}(:${BaseDateTime.SecondRegex})?|${BaseDateTime.HourRegex})`; - export const MidTimeRegex = `(?((?media\\s*noche)|(?media\\s*mañana)|(?media\\s*tarde)|(?medio\\s*d[ií]a)))`; + export const MidTimeRegex = `(?((?media\\s*noche)|(?media\\s*madrugada)|(?media\\s*mañana)|(?media\\s*tarde)|(?medio\\s*d[ií]a)))`; export const AtRegex = `\\b((?<=\\b((a|de(sde)?)\\s+las?|al)\\s+)((${WrittenTimeRegex}|${HourNumRegex}|${BaseDateTime.HourRegex})\\b(\\s*\\bh\\b)?(DescRegex)?|${MidTimeRegex})|${MidTimeRegex})`; export const ConnectNumRegex = `(${BaseDateTime.HourRegex}(?[0-5][0-9])\\s*${DescRegex})`; export const TimeRegexWithDotConnector = `(${BaseDateTime.HourRegex}\\.${BaseDateTime.MinuteRegex})`; @@ -148,7 +148,6 @@ export namespace SpanishDateTime { export const TimeRegex7 = `\\b${TimeSuffix}\\s+a\\s+las\\s+${BasicTime}((\\s*${DescRegex})|\\b)`; export const TimeRegex8 = `\\b${TimeSuffix}\\s+${BasicTime}((\\s*${DescRegex})|\\b)`; export const TimeRegex9 = `\\b(?${HourNumRegex}\\s+(${TensTimeRegex}\\s*)(y\\s+)?${MinuteNumRegex}?)\\b`; - export const TimeRegex10 = `(a\\s+la|al)\\s+(madrugada|mañana|tarde|noche)`; export const TimeRegex11 = `\\b(${WrittenTimeRegex})(\\s+${DescRegex})?\\b`; export const TimeRegex12 = `(\\b${TimePrefix}\\s+)?${BaseDateTime.HourRegex}(\\s*h\\s*)${BaseDateTime.MinuteRegex}(\\s*${DescRegex})?`; export const PrepositionRegex = `(?^(,\\s*)?(a(l)?|en|de(l)?)?(\\s*(la(s)?|el|los))?$)`; @@ -169,7 +168,7 @@ export namespace SpanishDateTime { export const PeriodTimeOfDayRegex = `\\b((en\\s+(el|la|lo)?\\s+)?(${LaterEarlyRegex}\\s+)?(est[ae]\\s+)?${DateTimeTimeOfDayRegex})\\b`; export const PeriodSpecificTimeOfDayRegex = `\\b((${LaterEarlyRegex}\\s+)?est[ae]\\s+${DateTimeTimeOfDayRegex}|(${StrictRelativeRegex}\\s+${PeriodTimeOfDayRegex})|anoche)\\b`; export const UnitRegex = `(?años?|(bi|tri|cuatri|se)mestre|mes(es)?|semanas?|fin(es)?\\s+de\\s+semana|finde|d[ií]as?|horas?|hra?s?|hs?|minutos?|mins?|segundos?|segs?|noches?)\\b`; - export const ConnectorRegex = `^(,|t|(para|y|a|en|por) las?|(\\s*,\\s*)?(cerca|alrededor) de las?)$`; + export const ConnectorRegex = `^(,|t|(para|y|a|en|por) las?|(\\s*,\\s*)?((cerca|alrededor)\\s+)?(de\\s+las?|del))$`; export const TimeHourNumRegex = `(?veint(i(uno|dos|tres|cuatro)|e)|cero|uno|dos|tres|cuatro|cinco|seis|siete|ocho|nueve|diez|once|doce|trece|catorce|quince|dieci(s([eé])is|siete|ocho|nueve))`; export const PureNumFromTo = `((\\b(desde|de)\\s+(la(s)?\\s+)?)?(${BaseDateTime.HourRegex}|${TimeHourNumRegex})(?!\\s+al?\\b)(\\s*(?${DescRegex}))?|(\\b(desde|de)\\s+(la(s)?\\s+)?)(${BaseDateTime.HourRegex}|${TimeHourNumRegex})(\\s*(?${DescRegex}))?)\\s*${TillRegex}\\s*(${BaseDateTime.HourRegex}|${TimeHourNumRegex})\\s*(?${PmRegex}|${AmRegex}|${DescRegex})?`; export const PureNumBetweenAnd = `(\\bentre\\s+(la(s)?\\s+)?)((${BaseDateTime.TwoDigitHourRegex}${BaseDateTime.TwoDigitMinuteRegex})|${BaseDateTime.HourRegex}|${TimeHourNumRegex})(\\s*(?${DescRegex}))?\\s*${RangeConnectorRegex}\\s*((${BaseDateTime.TwoDigitHourRegex}${BaseDateTime.TwoDigitMinuteRegex})|${BaseDateTime.HourRegex}|${TimeHourNumRegex})\\s*(?${PmRegex}|${AmRegex}|${DescRegex})?`; @@ -273,7 +272,12 @@ export namespace SpanishDateTime { export const SpecialDecadeCases: ReadonlyMap = new Map([["", 0]]); export const DefaultLanguageFallback = `DMY`; export const DurationDateRestrictions = [ "hoy" ]; - export const AmbiguityFiltersDict: ReadonlyMap = new Map([["^(este\\s+)?mi(\\s+([uú]ltimo|pasado|anterior|pr[oó]ximo|siguiente|que\\s+viene))?$", "\\b(este\\s+)?mi(\\s+([uú]ltimo|pasado|anterior|pr[oó]ximo|siguiente|que\\s+viene))?\\b"],["^a[nñ]o$", "(? = new Map([["^\\d{4}$", "(\\d\\.\\d{4}|\\d{4}\\.\\d)"],["^(este\\s+)?mi(\\s+([uú]ltimo|pasado|anterior|pr[oó]ximo|siguiente|que\\s+viene))?$", "\\b(este\\s+)?mi(\\s+([uú]ltimo|pasado|anterior|pr[oó]ximo|siguiente|que\\s+viene))?\\b"],["^a[nñ]o$", "(? { return `(((? { return `(((? { return `(((? { return `(((? { return `(((? { return `(((? { return `(((? { return `(((?(?anderthalb|einundhalb)|(?dreiviertel)) FractionHalfRegex: !simpleRegex - def: (einhalb)$ -OneHalfTokens: [ein, halb] + def: (einhalb(es)?)$ +OneHalfTokens: [ein, halb, halbes] FractionNounRegex: !nestedRegex - def: (?<=\b)(({AllIntRegex})(\s*|\s*-\s*)((({AllOrdinalRegex})|({RoundNumberOrdinalRegex}))|halb(er|e|es)?|hälfte)|{FractionUnitsRegex})(?=\b) + def: (?<=\b)(({AllIntRegex})(\s*|\s*-\s*)((({AllOrdinalRegex})|({RoundNumberOrdinalRegex}))|halb(e[rs]?)?|hälfte)|{FractionUnitsRegex})(?=\b) references: [ AllIntRegex, AllOrdinalRegex, RoundNumberOrdinalRegex, FractionUnitsRegex ] FractionNounWithArticleRegex: !nestedRegex - def: (?<=\b)(({AllIntRegex}\s+(und\s+)?)?eine?(\s+|\s*-\s*)({AllOrdinalRegex}|{RoundNumberOrdinalRegex}|{FractionUnitsRegex}|({AllIntRegex}ein)?(halb(er|e|es)?|hälfte))|{AllIntRegex}ein(halb))(?=\b) + def: (?<=\b)(({AllIntRegex}\s+(und\s+)?)?eine?(\s+|\s*-\s*)({AllOrdinalRegex}|{RoundNumberOrdinalRegex}|{FractionUnitsRegex}|({AllIntRegex}ein)?(halb(e[rs]?)?|hälfte))|{AllIntRegex}ein(halb))(?=\b) references: [ AllIntRegex, AllOrdinalRegex, RoundNumberOrdinalRegex, FractionUnitsRegex ] FractionPrepositionRegex: !nestedRegex def: (?({AllIntRegex})|((?({AllIntRegex})|(\d+)(?!\.))(?=\b) @@ -518,7 +518,7 @@ RoundNumberMap: !dictionary AmbiguityFiltersDict: !dictionary types: [ string, string ] entries: - '^[.]': '' + '^(tausend|hundert)$': '(ed(ward(\s+m(\.)?)?)?|mary(\s+c(\.)?)?|joachim|claudia|franz|maria|klaus|prof(\.|essor)?|dr(\.)?|herr|fr[äa]u(lein)?|frl?\.)\s+(tausend|hundert)' RelativeReferenceOffsetMap: !dictionary types: [ string, string ] # TODO: modify below regex according to the counterpart in English diff --git a/Patterns/Italian/Italian-DateTime.yaml b/Patterns/Italian/Italian-DateTime.yaml index 75c2acfff6..ae65f0563f 100644 --- a/Patterns/Italian/Italian-DateTime.yaml +++ b/Patterns/Italian/Italian-DateTime.yaml @@ -1018,6 +1018,7 @@ DurationDateRestrictions: [] AmbiguityFiltersDict: !dictionary types: [ string, string ] entries: + '^\d{4}$': '(\d\.\d{4}|\d{4}\.\d)' '\bgiorno|pomeriggio|sera|notte\b': '\b(buona?\s*(giorno|pomeriggio|sera|notte))\b' '^(apr|ago|dic|feb|gen|lug|giu|mar|mag|nov|ott|sett?)$': '([$%£&!?@#])(apr|ago|dic|feb|gen|lug|giu|mar|mag|nov|ott|sett?)|(apr|ago|dic|feb|gen|lug|giu|mar|mag|nov|ott|sett?)([$%£&@#])' # For TimeOfDay resolution diff --git a/Patterns/Portuguese/Portuguese-DateTime.yaml b/Patterns/Portuguese/Portuguese-DateTime.yaml index 742abfda21..9b48693bfe 100644 --- a/Patterns/Portuguese/Portuguese-DateTime.yaml +++ b/Patterns/Portuguese/Portuguese-DateTime.yaml @@ -914,6 +914,7 @@ DurationDateRestrictions: [] AmbiguityFiltersDict: !dictionary types: [ string, string ] entries: + '^\d{4}$': '(\d\.\d{4}|\d{4}\.\d)' '^(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)$': '([$%£&!?@#])(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)|(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)([$%£&@#])' # For TimeOfDay resolution EarlyMorningTermList: !list diff --git a/Patterns/Spanish/Spanish-DateTime.yaml b/Patterns/Spanish/Spanish-DateTime.yaml index 0aef899d1e..002af14cd7 100644 --- a/Patterns/Spanish/Spanish-DateTime.yaml +++ b/Patterns/Spanish/Spanish-DateTime.yaml @@ -1006,6 +1006,7 @@ DurationDateRestrictions: [ hoy ] AmbiguityFiltersDict: !dictionary types: [ string, string ] entries: + '^\d{4}$': '(\d\.\d{4}|\d{4}\.\d)' '^(este\s+)?mi(\s+([uú]ltimo|pasado|anterior|pr[oó]ximo|siguiente|que\s+viene))?$': '\b(este\s+)?mi(\s+([uú]ltimo|pasado|anterior|pr[oó]ximo|siguiente|que\s+viene))?\b' '^a[nñ]o$': '(?(?(小时|钟头|分钟|秒钟|时|分|秒))' DateTimePeriodFollowedUnit = f'^\\s*{DateTimePeriodUnitRegex}' DateTimePeriodNumberCombinedWithUnit = f'\\b(?\\d+(\\.\\d*)?){DateTimePeriodUnitRegex}' + DurationAllRegex = f'^[.]' + DurationHalfRegex = f'^[.]' + DurationRelativeDurationUnitRegex = f'^[.]' + DurationDuringRegex = f'^[.]' + DurationSomeRegex = f'^[.]' + DurationMoreOrLessRegex = f'^[.]' DurationYearRegex = f'((\\d{{3,4}})|0\\d|两千)\\s*年' DurationHalfSuffixRegex = f'半' DurationSuffixList = dict([("M", "分钟"), diff --git a/Python/libraries/recognizers-date-time/recognizers_date_time/resources/english_date_time.py b/Python/libraries/recognizers-date-time/recognizers_date_time/resources/english_date_time.py index d0f2255494..6abdc07540 100644 --- a/Python/libraries/recognizers-date-time/recognizers_date_time/resources/english_date_time.py +++ b/Python/libraries/recognizers-date-time/recognizers_date_time/resources/english_date_time.py @@ -678,7 +678,8 @@ class EnglishDateTime: DefaultLanguageFallback = 'MDY' SuperfluousWordList = [r'preferably', r'how about', r'maybe', r'perhaps', r'say', r'like'] DurationDateRestrictions = [r'today', r'now'] - AmbiguityFiltersDict = dict([("^(morning|afternoon|evening|night|day)\\b", "\\b(good\\s+(morning|afternoon|evening|night|day))|(nighty\\s+night)\\b"), + AmbiguityFiltersDict = dict([("^\\d{4}$", "(\\d\\.\\d{4}|\\d{4}\\.\\d)"), + ("^(morning|afternoon|evening|night|day)\\b", "\\b(good\\s+(morning|afternoon|evening|night|day))|(nighty\\s+night)\\b"), ("\\bnow\\b", "\\b(^now,)|\\b((is|are)\\s+now\\s+for|for\\s+now)\\b"), ("\\bmay\\b", "\\b((((!|\\.|\\?|,|;|)\\s+|^)may i)|(i|you|he|she|we|they)\\s+may|(may\\s+((((also|not|(also not)|well)\\s+)?(be|ask|contain|constitute|e-?mail|take|have|result|involve|get|work|reply|differ))|(or may not))))\\b"), ("\\b(a|one) second\\b", "\\b(?ce(tte)?|au\\s+cours+(du|de))\\b' RangePrefixRegex = f'(du|depuis|des?|entre)' DayRegex = f'(?(?:3[0-1]|[1-2]\\d|0?[1-9])(e(r)?)?)(?=\\b|t)' + WrittenDayRegex = f'(?((vingt|trente)(\\s*-\\s*|\\s+)et(\\s*-\\s*|\\s+))?un|(vingt(\\s*-\\s*|\\s+))?(deux|trois|quatre|cinq|six|sept|huit|neuf)|dix|onze|douze|treize|quatorze|quinze|seize|dix-(sept|huit|neuf)|vingt|trente)' MonthNumRegex = f'(?1[0-2]|(0)?[1-9])\\b' SpecialDescRegex = f'(p\\b)' AmDescRegex = f'(h\\b|{BaseDateTime.BaseAmDescRegex})' @@ -107,7 +108,7 @@ class FrenchDateTime: MorningRegex = f'(?matin([ée]e)?)' AfternoonRegex = f'(?(d\'|l\')?apr[eè]s(-|\\s*)midi)' MidmorningRegex = f'(?milieu\\s*d[ue]\\s*{MorningRegex})' - MiddayRegex = f'(?milieu(\\s*|-)d[eu]\\s*(jour|midi)|apr[eè]s(-|\\s*)midi)' + MiddayRegex = f'(?milieu(\\s*|-)d[eu]\\s*(jour|midi)|apr[eè]s(-|\\s*)midi|(?<=\\bà\\s+)midi)' MidafternoonRegex = f'(?milieu\\s*d\'+{AfternoonRegex})' MidTimeRegex = f'(?({MidnightRegex}|{MidmorningRegex}|{MidafternoonRegex}|{MiddayRegex}))' AtRegex = f'\\b(((?<=\\b[àa]\\s+)({WrittenTimeRegex}|{HourNumRegex}|{BaseDateTime.HourRegex}|{MidTimeRegex}))|{MidTimeRegex})\\b' @@ -222,7 +223,7 @@ class FrenchDateTime: PrepositionSuffixRegex = f'\\b(du|de|[àa]|vers|dans)$' FlexibleDayRegex = f'(?([A-Za-z]+\\s)?[A-Za-z\\d]+)' ForTheRegex = f'\\b(((pour le {FlexibleDayRegex})|(dans (le\\s+)?{FlexibleDayRegex}(?<=(st|nd|rd|th))))(?\\s*(,|\\.|!|\\?|$)))' - WeekDayAndDayOfMonthRegex = f'\\b{WeekDayRegex}\\s+(le\\s+{FlexibleDayRegex})\\b' + WeekDayAndDayOfMonthRegex = f'\\b({WeekDayRegex}\\s+(le\\s+{FlexibleDayRegex})|le\\s+(?{DayRegex}|{WrittenDayRegex})\\s+{WeekDayRegex})\\b' WeekDayAndDayRegex = f'\\b{WeekDayRegex}\\s+(?!(the)){DayRegex}(?!([-:]|(\\s+({AmDescRegex}|{PmDescRegex}|{OclockRegex}))))\\b' RestOfDateRegex = f'\\b(reste|fin)\\s+(d[eu]\\s+)?((le|ce(tte)?)\\s+)?(?semaine|mois|l\'ann[ée]e)\\b' RestOfDateTimeRegex = f'\\b(reste|fin)\\s+(d[eu]\\s+)?((le|ce(tte)?)\\s+)?(?jour)\\b' @@ -667,7 +668,8 @@ class FrenchDateTime: SpecialDecadeCases = dict([("", 0)]) DefaultLanguageFallback = 'DMY' DurationDateRestrictions = [] - AmbiguityFiltersDict = dict([("^([eé]t[eé])$", "(?\\b(at[eé]h?|[aà]s|ao?)\\b|--|-|—|——)(\\s+\\b(o|[aà](s)?)\\b)?' RangeConnectorRegex = f'(?(e\\s*(([àa]s?)|o)?)|{BaseDateTime.RangeConnectorSymbolRegex})' DayRegex = f'(?(?:3[0-1]|[1-2]\\d|0?[1-9]))(?=\\b|t)' + WrittenDayRegex = f'(?(vinte\\s+e\\s+)?(um|dois|tr[eê]s|quatro|cinco|seis|sete|oito|nove)|dez|onze|doze|treze|(c|qu)atorze|quinze|dez[ae](s(seis|sete)|nove)|dezoito|vinte|trinta(\\s+e\\s+um)?)' MonthNumRegex = f'(?1[0-2]|(0)?[1-9])\\b' AmDescRegex = f'({BaseDateTime.BaseAmDescRegex})' PmDescRegex = f'({BaseDateTime.BasePmDescRegex})' @@ -79,7 +80,8 @@ class PortugueseDateTime: SpecialDayRegex = f'\\b((d?o\\s+)?(dia\\s+antes\\s+de\\s+ontem|antes\\s+de\\s+ontem|anteontem)|((d?o\\s+)?(dia\\s+|depois\\s+|dia\\s+depois\\s+)?de\\s+amanh[aã])|(o\\s)?dia\\s+seguinte|(o\\s)?pr[oó]ximo\\s+dia|(o\\s+)?[uú]ltimo\\s+dia|ontem|amanh[ãa]|hoje)|(do\\s+dia$)\\b' SpecialDayWithNumRegex = f'^[.]' ForTheRegex = f'.^' - WeekDayAndDayOfMonthRegex = f'.^' + FlexibleDayRegex = f'(?([a-z]+\\s)?({WrittenDayRegex}|{DayRegex}))' + WeekDayAndDayOfMonthRegex = f'\\b{WeekDayRegex}\\s+(dia\\s+{FlexibleDayRegex})\\b' WeekDayAndDayRegex = f'\\b{WeekDayRegex}\\s+({DayRegex})(?!([-:/]|\\.\\d|(\\s+({AmDescRegex}|{PmDescRegex}|{OclockRegex}))))\\b' WeekDayOfMonthRegex = f'(?(n?[ao]\\s+)?(?primeir[ao]|1[ao]|segund[ao]|2[ao]|terceir[ao]|3[ao]|[qc]uart[ao]|4[ao]|quint[ao]|5[ao]|[uú]ltim[ao])\\s+{WeekDayRegex}\\s+{MonthSuffixRegex})' RelativeWeekDayRegex = f'^[.]' @@ -115,7 +117,13 @@ class PortugueseDateTime: TimePrefix = f'(?{LessThanOneHour}(\\s+(passad[ao]s)\\s+(as)?|\\s+depois\\s+(das?|do)|\\s+pras?|\\s+(para|antes)?\\s+([àa]s?))?)' TimeSuffix = f'(?({LessThanOneHour}\\s+)?({AmRegex}|{PmRegex}|{OclockRegex}))' BasicTime = f'(?{WrittenTimeRegex}|{HourNumRegex}|{BaseDateTime.HourRegex}:{BaseDateTime.MinuteRegex}(:{BaseDateTime.SecondRegex})?|{BaseDateTime.HourRegex})' - AtRegex = f'\\b((?<=\\b([aà]s?)\\s+)({WrittenTimeRegex}|{HourNumRegex}|{BaseDateTime.HourRegex})(\\s+horas?|\\s*h\\b)?|(?<=\\b(s(er)?[aã]o|v[aã]o\\s+ser|^[eé]h?)\\s+)({WrittenTimeRegex}|{HourNumRegex}|{BaseDateTime.HourRegex})(\\s+horas?|\\s*h\\b))(\\s+{OclockRegex})?\\b' + MidnightRegex = f'(?meia\\s*(-\\s*)?noite)' + MidmorningRegex = f'(?meio\\s+da\\s+manhã)' + MidEarlyMorning = f'(?meio\\s+da\\s+madrugada)' + MidafternoonRegex = f'(?meio\\s+da\\s+tarde)' + MiddayRegex = f'(?meio\\s*(-\\s*)?dia)' + MidTimeRegex = f'(?({MidnightRegex}|{MidmorningRegex}|{MidEarlyMorning}|{MidafternoonRegex}|{MiddayRegex}))' + AtRegex = f'\\b(((?<=\\b([aà]s?)\\s+)({WrittenTimeRegex}|{HourNumRegex}|{BaseDateTime.HourRegex})(\\s+horas?|\\s*h\\b)?|(?<=\\b(s(er)?[aã]o|v[aã]o\\s+ser|^[eé]h?)\\s+)({WrittenTimeRegex}|{HourNumRegex}|{BaseDateTime.HourRegex})(\\s+horas?|\\s*h\\b))(\\s+{OclockRegex})?|{MidTimeRegex})\\b' ConnectNumRegex = f'({BaseDateTime.HourRegex}(?[0-5][0-9])\\s*{DescRegex})' TimeRegex1 = f'(\\b{TimePrefix}\\s+)?({WrittenTimeRegex}|{HourNumRegex}|{BaseDateTime.HourRegex})\\s*({DescRegex})' TimeRegex2 = f'(\\b{TimePrefix}\\s+)?(t)?{BaseDateTime.HourRegex}(\\s*)?:(\\s*)?{BaseDateTime.MinuteRegex}((\\s*)?:(\\s*)?{BaseDateTime.SecondRegex})?((\\s*{DescRegex})|\\b)' @@ -126,7 +134,6 @@ class PortugueseDateTime: TimeRegex7 = f'\\b{TimeSuffix}\\s+[àa]s?\\s+{BasicTime}((\\s*{DescRegex})|\\b)' TimeRegex8 = f'\\b{TimeSuffix}\\s+{BasicTime}((\\s*{DescRegex})|\\b)' TimeRegex9 = f'\\b(?{HourNumRegex}\\s+({TensTimeRegex}\\s*)(e\\s+)?{MinuteNumRegex}?)\\b' - TimeRegex10 = f'(\\b([àa]|ao?)|na|de|da|pela)\\s+(madrugada|manh[ãa]|meio\\s*dia|meia\\s*noite|tarde|noite)' TimeRegex11 = f'\\b({WrittenTimeRegex})(\\s+{DescRegex})?\\b' TimeRegex12 = f'(\\b{TimePrefix}\\s+)?{BaseDateTime.HourRegex}(\\s*h\\s*){BaseDateTime.MinuteRegex}(\\s*{DescRegex})?' PrepositionRegex = f'(?([àa]s?|em|por|pel[ao]|n[ao]|de|d[ao]?)?$)' @@ -420,7 +427,8 @@ class PortugueseDateTime: ("vinte e sete", 27), ("vinte e oito", 28), ("vinte e nove", 29), - ("trinta", 30)]) + ("trinta", 30), + ("trinta e um", 31)]) HolidayNames = dict([("pai", ["diadopai", "diadospais"]), ("mae", ["diadamae", "diadasmaes"]), ("acaodegracas", ["diadegracas", "diadeacaodegracas", "acaodegracas"]), @@ -476,7 +484,8 @@ class PortugueseDateTime: SpecialDecadeCases = dict([("", 0)]) DefaultLanguageFallback = 'DMY' DurationDateRestrictions = [] - AmbiguityFiltersDict = dict([("^(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)$", "([$%£&!?@#])(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)|(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)([$%£&@#])")]) + AmbiguityFiltersDict = dict([("^\\d{4}$", "(\\d\\.\\d{4}|\\d{4}\\.\\d)"), + ("^(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)$", "([$%£&!?@#])(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)|(abr|ago|dez|fev|jan|ju[ln]|mar|maio?|nov|out|sep?t)([$%£&@#])")]) EarlyMorningTermList = [r'madrugada'] MorningTermList = [r'manha', r'manhã'] AfternoonTermList = [r'passado o meio dia', r'depois do meio dia'] diff --git a/Python/libraries/recognizers-date-time/recognizers_date_time/resources/spanish_date_time.py b/Python/libraries/recognizers-date-time/recognizers_date_time/resources/spanish_date_time.py index 181a1ec7ab..ea7c3db5d8 100644 --- a/Python/libraries/recognizers-date-time/recognizers_date_time/resources/spanish_date_time.py +++ b/Python/libraries/recognizers-date-time/recognizers_date_time/resources/spanish_date_time.py @@ -87,8 +87,8 @@ class SpanishDateTime: RelaxedOnRegex = f'(?<=\\b(en|d?el)\\s+)((?10|11|12|13|14|15|16|17|18|19|1st|20|21|22|23|24|25|26|27|28|29|2|30|31|3|4|5|6|7|8|9)s?)(?![.,]\\d)\\b' SpecialDayRegex = f'\\b((el\\s+)?(d[ií]a\\s+antes\\s+de\\s+ayer|anteayer)|((el\\s+)?d[ií]a\\s+(despu[eé]s\\s+)?de\\s+mañana|pasado\\s+mañana)|(el\\s)?d[ií]a\\s+(siguiente|anterior)|(el\\s)?pr[oó]ximo\\s+d[ií]a|(el\\s+)?[uú]ltimo\\s+d[ií]a|(d)?el\\s+d[ií]a(?!\\s+d)|ayer|mañana|hoy)\\b' SpecialDayWithNumRegex = f'^[.]' - FlexibleDayRegex = f'(?([A-Za-z]+\\s)?({WrittenDayRegex}|{DayRegex}))' - ForTheRegex = f'\\b((((?<=para\\s+el\\s+){FlexibleDayRegex})|((?\\s*(,|\\.(?![º°ª])|!|\\?|-|$))(?!\\d))' + FlexibleDayRegex = f'(?([a-z]+\\s)?({WrittenDayRegex}|{DayRegex}))' + ForTheRegex = f'\\b((((?<=para\\s+el\\s+){FlexibleDayRegex})|((?\\s*(,|\\.(?![º°ª])|!|\\?|-|$))(?!\\d))' WeekDayAndDayOfMonthRegex = f'\\b{WeekDayRegex}\\s+((el\\s+(d[ií]a\\s+)?){FlexibleDayRegex})\\b' WeekDayAndDayRegex = f'\\b{WeekDayRegex}\\s+({DayRegex}|{WrittenDayRegex})(?!([-:/]|\\.\\d|(\\s+({AmDescRegex}|{PmDescRegex}|{OclockRegex}))))\\b' WeekDayOfMonthRegex = f'(?(el\\s+)?(?primera?|1era?|segund[ao]|2d[ao]|tercera?|3era?|cuart[ao]|4t[ao]|quint[ao]|5t[ao]|((1|2|3|4|5)(\\.)?[ºª])|[uú]ltim[ao])\\s+(semana\\s+{MonthSuffixRegex}\\s+el\\s+{WeekDayRegex}|{WeekDayRegex}\\s+{MonthSuffixRegex}))' @@ -138,7 +138,7 @@ class SpanishDateTime: TimeSuffix = f'(?({LessThanOneHour}\\s+)?({AmRegex}|{PmRegex}|{OclockRegex}))' GeneralDescRegex = f'({DescRegex}|(?{AmRegex}|{PmRegex}))' BasicTime = f'(?{WrittenTimeRegex}|{HourNumRegex}|{BaseDateTime.HourRegex}:{BaseDateTime.MinuteRegex}(:{BaseDateTime.SecondRegex})?|{BaseDateTime.HourRegex})' - MidTimeRegex = f'(?((?media\\s*noche)|(?media\\s*mañana)|(?media\\s*tarde)|(?medio\\s*d[ií]a)))' + MidTimeRegex = f'(?((?media\\s*noche)|(?media\\s*madrugada)|(?media\\s*mañana)|(?media\\s*tarde)|(?medio\\s*d[ií]a)))' AtRegex = f'\\b((?<=\\b((a|de(sde)?)\\s+las?|al)\\s+)(({WrittenTimeRegex}|{HourNumRegex}|{BaseDateTime.HourRegex})\\b(\\s*\\bh\\b)?(DescRegex)?|{MidTimeRegex})|{MidTimeRegex})' ConnectNumRegex = f'({BaseDateTime.HourRegex}(?[0-5][0-9])\\s*{DescRegex})' TimeRegexWithDotConnector = f'({BaseDateTime.HourRegex}\\.{BaseDateTime.MinuteRegex})' @@ -151,7 +151,6 @@ class SpanishDateTime: TimeRegex7 = f'\\b{TimeSuffix}\\s+a\\s+las\\s+{BasicTime}((\\s*{DescRegex})|\\b)' TimeRegex8 = f'\\b{TimeSuffix}\\s+{BasicTime}((\\s*{DescRegex})|\\b)' TimeRegex9 = f'\\b(?{HourNumRegex}\\s+({TensTimeRegex}\\s*)(y\\s+)?{MinuteNumRegex}?)\\b' - TimeRegex10 = f'(a\\s+la|al)\\s+(madrugada|mañana|tarde|noche)' TimeRegex11 = f'\\b({WrittenTimeRegex})(\\s+{DescRegex})?\\b' TimeRegex12 = f'(\\b{TimePrefix}\\s+)?{BaseDateTime.HourRegex}(\\s*h\\s*){BaseDateTime.MinuteRegex}(\\s*{DescRegex})?' PrepositionRegex = f'(?^(,\\s*)?(a(l)?|en|de(l)?)?(\\s*(la(s)?|el|los))?$)' @@ -172,7 +171,7 @@ class SpanishDateTime: PeriodTimeOfDayRegex = f'\\b((en\\s+(el|la|lo)?\\s+)?({LaterEarlyRegex}\\s+)?(est[ae]\\s+)?{DateTimeTimeOfDayRegex})\\b' PeriodSpecificTimeOfDayRegex = f'\\b(({LaterEarlyRegex}\\s+)?est[ae]\\s+{DateTimeTimeOfDayRegex}|({StrictRelativeRegex}\\s+{PeriodTimeOfDayRegex})|anoche)\\b' UnitRegex = f'(?años?|(bi|tri|cuatri|se)mestre|mes(es)?|semanas?|fin(es)?\\s+de\\s+semana|finde|d[ií]as?|horas?|hra?s?|hs?|minutos?|mins?|segundos?|segs?|noches?)\\b' - ConnectorRegex = f'^(,|t|(para|y|a|en|por) las?|(\\s*,\\s*)?(cerca|alrededor) de las?)$' + ConnectorRegex = f'^(,|t|(para|y|a|en|por) las?|(\\s*,\\s*)?((cerca|alrededor)\\s+)?(de\\s+las?|del))$' TimeHourNumRegex = f'(?veint(i(uno|dos|tres|cuatro)|e)|cero|uno|dos|tres|cuatro|cinco|seis|siete|ocho|nueve|diez|once|doce|trece|catorce|quince|dieci(s([eé])is|siete|ocho|nueve))' PureNumFromTo = f'((\\b(desde|de)\\s+(la(s)?\\s+)?)?({BaseDateTime.HourRegex}|{TimeHourNumRegex})(?!\\s+al?\\b)(\\s*(?{DescRegex}))?|(\\b(desde|de)\\s+(la(s)?\\s+)?)({BaseDateTime.HourRegex}|{TimeHourNumRegex})(\\s*(?{DescRegex}))?)\\s*{TillRegex}\\s*({BaseDateTime.HourRegex}|{TimeHourNumRegex})\\s*(?{PmRegex}|{AmRegex}|{DescRegex})?' PureNumBetweenAnd = f'(\\bentre\\s+(la(s)?\\s+)?)(({BaseDateTime.TwoDigitHourRegex}{BaseDateTime.TwoDigitMinuteRegex})|{BaseDateTime.HourRegex}|{TimeHourNumRegex})(\\s*(?{DescRegex}))?\\s*{RangeConnectorRegex}\\s*(({BaseDateTime.TwoDigitHourRegex}{BaseDateTime.TwoDigitMinuteRegex})|{BaseDateTime.HourRegex}|{TimeHourNumRegex})\\s*(?{PmRegex}|{AmRegex}|{DescRegex})?' @@ -553,11 +552,17 @@ class SpanishDateTime: SpecialDecadeCases = dict([("", 0)]) DefaultLanguageFallback = 'DMY' DurationDateRestrictions = [r'hoy'] - AmbiguityFiltersDict = dict([("^(este\\s+)?mi(\\s+([uú]ltimo|pasado|anterior|pr[oó]ximo|siguiente|que\\s+viene))?$", "\\b(este\\s+)?mi(\\s+([uú]ltimo|pasado|anterior|pr[oó]ximo|siguiente|que\\s+viene))?\\b"), + AmbiguityFiltersDict = dict([("^\\d{4}$", "(\\d\\.\\d{4}|\\d{4}\\.\\d)"), + ("^(este\\s+)?mi(\\s+([uú]ltimo|pasado|anterior|pr[oó]ximo|siguiente|que\\s+viene))?$", "\\b(este\\s+)?mi(\\s+([uú]ltimo|pasado|anterior|pr[oó]ximo|siguiente|que\\s+viene))?\\b"), ("^a[nñ]o$", "(?anderthalb|einundhalb)|(?dreiviertel))' - FractionHalfRegex = f'(einhalb)$' - OneHalfTokens = [r'ein', r'halb'] - FractionNounRegex = f'(?<=\\b)(({AllIntRegex})(\\s*|\\s*-\\s*)((({AllOrdinalRegex})|({RoundNumberOrdinalRegex}))|halb(er|e|es)?|hälfte)|{FractionUnitsRegex})(?=\\b)' - FractionNounWithArticleRegex = f'(?<=\\b)(({AllIntRegex}\\s+(und\\s+)?)?eine?(\\s+|\\s*-\\s*)({AllOrdinalRegex}|{RoundNumberOrdinalRegex}|{FractionUnitsRegex}|({AllIntRegex}ein)?(halb(er|e|es)?|hälfte))|{AllIntRegex}ein(halb))(?=\\b)' + FractionHalfRegex = f'(einhalb(es)?)$' + OneHalfTokens = [r'ein', r'halb', r'halbes'] + FractionNounRegex = f'(?<=\\b)(({AllIntRegex})(\\s*|\\s*-\\s*)((({AllOrdinalRegex})|({RoundNumberOrdinalRegex}))|halb(e[rs]?)?|hälfte)|{FractionUnitsRegex})(?=\\b)' + FractionNounWithArticleRegex = f'(?<=\\b)(({AllIntRegex}\\s+(und\\s+)?)?eine?(\\s+|\\s*-\\s*)({AllOrdinalRegex}|{RoundNumberOrdinalRegex}|{FractionUnitsRegex}|({AllIntRegex}ein)?(halb(e[rs]?)?|hälfte))|{AllIntRegex}ein(halb))(?=\\b)' FractionPrepositionRegex = f'(?({AllIntRegex})|((?({AllIntRegex})|(\\d+)(?!\\.))(?=\\b)' AllPointRegex = f'((\\s*{ZeroToNineIntegerRegex})+|(\\s*{SeparaIntRegex}))' AllFloatRegex = f'({AllIntRegex}(\\s*komma\\s*){AllPointRegex})' @@ -399,7 +399,7 @@ def DoubleWithoutIntegralRegex(placeholder): ("g", 1000000000), ("b", 1000000000), ("t", 1000000000000)]) - AmbiguityFiltersDict = dict([("^[.]", "")]) + AmbiguityFiltersDict = dict([("^(tausend|hundert)$", "(ed(ward(\\s+m(\\.)?)?)?|mary(\\s+c(\\.)?)?|joachim|claudia|franz|maria|klaus|prof(\\.|essor)?|dr(\\.)?|herr|fr[äa]u(lein)?|frl?\\.)\\s+(tausend|hundert)")]) RelativeReferenceOffsetMap = dict([("", "")]) RelativeReferenceRelativeToMap = dict([("", "")]) # pylint: enable=line-too-long diff --git a/Python/libraries/recognizers-number/recognizers_number/resources/spanish_numeric.py b/Python/libraries/recognizers-number/recognizers_number/resources/spanish_numeric.py index 10a14883ff..8cb8ae257a 100644 --- a/Python/libraries/recognizers-number/recognizers_number/resources/spanish_numeric.py +++ b/Python/libraries/recognizers-number/recognizers_number/resources/spanish_numeric.py @@ -17,12 +17,13 @@ class SpanishNumeric: LangMarker = 'Spa' CompoundNumberLanguage = False MultiDecimalSeparatorCulture = True + NonStandardSeparatorVariants = [r'es-mx', r'es-do', r'es-sv', r'es-gt', r'es-hn', r'es-ni', r'es-pa', r'es-pr'] HundredsNumberIntegerRegex = f'(cuatrocient[ao]s|trescient[ao]s|seiscient[ao]s|setecient[ao]s|ochocient[ao]s|novecient[ao]s|doscient[ao]s|quinient[ao]s|(?(?