ITN: SH bug fixes for telephone (NVIDIA#3592)

* apex quick fix Signed-off-by: ekmb <[email protected]> * add money without hundred, fix telephone for SH Signed-off-by: ekmb <[email protected]> * update cache_dir Signed-off-by: ekmb <[email protected]>
bogdal1993 · Feb 3, 2022 · e241767 · e241767
1 parent 9f95457
commit e241767
Show file tree

Hide file tree

Showing 9 changed files with 34 additions and 20 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -111,18 +111,18 @@ pipeline {
       parallel {
         stage('En TN grammars') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py "1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-2'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py "1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-3'
           }
         }
         stage('En ITN grammars') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en "twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-2'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en "twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-3'
           }
         }
         stage('Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-2'
-            sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-2'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-3'
+            sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-3'
           }
         }
       }
@@ -139,7 +139,7 @@ pipeline {
       parallel {
         stage('L2: Eng TN') {
           steps {
-            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-2 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
+            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-3 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
             sh 'cd nemo_text_processing/text_normalization/ &&  python run_predict.py --input=/home/TestData/nlp/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output=/home/TestData/nlp/text_norm/output/test.pynini.txt --verbose'
             sh 'cat /home/TestData/nlp/text_norm/output/test.pynini.txt'
             sh 'cmp --silent /home/TestData/nlp/text_norm/output/test.pynini.txt /home/TestData/nlp/text_norm/ci/test_goal_py_12-10.txt || exit 1'
@@ -149,7 +149,7 @@ pipeline {
 
         stage('L2: Eng ITN export') {
           steps {
-            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-2 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
+            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-3 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
             sh 'cd nemo_text_processing/inverse_text_normalization/ &&  python run_predict.py --input=/home/TestData/nlp/text_denorm/ci/test.txt --language=en --output=/home/TestData/nlp/text_denorm/output/test.pynini.txt --verbose'
             sh 'cmp --silent /home/TestData/nlp/text_denorm/output/test.pynini.txt /home/TestData/nlp/text_denorm/ci/test_goal_py.txt || exit 1'
             sh 'rm -rf /home/TestData/nlp/text_denorm/output/*'
@@ -158,23 +158,23 @@ pipeline {
         stage('L2: TN with Audio (audio and raw text)') {
           steps {
             sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-2 --text "The total amounts to \\$4.76." \
+            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-3 --text "The total amounts to \\$4.76." \
             --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_raw.txt 2>&1 && \
             cmp --silent /tmp/out_raw.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
           }
         }
         stage('L2: TN with Audio (audio and text file)') {
           steps {
             sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-2 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
+            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-3 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
             --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_file.txt 2>&1 && \
             cmp --silent /tmp/out_file.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
           }
         }
         stage('L2: TN with Audio (manifest)') {
           steps {
             sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-2'
+            python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-3'
           }
         }
       }

diff --git a/nemo/collections/nlp/models/nlp_model.py b/nemo/collections/nlp/models/nlp_model.py
@@ -34,7 +34,6 @@
 from nemo.core.classes.exportable import Exportable
 from nemo.utils import AppState, logging
 
-
 __all__ = ['NLPModel']
 
 NEMO_NLP_TMP = os.path.join(os.path.dirname(str(TRANSFORMERS_CACHE)), "nemo_nlp_tmp")

diff --git a/nemo_text_processing/inverse_text_normalization/en/data/currency.tsv b/nemo_text_processing/inverse_text_normalization/en/data/currency.tsv
@@ -1,7 +1,7 @@
 $	dollar
 $	us dollar
 $	united states dollar
-£	pound
+£	british pound
 €	euro
 ₩	won
 nzd	new zealand dollar

diff --git a/nemo_text_processing/inverse_text_normalization/en/data/measurements.tsv b/nemo_text_processing/inverse_text_normalization/en/data/measurements.tsv
@@ -18,7 +18,6 @@ kg	kilogram
 ghz	gigahertz
 khz	kilohertz
 mhz	megahertz
-lb	pound
 v	volt
 h	hour
 mc	mega coulomb

diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/money.py b/nemo_text_processing/inverse_text_normalization/en/taggers/money.py
@@ -16,6 +16,7 @@
 from nemo_text_processing.inverse_text_normalization.en.utils import get_abs_path
 from nemo_text_processing.text_normalization.en.graph_utils import (
     NEMO_DIGIT,
+    NEMO_NOT_SPACE,
     NEMO_SIGMA,
     GraphFst,
     convert_space,
@@ -49,6 +50,13 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
         # quantity, integer_part, fractional_part, currency
 
         cardinal_graph = cardinal.graph_no_exception
+        # add support for missing hundred (only for 3 digit numbers)
+        # "one fifty" -> "one hundred fifty"
+        with_hundred = pynini.compose(
+            pynini.closure(NEMO_NOT_SPACE) + pynini.accep(" ") + pynutil.insert("hundred ") + NEMO_SIGMA,
+            pynini.compose(cardinal_graph, NEMO_DIGIT ** 3),
+        )
+        cardinal_graph |= with_hundred
         graph_decimal_final = decimal.final_graph_wo_negative
 
         unit = pynini.string_file(get_abs_path("data/currency.tsv"))

diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py
@@ -60,10 +60,13 @@ class TelephoneFst(GraphFst):
     def __init__(self, cardinal: GraphFst):
         super().__init__(name="telephone", kind="classify")
         # country code, number_part, extension
-        digit_to_str = pynini.invert(
-            pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
-        ).optimize() | pynini.cross("0", pynini.union("o", "oh", "zero"))
+        digit_to_str = (
+            pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize())
+            | pynini.cross("0", pynini.union("o", "oh", "zero")).optimize()
+        )
+
         str_to_digit = pynini.invert(digit_to_str)
+
         double_digit = pynini.union(
             *[
                 pynini.cross(
@@ -83,9 +86,10 @@ def __init__(self, cardinal: GraphFst):
             pynini.compose(double_digit, str_to_digit + pynutil.delete(" ") + str_to_digit) | two_digit_cardinal
         )
 
-        single_or_double_digit = (double_digit_to_digit | str_to_digit).optimize()
-        single_or_double_digit = (
-            single_or_double_digit + pynini.closure(pynutil.delete(" ") + single_or_double_digit)
+        single_or_double_digit = (pynutil.add_weight(double_digit_to_digit, -0.0001) | str_to_digit).optimize()
+        single_or_double_digit |= (
+            single_or_double_digit
+            + pynini.closure(pynutil.add_weight(pynutil.delete(" ") + single_or_double_digit, 0.0001))
         ).optimize()
 
         number_part = pynini.compose(
@@ -102,6 +106,7 @@ def __init__(self, cardinal: GraphFst):
             + ((pynini.closure(str_to_digit + pynutil.delete(" "), 0, 2) + str_to_digit) | cardinal_option)
             + pynutil.insert("\"")
         )
+
         optional_country_code = pynini.closure(country_code + pynutil.delete(" ") + insert_space, 0, 1).optimize()
         graph = optional_country_code + number_part
 
@@ -125,6 +130,7 @@ def __init__(self, cardinal: GraphFst):
         digit_or_double = digit_or_double.optimize()
 
         ip_graph = digit_or_double + (pynini.cross(" dot ", ".") + digit_or_double) ** 3
+
         graph |= pynutil.insert("number_part: \"") + ip_graph.optimize() + pynutil.insert("\"")
         graph |= (
             pynutil.insert("number_part: \"")

diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/time.py b/nemo_text_processing/inverse_text_normalization/en/taggers/time.py
@@ -85,7 +85,7 @@ def __init__(self):
         )
 
         # five o' clock
-        # two o eight, two thiry five (am/pm)
+        # two o eight, two thirty five (am/pm)
         # two pm/am
         graph_hm = (
             final_graph_hour + delete_extra_space + pynutil.insert("minutes: \"") + graph_minute + pynutil.insert("\"")

diff --git a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_money.txt
@@ -45,4 +45,5 @@ eighteen thousand one hundred twenty eight dollars~$18128
 eighteen thousand one hundred twenty five dollars~$18125
 eighteen thousand one hundred twenty four dollars~$18124
 eighteen thousand one hundred twenty nine dollars~$18129
-eighteen thousand pounds~£18000
+one thousand fifty five dollars~$1055
+one fifty five dollars~$155
diff --git a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_telephone.txt
@@ -10,6 +10,7 @@ four three two double seven three two one four three two one four three double z
 one two three dot one two three dot o dot four o~123.123.0.40
 one twenty three dot one two three dot o dot four o~123.123.0.40
 two two five dot double five dot o dot four o~225.55.0.40
+two two five dot double five dot o dot forty five~225.55.0.45
 ssn is seven double nine one two three double one three~ssn is 799-12-3113
 seven nine nine~799
 a b nine~ab9