Skip to content

Commit

Permalink
ITN: SH bug fixes for telephone (NVIDIA#3592)
Browse files Browse the repository at this point in the history
* apex quick fix

Signed-off-by: ekmb <[email protected]>

* add money without hundred, fix telephone for SH

Signed-off-by: ekmb <[email protected]>

* update cache_dir

Signed-off-by: ekmb <[email protected]>
  • Loading branch information
ekmb authored Feb 3, 2022
1 parent 9f95457 commit e241767
Show file tree
Hide file tree
Showing 9 changed files with 34 additions and 20 deletions.
18 changes: 9 additions & 9 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -111,18 +111,18 @@ pipeline {
parallel {
stage('En TN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py "1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-2'
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py "1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-3'
}
}
stage('En ITN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en "twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-2'
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en "twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-3'
}
}
stage('Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-2'
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-2'
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-3'
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-3'
}
}
}
Expand All @@ -139,7 +139,7 @@ pipeline {
parallel {
stage('L2: Eng TN') {
steps {
sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-2 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-3 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
sh 'cd nemo_text_processing/text_normalization/ && python run_predict.py --input=/home/TestData/nlp/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output=/home/TestData/nlp/text_norm/output/test.pynini.txt --verbose'
sh 'cat /home/TestData/nlp/text_norm/output/test.pynini.txt'
sh 'cmp --silent /home/TestData/nlp/text_norm/output/test.pynini.txt /home/TestData/nlp/text_norm/ci/test_goal_py_12-10.txt || exit 1'
Expand All @@ -149,7 +149,7 @@ pipeline {

stage('L2: Eng ITN export') {
steps {
sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-2 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-3 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
sh 'cd nemo_text_processing/inverse_text_normalization/ && python run_predict.py --input=/home/TestData/nlp/text_denorm/ci/test.txt --language=en --output=/home/TestData/nlp/text_denorm/output/test.pynini.txt --verbose'
sh 'cmp --silent /home/TestData/nlp/text_denorm/output/test.pynini.txt /home/TestData/nlp/text_denorm/ci/test_goal_py.txt || exit 1'
sh 'rm -rf /home/TestData/nlp/text_denorm/output/*'
Expand All @@ -158,23 +158,23 @@ pipeline {
stage('L2: TN with Audio (audio and raw text)') {
steps {
sh 'cd nemo_text_processing/text_normalization && \
python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-2 --text "The total amounts to \\$4.76." \
python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-3 --text "The total amounts to \\$4.76." \
--audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_raw.txt 2>&1 && \
cmp --silent /tmp/out_raw.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
}
}
stage('L2: TN with Audio (audio and text file)') {
steps {
sh 'cd nemo_text_processing/text_normalization && \
python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-2 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-3 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
--audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_file.txt 2>&1 && \
cmp --silent /tmp/out_file.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
}
}
stage('L2: TN with Audio (manifest)') {
steps {
sh 'cd nemo_text_processing/text_normalization && \
python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-2'
python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-3'
}
}
}
Expand Down
1 change: 0 additions & 1 deletion nemo/collections/nlp/models/nlp_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
from nemo.core.classes.exportable import Exportable
from nemo.utils import AppState, logging


__all__ = ['NLPModel']

NEMO_NLP_TMP = os.path.join(os.path.dirname(str(TRANSFORMERS_CACHE)), "nemo_nlp_tmp")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
$ dollar
$ us dollar
$ united states dollar
£ pound
£ british pound
euro
won
nzd new zealand dollar
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ kg kilogram
ghz gigahertz
khz kilohertz
mhz megahertz
lb pound
v volt
h hour
mc mega coulomb
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from nemo_text_processing.inverse_text_normalization.en.utils import get_abs_path
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_DIGIT,
NEMO_NOT_SPACE,
NEMO_SIGMA,
GraphFst,
convert_space,
Expand Down Expand Up @@ -49,6 +50,13 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
# quantity, integer_part, fractional_part, currency

cardinal_graph = cardinal.graph_no_exception
# add support for missing hundred (only for 3 digit numbers)
# "one fifty" -> "one hundred fifty"
with_hundred = pynini.compose(
pynini.closure(NEMO_NOT_SPACE) + pynini.accep(" ") + pynutil.insert("hundred ") + NEMO_SIGMA,
pynini.compose(cardinal_graph, NEMO_DIGIT ** 3),
)
cardinal_graph |= with_hundred
graph_decimal_final = decimal.final_graph_wo_negative

unit = pynini.string_file(get_abs_path("data/currency.tsv"))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,13 @@ class TelephoneFst(GraphFst):
def __init__(self, cardinal: GraphFst):
super().__init__(name="telephone", kind="classify")
# country code, number_part, extension
digit_to_str = pynini.invert(
pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
).optimize() | pynini.cross("0", pynini.union("o", "oh", "zero"))
digit_to_str = (
pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize())
| pynini.cross("0", pynini.union("o", "oh", "zero")).optimize()
)

str_to_digit = pynini.invert(digit_to_str)

double_digit = pynini.union(
*[
pynini.cross(
Expand All @@ -83,9 +86,10 @@ def __init__(self, cardinal: GraphFst):
pynini.compose(double_digit, str_to_digit + pynutil.delete(" ") + str_to_digit) | two_digit_cardinal
)

single_or_double_digit = (double_digit_to_digit | str_to_digit).optimize()
single_or_double_digit = (
single_or_double_digit + pynini.closure(pynutil.delete(" ") + single_or_double_digit)
single_or_double_digit = (pynutil.add_weight(double_digit_to_digit, -0.0001) | str_to_digit).optimize()
single_or_double_digit |= (
single_or_double_digit
+ pynini.closure(pynutil.add_weight(pynutil.delete(" ") + single_or_double_digit, 0.0001))
).optimize()

number_part = pynini.compose(
Expand All @@ -102,6 +106,7 @@ def __init__(self, cardinal: GraphFst):
+ ((pynini.closure(str_to_digit + pynutil.delete(" "), 0, 2) + str_to_digit) | cardinal_option)
+ pynutil.insert("\"")
)

optional_country_code = pynini.closure(country_code + pynutil.delete(" ") + insert_space, 0, 1).optimize()
graph = optional_country_code + number_part

Expand All @@ -125,6 +130,7 @@ def __init__(self, cardinal: GraphFst):
digit_or_double = digit_or_double.optimize()

ip_graph = digit_or_double + (pynini.cross(" dot ", ".") + digit_or_double) ** 3

graph |= pynutil.insert("number_part: \"") + ip_graph.optimize() + pynutil.insert("\"")
graph |= (
pynutil.insert("number_part: \"")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def __init__(self):
)

# five o' clock
# two o eight, two thiry five (am/pm)
# two o eight, two thirty five (am/pm)
# two pm/am
graph_hm = (
final_graph_hour + delete_extra_space + pynutil.insert("minutes: \"") + graph_minute + pynutil.insert("\"")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,5 @@ eighteen thousand one hundred twenty eight dollars~$18128
eighteen thousand one hundred twenty five dollars~$18125
eighteen thousand one hundred twenty four dollars~$18124
eighteen thousand one hundred twenty nine dollars~$18129
eighteen thousand pounds~£18000
one thousand fifty five dollars~$1055
one fifty five dollars~$155
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ four three two double seven three two one four three two one four three double z
one two three dot one two three dot o dot four o~123.123.0.40
one twenty three dot one two three dot o dot four o~123.123.0.40
two two five dot double five dot o dot four o~225.55.0.40
two two five dot double five dot o dot forty five~225.55.0.45
ssn is seven double nine one two three double one three~ssn is 799-12-3113
seven nine nine~799
a b nine~ab9
Expand Down

0 comments on commit e241767

Please sign in to comment.