-
Notifications
You must be signed in to change notification settings - Fork 24
/
test_numerize.py
332 lines (270 loc) · 10.8 KB
/
test_numerize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
from unittest import TestCase, skipUnless
from numerizer import numerizer as num
from spacy import load
from spacy.tokens import Token
numerize = num.numerize
try:
nlp = load("en_core_web_sm")
SPACY_MODEL_INSTALLED = True
except OSError:
SPACY_MODEL_INSTALLED = False
try:
nlp_trf = load("en_core_web_trf")
TRF_INSTALLED = True
except OSError:
TRF_INSTALLED = False
def test_init():
assert numerize("forty two") == "42"
def test_case_insensitive():
assert numerize("Forty two") == "42"
assert numerize("FORTY TWO") == "42"
assert numerize("FORTY Second") == "42nd"
assert numerize("Ninety Nine") == "99"
def test_hyphenated():
assert numerize("forty-two") == "42"
def test_hundreds():
assert numerize("four hundred and forty two") == "442"
def test_fraction():
assert numerize("half") == "1/2"
assert numerize("quarter") == "1/4"
assert numerize("two and a half") == "2.5"
assert numerize("three quarters") == "3/4"
assert numerize("two and three eighths") == "2.375"
assert numerize("2B/2B") == "2B/2B"
def test_straight_parsing():
strings = {
1: "one",
5: "five",
10: "ten",
11: "eleven",
12: "twelve",
13: "thirteen",
14: "fourteen",
15: "fifteen",
16: "sixteen",
17: "seventeen",
18: "eighteen",
19: "nineteen",
20: "twenty",
27: "twenty seven",
31: "thirty-one",
37: "thirty-seven",
41: "forty one",
42: "fourty two",
59: "fifty nine",
100: ["one hundred", "a hundred"],
150: ["one hundred and fifty", "one fifty"],
200: "two-hundred",
500: "5 hundred",
999: "nine hundred and ninety nine",
1_000: "one thousand",
1_200: ["twelve hundred", "one thousand two hundred"],
17_000: "seventeen thousand",
21_473: "twentyone-thousand-four-hundred-and-seventy-three",
74_002: "seventy four thousand and two",
99_999: "ninety nine thousand nine hundred ninety nine",
100_000: "100 thousand",
250_000: "two hundred fifty thousand",
1_000_000: ["one million", "1.0 million"],
1_200_000: "1.2 million",
1_250_007: "one million two hundred fifty thousand and seven",
1_000_000_000: "one billion",
1_000_000_001: "one billion and one",
}
for k, v in strings.items():
if isinstance(v, list):
for s in v:
assert numerize(s) == str(k)
else:
assert numerize(v) == str(k)
def test_combined_double_digits():
assert "21" == numerize("twentyone")
assert "37" == numerize("thirtyseven")
def test_fractions_in_words():
assert "1/2" == numerize("one half")
assert "1/4" == numerize("1 quarter")
assert "1/4" == numerize("one quarter")
assert "1/4" == numerize("a quarter")
assert "1/8" == numerize("one eighth")
assert "3/4" == numerize("three quarters")
assert "2/4" == numerize("two fourths")
assert "3/8" == numerize("three eighths")
assert "7/10" == numerize("seven tenths")
def test_fractional_addition():
assert "1.25" == numerize("one and a quarter")
assert "2.375" == numerize("two and three eighths")
assert "2.5" == numerize("two and a half")
assert "3.5 hours" == numerize("three and a half hours")
def test_word_with_a_number():
assert "pennyweight" == numerize("pennyweight")
def test_edges():
assert "27 Oct 2006 7:30am" == numerize("27 Oct 2006 7:30am")
def test_multiple_slashes_should_not_be_evaluated():
assert "11/02/2007" == numerize("11/02/2007")
def test_compatability():
assert "1/2" == numerize("1/2")
assert "05/06" == numerize("05/06")
assert "3.5 hours" == numerize("three and a half hours")
assert "1/2 an hour" == numerize("half an hour")
assert "(1/2)+2" == numerize("(1/2)+2")
assert "(10+10)/2" == numerize("(10+10)/2")
assert "(10+10)/2" == numerize("(10+10)/two")
assert "2*(45+21)/6" == numerize("2*(45+21)/6")
def test_ordinal_strings():
ords = {
"first": "1st",
"second": "2nd",
"third": "3rd",
"fourth": "4th",
"fifth": "5th",
"seventh": "7th",
"eighth": "8th",
"tenth": "10th",
"eleventh": "11th",
"twelfth": "12th",
"thirteenth": "13th",
"sixteenth": "16th",
"twentieth": "20th",
"twenty-third": "23rd",
"thirtieth": "30th",
"thirty-first": "31st",
"fourtieth": "40th",
"fourty ninth": "49th",
"fiftieth": "50th",
"sixtieth": "60th",
"seventieth": "70th",
"eightieth": "80th",
"ninetieth": "90th",
"hundredth": "100th",
"thousandth": "1000th",
"millionth": "1000000th",
"billionth": "1000000000th",
"trillionth": "1000000000000th",
"first day month two": "1st day month 2",
}
for k, v in ords.items():
assert v == numerize(k)
def test_ambiguous_cases():
# Quarter ( Coin ) is Untested
# Second ( Time / Verb ) is Untested
assert "the 4th" == numerize("the fourth")
assert "1/3 of" == numerize("a third of")
assert "4th" == numerize("fourth")
assert "2nd" == numerize("second")
# pronouns not supported yet
# some ambiguous cases here are untested
# assert 'I quarter' == numerize('I quarter')
# assert 'You quarter' == numerize('You quarter')
# assert 'I want to quarter' == numerize('I want to quarter')
# assert 'the 1st 1/4' == numerize('the first quarter')
assert "1/4 pound of beef" == numerize("quarter pound of beef")
# assert 'the 2nd second' == numerize('the second second')
# assert 'the 4th second' == numerize('the fourth second')
# assert '1 second' == numerize('one second')
# TODO: Find way to distinguish this verb
# assert 'I peel and quarter bananas' == numerize('I peel and quarter bananas')
def test_ignore():
assert "the second day of march" == numerize(
"the second day of march", ignore=["second"]
)
assert "quarter" == numerize("quarter", ignore=["quarter"])
assert "the five guys" == numerize("the five guys", ignore=["five"])
assert "the fifty 2" == numerize("the fifty two", ignore=["fifty"])
def test_bias_ordinal():
assert "4th" == numerize("fourth", bias="ordinal")
assert "12th" == numerize("twelfth", bias="ordinal")
assert "2nd" == numerize("second", bias="ordinal")
assert "the 4th" == numerize("the fourth", bias="ordinal")
assert "2.75" == numerize("two and three fourths", bias="ordinal")
assert "3/5" == numerize("three fifths", bias="ordinal")
assert "a 4th of" == numerize("a fourth of", bias="ordinal")
# assert 'I quarter your home' == numerize('I quarter your home', bias='ordinal')
# assert 'the 1st 2nd 3rd' == numerize('the first second third', bias='ordinal')
def test_bias_fractional():
assert "1/4" == numerize("fourth", bias="fractional")
assert "1/12" == numerize("twelfth", bias="fractional")
assert "2nd" == numerize("second", bias="fractional")
# assert 'the 1/4' == numerize('the fourth', bias='fractional')
assert "2.75" == numerize("two and three fourths", bias="fractional")
assert "3/5" == numerize("three fifths", bias="fractional")
assert "1/4 of" == numerize("a fourth of", bias="fractional")
# assert 'I 1/4 your home' == numerize('I quarter your home',
# bias='fractional')
# assert 'the 1st second 1/3' == numerize('the first second third',
# bias='fractional')
def test_numerize_big_prefixes():
s = "two hundred and twenty five thousand seven hundred"
s = num.preprocess(s)
s = num.numerize_numerals(s)
assert num.numerize_big_prefixes(s) == "<num>225700"
def test_misc():
ideal = "225755"
actual = numerize("two hundred twenty five thousand seven hundred and fifty-five")
assert ideal == actual
def test_andition():
tests = {
"thirty two and forty one": "32 and 41",
"thirty two and forty one thousand": "32 and 41000",
"one hundred and twenty three": "123",
"two thousand and thirty four": "2034",
"forty five and sixty seven": "45 and 67",
"one hundred and twenty three thousand and forty five": "123045",
"twenty five and seventy four and one": "25 and 74 and 1",
"twenty five and seventy four and one thousand": "25 and 74 and 1000",
}
for test in tests.items():
assert test[1] == numerize(test[0])
def test_whitespaces():
assert "55000" == numerize("55 thousand")
# Test the spacy extensions
condt = """Please install spacy models as follows:
python -m spacy download en_core_web_sm
python -m spacy download en_core_web_md
python -m spacy download en_core_web_lg
"""
@skipUnless(SPACY_MODEL_INSTALLED, condt)
class TestSpacyExtensions(TestCase):
def test_spacy_default(self):
doc = nlp("The Hogwarts Express is at platform nine and three quarters.")
numerized = doc._.numerize()
assert isinstance(numerized, dict)
assert len(numerized) == 1
key, val = numerized.popitem()
assert key.text == "nine and three quarters"
assert val == "9.75"
def test_entity_filters(self):
doc = nlp(
"""
Their revenue has been a billion dollars, as of six months ago.
The next quarter is not so promising."""
)
numerized = doc._.numerize(labels=["MONEY"])
assert len(numerized) == 1
key, val = numerized.popitem()
assert key.text == "a billion dollars"
assert val == "1000000000 dollars"
def test_retokenize(self):
doc = nlp("The Hogwarts Express is at platform nine and three quarters.")
doc._.numerize(retokenize=True)
assert isinstance(doc[-2], Token)
assert doc[-2].text == "nine and three quarters"
assert doc[-2]._.numerized == "9.75"
def test_span_token_extensions(self):
doc = nlp(
"The projected revenue for the next quarter is over two million dollars."
)
assert doc[-4:-2]._.numerize() == "2000000"
assert doc[6]._.numerized == "1/4"
def test_article(self):
# See: https://github.com/jaidevd/numerizer/issues/24
_, val = nlp("A cat, a baby and a hundred puppies.")._.numerize().popitem()
assert val == "100"
@skipUnless(TRF_INSTALLED, "python -m spacy download en_core_web_trf")
def test_whitespace(self):
# See https://github.com/jaidevd/numerizer/issues/25
numerized = nlp_trf("55 thousand")._.numerize()
_, val = numerized.popitem()
assert val == "55 "
# But if we ignore labels,
numerized = nlp_trf("55 thousand")._.numerize(labels=False)
assert numerized == "55000"