Skip to content

Commit

Permalink
Correct tokenization with multi-character split (keras-team#9585)
Browse files Browse the repository at this point in the history
* Fixes keras-team#9538
* Adds new test cases for previously-failing uses
* Tested on Python 2.7 with Unicode and non-Unicode strings, and on Python 3.5
  • Loading branch information
Connor Bracewell authored and fchollet committed Mar 7, 2018
1 parent 1c9a497 commit 70ad0d6
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 4 deletions.
17 changes: 13 additions & 4 deletions keras/preprocessing/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,21 @@ def text_to_word_sequence(text,
if lower:
text = text.lower()

if sys.version_info < (3,) and isinstance(text, unicode):
translate_map = dict((ord(c), unicode(split)) for c in filters)
if sys.version_info < (3,):
if isinstance(text, unicode):
translate_map = dict((ord(c), unicode(split)) for c in filters)
text = text.translate(translate_map)
elif len(split) == 1:
translate_map = maketrans(filters, split * len(filters))
text = text.translate(translate_map)
else:
for c in filters:
text = text.replace(c, split)
else:
translate_map = maketrans(filters, split * len(filters))
translate_dict = dict((c, split) for c in filters)
translate_map = maketrans(translate_dict)
text = text.translate(translate_map)

text = text.translate(translate_map)
seq = text.split(split)
return [i for i in seq if i]

Expand Down
10 changes: 10 additions & 0 deletions tests/keras/preprocessing/text_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,21 @@ def test_text_to_word_sequence():
assert text_to_word_sequence(text) == ['hello', 'world']


def test_text_to_word_sequence_multichar_split():
text = 'hello!stop?world!'
assert text_to_word_sequence(text, split='stop') == ['hello', 'world']


def test_text_to_word_sequence_unicode():
text = u'ali! veli? kırk dokuz elli'
assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']


def test_text_to_word_sequence_unicode_multichar_split():
text = u'ali!stopveli?stopkırkstopdokuzstopelli'
assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']


def test_tokenizer_unicode():
texts = [u'ali veli kırk dokuz elli', u'ali veli kırk dokuz elli veli kırk dokuz']
tokenizer = Tokenizer(num_words=5)
Expand Down

0 comments on commit 70ad0d6

Please sign in to comment.