Correct tokenization with multi-character split (keras-team#9585)

* Fixes keras-team#9538 * Adds new test cases for previously-failing uses * Tested on Python 2.7 with Unicode and non-Unicode strings, and on Python 3.5
dxgung · Mar 7, 2018 · 70ad0d6 · 70ad0d6
1 parent 1c9a497
commit 70ad0d6
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 4 deletions.
diff --git a/keras/preprocessing/text.py b/keras/preprocessing/text.py
@@ -38,12 +38,21 @@ def text_to_word_sequence(text,
     if lower:
         text = text.lower()
 
-    if sys.version_info < (3,) and isinstance(text, unicode):
-        translate_map = dict((ord(c), unicode(split)) for c in filters)
+    if sys.version_info < (3,):
+        if isinstance(text, unicode):
+            translate_map = dict((ord(c), unicode(split)) for c in filters)
+            text = text.translate(translate_map)
+        elif len(split) == 1:
+            translate_map = maketrans(filters, split * len(filters))
+            text = text.translate(translate_map)
+        else:
+            for c in filters:
+                text = text.replace(c, split)
     else:
-        translate_map = maketrans(filters, split * len(filters))
+        translate_dict = dict((c, split) for c in filters)
+        translate_map = maketrans(translate_dict)
+        text = text.translate(translate_map)
 
-    text = text.translate(translate_map)
     seq = text.split(split)
     return [i for i in seq if i]
 

diff --git a/tests/keras/preprocessing/text_test.py b/tests/keras/preprocessing/text_test.py
@@ -73,11 +73,21 @@ def test_text_to_word_sequence():
     assert text_to_word_sequence(text) == ['hello', 'world']
 
 
+def test_text_to_word_sequence_multichar_split():
+    text = 'hello!stop?world!'
+    assert text_to_word_sequence(text, split='stop') == ['hello', 'world']
+
+
 def test_text_to_word_sequence_unicode():
     text = u'ali! veli? kırk dokuz elli'
     assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']
 
 
+def test_text_to_word_sequence_unicode_multichar_split():
+    text = u'ali!stopveli?stopkırkstopdokuzstopelli'
+    assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']
+
+
 def test_tokenizer_unicode():
     texts = [u'ali veli kırk dokuz elli', u'ali veli kırk dokuz elli veli kırk dokuz']
     tokenizer = Tokenizer(num_words=5)