Skip to content

Commit

Permalink
[Wav2Vec2] Fix special tokens for Wav2Vec2 tokenizer (huggingface#11349)
Browse files Browse the repository at this point in the history
* fix wav2vec2 tok

* up
  • Loading branch information
patrickvonplaten authored Apr 22, 2021
1 parent 6f14eab commit 880154d
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 1 deletion.
7 changes: 7 additions & 0 deletions src/transformers/models/wav2vec2/tokenization_wav2vec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,12 @@ def __init__(
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}

# make sure that tokens made of several
# characters are not split at tokenization
for token in self.encoder.keys():
if len(token) > 1:
self.unique_no_split_tokens.append(token)

@property
def word_delimiter_token(self) -> str:
"""
Expand Down Expand Up @@ -366,6 +372,7 @@ def __init__(

with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)

self.decoder = {v: k for k, v in self.encoder.items()}

@property
Expand Down
22 changes: 21 additions & 1 deletion tests/test_tokenization_wav2vec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,26 @@ def test_tokenizer_decode_added_tokens(self):

self.assertEqual(batch_tokens, ["HELLO<unk>!?!?$$$", "BYE BYE<unk>$$$"])

def test_special_characters_in_vocab(self):
sent = "ʈʰ æ æ̃ ˧ kʰ"

vocab_dict = {k: v for v, k in enumerate({phoneme for phoneme in sent.split()})}
vocab_file = os.path.join(self.tmpdirname, "vocab_special.json")

with open(vocab_file, "w") as f:
json.dump(vocab_dict, f)

tokenizer = Wav2Vec2CTCTokenizer(vocab_file)

expected_sent = tokenizer.decode(tokenizer(sent).input_ids, spaces_between_special_tokens=True)
self.assertEqual(sent, expected_sent)

tokenizer.save_pretrained(os.path.join(self.tmpdirname, "special_tokenizer"))
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(os.path.join(self.tmpdirname, "special_tokenizer"))

expected_sent = tokenizer.decode(tokenizer(sent).input_ids, spaces_between_special_tokens=True)
self.assertEqual(sent, expected_sent)

def test_pretrained_model_lists(self):
# Wav2Vec2Model has no max model length => no
# Wav2Vec2Model has no max model length => no testing
pass

0 comments on commit 880154d

Please sign in to comment.