Skip to content

Commit

Permalink
add_to_vocab now checks that the token already exists
Browse files Browse the repository at this point in the history
  • Loading branch information
Natooz committed Jul 4, 2024
1 parent 6ed293b commit 187fa24
Showing 1 changed file with 10 additions and 2 deletions.
12 changes: 10 additions & 2 deletions miditok/midi_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,8 @@ def add_attribute_control(self, attribute_control: AttributeControl) -> None:
"""
self.attribute_controls.append(attribute_control)
for token in attribute_control.tokens:
self.add_to_vocab(token)
if token not in self.vocab:
self.add_to_vocab(token)

@property
def pad_token_id(self) -> int:
Expand Down Expand Up @@ -2178,7 +2179,14 @@ def add_to_vocab(
if token not in self.config.special_tokens:
self.config.special_tokens.append(token)

if vocab_idx is not None:
dict_vocab = self.vocab if vocab_idx is None else self.vocab[vocab_idx]
if token_str in dict_vocab:
token_id = dict_vocab[token_str]
warnings.warn(
f"Token {token_str} is already in the vocabulary at idx {token_id}.",
stacklevel=2,
)
elif vocab_idx is not None:
self._vocab_base[vocab_idx][token_str] = len(self._vocab_base[vocab_idx])
self.__vocab_base_inv[vocab_idx][len(self.__vocab_base_inv[vocab_idx])] = (
token_str
Expand Down

0 comments on commit 187fa24

Please sign in to comment.