Skip to content

Commit

Permalink
precompiled_charsmap checking before adding to the normalizers' list …
Browse files Browse the repository at this point in the history
…for XLNetTokenizerFast conversion. (huggingface#24618)

* precompiled_charsmap checking before adding to the normalizers' list.

* precompiled_charsmap checking for all Sentencepiece tokenizer models

* precompiled_charsmap checking for SPM tokenizer models - correct formatting
  • Loading branch information
shahad-mahmud authored Jul 4, 2023
1 parent f4e4b4d commit cd4584e
Showing 1 changed file with 12 additions and 3 deletions.
15 changes: 12 additions & 3 deletions src/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,10 @@ def normalizer(self, proto):
list_normalizers.append(normalizers.Lowercase())

precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))

if precompiled_charsmap:
list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))

list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " "))
return normalizers.Sequence(list_normalizers)

Expand Down Expand Up @@ -802,7 +805,10 @@ def normalizer(self, proto):
list_normalizers.append(normalizers.Lowercase())

precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))

if precompiled_charsmap:
list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))

list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " "))
return normalizers.Sequence(list_normalizers)

Expand Down Expand Up @@ -836,7 +842,10 @@ def normalizer(self, proto):
list_normalizers.append(normalizers.Lowercase())

precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))

if precompiled_charsmap:
list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))

return normalizers.Sequence(list_normalizers)

def post_processor(self):
Expand Down

0 comments on commit cd4584e

Please sign in to comment.