Skip to content

Commit

Permalink
Modify Ernie Docstring (PaddlePaddle#997)
Browse files Browse the repository at this point in the history
* modify ernie

* modify modeling

* modify ernie-ctm

* modify tokenizer

* modify ernie-ctm tokenizer

* modify modeling

* modify erniemodel

* modify ernie-ctm

* modify ernie-gen

* modify tokenizer

* modify ernie models

* modify ernie-gen

* fix errirs

* modify erniemodel

* modify erniemodel
  • Loading branch information
huhuiwen99 authored Sep 24, 2021
1 parent 375df59 commit 6b6bcf1
Show file tree
Hide file tree
Showing 9 changed files with 897 additions and 716 deletions.
308 changes: 129 additions & 179 deletions paddlenlp/transformers/ernie/modeling.py

Large diffs are not rendered by default.

229 changes: 142 additions & 87 deletions paddlenlp/transformers/ernie/tokenizer.py

Large diffs are not rendered by default.

265 changes: 200 additions & 65 deletions paddlenlp/transformers/ernie_ctm/modeling.py

Large diffs are not rendered by default.

118 changes: 85 additions & 33 deletions paddlenlp/transformers/ernie_ctm/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,37 +28,49 @@

class ErnieCtmTokenizer(PretrainedTokenizer):
r"""
Construct a ERNIE-CTM tokenizer. It uses a basic tokenizer to do punctuation
splitting, lower casing and so on, and follows a WordPiece tokenizer to
tokenize as subwords.
Construct a ERNIE-CTM tokenizer.
Args:
vocab_file (`str`):
File containing the vocabulary.
do_lower_case (`bool`, optional):
vocab_file (str):
File path of the vocabulary.
do_lower_case (bool, optional):
Whether or not to lowercase the input when tokenizing. Defaults to `True`
do_basic_tokenize (`bool`, optional):
do_basic_tokenize (bool, optional):
Whether or not to do basic tokenization before WordPiece. Defaults to `True`
unk_token (`str`, optional):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. Defaults to `"[UNK]"`
sep_token (`str`, optional):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens. Defaults to `"[SEP]"`
pad_token (`str`, optional):
The token used for padding, for example when batching sequences of different lengths. Defaults to `"[PAD]"`
cls_token_template (`str`, optional)
The template of summary token for multiple summary placeholders. Defauts to `"[CLS{}]"`
cls_num (`int`, optional):
unk_token (str, optional):
A special token representing the *unknown (out-of-vocabulary)* token.
An unknown token is set to be `unk_token` inorder to be converted to an ID.
Defaults to "[UNK]".
sep_token (str, optional):
A special token separating two different sentences in the same input.
Defaults to "[SEP]".
pad_token (str, optional):
A special token used to make arrays of tokens the same size for batching purposes.
Defaults to "[PAD]".
cls_token_template (str, optional)
The template of summary token for multiple summary placeholders. Defaults to `"[CLS{}]"`
cls_num (int, optional):
Summary placeholder used in ernie-ctm model. For catching a sentence global feature from multiple aware.
Defaults to 1
mask_token (`str`, optional):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict. Defaults to `"[MASK]"`
strip_accents: (`bool`, optional):
Defaults to `1`.
mask_token (str, optional):
A special token representing a masked token. This is the token used in the masked
language modeling task. This is the token which the model will try to predict the original unmasked ones.
Defaults to `"[MASK]"`.
strip_accents: (bool, optional):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
Examples:
.. code-block::
from paddlenlp.transformers import ErnieCtmTokenizer
tokenizer = ErnieCtmTokenizer.from_pretrained('ernie-ctm')
encoded_inputs = tokenizer('He was a puppeteer')
# encoded_inputs:
# {'input_ids': [101, 98, 153, 150, 99, 168, 146, 164, 99, 146, 99, 161, 166, 161,
# 161, 150, 165, 150, 150, 163, 102],
# 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
"""
resource_files_names = {"vocab_file": "vocab.txt"} # for save_pretrained
pretrained_resource_files_map = {
Expand Down Expand Up @@ -104,28 +116,58 @@ def __init__(self,

@property
def vocab_size(self):
"""
Return the size of vocabulary.
Returns:
int: The size of vocabulary.
"""
return len(self.vocab)

def convert_tokens_to_string(self, tokens):
# Converts a sequence of tokens (strings for sub-words) in a single string.
r"""
Converts a sequence of tokens (list of string) in a single string. Since
the usage of WordPiece introducing `##` to concat subwords, also remove
`##` when converting.
Args:
tokens (List[str]): A list of string representing tokens to be converted.
Returns:
str: Converted string from tokens.
Examples:
.. code-block::
from paddlenlp.transformers import ErnieCtmTokenizer
tokenizer = ErnieCtmTokenizer.from_pretrained('ernie-ctm')
tokens = tokenizer.tokenize('He was a puppeteer')
strings = tokenizer.convert_tokens_to_string(tokens)
#he was a puppeteer
"""
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string

def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""Build model inputs from a sequence or a pair of sequences for sequence classification tasks by
concatenating and add special tokens. A ERNIE-CTM sequence has the following format:
"""
Build model inputs from a sequence or a pair of sequences for sequence classification tasks by
concatenating and add special tokens.
- single sequence: [CLS0][CLS1]... X [SEP]
- pair of sequences: [CLS0][CLS1]... X [SEP] X [SEP]
A ERNIE-CTM sequence has the following format:
- single sequence: [CLS0][CLS1]... X [SEP]
- pair of sequences: [CLS0][CLS1]... X [SEP] X [SEP]
Args:
token_ids_0 (`List`):
token_ids_0 (List):
List of IDs to which the special tokens will be added.
token_ids_1 (`List`, optional):
second list of IDs for sequence pairs. Defaults to ``None``.
token_ids_1 (List, optional):
Optional second list of IDs for sequence pairs. Defaults to ``None``.
Returns:
List: The input IDs with the appropriate special tokens.
List[int]: The input_id with the appropriate special tokens.
"""
cls_token_ids = [
self.convert_tokens_to_ids(self.cls_token_template.format(sid))
Expand Down Expand Up @@ -178,6 +220,7 @@ def create_token_type_ids_from_sequences(self,
token_ids_1=None):
"""
Creates a token_type mask from the input sequences.
If `token_ids_1` is not `None`, then a sequence pair
token_type mask has the following format:
Expand Down Expand Up @@ -260,7 +303,16 @@ def tokenize(self, text, **kwargs):
Args:
text (str):
The text to be tokenized.
Returns:
List(str): A list of string representing converted tokens.
Examples:
.. code-block::
from paddlenlp.transformers import ErnieCtmTokenizer
tokenizer = ErnieCtmTokenizer.from_pretrained('ernie-ctm')
tokens = tokenizer.tokenize('He was a puppeteer')
"""
return self._tokenize(text, **kwargs)
Loading

0 comments on commit 6b6bcf1

Please sign in to comment.