Modify Ernie Docstring (PaddlePaddle#997)

* modify ernie * modify modeling * modify ernie-ctm * modify tokenizer * modify ernie-ctm tokenizer * modify modeling * modify erniemodel * modify ernie-ctm * modify ernie-gen * modify tokenizer * modify ernie models * modify ernie-gen * fix errirs * modify erniemodel * modify erniemodel
zhouyuanzhe · Sep 24, 2021 · 6b6bcf1 · 6b6bcf1
1 parent 375df59
commit 6b6bcf1
Show file tree

Hide file tree

Showing 9 changed files with 897 additions and 716 deletions.
diff --git a/paddlenlp/transformers/ernie/modeling.py b/paddlenlp/transformers/ernie/modeling.py
diff --git a/paddlenlp/transformers/ernie/tokenizer.py b/paddlenlp/transformers/ernie/tokenizer.py
diff --git a/paddlenlp/transformers/ernie_ctm/modeling.py b/paddlenlp/transformers/ernie_ctm/modeling.py
diff --git a/paddlenlp/transformers/ernie_ctm/tokenizer.py b/paddlenlp/transformers/ernie_ctm/tokenizer.py
@@ -28,37 +28,49 @@
 
 class ErnieCtmTokenizer(PretrainedTokenizer):
     r"""
-    Construct a ERNIE-CTM tokenizer. It uses a basic tokenizer to do punctuation
-    splitting, lower casing and so on, and follows a WordPiece tokenizer to
-    tokenize as subwords.
+    Construct a ERNIE-CTM tokenizer.
     
     Args:
-        vocab_file (`str`):
-            File containing the vocabulary.
-        do_lower_case (`bool`, optional):
+        vocab_file (str):
+            File path of the vocabulary.
+        do_lower_case (bool, optional):
             Whether or not to lowercase the input when tokenizing. Defaults to `True`
-        do_basic_tokenize (`bool`, optional):
+        do_basic_tokenize (bool, optional):
             Whether or not to do basic tokenization before WordPiece. Defaults to `True`
-        unk_token (`str`, optional):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead. Defaults to `"[UNK]"`
-        sep_token (`str`, optional):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens. Defaults to `"[SEP]"`
-        pad_token (`str`, optional):
-            The token used for padding, for example when batching sequences of different lengths. Defaults to `"[PAD]"`
-        cls_token_template (`str`, optional)
-            The template of summary token for multiple summary placeholders. Defauts to `"[CLS{}]"`
-        cls_num (`int`, optional):
+        unk_token (str, optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str, optional):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str, optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token_template (str, optional)
+            The template of summary token for multiple summary placeholders. Defaults to `"[CLS{}]"`
+        cls_num (int, optional):
             Summary placeholder used in ernie-ctm model. For catching a sentence global feature from multiple aware.
-            Defaults to 1
-        mask_token (`str`, optional):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict. Defaults to `"[MASK]"`
-        strip_accents: (`bool`, optional):
+            Defaults to `1`.
+        mask_token (str, optional):
+            A special token representing a masked token. This is the token used in the masked
+            language modeling task. This is the token which the model will try to predict the original unmasked ones.
+            Defaults to `"[MASK]"`.
+        strip_accents: (bool, optional):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import ErnieCtmTokenizer
+            tokenizer = ErnieCtmTokenizer.from_pretrained('ernie-ctm')
+
+            encoded_inputs = tokenizer('He was a puppeteer')
+            # encoded_inputs:
+            # {'input_ids': [101, 98, 153, 150, 99, 168, 146, 164, 99, 146, 99, 161, 166, 161,
+            #  161, 150, 165, 150, 150, 163, 102],
+            # 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
     """
     resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
     pretrained_resource_files_map = {
@@ -104,28 +116,58 @@ def __init__(self,
 
     @property
     def vocab_size(self):
+        """
+        Return the size of vocabulary.
+
+        Returns:
+            int: The size of vocabulary.
+        """
         return len(self.vocab)
 
     def convert_tokens_to_string(self, tokens):
-        # Converts a sequence of tokens (strings for sub-words) in a single string.
+        r"""
+        Converts a sequence of tokens (list of string) in a single string. Since
+        the usage of WordPiece introducing `##` to concat subwords, also remove
+        `##` when converting.
+
+        Args:
+            tokens (List[str]): A list of string representing tokens to be converted.
+
+        Returns:
+            str: Converted string from tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import ErnieCtmTokenizer
+                tokenizer = ErnieCtmTokenizer.from_pretrained('ernie-ctm')
+
+                tokens = tokenizer.tokenize('He was a puppeteer')
+                strings = tokenizer.convert_tokens_to_string(tokens)
+                #he was a puppeteer
+
+        """
         out_string = " ".join(tokens).replace(" ##", "").strip()
         return out_string
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """Build model inputs from a sequence or a pair of sequences for sequence classification tasks by
-        concatenating and add special tokens. A ERNIE-CTM sequence has the following format:
+        """
+        Build model inputs from a sequence or a pair of sequences for sequence classification tasks by
+        concatenating and add special tokens.
 
-        - single sequence: [CLS0][CLS1]... X [SEP]
-        - pair of sequences: [CLS0][CLS1]... X [SEP] X [SEP]
+        A ERNIE-CTM sequence has the following format:
+
+        - single sequence:      [CLS0][CLS1]... X [SEP]
+        - pair of sequences:        [CLS0][CLS1]... X [SEP] X [SEP]
 
         Args:
-            token_ids_0 (`List`):
+            token_ids_0 (List):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (`List`, optional):
-                second list of IDs for sequence pairs. Defaults to ``None``.
+            token_ids_1 (List, optional):
+                Optional second list of IDs for sequence pairs. Defaults to ``None``.
 
         Returns:
-            List: The input IDs with the appropriate special tokens.
+            List[int]: The input_id with the appropriate special tokens.
         """
         cls_token_ids = [
             self.convert_tokens_to_ids(self.cls_token_template.format(sid))
@@ -178,6 +220,7 @@ def create_token_type_ids_from_sequences(self,
                                              token_ids_1=None):
         """
         Creates a token_type mask from the input sequences.
+
         If `token_ids_1` is not `None`, then a sequence pair
         token_type mask has the following format:
 
@@ -260,7 +303,16 @@ def tokenize(self, text, **kwargs):
         Args:
             text (str):
                 The text to be tokenized.
+
         Returns:
             List(str): A list of string representing converted tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import ErnieCtmTokenizer
+                tokenizer = ErnieCtmTokenizer.from_pretrained('ernie-ctm')
+                tokens = tokenizer.tokenize('He was a puppeteer')
+
         """
         return self._tokenize(text, **kwargs)