updated bert from scratch for roBERTa model

neelkanthavvari · Jun 21, 2021 · 1c3f1d1 · 1c3f1d1
1 parent 1f79627
commit 1c3f1d1
Show file tree

Hide file tree

Showing 4 changed files with 634 additions and 161 deletions.
diff --git a/course/bert_from_scratch/00_creating_a_tokenizer.ipynb b/course/bert_from_scratch/00_creating_a_tokenizer.ipynb
@@ -208,17 +208,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stderr",
-     "text": [
-      "100%|██████████| 28522082/28522082 [33:32<00:00, 14173.48it/s]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from tqdm.auto import tqdm\n",
     "\n",
@@ -299,7 +291,7 @@
    "outputs": [],
    "source": [
     "tokenizer.train(files=paths[:5], vocab_size=30_522, min_frequency=2,\n",
-    "                special_tokens=['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]'])"
+    "                special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])"
    ]
   },
   {
@@ -318,7 +310,7 @@
      "output_type": "execute_result",
      "data": {
       "text/plain": [
-       "['./filiberto\\\\filiberto-vocab.json', './filiberto\\\\filiberto-merges.txt']"
+       "['filiberto\\\\vocab.json', 'filiberto\\\\merges.txt']"
       ]
      },
      "metadata": {},
@@ -330,7 +322,7 @@
     "\n",
     "os.mkdir('./filiberto')\n",
     "\n",
-    "tokenizer.save_model('./filiberto', 'filiberto')"
+    "tokenizer.save_model('filiberto')"
    ]
   },
   {
@@ -351,41 +343,60 @@
    "execution_count": 5,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "from transformers import RobertaTokenizer\n",
+    "\n",
+    "tokenizer = RobertaTokenizer.from_pretrained('filiberto', max_len=512)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# test our tokenizer on a simple sentence\n",
+    "tokens = tokenizer('ciao, come va?')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "from tokenizers.implementations import ByteLevelBPETokenizer\n",
     "from tokenizers.processors import BertProcessing\n",
     "\n",
     "# initialize the tokenizer using the tokenizer we initialized and saved to file\n",
     "tokenizer = ByteLevelBPETokenizer(\n",
-    "    './filiberto/filiberto-vocab.json',\n",
-    "    './filiberto/filiberto-merges.txt'\n",
+    "    './filiberto/vocab.json',\n",
+    "    './filiberto/merges.txt'\n",
     ")\n",
     "\n",
     "# set [CLS] and [SEP] to be added to start-end of sequences\n",
     "tokenizer._tokenizer.post_processor = BertProcessing(\n",
-    "    ('[SEP]', tokenizer.token_to_id('[SEP]')),\n",
-    "    ('[CLS]', tokenizer.token_to_id('[CLS]'))\n",
+    "    ('</s>', tokenizer.token_to_id('</s>')),\n",
+    "    ('<s>', tokenizer.token_to_id('<s>'))\n",
     ")\n",
     "\n",
     "# truncate anything more than 512 characters in length\n",
     "tokenizer.enable_truncation(max_length=512)\n",
-    "# and enable padding to 512 too\n",
-    "tokenizer.enable_padding(length=512, pad_token='[PAD]')\n",
     "\n",
     "# test our tokenizer on a simple sentence\n",
     "tokens = tokenizer.encode('ciao, come va?')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "output_type": "stream",
      "name": "stdout",
      "text": [
-      "Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])\n"
+      "{'input_ids': [0, 16834, 16, 488, 611, 35, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}\n"
      ]
     }
    ],
@@ -395,57 +406,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "['[CLS]', 'ciao', ',', 'Ġcome', 'Ġva', '?', '[SEP]', '[PAD]', '[PAD]', '[PAD]']"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 7
-    }
-   ],
-   "source": [
-    "tokens.tokens[:10]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "output_type": "execute_result",
      "data": {
       "text/plain": [
-       "[1, 16834, 16, 488, 611, 35, 2, 0, 0, 0]"
+       "[0, 16834, 16, 488, 611, 35, 2]"
       ]
      },
      "metadata": {},
-     "execution_count": 8
+     "execution_count": 11
     }
    ],
    "source": [
-    "tokens.ids[:10]"
+    "tokens.input_ids"
    ]
   },
   {
    "source": [
-    "We can see here that our **CLS** token is now placed at the beginning of our sequences using token ID *1*. At the end of the sequence we see the **SEP** token represented by *2*. Following this we have our **PAD** tokens which pad each sequence upto a length of *512*."
+    "We can see here that our **<s\\>** token is now placed at the beginning of our sequences using token ID *0*. At the end of the sequence we see the **<s\\\\>** token represented by *2*."
    ],
    "cell_type": "markdown",
    "metadata": {}
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "ML",
+   "display_name": "ML test",
    "language": "python",
-   "name": "ml"
+   "name": "ml_test"
   },
   "language_info": {
    "codemirror_mode": {