Skip to content

Commit

Permalink
updated bert from scratch for roBERTa model
Browse files Browse the repository at this point in the history
  • Loading branch information
jamescalam committed Jun 21, 2021
1 parent 1f79627 commit 1c3f1d1
Show file tree
Hide file tree
Showing 4 changed files with 634 additions and 161 deletions.
87 changes: 39 additions & 48 deletions course/bert_from_scratch/00_creating_a_tokenizer.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -208,17 +208,9 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"100%|██████████| 28522082/28522082 [33:32<00:00, 14173.48it/s]\n"
]
}
],
"outputs": [],
"source": [
"from tqdm.auto import tqdm\n",
"\n",
Expand Down Expand Up @@ -299,7 +291,7 @@
"outputs": [],
"source": [
"tokenizer.train(files=paths[:5], vocab_size=30_522, min_frequency=2,\n",
" special_tokens=['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]'])"
" special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])"
]
},
{
Expand All @@ -318,7 +310,7 @@
"output_type": "execute_result",
"data": {
"text/plain": [
"['./filiberto\\\\filiberto-vocab.json', './filiberto\\\\filiberto-merges.txt']"
"['filiberto\\\\vocab.json', 'filiberto\\\\merges.txt']"
]
},
"metadata": {},
Expand All @@ -330,7 +322,7 @@
"\n",
"os.mkdir('./filiberto')\n",
"\n",
"tokenizer.save_model('./filiberto', 'filiberto')"
"tokenizer.save_model('filiberto')"
]
},
{
Expand All @@ -351,41 +343,60 @@
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from transformers import RobertaTokenizer\n",
"\n",
"tokenizer = RobertaTokenizer.from_pretrained('filiberto', max_len=512)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# test our tokenizer on a simple sentence\n",
"tokens = tokenizer('ciao, come va?')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"from tokenizers.implementations import ByteLevelBPETokenizer\n",
"from tokenizers.processors import BertProcessing\n",
"\n",
"# initialize the tokenizer using the tokenizer we initialized and saved to file\n",
"tokenizer = ByteLevelBPETokenizer(\n",
" './filiberto/filiberto-vocab.json',\n",
" './filiberto/filiberto-merges.txt'\n",
" './filiberto/vocab.json',\n",
" './filiberto/merges.txt'\n",
")\n",
"\n",
"# set [CLS] and [SEP] to be added to start-end of sequences\n",
"tokenizer._tokenizer.post_processor = BertProcessing(\n",
" ('[SEP]', tokenizer.token_to_id('[SEP]')),\n",
" ('[CLS]', tokenizer.token_to_id('[CLS]'))\n",
" ('</s>', tokenizer.token_to_id('</s>')),\n",
" ('<s>', tokenizer.token_to_id('<s>'))\n",
")\n",
"\n",
"# truncate anything more than 512 characters in length\n",
"tokenizer.enable_truncation(max_length=512)\n",
"# and enable padding to 512 too\n",
"tokenizer.enable_padding(length=512, pad_token='[PAD]')\n",
"\n",
"# test our tokenizer on a simple sentence\n",
"tokens = tokenizer.encode('ciao, come va?')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])\n"
"{'input_ids': [0, 16834, 16, 488, 611, 35, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}\n"
]
}
],
Expand All @@ -395,57 +406,37 @@
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['[CLS]', 'ciao', ',', 'Ġcome', 'Ġva', '?', '[SEP]', '[PAD]', '[PAD]', '[PAD]']"
]
},
"metadata": {},
"execution_count": 7
}
],
"source": [
"tokens.tokens[:10]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 11,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[1, 16834, 16, 488, 611, 35, 2, 0, 0, 0]"
"[0, 16834, 16, 488, 611, 35, 2]"
]
},
"metadata": {},
"execution_count": 8
"execution_count": 11
}
],
"source": [
"tokens.ids[:10]"
"tokens.input_ids"
]
},
{
"source": [
"We can see here that our **CLS** token is now placed at the beginning of our sequences using token ID *1*. At the end of the sequence we see the **SEP** token represented by *2*. Following this we have our **PAD** tokens which pad each sequence upto a length of *512*."
"We can see here that our **<s\\>** token is now placed at the beginning of our sequences using token ID *0*. At the end of the sequence we see the **<s\\\\>** token represented by *2*."
],
"cell_type": "markdown",
"metadata": {}
}
],
"metadata": {
"kernelspec": {
"display_name": "ML",
"display_name": "ML test",
"language": "python",
"name": "ml"
"name": "ml_test"
},
"language_info": {
"codemirror_mode": {
Expand Down
Loading

0 comments on commit 1c3f1d1

Please sign in to comment.