Updated

suresh-venkate · Apr 13, 2024 · 3d02f75 · 3d02f75
1 parent adff25b
commit 3d02f75
Showing 1 changed file with 40 additions and 13 deletions.
diff --git a/Courses/HF_Transformers_Course/Sec_3P1_HF_Datasets_PyTorch.ipynb b/Courses/HF_Transformers_Course/Sec_3P1_HF_Datasets_PyTorch.ipynb
@@ -364,7 +364,6 @@
       "execution_count": 8,
       "metadata": {
         "id": "EEJE5rGQ53wI",
-        "outputId": "042df931-edde-4660-dd3c-3a6ae57a2ea0",
         "colab": {
           "base_uri": "https://localhost:8080/",
           "height": 145,
@@ -414,7 +413,8 @@
             "54cf89b66cf740b298d2b3cbac881fd6",
             "702487ff9bd244a580b34c7c1c788247"
           ]
-        }
+        },
+        "outputId": "042df931-edde-4660-dd3c-3a6ae57a2ea0"
       },
       "outputs": [
         {
@@ -520,37 +520,64 @@
       ]
     },
     {
-      "cell_type": "code",
-      "source": [],
+      "cell_type": "markdown",
+      "source": [
+        "THe 'token_type_ids' in the output above indicates which part of the 'input_ids' belongs to the first sentence and which part belongs to the second sentence."
+      ],
       "metadata": {
-        "id": "tVjZfe5Q-0nA"
-      },
-      "execution_count": null,
-      "outputs": []
+        "id": "XoSEQKXlAGk6"
+      }
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 10,
       "metadata": {
         "id": "n5ER5w2v53wK",
-        "outputId": "17a101f8-9066-4fea-f055-9261e22c0144"
+        "outputId": "3664b031-b789-490c-d29a-7da81a66a574",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
       },
       "outputs": [
         {
+          "output_type": "execute_result",
           "data": {
             "text/plain": [
-              "['[CLS]', 'this', 'is', 'the', 'first', 'sentence', '.', '[SEP]', 'this', 'is', 'the', 'second', 'one', '.', '[SEP]']"
+              "['[CLS]',\n",
+              " 'this',\n",
+              " 'is',\n",
+              " 'the',\n",
+              " 'first',\n",
+              " 'sentence',\n",
+              " '.',\n",
+              " '[SEP]',\n",
+              " 'this',\n",
+              " 'is',\n",
+              " 'the',\n",
+              " 'second',\n",
+              " 'one',\n",
+              " '.',\n",
+              " '[SEP]']"
             ]
           },
-          "execution_count": null,
           "metadata": {},
-          "output_type": "execute_result"
+          "execution_count": 10
         }
       ],
       "source": [
+        "# decode the IDs inside input_ids back to words:\n",
         "tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"])"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "So we see the model expects the inputs to be of the form [CLS] sentence1 [SEP] sentence2 [SEP] when there are two sentences."
+      ],
+      "metadata": {
+        "id": "71i5cCGKAZvq"
+      }
+    },
     {
       "cell_type": "code",
       "execution_count": null,