xhr_update

kioco · Dec 11, 2023 · 4856afb · 4856afb
1 parent 02ef4d0
commit 4856afb
Show file tree

Hide file tree

Showing 5 changed files with 3,092 additions and 30 deletions.
diff --git a/ChatGLM/06-ChatGLM3-6B-Lora微调.ipynb b/ChatGLM/06-ChatGLM3-6B-Lora微调.ipynb
@@ -175,6 +175,47 @@
     "tokenizer.get_command(\"[gMASK]\"), tokenizer._convert_id_to_token(+tokenizer.eos_token_id)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 调用api处理数据"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "instruction = \"\\n\".join([ds[0][\"instruction\"], ds[0][\"input\"]]).strip()     # query\n",
+    "instruction = tokenizer.build_chat_input(instruction, history=[], role=\"user\")\n",
+    "response = tokenizer(\"\\n\" + ds[0][\"output\"], add_special_tokens=False)\n",
+    "input_ids = instruction[\"input_ids\"][0].numpy().tolist() + response[\"input_ids\"] + [tokenizer.eos_token_id]\n",
+    "tokenizer.decode(input_ids)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 手动拆解数据处理"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = [tokenizer.get_command(\"<|system|>\")] + tokenizer.encode(\"现在你要扮演皇帝身边的女人--甄嬛\\n \", add_special_tokens=False)\n",
+    "instruction_ = [tokenizer.get_command(\"<|user|>\")] + tokenizer.encode(\"\\n \" + \"\\n\".join([ds[0][\"instruction\"], ds[0][\"input\"]]).strip(), add_special_tokens=False,max_length=512) + [tokenizer.get_command(\"<|assistant|>\")]\n",
+    "instruction = tokenizer.encode(prompt + instruction_)\n",
+    "response = tokenizer.encode(\"\\n\" + ds[0][\"output\"], add_special_tokens=False)\n",
+    "input_ids = instruction + response + [tokenizer.eos_token_id]\n",
+    "tokenizer.decode(input_ids)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 8,
@@ -184,10 +225,10 @@
     "def process_func(example):\n",
     "    MAX_LENGTH = 512\n",
     "    input_ids, labels = [], []\n",
-    "    instruction = tokenizer.encode(text=\"\\n\".join([\"<|system|>\", \"现在你要扮演皇帝身边的女人--甄嬛\", \"<|user|>\", \n",
-    "                                    example[\"instruction\"] + example[\"input\"] + \"<|assistant|>\"]).strip() + \"\\n\",\n",
-    "                                    add_special_tokens=True, truncation=True, max_length=MAX_LENGTH)\n",
-    "    response = tokenizer.encode(text=example[\"output\"], add_special_tokens=False, truncation=True, max_length=MAX_LENGTH)\n",
+    "    prompt = [tokenizer.get_command(\"<|system|>\")] + tokenizer.encode(\"现在你要扮演皇帝身边的女人--甄嬛\\n \", add_special_tokens=False)\n",
+    "    instruction_ = [tokenizer.get_command(\"<|user|>\")] + tokenizer.encode(\"\\n \" + \"\\n\".join([example[\"instruction\"], example[\"input\"]]).strip(), add_special_tokens=False,max_length=512) + [tokenizer.get_command(\"<|assistant|>\")]\n",
+    "    instruction = tokenizer.encode(prompt + instruction_)\n",
+    "    response = tokenizer.encode(\"\\n\" + example[\"output\"], add_special_tokens=False)\n",
     "    input_ids = instruction + response + [tokenizer.eos_token_id]\n",
     "    labels = [tokenizer.pad_token_id] * len(instruction) + response + [tokenizer.eos_token_id]\n",
     "    pad_len = MAX_LENGTH - len(input_ids)\n",
@@ -235,40 +276,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'[gMASK]sop <|system|>\\n现在你要扮演皇帝身边的女人--甄嬛\\n<|user|>\\n这个温太医啊，也是古怪，谁不知太医不得皇命不能为皇族以外的人请脉诊病，他倒好，十天半月便往咱们府里跑。<|assistant|>\\n 你们俩话太多了，我该和温太医要一剂药，好好治治你们。'"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "tokenizer.decode(tokenized_ds[1][\"input_ids\"])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'你们俩话太多了，我该和温太医要一剂药，好好治治你们。'"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "tokenizer.decode(list(filter(lambda x: x != -100, tokenized_ds[1][\"labels\"])))"
    ]