Skip to content

Commit

Permalink
xhr_update
Browse files Browse the repository at this point in the history
  • Loading branch information
Hongru0306 committed Dec 11, 2023
1 parent 02ef4d0 commit 4856afb
Show file tree
Hide file tree
Showing 5 changed files with 3,092 additions and 30 deletions.
79 changes: 49 additions & 30 deletions ChatGLM/06-ChatGLM3-6B-Lora微调.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,47 @@
"tokenizer.get_command(\"[gMASK]\"), tokenizer._convert_id_to_token(+tokenizer.eos_token_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 调用api处理数据"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"instruction = \"\\n\".join([ds[0][\"instruction\"], ds[0][\"input\"]]).strip() # query\n",
"instruction = tokenizer.build_chat_input(instruction, history=[], role=\"user\")\n",
"response = tokenizer(\"\\n\" + ds[0][\"output\"], add_special_tokens=False)\n",
"input_ids = instruction[\"input_ids\"][0].numpy().tolist() + response[\"input_ids\"] + [tokenizer.eos_token_id]\n",
"tokenizer.decode(input_ids)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 手动拆解数据处理"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"prompt = [tokenizer.get_command(\"<|system|>\")] + tokenizer.encode(\"现在你要扮演皇帝身边的女人--甄嬛\\n \", add_special_tokens=False)\n",
"instruction_ = [tokenizer.get_command(\"<|user|>\")] + tokenizer.encode(\"\\n \" + \"\\n\".join([ds[0][\"instruction\"], ds[0][\"input\"]]).strip(), add_special_tokens=False,max_length=512) + [tokenizer.get_command(\"<|assistant|>\")]\n",
"instruction = tokenizer.encode(prompt + instruction_)\n",
"response = tokenizer.encode(\"\\n\" + ds[0][\"output\"], add_special_tokens=False)\n",
"input_ids = instruction + response + [tokenizer.eos_token_id]\n",
"tokenizer.decode(input_ids)"
]
},
{
"cell_type": "code",
"execution_count": 8,
Expand All @@ -184,10 +225,10 @@
"def process_func(example):\n",
" MAX_LENGTH = 512\n",
" input_ids, labels = [], []\n",
" instruction = tokenizer.encode(text=\"\\n\".join([\"<|system|>\", \"现在你要扮演皇帝身边的女人--甄嬛\", \"<|user|>\", \n",
" example[\"instruction\"] + example[\"input\"] + \"<|assistant|>\"]).strip() + \"\\n\",\n",
" add_special_tokens=True, truncation=True, max_length=MAX_LENGTH)\n",
" response = tokenizer.encode(text=example[\"output\"], add_special_tokens=False, truncation=True, max_length=MAX_LENGTH)\n",
" prompt = [tokenizer.get_command(\"<|system|>\")] + tokenizer.encode(\"现在你要扮演皇帝身边的女人--甄嬛\\n \", add_special_tokens=False)\n",
" instruction_ = [tokenizer.get_command(\"<|user|>\")] + tokenizer.encode(\"\\n \" + \"\\n\".join([example[\"instruction\"], example[\"input\"]]).strip(), add_special_tokens=False,max_length=512) + [tokenizer.get_command(\"<|assistant|>\")]\n",
" instruction = tokenizer.encode(prompt + instruction_)\n",
" response = tokenizer.encode(\"\\n\" + example[\"output\"], add_special_tokens=False)\n",
" input_ids = instruction + response + [tokenizer.eos_token_id]\n",
" labels = [tokenizer.pad_token_id] * len(instruction) + response + [tokenizer.eos_token_id]\n",
" pad_len = MAX_LENGTH - len(input_ids)\n",
Expand Down Expand Up @@ -235,40 +276,18 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'[gMASK]sop <|system|>\\n现在你要扮演皇帝身边的女人--甄嬛\\n<|user|>\\n这个温太医啊,也是古怪,谁不知太医不得皇命不能为皇族以外的人请脉诊病,他倒好,十天半月便往咱们府里跑。<|assistant|>\\n 你们俩话太多了,我该和温太医要一剂药,好好治治你们。'"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"tokenizer.decode(tokenized_ds[1][\"input_ids\"])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'你们俩话太多了,我该和温太医要一剂药,好好治治你们。'"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"tokenizer.decode(list(filter(lambda x: x != -100, tokenized_ds[1][\"labels\"])))"
]
Expand Down
Loading

0 comments on commit 4856afb

Please sign in to comment.