Merge pull request datawhalechina#297 from Zyvpeng/master

修改之前qwen2-vl微调并没有真正输入图片的bug
belinda407 · Nov 27, 2024 · 1390d89 · 1390d89
2 parents 342c2c3 + c060544
commit 1390d89
Show file tree

Hide file tree

Showing 2 changed files with 88 additions and 24 deletions.
diff --git a/models/Qwen2-VL/04-Qwen2-VL-2B Lora 微调.ipynb b/models/Qwen2-VL/04-Qwen2-VL-2B Lora 微调.ipynb
@@ -244,28 +244,60 @@
     "    conversation = example[\"conversations\"]\n",
     "    input_content = conversation[0][\"value\"]\n",
     "    output_content = conversation[1][\"value\"]\n",
-    "    \n",
-    "    instruction = tokenizer(\n",
-    "        f\"<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n<|im_start|>user\\n{input_content}<|im_end|>\\n<|im_start|>assistant\\n\",\n",
-    "        add_special_tokens=False,\n",
+    "    file_path = input_content.split(\"<|vision_start|>\")[1].split(\"<|vision_end|>\")[0]  # 获取图像路径\n",
+    "    messages = [\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": [\n",
+    "                {\n",
+    "                    \"type\": \"image\",\n",
+    "                    \"image\": f\"{file_path}\",\n",
+    "                    \"resized_height\": 280,\n",
+    "                    \"resized_width\": 280,\n",
+    "                },\n",
+    "                {\"type\": \"text\", \"text\": \"COCO Yes:\"},\n",
+    "            ],\n",
+    "        }\n",
+    "    ]\n",
+    "    text = processor.apply_chat_template(\n",
+    "        messages, tokenize=False, add_generation_prompt=True\n",
+    "    )  # 获取文本\n",
+    "    image_inputs, video_inputs = process_vision_info(messages)  # 获取数据数据（预处理过）\n",
+    "    inputs = processor(\n",
+    "        text=[text],\n",
+    "        images=image_inputs,\n",
+    "        videos=video_inputs,\n",
+    "        padding=True,\n",
+    "        return_tensors=\"pt\",\n",
     "    )\n",
+    "    inputs = {key: value.tolist() for key, value in inputs.items()} #tensor -> list,为了方便拼接\n",
+    "    instruction = inputs\n",
+    "\n",
     "    response = tokenizer(f\"{output_content}\", add_special_tokens=False)\n",
+    "\n",
+    "\n",
     "    input_ids = (\n",
-    "        instruction[\"input_ids\"] + response[\"input_ids\"] + [tokenizer.pad_token_id]\n",
+    "            instruction[\"input_ids\"][0] + response[\"input_ids\"] + [tokenizer.pad_token_id]\n",
     "    )\n",
-    "    attention_mask = instruction[\"attention_mask\"] + response[\"attention_mask\"] + [1]\n",
+    "\n",
+    "    attention_mask = instruction[\"attention_mask\"][0] + response[\"attention_mask\"] + [1]\n",
     "    labels = (\n",
-    "        [-100] * len(instruction[\"input_ids\"])\n",
-    "        + response[\"input_ids\"]\n",
-    "        + [tokenizer.pad_token_id]\n",
+    "            [-100] * len(instruction[\"input_ids\"][0])\n",
+    "            + response[\"input_ids\"]\n",
+    "            + [tokenizer.pad_token_id]\n",
     "    )\n",
-    "    \n",
     "    if len(input_ids) > MAX_LENGTH:  # 做一个截断\n",
     "        input_ids = input_ids[:MAX_LENGTH]\n",
     "        attention_mask = attention_mask[:MAX_LENGTH]\n",
     "        labels = labels[:MAX_LENGTH]\n",
-    "        \n",
-    "    return {\"input_ids\": input_ids, \"attention_mask\": attention_mask, \"labels\": labels}\n",
+    "\n",
+    "    input_ids = torch.tensor(input_ids)\n",
+    "    attention_mask = torch.tensor(attention_mask)\n",
+    "    labels = torch.tensor(labels)\n",
+    "    inputs['pixel_values'] = torch.tensor(inputs['pixel_values'])\n",
+    "    inputs['image_grid_thw'] = torch.tensor(inputs['image_grid_thw']).squeeze(0)  #由（1,h,w)变换为（h,w）\n",
+    "    return {\"input_ids\": input_ids, \"attention_mask\": attention_mask, \"labels\": labels,\n",
+    "            \"pixel_values\": inputs['pixel_values'], \"image_grid_thw\": inputs['image_grid_thw']}\n",
     "\n",
     "def predict(messages, model):\n",
     "    # 准备推理\n",

diff --git a/models/Qwen2-VL/04-Qwen2-VL-2B Lora 微调.md b/models/Qwen2-VL/04-Qwen2-VL-2B Lora 微调.md
@@ -193,28 +193,60 @@ def process_func(example):
     conversation = example["conversations"]
     input_content = conversation[0]["value"]
     output_content = conversation[1]["value"]
-
-    instruction = tokenizer(
-        f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_content}<|im_end|>\n<|im_start|>assistant\n",
-        add_special_tokens=False,
+    file_path = input_content.split("<|vision_start|>")[1].split("<|vision_end|>")[0]  # 获取图像路径
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": f"{file_path}",
+                    "resized_height": 280,
+                    "resized_width": 280,
+                },
+                {"type": "text", "text": "COCO Yes:"},
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )  # 获取文本
+    image_inputs, video_inputs = process_vision_info(messages)  # 获取数据数据（预处理过）
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
     )
+    inputs = {key: value.tolist() for key, value in inputs.items()} #tensor -> list,为了方便拼接
+    instruction = inputs
+
     response = tokenizer(f"{output_content}", add_special_tokens=False)
+
+
     input_ids = (
-        instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
+            instruction["input_ids"][0] + response["input_ids"] + [tokenizer.pad_token_id]
     )
-    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]
+
+    attention_mask = instruction["attention_mask"][0] + response["attention_mask"] + [1]
     labels = (
-        [-100] * len(instruction["input_ids"])
-        + response["input_ids"]
-        + [tokenizer.pad_token_id]
+            [-100] * len(instruction["input_ids"][0])
+            + response["input_ids"]
+            + [tokenizer.pad_token_id]
     )
-
     if len(input_ids) > MAX_LENGTH:  # 做一个截断
         input_ids = input_ids[:MAX_LENGTH]
         attention_mask = attention_mask[:MAX_LENGTH]
         labels = labels[:MAX_LENGTH]
-
-    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
+
+    input_ids = torch.tensor(input_ids)
+    attention_mask = torch.tensor(attention_mask)
+    labels = torch.tensor(labels)
+    inputs['pixel_values'] = torch.tensor(inputs['pixel_values'])
+    inputs['image_grid_thw'] = torch.tensor(inputs['image_grid_thw']).squeeze(0)  #由（1,h,w)变换为（h,w）
+    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels,
+            "pixel_values": inputs['pixel_values'], "image_grid_thw": inputs['image_grid_thw']}
 
 def predict(messages, model):
     # 准备推理