{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1251ab7b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/root/.conda/envs/qwen2.5.3/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "2025-06-25 19:34:00,882 - modelscope - INFO - PyTorch version 2.6.0 Found.\n",
      "2025-06-25 19:34:00,884 - modelscope - INFO - Loading ast index from /root/.cache/modelscope/ast_indexer\n",
      "2025-06-25 19:34:04,723 - modelscope - INFO - Loading done! Current index file version is 1.12.0, with md5 843fe399ac811db92736a820f07f468e and a total number of 964 components indexed\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from datasets import Dataset\n",
    "from modelscope import snapshot_download, AutoTokenizer\n",
    "from swanlab.integration.transformers import SwanLabCallback\n",
    "from qwen_vl_utils import process_vision_info\n",
    "from peft import LoraConfig, TaskType, get_peft_model, PeftModel\n",
    "from transformers import (\n",
    "    TrainingArguments,\n",
    "    Trainer,\n",
    "    DataCollatorForSeq2Seq,\n",
    "    Qwen2VLForConditionalGeneration,\n",
    "    AutoProcessor,\n",
    ")\n",
    "import swanlab\n",
    "import json"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "394d869a",
   "metadata": {},
   "source": [
    "# 1-加载模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d6fe9c76",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n",
      "Loading checkpoint shards: 100%|██████████| 2/2 [01:34<00:00, 47.11s/it]\n"
     ]
    }
   ],
   "source": [
    "model_dir = \"/root/PMN_WS/qwen-test/model/Qwen/Qwen2-VL-2B-Instruct\"\n",
    "# 使用Transformers加载模型权重\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True)\n",
    "processor = AutoProcessor.from_pretrained(model_dir)\n",
    "\n",
    "model = Qwen2VLForConditionalGeneration.from_pretrained(\n",
    "    model_dir, torch_dtype=torch.bfloat16, \n",
    "    device_map=\"auto\",trust_remote_code=True\n",
    ")\n",
    "model.enable_input_require_grads()  # 开启梯度检查点时，要执行该方法"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "43ddce3e",
   "metadata": {},
   "source": [
    "测试模型能否正常使用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "842158a4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "text:<|im_start|>system\n",
      "You are a helpful assistant.<|im_end|>\n",
      "<|im_start|>user\n",
      "<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n",
      "<|im_start|>assistant\n",
      "\n",
      "image_inputs:[<PIL.Image.Image image mode=RGB size=2044x1372 at 0x7F2895D1DA80>]\n",
      "video_inputs:None\n",
      "[\"The image depicts a serene beach scene with a woman and a dog. The woman is sitting on the sand, wearing a plaid shirt and black pants, and appears to be smiling. She is holding the dog's paw in a high-five gesture. The dog, which is a large breed, is sitting on the sand with its front paws raised, possibly in response to the woman's gesture. The background shows the ocean with gentle waves, and the sky is clear with a soft light, suggesting it might be either sunrise or sunset. The overall atmosphere is peaceful and joyful.\"]\n"
     ]
    }
   ],
   "source": [
    "messages = [\n",
    "    {\n",
    "        \"role\": \"user\",\n",
    "        \"content\": [\n",
    "            {\n",
    "                \"type\": \"image\",\n",
    "                \"image\": \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg\",\n",
    "            },\n",
    "            {\"type\": \"text\", \"text\": \"Describe this image.\"},\n",
    "        ],\n",
    "\n",
    "    }\n",
    "]\n",
    "# Preparation for inference\n",
    "text = processor.apply_chat_template(\n",
    "    messages, tokenize=False, add_generation_prompt=True\n",
    ")\n",
    "print(f'text:{text}')\n",
    "image_inputs, video_inputs = process_vision_info(messages)\n",
    "print(f'image_inputs:{image_inputs}')\n",
    "print(f'video_inputs:{video_inputs}')\n",
    "inputs = processor(\n",
    "    text=[text],\n",
    "    images=image_inputs,\n",
    "    videos=video_inputs,\n",
    "    padding=True,\n",
    "    return_tensors=\"pt\",\n",
    ")\n",
    "inputs = inputs.to(\"cuda\")\n",
    "\n",
    "# Inference: Generation of the output\n",
    "generated_ids = model.generate(**inputs, max_new_tokens=128)\n",
    "generated_ids_trimmed = [\n",
    "    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n",
    "]\n",
    "output_text = processor.batch_decode(\n",
    "    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
    ")\n",
    "print(output_text)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cdd20a96",
   "metadata": {},
   "source": [
    "# 2-数据集预处理:\n",
    "2-1 拆分成训练集和测试集，保存为data_vl_train.json和data_vl_test.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "id": "e4c5f3b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_json_path = \"/root/PMN_WS/qwen-test/data_vl.json\"\n",
    "with open(train_json_path, 'r') as f:\n",
    "    data = json.load(f)\n",
    "    train_data = data[:-4]\n",
    "    test_data = data[-4:]\n",
    "\n",
    "with open(\"data_vl_train.json\", \"w\") as f:\n",
    "    json.dump(train_data, f, indent=4, ensure_ascii=False)\n",
    "\n",
    "with open(\"data_vl_test.json\", \"w\") as f:\n",
    "    json.dump(test_data, f, indent=4, ensure_ascii=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9ab267e0",
   "metadata": {},
   "source": [
    "2-2 处理数据集：读取json文件,将input和label转换成模型训练需要的形式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "id": "0035b338",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Generating train split: 96 examples [00:00, 258.76 examples/s]\n",
      "Map:   2%|▏         | 2/96 [00:00<00:07, 12.52 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:   6%|▋         | 6/96 [00:00<00:06, 13.59 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:   8%|▊         | 8/96 [00:00<00:06, 13.89 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  12%|█▎        | 12/96 [00:00<00:06, 13.60 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  17%|█▋        | 16/96 [00:01<00:05, 13.91 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  19%|█▉        | 18/96 [00:01<00:05, 14.24 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  23%|██▎       | 22/96 [00:01<00:05, 13.85 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  25%|██▌       | 24/96 [00:01<00:05, 13.87 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  29%|██▉       | 28/96 [00:02<00:04, 13.96 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  31%|███▏      | 30/96 [00:02<00:04, 13.89 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  35%|███▌      | 34/96 [00:02<00:04, 13.97 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  38%|███▊      | 36/96 [00:02<00:04, 13.85 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  42%|████▏     | 40/96 [00:02<00:03, 14.01 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  46%|████▌     | 44/96 [00:03<00:03, 14.53 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  48%|████▊     | 46/96 [00:03<00:03, 14.57 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  52%|█████▏    | 50/96 [00:03<00:03, 13.71 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  54%|█████▍    | 52/96 [00:03<00:03, 14.10 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  58%|█████▊    | 56/96 [00:04<00:02, 14.38 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  60%|██████    | 58/96 [00:04<00:02, 13.22 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  62%|██████▎   | 60/96 [00:04<00:05,  6.39 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  65%|██████▍   | 62/96 [00:05<00:07,  4.68 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  67%|██████▋   | 64/96 [00:06<00:07,  4.49 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  69%|██████▉   | 66/96 [00:06<00:05,  5.02 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  71%|███████   | 68/96 [00:06<00:04,  6.46 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  74%|███████▍  | 71/96 [00:06<00:02,  9.21 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  76%|███████▌  | 73/96 [00:07<00:02, 10.53 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  80%|████████  | 77/96 [00:07<00:01, 12.73 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  84%|████████▍ | 81/96 [00:07<00:01, 13.80 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  86%|████████▋ | 83/96 [00:07<00:00, 14.19 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  91%|█████████ | 87/96 [00:07<00:00, 14.55 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  93%|█████████▎| 89/96 [00:08<00:00, 14.44 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  97%|█████████▋| 93/96 [00:08<00:00, 13.17 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map:  99%|█████████▉| 95/96 [00:08<00:00, 12.76 examples/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n",
      "<class 'torch.Tensor'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map: 100%|██████████| 96/96 [00:09<00:00,  9.66 examples/s]\n",
      "Saving the dataset (1/1 shards): 100%|██████████| 96/96 [00:03<00:00, 31.86 examples/s]\n"
     ]
    }
   ],
   "source": [
    "def process_func(example):\n",
    "    \"\"\"\n",
    "    将数据集进行预处理\n",
    "    \"\"\"\n",
    "    MAX_LENGTH = 8192\n",
    "    input_ids, attention_mask, labels = [], [], []\n",
    "    conversation = example[\"conversations\"]\n",
    "    input_content = conversation[0][\"value\"]\n",
    "    output_content = conversation[1][\"value\"]\n",
    "    file_path = input_content.split(\"<|vision_start|>\")[1].split(\"<|vision_end|>\")[0]  # 获取图像路径\n",
    "    messages = [\n",
    "        {\n",
    "            \"role\": \"user\",\n",
    "            \"content\": [\n",
    "                {\n",
    "                    \"type\": \"image\",\n",
    "                    \"image\": f\"{file_path}\",\n",
    "                    \"resized_height\": 280,\n",
    "                    \"resized_width\": 280,\n",
    "                },\n",
    "                {\n",
    "                    \n",
    "                    \"type\": \"text\", \n",
    "                    \"text\": \"COCO Yes:\"\n",
    "                },\n",
    "            ],\n",
    "        }\n",
    "    ]\n",
    "\n",
    "    # 获取文本  \n",
    "    text = processor.apply_chat_template(\n",
    "        messages, tokenize=False, add_generation_prompt=True\n",
    "    )  \n",
    "\n",
    "    image_inputs, video_inputs = process_vision_info(messages)  # 获取数据数据（预处理过）\n",
    "    inputs = processor(\n",
    "        text=[text],\n",
    "        images=image_inputs,\n",
    "        videos=video_inputs,\n",
    "        padding=True,\n",
    "        return_tensors=\"pt\",\n",
    "    )\n",
    "    inputs = {key: value.tolist() for key, value in inputs.items()} #tensor -> list,为了方便拼接\n",
    "    instruction = inputs    \n",
    "    # for key, value in inputs.items():\n",
    "    #     print(key,'\\n')\n",
    "    # \"\"\"\n",
    "    #     inputs:dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw'])\n",
    "    # \"\"\"\n",
    "    \n",
    "\n",
    "    response = tokenizer(f\"{output_content}\", add_special_tokens=False)\n",
    "    # for key, value in response.items():\n",
    "    #     print(key,value)\n",
    "    #     \"\"\"\n",
    "    #     input_ids [32, 10729, 702, 6481, 22360, 12632, 323, 25904, 13]\n",
    "    #     attention_mask [1, 1, 1, 1, 1, 1, 1, 1, 1]\n",
    "    #     \"\"\"\n",
    "    \n",
    "    # 输入text+回答text\n",
    "    input_ids = (\n",
    "            instruction[\"input_ids\"][0] + response[\"input_ids\"] + [tokenizer.pad_token_id]\n",
    "    )\n",
    "    # 哪些位置是有效内容（1有效，0无效）\n",
    "    attention_mask = instruction[\"attention_mask\"][0] + response[\"attention_mask\"] + [1]\n",
    "    # 模型不学习部分，用-100标记\n",
    "    labels = (\n",
    "            [-100] * len(instruction[\"input_ids\"][0])\n",
    "            + response[\"input_ids\"]\n",
    "            + [tokenizer.pad_token_id]\n",
    "    )\n",
    "\n",
    "    if len(input_ids) > MAX_LENGTH:  # 做一个截断\n",
    "        input_ids = input_ids[:MAX_LENGTH]\n",
    "        attention_mask = attention_mask[:MAX_LENGTH]\n",
    "        labels = labels[:MAX_LENGTH]\n",
    "\n",
    "    input_ids = torch.tensor(input_ids)\n",
    "    attention_mask = torch.tensor(attention_mask)\n",
    "    labels = torch.tensor(labels)\n",
    "\n",
    "\n",
    "\n",
    "    inputs['pixel_values'] = torch.tensor(inputs['pixel_values'])\n",
    "    inputs['image_grid_thw'] = torch.tensor(inputs['image_grid_thw']).squeeze(0)  #由（1,h,w)变换为（h,w）\n",
    "    print(type(input_ids))\n",
    "\n",
    "    return {\"input_ids\": input_ids, \"attention_mask\": attention_mask, \"labels\": labels,\n",
    "            \"pixel_values\": inputs['pixel_values'], \"image_grid_thw\": inputs['image_grid_thw']}\n",
    "\n",
    "train_ds = Dataset.from_json(\"data_vl_train.json\")\n",
    "train_dataset = train_ds.map(process_func)\n",
    "# 保存为本地磁盘目录,默认保存为list\n",
    "train_dataset.save_to_disk(\"./processed_train_dataset\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "74a4927d",
   "metadata": {},
   "source": [
    "2-3 读取数据是否保存成功"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "id": "6205a265",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset size: 96\n",
      "✅ Forward 成功！\n",
      "Loss: tensor(2.3434, device='cuda:0')\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_from_disk\n",
    "\n",
    "# 加载保存的数据集\n",
    "loaded_dataset = load_from_disk(\"./processed_train_dataset\")\n",
    "\n",
    "# 查看数据集大小\n",
    "print(f\"Dataset size: {len(loaded_dataset)}\")\n",
    "\n",
    "# 查看第一条样本内容（字段和类型）\n",
    "sample = loaded_dataset[0]\n",
    "# 需要转换的字段\n",
    "required_keys = [\"input_ids\", \"attention_mask\", \"labels\", \"pixel_values\", \"image_grid_thw\"]\n",
    "batch = {}\n",
    "\n",
    "for key in required_keys:\n",
    "    value = sample[key]\n",
    "\n",
    "    # 转成 Tensor（避免直接转 dict）\n",
    "    value = torch.tensor(value)\n",
    "\n",
    "    # 添加 batch 维度\n",
    "    if value.dim() == 1 or (key == \"image_grid_thw\" and value.dim() == 2):\n",
    "        value = value.unsqueeze(0)\n",
    "\n",
    "    # 放到模型所在设备\n",
    "    batch[key] = value.to(model.device)\n",
    "\n",
    "\n",
    "\n",
    "# 验证模型能否 forward\n",
    "model.eval()\n",
    "with torch.no_grad():\n",
    "    outputs = model(**batch)\n",
    "\n",
    "# 打印结果\n",
    "print(\"✅ Forward 成功！\")\n",
    "print(\"Loss:\", outputs.loss if hasattr(outputs, \"loss\") else None)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "efbb5025",
   "metadata": {},
   "source": [
    "# 训练配置"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "428dc34d",
   "metadata": {},
   "source": [
    "查看设备配置信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "e0085d74",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "🧠 CPU 信息:\n",
      "- 处理器: x86_64\n",
      "- 核心数: 64 物理核心 / 128 逻辑核心\n",
      "\n",
      "🧮 内存信息:\n",
      "- 总内存: 2163.58 GB\n",
      "- 已使用: 173.8 GB\n",
      "- 可用:   1962.2 GB\n",
      "\n",
      "🎮 GPU 信息:\n",
      "- GPU 0: NVIDIA A100-SXM4-80GB\n",
      "  - 显存总量: 25.76 GB\n",
      "  - 当前占用: 4.69 GB\n",
      "  - 保留显存: 5.2 GB\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import psutil\n",
    "import platform\n",
    "import os\n",
    "\n",
    "# CPU 信息\n",
    "print(\"🧠 CPU 信息:\")\n",
    "print(f\"- 处理器: {platform.processor()}\")\n",
    "print(f\"- 核心数: {psutil.cpu_count(logical=False)} 物理核心 / {psutil.cpu_count(logical=True)} 逻辑核心\")\n",
    "\n",
    "# 内存信息\n",
    "mem = psutil.virtual_memory()\n",
    "print(\"\\n🧮 内存信息:\")\n",
    "print(f\"- 总内存: {round(mem.total / 1e9, 2)} GB\")\n",
    "print(f\"- 已使用: {round(mem.used / 1e9, 2)} GB\")\n",
    "print(f\"- 可用:   {round(mem.available / 1e9, 2)} GB\")\n",
    "\n",
    "# GPU 信息（需安装 NVIDIA 驱动）\n",
    "if torch.cuda.is_available():\n",
    "    print(\"\\n🎮 GPU 信息:\")\n",
    "    for i in range(torch.cuda.device_count()):\n",
    "        print(f\"- GPU {i}: {torch.cuda.get_device_name(i)}\")\n",
    "        print(f\"  - 显存总量: {round(torch.cuda.get_device_properties(i).total_memory / 1e9, 2)} GB\")\n",
    "        print(f\"  - 当前占用: {round(torch.cuda.memory_allocated(i) / 1e9, 2)} GB\")\n",
    "        print(f\"  - 保留显存: {round(torch.cuda.memory_reserved(i) / 1e9, 2)} GB\")\n",
    "else:\n",
    "    print(\"\\n🚫 当前没有可用的 CUDA GPU\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5a5841bb",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/root/.conda/envs/qwen2.5.3/lib/python3.10/site-packages/peft/mapping_func.py:73: UserWarning: You are trying to modify a model with PEFT for a second time. If you want to reload the model with a different config, make sure to call `.unload()` before.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "# 配置LoRA\n",
    "config = LoraConfig(\n",
    "    task_type=TaskType.CAUSAL_LM,\n",
    "    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
    "    inference_mode=False,  # 训练模式\n",
    "    r=16,  # 越大表达能力越强，参数越多\n",
    "    lora_alpha=32,  # = r 或 2r,缩放因子，调节学习率与初始化范围\n",
    "    lora_dropout=0.05,  # Dropout 比例\n",
    "    bias=\"none\",\n",
    ")\n",
    "\n",
    "# 获取LoRA模型\n",
    "peft_model = get_peft_model(model, config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "e3824dd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 配置训练参数\n",
    "args = TrainingArguments(\n",
    "    output_dir=\"./output/qwen2-vl-lora\",\n",
    "    per_device_train_batch_size=1,\n",
    "    gradient_accumulation_steps=8,\n",
    "    logging_steps=10,\n",
    "    logging_first_step=True,\n",
    "    save_safetensors=False,  # 禁用 safetensors\n",
    "    num_train_epochs=2,\n",
    "    save_steps=100,\n",
    "    bf16=True, \n",
    "    learning_rate=1e-4,\n",
    "    save_on_each_node=True,\n",
    "    gradient_checkpointing=True,\n",
    "    remove_unused_columns=False,    # 必须关\n",
    "    dataloader_num_workers=2,       # 防止爆内存\n",
    "    report_to=\"none\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "5692090c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_from_disk\n",
    "# 设置SwanLab回调\n",
    "train_dataset = load_from_disk(\"./processed_train_dataset\")\n",
    "\n",
    "swanlab_callback = SwanLabCallback(\n",
    "    project=\"Qwen2-VL-finetune\",\n",
    "    experiment_name=\"qwen2-vl-coco2014\",\n",
    "    config={\n",
    "        \"model\": \"https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct\",\n",
    "        \"dataset\": \"https://modelscope.cn/datasets/modelscope/coco_2014_caption/quickstart\",\n",
    "        \"github\": \"https://github.com/datawhalechina/self-llm\",\n",
    "        \"prompt\": \"COCO Yes: \",\n",
    "        \"train_data_number\": len(train_dataset),\n",
    "        \"lora_rank\": 64,\n",
    "        \"lora_alpha\": 16,\n",
    "        \"lora_dropout\": 0.1,\n",
    "    },\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d16e718e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Tracking run with swanlab version 0.6.4                                   \n",
      "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Run data will be saved locally in \u001b[35m\u001b[1m/root/PMN_WS/qwen-test/swanlog/run-20250625_194816-a3b1799d\u001b[0m\u001b[0m\n",
      "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 👋 Hi \u001b[1m\u001b[39mpumpkin_nan\u001b[0m\u001b[0m, welcome to swanlab!\n",
      "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Syncing run \u001b[33mqwen2-vl-coco2014\u001b[0m to the cloud\n",
      "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🏠 View project at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune\u001b[0m\u001b[0m\n",
      "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune/runs/e8rv1gel0mun1e9c74942\u001b[0m\u001b[0m\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "<!DOCTYPE html>\n",
       "<html lang=\"en\">\n",
       "<head>\n",
       "    <meta charset=\"UTF-8\">\n",
       "    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n",
       "    <title>Show Iframe</title>\n",
       "    \n",
       "        <script>\n",
       "            function showIframe() {\n",
       "                var iframeHtml = '<iframe src=\"https://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune/runs/e8rv1gel0mun1e9c74942\" width=100% height=\"600\" frameborder=\"no\"></iframe>';\n",
       "                document.getElementById('iframeContainer').innerHTML = iframeHtml;\n",
       "            }\n",
       "        </script>\n",
       "        \n",
       "</head>\n",
       "<body>\n",
       "    <style>\n",
       "        .interactive-button {\n",
       "            display: flex;\n",
       "            align-items: center;\n",
       "            height: 36px;\n",
       "            border: 0px;\n",
       "            background-color: #2c8f63;\n",
       "            color: white;\n",
       "            padding: 10px 20px;\n",
       "            transition: background-color 0.3s, transform 0.2s;\n",
       "        }\n",
       "\n",
       "        .interactive-button:hover {\n",
       "            background-color: #5cab87;\n",
       "            cursor: pointer;\n",
       "        }\n",
       "\n",
       "        .interactive-button:active { background-color: #217952; transform: scale(0.96); } </style> <br> <button \n",
       "        onclick=\"showIframe()\" class=\"interactive-button\"> <svg style=\"height: 16px; margin-right: 8px;\" viewBox=\"0 0 \n",
       "        46 46\" fill=\"none\"> <path d=\"M10.8439 21.1974C10.6414 21.2854 10.4477 21.3925 10.2655 21.5173L10.2069 \n",
       "        21.5652C10.1839 21.58 10.1625 21.5969 10.1429 21.6159C6.29135 24.6118 4.22831 29.4416 5.32646 34.282C5.94656 \n",
       "        37.0577 7.50461 39.5348 9.73801 41.2958C11.9714 43.0568 14.7436 43.994 17.5874 43.9495H18.0219C19.8864 \n",
       "        43.8697 21.7087 43.3694 23.3526 42.486C24.9964 41.6026 26.4193 40.3589 27.5147 38.848C28.61 37.3371 29.3496 \n",
       "        35.598 29.678 33.761C30.0065 31.9239 29.9153 30.0363 29.4112 28.2395C28.9181 26.4723 27.8919 24.8437 26.9937 \n",
       "        23.2551C25.4158 20.4653 23.8343 17.6764 22.2492 14.8884C21.7801 14.0647 21.3057 13.2465 20.8419 \n",
       "        12.4228C20.2315 11.3353 19.2746 10.1519 19.224 8.86183C19.1733 7.57176 20.2235 6.32701 21.5082 \n",
       "        6.07912C23.9284 5.61801 25.0639 8.24078 25.0693 8.23812C25.363 8.94035 25.9123 9.50489 26.6063 \n",
       "        9.81764C27.3002 10.1304 28.087 10.168 28.8077 9.92298C29.5283 9.67791 30.1291 9.1684 30.4885 8.49743C30.8479 \n",
       "        7.82646 30.9392 7.04405 30.7439 6.30835C30.1514 4.37314 28.9133 2.69953 27.2363 1.56656C25.7615 0.511704 \n",
       "        23.9847 -0.0372109 22.1719 0.00195984C20.9049 0.00893199 19.6532 0.27989 18.4967 0.797557C17.3402 1.31522 \n",
       "        16.3043 2.06823 15.4551 3.00856C14.49 4.08707 13.7984 5.38193 13.4389 6.78385C13.0794 8.18576 13.0624 9.6536 \n",
       "        13.3894 11.0635C13.52 11.593 13.6984 12.1095 13.9225 12.6067C14.5595 14.0514 15.4951 15.3681 16.284 \n",
       "        16.7355C17.2525 18.4147 18.2209 20.0948 19.1893 21.7758C20.1578 23.4568 21.1351 25.1449 22.1213 \n",
       "        26.8401C22.9209 28.2421 23.7925 29.4682 23.8805 31.1528C23.9175 32.0513 23.7682 32.9479 23.4419 \n",
       "        33.7859C23.1156 34.6239 22.6194 35.3854 21.9845 36.0223C21.3496 36.6592 20.5897 37.1578 19.7527 \n",
       "        37.4868C18.9157 37.8157 18.0196 37.9678 17.121 37.9336C14.0024 37.7923 11.6488 35.4814 11.1744 32.4588C10.58 \n",
       "        28.6419 13.552 26.5469 13.552 26.5469C14.1782 26.1785 14.6497 25.5955 14.8791 24.906C15.1084 24.2166 15.0801 \n",
       "        23.4673 14.7993 22.7971C14.5186 22.127 14.0044 21.5813 13.3521 21.2611C12.6998 20.941 11.9536 20.8682 11.2517 \n",
       "        21.0561C11.1174 21.0939 10.9856 21.1402 10.8572 21.1947\" fill=\"white\" /> <path d=\"M42.8101 31.5968C42.8109 \n",
       "        30.5198 42.7218 29.4445 42.5435 28.3823C42.2663 26.7069 41.7464 25.0808 41.0002 23.5552C40.5524 22.6463 \n",
       "        39.9874 21.7374 39.1024 21.2417C38.6593 20.9919 38.1589 20.8617 37.6502 20.8639C37.1416 20.8661 36.6423 \n",
       "        21.0006 36.2013 21.2541C35.7604 21.5077 35.393 21.8716 35.1352 22.3101C34.8775 22.7485 34.7382 23.2466 \n",
       "        34.7312 23.7552C34.7072 24.8773 35.3149 25.8875 35.768 26.9217C36.5212 28.6453 36.8623 30.5208 36.7642 \n",
       "        32.3993C36.6661 34.2777 36.1315 36.1075 35.2029 37.7433C35.146 37.8404 35.0952 37.941 35.051 38.0445C34.8623 \n",
       "        38.4842 34.7635 38.9573 34.7605 39.4358C34.7802 40.1222 35.0356 40.7808 35.4835 41.3011C35.9315 41.8214 \n",
       "        36.5449 42.1717 37.2207 42.2932C38.8759 42.589 40.1899 41.347 40.8856 39.9609C42.1643 37.3589 42.823 34.4961 \n",
       "        42.8101 31.5968Z\" fill=\"white\" /> <path d=\"M28.2309 11.8938C28.1761 11.9043 28.1218 11.9176 28.0683 \n",
       "        11.9338C27.9593 11.9642 27.8611 12.0249 27.7851 12.1088C27.7091 12.1928 27.6584 12.2965 27.6389 \n",
       "        12.408C27.6193 12.5195 27.6318 12.6343 27.6748 12.7391C27.7178 12.8438 27.7895 12.9343 27.8818 \n",
       "        12.9999C29.2375 14.0252 30.3809 15.3043 31.2482 16.7662C31.4838 17.1677 31.6888 17.5865 31.8612 \n",
       "        18.0189C32.0052 18.3921 32.1971 18.8799 32.6822 18.8532C33.0607 18.8346 33.2153 18.512 33.3192 \n",
       "        18.1895C33.8137 16.5125 33.9678 14.7534 33.7723 13.0159C33.6331 12.0693 33.4155 11.1359 33.122 \n",
       "        10.2252C33.0775 10.0047 32.9744 9.80029 32.8235 9.6335C32.7273 9.54627 32.6054 9.49262 32.4761 9.4806C32.3468 \n",
       "        9.46859 32.2171 9.49886 32.1065 9.56687C32.0016 9.65188 31.9115 9.75365 31.8399 9.86806C31.3956 10.4658 \n",
       "        30.825 10.9581 30.1687 11.3101C29.8377 11.4861 29.4893 11.6272 29.1292 11.7312C28.828 11.8192 28.5215 11.8325 \n",
       "        28.2309 11.8938Z\" fill=\"white\" /> </svg> Display SwanLab Board </button> <br> <div \n",
       "        id=\"iframeContainer\"></div> </body> </html>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/root/.conda/envs/qwen2.5.3/lib/python3.10/site-packages/torch/utils/checkpoint.py:87: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n",
      "  warnings.warn(\n",
      "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='24' max='24' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [24/24 01:24, Epoch 2/2]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Step</th>\n",
       "      <th>Training Loss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>3.471600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>10</td>\n",
       "      <td>2.685700</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>20</td>\n",
       "      <td>1.818100</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🏠 View project at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune\u001b[0m\u001b[0m\n",
      "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune/runs/e8rv1gel0mun1e9c74942\u001b[0m\u001b[0m\n",
      "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Waiting for uploading complete\n",
      "                                                                                                    \n"
     ]
    }
   ],
   "source": [
    "# 配置Trainer\n",
    "from transformers import Trainer\n",
    "from torch.nn.utils.rnn import pad_sequence\n",
    "\n",
    "\n",
    "class MyTrainer(Trainer):\n",
    "    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):\n",
    "        if \"num_items_in_batch\" in kwargs:\n",
    "            kwargs.pop(\"num_items_in_batch\")\n",
    "        if \"num_items_in_batch\" in inputs:\n",
    "            inputs.pop(\"num_items_in_batch\")\n",
    "\n",
    "        outputs = model(**inputs)\n",
    "        loss = outputs.loss if hasattr(outputs, \"loss\") else outputs[0]\n",
    "        return (loss, outputs) if return_outputs else loss\n",
    "\n",
    "class MultimodalCollator:\n",
    "    def __init__(self, tokenizer):\n",
    "        self.tokenizer = tokenizer\n",
    "\n",
    "    def __call__(self, batch):\n",
    "        input_ids = [torch.tensor(x[\"input_ids\"]) for x in batch]\n",
    "        attention_mask = [torch.tensor(x[\"attention_mask\"]) for x in batch]\n",
    "        labels = [torch.tensor(x[\"labels\"]) for x in batch]\n",
    "        pixel_values = [\n",
    "            torch.tensor(x[\"pixel_values\"]) if not isinstance(x[\"pixel_values\"], torch.Tensor) else x[\"pixel_values\"] \n",
    "            for x in batch\n",
    "        ]\n",
    "        image_grid_thw = [\n",
    "            torch.tensor(x[\"image_grid_thw\"]) if not isinstance(x[\"image_grid_thw\"], torch.Tensor) else x[\"image_grid_thw\"] \n",
    "            for x in batch\n",
    "        ]\n",
    "        # padding sequences\n",
    "        input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)\n",
    "        attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)\n",
    "        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)\n",
    "\n",
    "        # stack images tensors (假设都是相同shape)\n",
    "        pixel_values = torch.stack(pixel_values)\n",
    "        image_grid_thw = torch.stack(image_grid_thw)\n",
    "\n",
    "        return {\n",
    "            \"input_ids\": input_ids,\n",
    "            \"attention_mask\": attention_mask,\n",
    "            \"labels\": labels,\n",
    "            \"pixel_values\": pixel_values,\n",
    "            \"image_grid_thw\": image_grid_thw,\n",
    "        }\n",
    "\n",
    "\n",
    "trainer = MyTrainer(\n",
    "    model=peft_model,\n",
    "    args=args,\n",
    "    train_dataset=train_dataset,\n",
    "    data_collator=MultimodalCollator(tokenizer=tokenizer),\n",
    "    callbacks=[swanlab_callback],\n",
    ")\n",
    "\n",
    "# 开启模型训练\n",
    "trainer.train()\n",
    "# 训练完成后手动保存\n",
    "peft_model.save_pretrained(args.output_dir)\n",
    "# tokenizer.save_pretrained(args.output_dir)\n",
    "swanlab.finish()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "73c7ef64",
   "metadata": {},
   "source": [
    "# 测试模式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9e40dd83",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/root/.conda/envs/qwen2.5.3/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:167: UserWarning: Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing!\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "# 配置LoRA\n",
    "val_config  = LoraConfig(\n",
    "    task_type=TaskType.CAUSAL_LM,\n",
    "    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
    "    inference_mode=True,  # 推理\n",
    "    r=16,  # 越大表达能力越强，参数越多\n",
    "    lora_alpha=32,  # = r 或 2r,缩放因子，调节学习率与初始化范围\n",
    "    lora_dropout=0.05,  # Dropout 比例\n",
    "    bias=\"none\",\n",
    ")\n",
    "\n",
    "\n",
    "# 获取测试模型\n",
    "val_peft_model = PeftModel.from_pretrained(model,model_id=\"/root/PMN_WS/qwen-test/output/qwen2-vl-lora/checkpoint-24\", \n",
    "                                            config=val_config)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "7637ae3e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict(messages, model):\n",
    "    # 准备推理\n",
    "    text = processor.apply_chat_template(\n",
    "        messages, tokenize=False, add_generation_prompt=True\n",
    "    )\n",
    "    image_inputs, video_inputs = process_vision_info(messages)\n",
    "    inputs = processor(\n",
    "        text=[text],\n",
    "        images=image_inputs,\n",
    "        videos=video_inputs,\n",
    "        padding=True,\n",
    "        return_tensors=\"pt\",\n",
    "    )\n",
    "    inputs = inputs.to(\"cuda\")\n",
    "\n",
    "    # 生成输出\n",
    "    generated_ids = model.generate(**inputs, max_new_tokens=128)\n",
    "    generated_ids_trimmed = [\n",
    "        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n",
    "    ]\n",
    "    output_text = processor.batch_decode(\n",
    "        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
    "    )\n",
    "    \n",
    "    return output_text[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "040a4f9e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Tracking run with swanlab version 0.6.4                                   \n",
      "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Run data will be saved locally in \u001b[35m\u001b[1m/root/PMN_WS/qwen-test/swanlog/run-20250625_205322-8ce21ea3\u001b[0m\u001b[0m\n",
      "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 👋 Hi \u001b[1m\u001b[39mpumpkin_nan\u001b[0m\u001b[0m, welcome to swanlab!\n",
      "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Syncing run \u001b[33mrabbit-5\u001b[0m to the cloud\n",
      "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🏠 View project at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune\u001b[0m\u001b[0m\n",
      "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune/runs/go1e95rfocx0951w3l8r8\u001b[0m\u001b[0m\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "<!DOCTYPE html>\n",
       "<html lang=\"en\">\n",
       "<head>\n",
       "    <meta charset=\"UTF-8\">\n",
       "    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n",
       "    <title>Show Iframe</title>\n",
       "    \n",
       "        <script>\n",
       "            function showIframe() {\n",
       "                var iframeHtml = '<iframe src=\"https://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune/runs/go1e95rfocx0951w3l8r8\" width=100% height=\"600\" frameborder=\"no\"></iframe>';\n",
       "                document.getElementById('iframeContainer').innerHTML = iframeHtml;\n",
       "            }\n",
       "        </script>\n",
       "        \n",
       "</head>\n",
       "<body>\n",
       "    <style>\n",
       "        .interactive-button {\n",
       "            display: flex;\n",
       "            align-items: center;\n",
       "            height: 36px;\n",
       "            border: 0px;\n",
       "            background-color: #2c8f63;\n",
       "            color: white;\n",
       "            padding: 10px 20px;\n",
       "            transition: background-color 0.3s, transform 0.2s;\n",
       "        }\n",
       "\n",
       "        .interactive-button:hover {\n",
       "            background-color: #5cab87;\n",
       "            cursor: pointer;\n",
       "        }\n",
       "\n",
       "        .interactive-button:active { background-color: #217952; transform: scale(0.96); } </style> <br> <button \n",
       "        onclick=\"showIframe()\" class=\"interactive-button\"> <svg style=\"height: 16px; margin-right: 8px;\" viewBox=\"0 0 \n",
       "        46 46\" fill=\"none\"> <path d=\"M10.8439 21.1974C10.6414 21.2854 10.4477 21.3925 10.2655 21.5173L10.2069 \n",
       "        21.5652C10.1839 21.58 10.1625 21.5969 10.1429 21.6159C6.29135 24.6118 4.22831 29.4416 5.32646 34.282C5.94656 \n",
       "        37.0577 7.50461 39.5348 9.73801 41.2958C11.9714 43.0568 14.7436 43.994 17.5874 43.9495H18.0219C19.8864 \n",
       "        43.8697 21.7087 43.3694 23.3526 42.486C24.9964 41.6026 26.4193 40.3589 27.5147 38.848C28.61 37.3371 29.3496 \n",
       "        35.598 29.678 33.761C30.0065 31.9239 29.9153 30.0363 29.4112 28.2395C28.9181 26.4723 27.8919 24.8437 26.9937 \n",
       "        23.2551C25.4158 20.4653 23.8343 17.6764 22.2492 14.8884C21.7801 14.0647 21.3057 13.2465 20.8419 \n",
       "        12.4228C20.2315 11.3353 19.2746 10.1519 19.224 8.86183C19.1733 7.57176 20.2235 6.32701 21.5082 \n",
       "        6.07912C23.9284 5.61801 25.0639 8.24078 25.0693 8.23812C25.363 8.94035 25.9123 9.50489 26.6063 \n",
       "        9.81764C27.3002 10.1304 28.087 10.168 28.8077 9.92298C29.5283 9.67791 30.1291 9.1684 30.4885 8.49743C30.8479 \n",
       "        7.82646 30.9392 7.04405 30.7439 6.30835C30.1514 4.37314 28.9133 2.69953 27.2363 1.56656C25.7615 0.511704 \n",
       "        23.9847 -0.0372109 22.1719 0.00195984C20.9049 0.00893199 19.6532 0.27989 18.4967 0.797557C17.3402 1.31522 \n",
       "        16.3043 2.06823 15.4551 3.00856C14.49 4.08707 13.7984 5.38193 13.4389 6.78385C13.0794 8.18576 13.0624 9.6536 \n",
       "        13.3894 11.0635C13.52 11.593 13.6984 12.1095 13.9225 12.6067C14.5595 14.0514 15.4951 15.3681 16.284 \n",
       "        16.7355C17.2525 18.4147 18.2209 20.0948 19.1893 21.7758C20.1578 23.4568 21.1351 25.1449 22.1213 \n",
       "        26.8401C22.9209 28.2421 23.7925 29.4682 23.8805 31.1528C23.9175 32.0513 23.7682 32.9479 23.4419 \n",
       "        33.7859C23.1156 34.6239 22.6194 35.3854 21.9845 36.0223C21.3496 36.6592 20.5897 37.1578 19.7527 \n",
       "        37.4868C18.9157 37.8157 18.0196 37.9678 17.121 37.9336C14.0024 37.7923 11.6488 35.4814 11.1744 32.4588C10.58 \n",
       "        28.6419 13.552 26.5469 13.552 26.5469C14.1782 26.1785 14.6497 25.5955 14.8791 24.906C15.1084 24.2166 15.0801 \n",
       "        23.4673 14.7993 22.7971C14.5186 22.127 14.0044 21.5813 13.3521 21.2611C12.6998 20.941 11.9536 20.8682 11.2517 \n",
       "        21.0561C11.1174 21.0939 10.9856 21.1402 10.8572 21.1947\" fill=\"white\" /> <path d=\"M42.8101 31.5968C42.8109 \n",
       "        30.5198 42.7218 29.4445 42.5435 28.3823C42.2663 26.7069 41.7464 25.0808 41.0002 23.5552C40.5524 22.6463 \n",
       "        39.9874 21.7374 39.1024 21.2417C38.6593 20.9919 38.1589 20.8617 37.6502 20.8639C37.1416 20.8661 36.6423 \n",
       "        21.0006 36.2013 21.2541C35.7604 21.5077 35.393 21.8716 35.1352 22.3101C34.8775 22.7485 34.7382 23.2466 \n",
       "        34.7312 23.7552C34.7072 24.8773 35.3149 25.8875 35.768 26.9217C36.5212 28.6453 36.8623 30.5208 36.7642 \n",
       "        32.3993C36.6661 34.2777 36.1315 36.1075 35.2029 37.7433C35.146 37.8404 35.0952 37.941 35.051 38.0445C34.8623 \n",
       "        38.4842 34.7635 38.9573 34.7605 39.4358C34.7802 40.1222 35.0356 40.7808 35.4835 41.3011C35.9315 41.8214 \n",
       "        36.5449 42.1717 37.2207 42.2932C38.8759 42.589 40.1899 41.347 40.8856 39.9609C42.1643 37.3589 42.823 34.4961 \n",
       "        42.8101 31.5968Z\" fill=\"white\" /> <path d=\"M28.2309 11.8938C28.1761 11.9043 28.1218 11.9176 28.0683 \n",
       "        11.9338C27.9593 11.9642 27.8611 12.0249 27.7851 12.1088C27.7091 12.1928 27.6584 12.2965 27.6389 \n",
       "        12.408C27.6193 12.5195 27.6318 12.6343 27.6748 12.7391C27.7178 12.8438 27.7895 12.9343 27.8818 \n",
       "        12.9999C29.2375 14.0252 30.3809 15.3043 31.2482 16.7662C31.4838 17.1677 31.6888 17.5865 31.8612 \n",
       "        18.0189C32.0052 18.3921 32.1971 18.8799 32.6822 18.8532C33.0607 18.8346 33.2153 18.512 33.3192 \n",
       "        18.1895C33.8137 16.5125 33.9678 14.7534 33.7723 13.0159C33.6331 12.0693 33.4155 11.1359 33.122 \n",
       "        10.2252C33.0775 10.0047 32.9744 9.80029 32.8235 9.6335C32.7273 9.54627 32.6054 9.49262 32.4761 9.4806C32.3468 \n",
       "        9.46859 32.2171 9.49886 32.1065 9.56687C32.0016 9.65188 31.9115 9.75365 31.8399 9.86806C31.3956 10.4658 \n",
       "        30.825 10.9581 30.1687 11.3101C29.8377 11.4861 29.4893 11.6272 29.1292 11.7312C28.828 11.8192 28.5215 11.8325 \n",
       "        28.2309 11.8938Z\" fill=\"white\" /> </svg> Display SwanLab Board </button> <br> <div \n",
       "        id=\"iframeContainer\"></div> </body> </html>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'role': 'assistant', 'content': 'A bathroom with a toilet and sink.'}\n",
      "{'role': 'assistant', 'content': 'A motorcycle parked under a roof.'}\n",
      "{'role': 'assistant', 'content': 'A toilet with a lid up next to a toilet brush.'}\n",
      "{'role': 'assistant', 'content': 'A bathroom with urinals and sinks.'}\n",
      "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🏠 View project at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune\u001b[0m\u001b[0m\n",
      "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune/runs/go1e95rfocx0951w3l8r8\u001b[0m\u001b[0m\n",
      "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Waiting for uploading complete\n",
      "                                                                                                    \n"
     ]
    }
   ],
   "source": [
    "# 读取测试数据\n",
    "with open(\"data_vl_test.json\", \"r\") as f:\n",
    "    test_dataset = json.load(f)\n",
    "\n",
    "# ✅ 初始化，必须最先调用\n",
    "\n",
    "swanlab.init(\n",
    "    project=\"Qwen2-VL-finetune\",\n",
    "    task=\"test\",\n",
    "    run_name=\"qwen2-vl-eval-ckpt24\",\n",
    ")\n",
    "\n",
    "\n",
    "\n",
    "test_image_list = []\n",
    "for item in test_dataset:\n",
    "    input_image_prompt = item[\"conversations\"][0][\"value\"]\n",
    "    # 去掉前后的<|vision_start|>和<|vision_end|>\n",
    "    origin_image_path = input_image_prompt.split(\"<|vision_start|>\")[1].split(\"<|vision_end|>\")[0]\n",
    "    \n",
    "    messages = [{\n",
    "        \"role\": \"user\", \n",
    "        \"content\": [\n",
    "            {\n",
    "            \"type\": \"image\", \n",
    "            \"image\": origin_image_path\n",
    "            },\n",
    "            {\n",
    "            \"type\": \"text\",\n",
    "            \"text\": \"COCO Yes:\"\n",
    "            }\n",
    "        ]}]\n",
    "    \n",
    "    response = predict(messages, val_peft_model)\n",
    "    messages.append({\"role\": \"assistant\", \"content\": f\"{response}\"})\n",
    "    print(messages[-1])\n",
    "\n",
    "    test_image_list.append(swanlab.Image(origin_image_path, caption=response))\n",
    "\n",
    "swanlab.log({\"Prediction\": test_image_list})\n",
    "\n",
    "# 在Jupyter Notebook中运行时要停止SwanLab记录，需要调用swanlab.finish()\n",
    "swanlab.finish()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f38cb046",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['A woman in a blue shirt is sitting at a table with a child in a pink shirt. The child is holding a hoop with ice cream cones on it. The child is also holding a phone. The woman is looking at the child. The child is looking at the phone. The woman is wearing a blue shirt. The child is wearing a pink shirt. The child is holding a phone. The woman is looking at the child. The child is looking at the phone. The woman is wearing a blue shirt. The child is wearing a pink shirt. The child is holding a phone. The woman is looking at the child. The child']\n"
     ]
    }
   ],
   "source": [
    "messages = [\n",
    "    {\n",
    "        \"role\": \"user\",\n",
    "        \"content\": [\n",
    "            {\n",
    "                \"type\": \"image\",\n",
    "                \"image\": \"/root/PMN_WS/VLM_test/image/吃冰激凌.jpg\",\n",
    "            },\n",
    "            {\"type\": \"text\", \n",
    "             \"text\": \"Describe this image.\"},\n",
    "        ],\n",
    "\n",
    "    }\n",
    "]\n",
    "# Preparation for inference\n",
    "text = processor.apply_chat_template(\n",
    "    messages, tokenize=False, add_generation_prompt=True\n",
    ")\n",
    "\n",
    "image_inputs, video_inputs = process_vision_info(messages)\n",
    "\n",
    "inputs = processor(\n",
    "    text=[text],\n",
    "    images=image_inputs,\n",
    "    videos=video_inputs,\n",
    "    padding=True,\n",
    "    return_tensors=\"pt\",\n",
    ")\n",
    "inputs = inputs.to(\"cuda\")\n",
    "\n",
    "# Inference: Generation of the output\n",
    "# val_peft_model\n",
    "generated_ids = model.generate(**inputs, max_new_tokens=128)\n",
    "generated_ids_trimmed = [\n",
    "    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n",
    "]\n",
    "output_text = processor.batch_decode(\n",
    "    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
    ")\n",
    "print(output_text)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "qwen2.5.3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}