{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "1251ab7b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/root/.conda/envs/qwen2.5.3/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "2025-06-25 19:34:00,882 - modelscope - INFO - PyTorch version 2.6.0 Found.\n", "2025-06-25 19:34:00,884 - modelscope - INFO - Loading ast index from /root/.cache/modelscope/ast_indexer\n", "2025-06-25 19:34:04,723 - modelscope - INFO - Loading done! Current index file version is 1.12.0, with md5 843fe399ac811db92736a820f07f468e and a total number of 964 components indexed\n" ] } ], "source": [ "import torch\n", "from datasets import Dataset\n", "from modelscope import snapshot_download, AutoTokenizer\n", "from swanlab.integration.transformers import SwanLabCallback\n", "from qwen_vl_utils import process_vision_info\n", "from peft import LoraConfig, TaskType, get_peft_model, PeftModel\n", "from transformers import (\n", " TrainingArguments,\n", " Trainer,\n", " DataCollatorForSeq2Seq,\n", " Qwen2VLForConditionalGeneration,\n", " AutoProcessor,\n", ")\n", "import swanlab\n", "import json" ] }, { "cell_type": "markdown", "id": "394d869a", "metadata": {}, "source": [ "# 1-加载模型" ] }, { "cell_type": "code", "execution_count": 2, "id": "d6fe9c76", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n", "Loading checkpoint shards: 100%|██████████| 2/2 [01:34<00:00, 47.11s/it]\n" ] } ], "source": [ "model_dir = \"/root/PMN_WS/qwen-test/model/Qwen/Qwen2-VL-2B-Instruct\"\n", "# 使用Transformers加载模型权重\n", "tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True)\n", "processor = AutoProcessor.from_pretrained(model_dir)\n", "\n", "model = Qwen2VLForConditionalGeneration.from_pretrained(\n", " model_dir, torch_dtype=torch.bfloat16, \n", " device_map=\"auto\",trust_remote_code=True\n", ")\n", "model.enable_input_require_grads() # 开启梯度检查点时,要执行该方法" ] }, { "cell_type": "markdown", "id": "43ddce3e", "metadata": {}, "source": [ "测试模型能否正常使用" ] }, { "cell_type": "code", "execution_count": null, "id": "842158a4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "text:<|im_start|>system\n", "You are a helpful assistant.<|im_end|>\n", "<|im_start|>user\n", "<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n", "<|im_start|>assistant\n", "\n", "image_inputs:[]\n", "video_inputs:None\n", "[\"The image depicts a serene beach scene with a woman and a dog. The woman is sitting on the sand, wearing a plaid shirt and black pants, and appears to be smiling. She is holding the dog's paw in a high-five gesture. The dog, which is a large breed, is sitting on the sand with its front paws raised, possibly in response to the woman's gesture. The background shows the ocean with gentle waves, and the sky is clear with a soft light, suggesting it might be either sunrise or sunset. The overall atmosphere is peaceful and joyful.\"]\n" ] } ], "source": [ "messages = [\n", " {\n", " \"role\": \"user\",\n", " \"content\": [\n", " {\n", " \"type\": \"image\",\n", " \"image\": \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg\",\n", " },\n", " {\"type\": \"text\", \"text\": \"Describe this image.\"},\n", " ],\n", "\n", " }\n", "]\n", "# Preparation for inference\n", "text = processor.apply_chat_template(\n", " messages, tokenize=False, add_generation_prompt=True\n", ")\n", "print(f'text:{text}')\n", "image_inputs, video_inputs = process_vision_info(messages)\n", "print(f'image_inputs:{image_inputs}')\n", "print(f'video_inputs:{video_inputs}')\n", "inputs = processor(\n", " text=[text],\n", " images=image_inputs,\n", " videos=video_inputs,\n", " padding=True,\n", " return_tensors=\"pt\",\n", ")\n", "inputs = inputs.to(\"cuda\")\n", "\n", "# Inference: Generation of the output\n", "generated_ids = model.generate(**inputs, max_new_tokens=128)\n", "generated_ids_trimmed = [\n", " out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n", "]\n", "output_text = processor.batch_decode(\n", " generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n", ")\n", "print(output_text)" ] }, { "cell_type": "markdown", "id": "cdd20a96", "metadata": {}, "source": [ "# 2-数据集预处理:\n", "2-1 拆分成训练集和测试集,保存为data_vl_train.json和data_vl_test.json" ] }, { "cell_type": "code", "execution_count": 96, "id": "e4c5f3b9", "metadata": {}, "outputs": [], "source": [ "train_json_path = \"/root/PMN_WS/qwen-test/data_vl.json\"\n", "with open(train_json_path, 'r') as f:\n", " data = json.load(f)\n", " train_data = data[:-4]\n", " test_data = data[-4:]\n", "\n", "with open(\"data_vl_train.json\", \"w\") as f:\n", " json.dump(train_data, f, indent=4, ensure_ascii=False)\n", "\n", "with open(\"data_vl_test.json\", \"w\") as f:\n", " json.dump(test_data, f, indent=4, ensure_ascii=False)" ] }, { "cell_type": "markdown", "id": "9ab267e0", "metadata": {}, "source": [ "2-2 处理数据集:读取json文件,将input和label转换成模型训练需要的形式" ] }, { "cell_type": "code", "execution_count": 97, "id": "0035b338", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Generating train split: 96 examples [00:00, 258.76 examples/s]\n", "Map: 2%|▏ | 2/96 [00:00<00:07, 12.52 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 6%|▋ | 6/96 [00:00<00:06, 13.59 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 8%|▊ | 8/96 [00:00<00:06, 13.89 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 12%|█▎ | 12/96 [00:00<00:06, 13.60 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 17%|█▋ | 16/96 [00:01<00:05, 13.91 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 19%|█▉ | 18/96 [00:01<00:05, 14.24 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 23%|██▎ | 22/96 [00:01<00:05, 13.85 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 25%|██▌ | 24/96 [00:01<00:05, 13.87 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 29%|██▉ | 28/96 [00:02<00:04, 13.96 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 31%|███▏ | 30/96 [00:02<00:04, 13.89 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 35%|███▌ | 34/96 [00:02<00:04, 13.97 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 38%|███▊ | 36/96 [00:02<00:04, 13.85 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 42%|████▏ | 40/96 [00:02<00:03, 14.01 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 46%|████▌ | 44/96 [00:03<00:03, 14.53 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 48%|████▊ | 46/96 [00:03<00:03, 14.57 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 52%|█████▏ | 50/96 [00:03<00:03, 13.71 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 54%|█████▍ | 52/96 [00:03<00:03, 14.10 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 58%|█████▊ | 56/96 [00:04<00:02, 14.38 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 60%|██████ | 58/96 [00:04<00:02, 13.22 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 62%|██████▎ | 60/96 [00:04<00:05, 6.39 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 65%|██████▍ | 62/96 [00:05<00:07, 4.68 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 67%|██████▋ | 64/96 [00:06<00:07, 4.49 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 69%|██████▉ | 66/96 [00:06<00:05, 5.02 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 71%|███████ | 68/96 [00:06<00:04, 6.46 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 74%|███████▍ | 71/96 [00:06<00:02, 9.21 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 76%|███████▌ | 73/96 [00:07<00:02, 10.53 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 80%|████████ | 77/96 [00:07<00:01, 12.73 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 84%|████████▍ | 81/96 [00:07<00:01, 13.80 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 86%|████████▋ | 83/96 [00:07<00:00, 14.19 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 91%|█████████ | 87/96 [00:07<00:00, 14.55 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 93%|█████████▎| 89/96 [00:08<00:00, 14.44 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 97%|█████████▋| 93/96 [00:08<00:00, 13.17 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 99%|█████████▉| 95/96 [00:08<00:00, 12.76 examples/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Map: 100%|██████████| 96/96 [00:09<00:00, 9.66 examples/s]\n", "Saving the dataset (1/1 shards): 100%|██████████| 96/96 [00:03<00:00, 31.86 examples/s]\n" ] } ], "source": [ "def process_func(example):\n", " \"\"\"\n", " 将数据集进行预处理\n", " \"\"\"\n", " MAX_LENGTH = 8192\n", " input_ids, attention_mask, labels = [], [], []\n", " conversation = example[\"conversations\"]\n", " input_content = conversation[0][\"value\"]\n", " output_content = conversation[1][\"value\"]\n", " file_path = input_content.split(\"<|vision_start|>\")[1].split(\"<|vision_end|>\")[0] # 获取图像路径\n", " messages = [\n", " {\n", " \"role\": \"user\",\n", " \"content\": [\n", " {\n", " \"type\": \"image\",\n", " \"image\": f\"{file_path}\",\n", " \"resized_height\": 280,\n", " \"resized_width\": 280,\n", " },\n", " {\n", " \n", " \"type\": \"text\", \n", " \"text\": \"COCO Yes:\"\n", " },\n", " ],\n", " }\n", " ]\n", "\n", " # 获取文本 \n", " text = processor.apply_chat_template(\n", " messages, tokenize=False, add_generation_prompt=True\n", " ) \n", "\n", " image_inputs, video_inputs = process_vision_info(messages) # 获取数据数据(预处理过)\n", " inputs = processor(\n", " text=[text],\n", " images=image_inputs,\n", " videos=video_inputs,\n", " padding=True,\n", " return_tensors=\"pt\",\n", " )\n", " inputs = {key: value.tolist() for key, value in inputs.items()} #tensor -> list,为了方便拼接\n", " instruction = inputs \n", " # for key, value in inputs.items():\n", " # print(key,'\\n')\n", " # \"\"\"\n", " # inputs:dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw'])\n", " # \"\"\"\n", " \n", "\n", " response = tokenizer(f\"{output_content}\", add_special_tokens=False)\n", " # for key, value in response.items():\n", " # print(key,value)\n", " # \"\"\"\n", " # input_ids [32, 10729, 702, 6481, 22360, 12632, 323, 25904, 13]\n", " # attention_mask [1, 1, 1, 1, 1, 1, 1, 1, 1]\n", " # \"\"\"\n", " \n", " # 输入text+回答text\n", " input_ids = (\n", " instruction[\"input_ids\"][0] + response[\"input_ids\"] + [tokenizer.pad_token_id]\n", " )\n", " # 哪些位置是有效内容(1有效,0无效)\n", " attention_mask = instruction[\"attention_mask\"][0] + response[\"attention_mask\"] + [1]\n", " # 模型不学习部分,用-100标记\n", " labels = (\n", " [-100] * len(instruction[\"input_ids\"][0])\n", " + response[\"input_ids\"]\n", " + [tokenizer.pad_token_id]\n", " )\n", "\n", " if len(input_ids) > MAX_LENGTH: # 做一个截断\n", " input_ids = input_ids[:MAX_LENGTH]\n", " attention_mask = attention_mask[:MAX_LENGTH]\n", " labels = labels[:MAX_LENGTH]\n", "\n", " input_ids = torch.tensor(input_ids)\n", " attention_mask = torch.tensor(attention_mask)\n", " labels = torch.tensor(labels)\n", "\n", "\n", "\n", " inputs['pixel_values'] = torch.tensor(inputs['pixel_values'])\n", " inputs['image_grid_thw'] = torch.tensor(inputs['image_grid_thw']).squeeze(0) #由(1,h,w)变换为(h,w)\n", " print(type(input_ids))\n", "\n", " return {\"input_ids\": input_ids, \"attention_mask\": attention_mask, \"labels\": labels,\n", " \"pixel_values\": inputs['pixel_values'], \"image_grid_thw\": inputs['image_grid_thw']}\n", "\n", "train_ds = Dataset.from_json(\"data_vl_train.json\")\n", "train_dataset = train_ds.map(process_func)\n", "# 保存为本地磁盘目录,默认保存为list\n", "train_dataset.save_to_disk(\"./processed_train_dataset\")" ] }, { "cell_type": "markdown", "id": "74a4927d", "metadata": {}, "source": [ "2-3 读取数据是否保存成功" ] }, { "cell_type": "code", "execution_count": 98, "id": "6205a265", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dataset size: 96\n", "✅ Forward 成功!\n", "Loss: tensor(2.3434, device='cuda:0')\n" ] } ], "source": [ "from datasets import load_from_disk\n", "\n", "# 加载保存的数据集\n", "loaded_dataset = load_from_disk(\"./processed_train_dataset\")\n", "\n", "# 查看数据集大小\n", "print(f\"Dataset size: {len(loaded_dataset)}\")\n", "\n", "# 查看第一条样本内容(字段和类型)\n", "sample = loaded_dataset[0]\n", "# 需要转换的字段\n", "required_keys = [\"input_ids\", \"attention_mask\", \"labels\", \"pixel_values\", \"image_grid_thw\"]\n", "batch = {}\n", "\n", "for key in required_keys:\n", " value = sample[key]\n", "\n", " # 转成 Tensor(避免直接转 dict)\n", " value = torch.tensor(value)\n", "\n", " # 添加 batch 维度\n", " if value.dim() == 1 or (key == \"image_grid_thw\" and value.dim() == 2):\n", " value = value.unsqueeze(0)\n", "\n", " # 放到模型所在设备\n", " batch[key] = value.to(model.device)\n", "\n", "\n", "\n", "# 验证模型能否 forward\n", "model.eval()\n", "with torch.no_grad():\n", " outputs = model(**batch)\n", "\n", "# 打印结果\n", "print(\"✅ Forward 成功!\")\n", "print(\"Loss:\", outputs.loss if hasattr(outputs, \"loss\") else None)\n" ] }, { "cell_type": "markdown", "id": "efbb5025", "metadata": {}, "source": [ "# 训练配置" ] }, { "cell_type": "markdown", "id": "428dc34d", "metadata": {}, "source": [ "查看设备配置信息" ] }, { "cell_type": "code", "execution_count": 84, "id": "e0085d74", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "🧠 CPU 信息:\n", "- 处理器: x86_64\n", "- 核心数: 64 物理核心 / 128 逻辑核心\n", "\n", "🧮 内存信息:\n", "- 总内存: 2163.58 GB\n", "- 已使用: 173.8 GB\n", "- 可用: 1962.2 GB\n", "\n", "🎮 GPU 信息:\n", "- GPU 0: NVIDIA A100-SXM4-80GB\n", " - 显存总量: 25.76 GB\n", " - 当前占用: 4.69 GB\n", " - 保留显存: 5.2 GB\n" ] } ], "source": [ "import torch\n", "import psutil\n", "import platform\n", "import os\n", "\n", "# CPU 信息\n", "print(\"🧠 CPU 信息:\")\n", "print(f\"- 处理器: {platform.processor()}\")\n", "print(f\"- 核心数: {psutil.cpu_count(logical=False)} 物理核心 / {psutil.cpu_count(logical=True)} 逻辑核心\")\n", "\n", "# 内存信息\n", "mem = psutil.virtual_memory()\n", "print(\"\\n🧮 内存信息:\")\n", "print(f\"- 总内存: {round(mem.total / 1e9, 2)} GB\")\n", "print(f\"- 已使用: {round(mem.used / 1e9, 2)} GB\")\n", "print(f\"- 可用: {round(mem.available / 1e9, 2)} GB\")\n", "\n", "# GPU 信息(需安装 NVIDIA 驱动)\n", "if torch.cuda.is_available():\n", " print(\"\\n🎮 GPU 信息:\")\n", " for i in range(torch.cuda.device_count()):\n", " print(f\"- GPU {i}: {torch.cuda.get_device_name(i)}\")\n", " print(f\" - 显存总量: {round(torch.cuda.get_device_properties(i).total_memory / 1e9, 2)} GB\")\n", " print(f\" - 当前占用: {round(torch.cuda.memory_allocated(i) / 1e9, 2)} GB\")\n", " print(f\" - 保留显存: {round(torch.cuda.memory_reserved(i) / 1e9, 2)} GB\")\n", "else:\n", " print(\"\\n🚫 当前没有可用的 CUDA GPU\")\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "5a5841bb", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/root/.conda/envs/qwen2.5.3/lib/python3.10/site-packages/peft/mapping_func.py:73: UserWarning: You are trying to modify a model with PEFT for a second time. If you want to reload the model with a different config, make sure to call `.unload()` before.\n", " warnings.warn(\n" ] } ], "source": [ "# 配置LoRA\n", "config = LoraConfig(\n", " task_type=TaskType.CAUSAL_LM,\n", " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n", " inference_mode=False, # 训练模式\n", " r=16, # 越大表达能力越强,参数越多\n", " lora_alpha=32, # = r 或 2r,缩放因子,调节学习率与初始化范围\n", " lora_dropout=0.05, # Dropout 比例\n", " bias=\"none\",\n", ")\n", "\n", "# 获取LoRA模型\n", "peft_model = get_peft_model(model, config)" ] }, { "cell_type": "code", "execution_count": 6, "id": "e3824dd6", "metadata": {}, "outputs": [], "source": [ "# 配置训练参数\n", "args = TrainingArguments(\n", " output_dir=\"./output/qwen2-vl-lora\",\n", " per_device_train_batch_size=1,\n", " gradient_accumulation_steps=8,\n", " logging_steps=10,\n", " logging_first_step=True,\n", " save_safetensors=False, # 禁用 safetensors\n", " num_train_epochs=2,\n", " save_steps=100,\n", " bf16=True, \n", " learning_rate=1e-4,\n", " save_on_each_node=True,\n", " gradient_checkpointing=True,\n", " remove_unused_columns=False, # 必须关\n", " dataloader_num_workers=2, # 防止爆内存\n", " report_to=\"none\",\n", ")" ] }, { "cell_type": "code", "execution_count": 17, "id": "5692090c", "metadata": {}, "outputs": [], "source": [ "from datasets import load_from_disk\n", "# 设置SwanLab回调\n", "train_dataset = load_from_disk(\"./processed_train_dataset\")\n", "\n", "swanlab_callback = SwanLabCallback(\n", " project=\"Qwen2-VL-finetune\",\n", " experiment_name=\"qwen2-vl-coco2014\",\n", " config={\n", " \"model\": \"https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct\",\n", " \"dataset\": \"https://modelscope.cn/datasets/modelscope/coco_2014_caption/quickstart\",\n", " \"github\": \"https://github.com/datawhalechina/self-llm\",\n", " \"prompt\": \"COCO Yes: \",\n", " \"train_data_number\": len(train_dataset),\n", " \"lora_rank\": 64,\n", " \"lora_alpha\": 16,\n", " \"lora_dropout\": 0.1,\n", " },\n", ")\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "d16e718e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Tracking run with swanlab version 0.6.4 \n", "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Run data will be saved locally in \u001b[35m\u001b[1m/root/PMN_WS/qwen-test/swanlog/run-20250625_194816-a3b1799d\u001b[0m\u001b[0m\n", "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 👋 Hi \u001b[1m\u001b[39mpumpkin_nan\u001b[0m\u001b[0m, welcome to swanlab!\n", "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Syncing run \u001b[33mqwen2-vl-coco2014\u001b[0m to the cloud\n", "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🏠 View project at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune\u001b[0m\u001b[0m\n", "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune/runs/e8rv1gel0mun1e9c74942\u001b[0m\u001b[0m\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", " \n", " \n", " Show Iframe\n", " \n", " \n", " \n", "\n", "\n", "

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "/root/.conda/envs/qwen2.5.3/lib/python3.10/site-packages/torch/utils/checkpoint.py:87: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n", " warnings.warn(\n", "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [24/24 01:24, Epoch 2/2]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining Loss
13.471600
102.685700
201.818100

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🏠 View project at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune\u001b[0m\u001b[0m\n", "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune/runs/e8rv1gel0mun1e9c74942\u001b[0m\u001b[0m\n", "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Waiting for uploading complete\n", " \n" ] } ], "source": [ "# 配置Trainer\n", "from transformers import Trainer\n", "from torch.nn.utils.rnn import pad_sequence\n", "\n", "\n", "class MyTrainer(Trainer):\n", " def compute_loss(self, model, inputs, return_outputs=False, **kwargs):\n", " if \"num_items_in_batch\" in kwargs:\n", " kwargs.pop(\"num_items_in_batch\")\n", " if \"num_items_in_batch\" in inputs:\n", " inputs.pop(\"num_items_in_batch\")\n", "\n", " outputs = model(**inputs)\n", " loss = outputs.loss if hasattr(outputs, \"loss\") else outputs[0]\n", " return (loss, outputs) if return_outputs else loss\n", "\n", "class MultimodalCollator:\n", " def __init__(self, tokenizer):\n", " self.tokenizer = tokenizer\n", "\n", " def __call__(self, batch):\n", " input_ids = [torch.tensor(x[\"input_ids\"]) for x in batch]\n", " attention_mask = [torch.tensor(x[\"attention_mask\"]) for x in batch]\n", " labels = [torch.tensor(x[\"labels\"]) for x in batch]\n", " pixel_values = [\n", " torch.tensor(x[\"pixel_values\"]) if not isinstance(x[\"pixel_values\"], torch.Tensor) else x[\"pixel_values\"] \n", " for x in batch\n", " ]\n", " image_grid_thw = [\n", " torch.tensor(x[\"image_grid_thw\"]) if not isinstance(x[\"image_grid_thw\"], torch.Tensor) else x[\"image_grid_thw\"] \n", " for x in batch\n", " ]\n", " # padding sequences\n", " input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)\n", " attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)\n", " labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)\n", "\n", " # stack images tensors (假设都是相同shape)\n", " pixel_values = torch.stack(pixel_values)\n", " image_grid_thw = torch.stack(image_grid_thw)\n", "\n", " return {\n", " \"input_ids\": input_ids,\n", " \"attention_mask\": attention_mask,\n", " \"labels\": labels,\n", " \"pixel_values\": pixel_values,\n", " \"image_grid_thw\": image_grid_thw,\n", " }\n", "\n", "\n", "trainer = MyTrainer(\n", " model=peft_model,\n", " args=args,\n", " train_dataset=train_dataset,\n", " data_collator=MultimodalCollator(tokenizer=tokenizer),\n", " callbacks=[swanlab_callback],\n", ")\n", "\n", "# 开启模型训练\n", "trainer.train()\n", "# 训练完成后手动保存\n", "peft_model.save_pretrained(args.output_dir)\n", "# tokenizer.save_pretrained(args.output_dir)\n", "swanlab.finish()" ] }, { "cell_type": "markdown", "id": "73c7ef64", "metadata": {}, "source": [ "# 测试模式" ] }, { "cell_type": "code", "execution_count": null, "id": "9e40dd83", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/root/.conda/envs/qwen2.5.3/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:167: UserWarning: Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing!\n", " warnings.warn(\n" ] } ], "source": [ "# 配置LoRA\n", "val_config = LoraConfig(\n", " task_type=TaskType.CAUSAL_LM,\n", " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n", " inference_mode=True, # 推理\n", " r=16, # 越大表达能力越强,参数越多\n", " lora_alpha=32, # = r 或 2r,缩放因子,调节学习率与初始化范围\n", " lora_dropout=0.05, # Dropout 比例\n", " bias=\"none\",\n", ")\n", "\n", "\n", "# 获取测试模型\n", "val_peft_model = PeftModel.from_pretrained(model,model_id=\"/root/PMN_WS/qwen-test/output/qwen2-vl-lora/checkpoint-24\", \n", " config=val_config)\n", "\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "7637ae3e", "metadata": {}, "outputs": [], "source": [ "def predict(messages, model):\n", " # 准备推理\n", " text = processor.apply_chat_template(\n", " messages, tokenize=False, add_generation_prompt=True\n", " )\n", " image_inputs, video_inputs = process_vision_info(messages)\n", " inputs = processor(\n", " text=[text],\n", " images=image_inputs,\n", " videos=video_inputs,\n", " padding=True,\n", " return_tensors=\"pt\",\n", " )\n", " inputs = inputs.to(\"cuda\")\n", "\n", " # 生成输出\n", " generated_ids = model.generate(**inputs, max_new_tokens=128)\n", " generated_ids_trimmed = [\n", " out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n", " ]\n", " output_text = processor.batch_decode(\n", " generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n", " )\n", " \n", " return output_text[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "040a4f9e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Tracking run with swanlab version 0.6.4 \n", "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Run data will be saved locally in \u001b[35m\u001b[1m/root/PMN_WS/qwen-test/swanlog/run-20250625_205322-8ce21ea3\u001b[0m\u001b[0m\n", "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 👋 Hi \u001b[1m\u001b[39mpumpkin_nan\u001b[0m\u001b[0m, welcome to swanlab!\n", "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Syncing run \u001b[33mrabbit-5\u001b[0m to the cloud\n", "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🏠 View project at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune\u001b[0m\u001b[0m\n", "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune/runs/go1e95rfocx0951w3l8r8\u001b[0m\u001b[0m\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", " \n", " \n", " Show Iframe\n", " \n", " \n", " \n", "\n", "\n", "

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "{'role': 'assistant', 'content': 'A bathroom with a toilet and sink.'}\n", "{'role': 'assistant', 'content': 'A motorcycle parked under a roof.'}\n", "{'role': 'assistant', 'content': 'A toilet with a lid up next to a toilet brush.'}\n", "{'role': 'assistant', 'content': 'A bathroom with urinals and sinks.'}\n", "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🏠 View project at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune\u001b[0m\u001b[0m\n", "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune/runs/go1e95rfocx0951w3l8r8\u001b[0m\u001b[0m\n", "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Waiting for uploading complete\n", " \n" ] } ], "source": [ "# 读取测试数据\n", "with open(\"data_vl_test.json\", \"r\") as f:\n", " test_dataset = json.load(f)\n", "\n", "# ✅ 初始化,必须最先调用\n", "\n", "swanlab.init(\n", " project=\"Qwen2-VL-finetune\",\n", " task=\"test\",\n", " run_name=\"qwen2-vl-eval-ckpt24\",\n", ")\n", "\n", "\n", "\n", "test_image_list = []\n", "for item in test_dataset:\n", " input_image_prompt = item[\"conversations\"][0][\"value\"]\n", " # 去掉前后的<|vision_start|>和<|vision_end|>\n", " origin_image_path = input_image_prompt.split(\"<|vision_start|>\")[1].split(\"<|vision_end|>\")[0]\n", " \n", " messages = [{\n", " \"role\": \"user\", \n", " \"content\": [\n", " {\n", " \"type\": \"image\", \n", " \"image\": origin_image_path\n", " },\n", " {\n", " \"type\": \"text\",\n", " \"text\": \"COCO Yes:\"\n", " }\n", " ]}]\n", " \n", " response = predict(messages, val_peft_model)\n", " messages.append({\"role\": \"assistant\", \"content\": f\"{response}\"})\n", " print(messages[-1])\n", "\n", " test_image_list.append(swanlab.Image(origin_image_path, caption=response))\n", "\n", "swanlab.log({\"Prediction\": test_image_list})\n", "\n", "# 在Jupyter Notebook中运行时要停止SwanLab记录,需要调用swanlab.finish()\n", "swanlab.finish()" ] }, { "cell_type": "code", "execution_count": null, "id": "f38cb046", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['A woman in a blue shirt is sitting at a table with a child in a pink shirt. The child is holding a hoop with ice cream cones on it. The child is also holding a phone. The woman is looking at the child. The child is looking at the phone. The woman is wearing a blue shirt. The child is wearing a pink shirt. The child is holding a phone. The woman is looking at the child. The child is looking at the phone. The woman is wearing a blue shirt. The child is wearing a pink shirt. The child is holding a phone. The woman is looking at the child. The child']\n" ] } ], "source": [ "messages = [\n", " {\n", " \"role\": \"user\",\n", " \"content\": [\n", " {\n", " \"type\": \"image\",\n", " \"image\": \"/root/PMN_WS/VLM_test/image/吃冰激凌.jpg\",\n", " },\n", " {\"type\": \"text\", \n", " \"text\": \"Describe this image.\"},\n", " ],\n", "\n", " }\n", "]\n", "# Preparation for inference\n", "text = processor.apply_chat_template(\n", " messages, tokenize=False, add_generation_prompt=True\n", ")\n", "\n", "image_inputs, video_inputs = process_vision_info(messages)\n", "\n", "inputs = processor(\n", " text=[text],\n", " images=image_inputs,\n", " videos=video_inputs,\n", " padding=True,\n", " return_tensors=\"pt\",\n", ")\n", "inputs = inputs.to(\"cuda\")\n", "\n", "# Inference: Generation of the output\n", "# val_peft_model\n", "generated_ids = model.generate(**inputs, max_new_tokens=128)\n", "generated_ids_trimmed = [\n", " out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n", "]\n", "output_text = processor.batch_decode(\n", " generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n", ")\n", "print(output_text)\n" ] } ], "metadata": { "kernelspec": { "display_name": "qwen2.5.3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 5 }