From f38b91ce138358f98534a5f267d4888f3521d93e Mon Sep 17 00:00:00 2001 From: pmn Date: Thu, 26 Jun 2025 14:56:56 +0800 Subject: [PATCH] =?UTF-8?q?lora=E5=BE=AE=E8=B0=83qwen2-vl?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 9 + coco-2024-dataset.csv | 102 +++ csv2json.py | 27 + data2csv.py | 146 ++++ data_vl.json | 1302 +++++++++++++++++++++++++++++++++ data_vl_test.json | 54 ++ data_vl_train.json | 1250 ++++++++++++++++++++++++++++++++ model_download.py | 69 ++ train.ipynb | 1599 +++++++++++++++++++++++++++++++++++++++++ train.json | 0 10 files changed, 4558 insertions(+) create mode 100644 README.md create mode 100644 coco-2024-dataset.csv create mode 100644 csv2json.py create mode 100644 data2csv.py create mode 100644 data_vl.json create mode 100644 data_vl_test.json create mode 100644 data_vl_train.json create mode 100644 model_download.py create mode 100644 train.ipynb create mode 100644 train.json diff --git a/README.md b/README.md new file mode 100644 index 0000000..0e4907f --- /dev/null +++ b/README.md @@ -0,0 +1,9 @@ +https://zhuanlan.zhihu.com/p/7144893529 +用于测试Qwen-VL部署、微调 + +data2csv.py:下载数据,生成csv + coco-2024-dataset.csv +csv2json.py:csv转换成json + data_vl.json +train.ipynb + 训练全部代码 diff --git a/coco-2024-dataset.csv b/coco-2024-dataset.csv new file mode 100644 index 0000000..0765a82 --- /dev/null +++ b/coco-2024-dataset.csv @@ -0,0 +1,102 @@ +image_path,caption +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000057870.jpg,A restaurant has modern wooden tables and chairs. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000384029.jpg,A man preparing desserts in a kitchen covered in frosting. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000222016.jpg,a big red telephone booth that a man is standing in +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000520950.jpg,the kitchen is full of spices on the rack +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000069675.jpg,A child and woman are cooking in the kitchen. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000547471.jpg,A black and white image of a man in a suit wearing glasses walking through a door. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000122688.jpg,The huge clock on the wall is near a wooden table. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000392136.jpg,A large bus and some people on the street. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000398494.jpg,A bicycle parked in a kitchen with a stove and cabinets. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000090570.jpg,"Two people in a food truck, one looking at an order." +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000504616.jpg,a person in white is standing in a kitchen +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000161919.jpg,A person is cutting a roast with a fork and knife. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000457732.jpg,a kitchen with a table and some chairs +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000044404.jpg,"A kitchen has wood cabinets, a dishwasher, sink, and refrigerator. " +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000004428.jpg,A chef preparing food inside of a kitchen near a window. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000170558.jpg,Adults using laptop computers while sitting at outdoor venue. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000405613.jpg,A group of men at a table preparing food together +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000283524.jpg,a man cutting up vegetables on top of a food cart. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000037015.jpg,Chefs are preparing food at a restaurant as patrons exit. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000071631.jpg,"Dining room table set for a casual meal, with flowers." +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000491269.jpg,A very cluttered but very clean kept kitchen. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000365363.jpg,Two people flying a kite above pine trees. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000064460.jpg,A kitchen in a restaurant with food on the counter. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000581674.jpg,Several kitchen workers making dishes in commercial kitchen. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000470072.jpg,a grill that has a bunch of burgers on it +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000344806.jpg,A man laying on his stomach with a towel on his head. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000084427.jpg,A small cluttered kitchen with a window and sink. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000317237.jpg,A small child eating noodles from a bowl in a kitchen +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000409382.jpg,People on a skateboard ramp with one doing a trick and one with skateboard on his head. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000570608.jpg,a kitchen that has a microwave and some cabinets in it +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000469605.jpg,there is a man standing on a field talking on the phone +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000356702.jpg,A couple of young men sitting in front of a child's laptop. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000405207.jpg,A large bottle of wine sitting on a kitchen counter. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000472925.jpg,A stove top with pots and pans in a kitchen. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000214704.jpg,A young man that is sitting at a kitchen table is looking of to the side. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000279108.jpg,A woman feeding a man food from a spoon. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000438422.jpg,A woman giving a taste test to a man. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000257350.jpg,a group of people riding bikes stopped in front of a building +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000393493.jpg,Little girl looking down at leaves with her bicycle with training wheels parked next to her. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000062426.jpg,A sink and bath in a small room. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000019380.jpg,A bathroom with a yellow sink next to a white bath tub. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000485894.jpg,A bathroom with a white bath tub sitting in a corner of a green room. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000446014.jpg,A girl washing her hands while looking into a mirror crying. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000530683.jpg,A woman in a dress riding a bicycle. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000292835.jpg,A view of a very dirty bathroom that needs to be cleaned. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000262845.jpg,A picture of a broken down bathroom with two sinks. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000299411.jpg,a bathroom that has a sink and a toilet in it +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000042493.jpg,Two mountain bikers take a break on a path. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000239811.jpg,A solid white bicycle is parked next to statues on a sidewalk. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000002024.jpg,A bathroom looks new with nothing in it. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000095133.jpg,a bathroom scene with a wooden door and a sink in view +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000287541.jpg,"Two bathroom sinks mounted against a mirror, with soap in between the two sinks." +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000441488.jpg,MAN KNEELING BETWEEN TWO BICYCLES LOOKING AT HIS PHONE +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000179620.jpg,"A toilet bowl with rolls of toilet paper stacked next to it +" +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000070000.jpg,Bright loft space with large rustic dining table and bikes on the wall. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000536587.jpg,Adjustable magnifying mirror attached to a bathroom wall +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000543877.jpg,"A bathroom features white, bowl sinks and a bathtub." +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000420721.jpg,A large furry cat pulling up the bathroom carpet +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000540162.jpg,Toilet with upraised lid sitting next to bookshelves. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000218956.jpg,A man standing in front of a white toilet in a restroom. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000318574.jpg,Bicycle wheels are lined up on bicycles in a row. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000172899.jpg,"A bathroom scene complete with a tab, sink and toilet." +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000352884.jpg,"Bathroom with destroyed walls, a sink and a mirrored cabinet. " +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000394326.jpg,A black and white photo of restroom toilet with a filthy floor. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000535786.jpg,A toilet with a bow on it inside a bathroom. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000357684.jpg,Several groups of people are standing outside of a building. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000576757.jpg,An unfinished bathroom has a toilet and tools +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000165499.jpg,A bathroom shower with glass doors and tile walls. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000181104.jpg,A white bathtub sitting in a bathroom next to a sink. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000055627.jpg,A bathroom is freshly cleaned and ready for hotel patrons. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000241364.jpg,A mirror that is sitting behind a sink. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000209967.jpg,A sink and vanity with overhead lights a decorative piece on the wall and a commode. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000153674.jpg,A bathroom sink with the faucet on the side +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000354444.jpg,a lady sitting in a van with several seagulls landing on the top +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000098760.jpg,A dog sticks its out out the window of a car. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000311914.jpg,A school bus parked with it's stop sign closed. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000467311.jpg,A bus and a few other vehicles that appear to be traveling down the road. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000236772.jpg,In the bathroom a toilet is full of ice cubes. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000049183.jpg,A white toilet commode sits on a tile floor. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000426038.jpg,A boat with lots of seats and large windows. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000309322.jpg,The Phillips 66 clocks is in front of some posters. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000318189.jpg,An old red and yellow car with a yellow surfboard on top. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000343322.jpg,Blue and white antique car at intersection of city roadway. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000036633.jpg,A toilet with a trash can and a roll of toilet paper on top +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000213546.jpg,"A mirrored bathroom features duel, white porcelain sinks and silver faucets. " +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000378710.jpg,A wall with four mounted urinals on it. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000189993.jpg,A black bear on display in a library. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000551125.jpg,A group of three urinals mounted to a wall. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000077806.jpg,Small dog in wire basket transported on motor scooter in city. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000546451.jpg,A small restroom that is painted the color blue. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000444546.jpg,a spoon and a fork that is on a table +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000147016.jpg,"A toilet connected to a wire, next to a speaker." +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000497616.jpg,A black and white photo of a cat sitting on a chair. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000520208.jpg,a spoon sitting on some food in a bowl +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000199628.jpg,A public restroom has two sinks shaped like fancy vases. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000280980.jpg,A man in a costume and wig is using a urinal. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000167613.jpg,A bathroom with a white toilet and sink and checkered tile. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000142088.jpg,Black motorcycle sitting underneath an overhang outdoors. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000301778.jpg,Dirty bathroom floor with a toilet and a toilet brush next to it. +/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000454325.jpg,A modern restroom is equipped with fashionable sinks and urinals surrounded by architectural subway tile. diff --git a/csv2json.py b/csv2json.py new file mode 100644 index 0000000..054b04c --- /dev/null +++ b/csv2json.py @@ -0,0 +1,27 @@ +import pandas as pd +import json + +# 载入CSV文件 +df = pd.read_csv('qwen-test/coco-2024-dataset.csv') +conversations = [] + + +# 添加对话数据 +for i in range(len(df)): + conversations.append({ + "id": f"identity_{i+1}", + "conversations": [ + { + "from": "user", + "value": f"COCO Yes: <|vision_start|>{df.iloc[i]['image_path']}<|vision_end|>" + }, + { + "from": "assistant", + "value": df.iloc[i]['caption'] + } + ] + }) + +# 保存为Json +with open('/root/PMN_WS/qwen-test/data_vl.json', 'w', encoding='utf-8') as f: + json.dump(conversations, f, ensure_ascii=False, indent=2) \ No newline at end of file diff --git a/data2csv.py b/data2csv.py new file mode 100644 index 0000000..d1d7314 --- /dev/null +++ b/data2csv.py @@ -0,0 +1,146 @@ +import os +import pandas as pd +import json + +import requests + + + +# #查看json文件 +# print_json_tree(data) + +class image_caption_json(): + """ + { + "info": {...}, + "licenses": [...], + "images": [...], + "annotations": [...] + } + """ + def __init__(self,json_path,data_number): + self.json_path = json_path + self.data_number = data_number + with open(self.json_path, 'r') as f: + self.data = json.load(f) + + + # 树状输出json + def print_json_tree(self, indent=1): + for key, value in self.data.items(): + print(' ' * indent + str(key), end='\n') + + # 如果值是列表,统计列表长度 + if isinstance(value, list): + value_count = len(value) + # 如果值是字典,统计字典的键数量 + elif isinstance(value, dict): + value_count = len(value.keys()) + # 如果值是单个值,数量为1 + else: + value_count = 1 + print(value_count) + + + # if isinstance(value, dict): + # print() + # print_json_tree(value, indent + 1) + # else: + # print(': ' + str(value)) + + + + def download_image(self,url, save_dir, filename, timeout=10): + # 1.创建保存路径 + if not os.path.exists(save_dir): + os.makedirs(save_dir, exist_ok=True) + try: + response = requests.get(url, stream=True) + response.raise_for_status() # 检查请求是否成功 + + img_path = f"{save_dir}/{filename}" + with open(img_path, "wb") as f: + for chunk in response.iter_content(1024): + f.write(chunk) + print(f"✅ Successfully saved {filename} to: {img_path} from: {url}\n") + return img_path + except Exception as e: + print(f"❌ Failed to download from[{url}]: {e}\n") + return None + + def image_annotation(self,save_dir,csv_path): + # if os.path.exists(save_dir): + # print('coco_2014_caption目录已存在,跳过数据处理步骤') + # return 0 + + + # 全局变量,记录最近成功的来源(0-"coco" 或 1-"flickr") + LAST_SUCCESSFUL_SOURCE = 0 + # 初始化存储图片路径和描述的列表 + image_paths = [] + captions = [] + + # 获取前data_number个annotation + for i, img_info in enumerate(self.data['images'][:self.data_number]): + # 获取对应的caption和image + img_id = img_info['id'] + filename = img_info['file_name'] + coco_url = img_info['coco_url'] + flickr_url = img_info['flickr_url'] + # 查找image对应的caption + # # 匹配所有 + # match_annotation =[annotation['caption'] for annotation in self.data['annotations'] if annotation['image_id'] == img_id] + # 只匹配第一个 + caption =next((annotation['caption'] for annotation in self.data['annotations'] if annotation['image_id'] == img_id),None) + + print(f"{i+1}. 图片ID: {img_id}") + print(f" 文件名: {filename}") + print(f" Caption: {caption}") + # print(f" coco_url: {coco_url}") + + # 根据 url 下载图片 + if LAST_SUCCESSFUL_SOURCE: + first_url = flickr_url + second_url = coco_url + else: + first_url = coco_url + second_url = flickr_url + + image_path = self.download_image(first_url,save_dir,filename) + if not image_path: + image_path = self.download_image(second_url,save_dir,filename) + if image_path: + LAST_SUCCESSFUL_SOURCE =1-LAST_SUCCESSFUL_SOURCE + else: + print(f"❌❌ Failed to download ]\n") + continue + + # 将路径和描述添加到列表中 + image_paths.append(image_path) + captions.append(caption) + + # 将图片路径和描述保存为CSV文件 + df = pd.DataFrame({ + 'image_path': image_paths, + 'caption': captions + }) + # 将数据保存为CSV文件 + df.to_csv(csv_path, index=False) + + print(f'数据处理完成') + + + + + + + +if __name__ == '__main__': + file_path = '/root/PMN_WS/coco_2014_caption/annotations/captions_train2014.json' + MAX_DATA_NUMBER = 100 + + image_caption = image_caption_json(file_path,MAX_DATA_NUMBER) + save_dir='/root/PMN_WS/qwen-test/coco_2014_image' + csv_path = '/root/PMN_WS/qwen-test/coco-2024-dataset.csv' + image_caption.image_annotation(save_dir,csv_path) + # image_caption.print_json_tree() diff --git a/data_vl.json b/data_vl.json new file mode 100644 index 0000000..a702f48 --- /dev/null +++ b/data_vl.json @@ -0,0 +1,1302 @@ +[ + { + "id": "identity_1", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000057870.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A restaurant has modern wooden tables and chairs." + } + ] + }, + { + "id": "identity_2", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000384029.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A man preparing desserts in a kitchen covered in frosting." + } + ] + }, + { + "id": "identity_3", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000222016.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a big red telephone booth that a man is standing in" + } + ] + }, + { + "id": "identity_4", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000520950.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "the kitchen is full of spices on the rack" + } + ] + }, + { + "id": "identity_5", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000069675.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A child and woman are cooking in the kitchen." + } + ] + }, + { + "id": "identity_6", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000547471.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A black and white image of a man in a suit wearing glasses walking through a door." + } + ] + }, + { + "id": "identity_7", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000122688.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "The huge clock on the wall is near a wooden table." + } + ] + }, + { + "id": "identity_8", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000392136.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A large bus and some people on the street." + } + ] + }, + { + "id": "identity_9", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000398494.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bicycle parked in a kitchen with a stove and cabinets." + } + ] + }, + { + "id": "identity_10", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000090570.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Two people in a food truck, one looking at an order." + } + ] + }, + { + "id": "identity_11", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000504616.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a person in white is standing in a kitchen" + } + ] + }, + { + "id": "identity_12", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000161919.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A person is cutting a roast with a fork and knife." + } + ] + }, + { + "id": "identity_13", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000457732.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a kitchen with a table and some chairs " + } + ] + }, + { + "id": "identity_14", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000044404.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A kitchen has wood cabinets, a dishwasher, sink, and refrigerator. " + } + ] + }, + { + "id": "identity_15", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000004428.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A chef preparing food inside of a kitchen near a window." + } + ] + }, + { + "id": "identity_16", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000170558.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Adults using laptop computers while sitting at outdoor venue." + } + ] + }, + { + "id": "identity_17", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000405613.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A group of men at a table preparing food together" + } + ] + }, + { + "id": "identity_18", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000283524.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a man cutting up vegetables on top of a food cart." + } + ] + }, + { + "id": "identity_19", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000037015.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Chefs are preparing food at a restaurant as patrons exit." + } + ] + }, + { + "id": "identity_20", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000071631.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Dining room table set for a casual meal, with flowers." + } + ] + }, + { + "id": "identity_21", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000491269.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A very cluttered but very clean kept kitchen." + } + ] + }, + { + "id": "identity_22", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000365363.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Two people flying a kite above pine trees." + } + ] + }, + { + "id": "identity_23", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000064460.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A kitchen in a restaurant with food on the counter." + } + ] + }, + { + "id": "identity_24", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000581674.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Several kitchen workers making dishes in commercial kitchen." + } + ] + }, + { + "id": "identity_25", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000470072.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a grill that has a bunch of burgers on it" + } + ] + }, + { + "id": "identity_26", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000344806.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A man laying on his stomach with a towel on his head." + } + ] + }, + { + "id": "identity_27", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000084427.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A small cluttered kitchen with a window and sink." + } + ] + }, + { + "id": "identity_28", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000317237.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A small child eating noodles from a bowl in a kitchen " + } + ] + }, + { + "id": "identity_29", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000409382.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "People on a skateboard ramp with one doing a trick and one with skateboard on his head." + } + ] + }, + { + "id": "identity_30", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000570608.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a kitchen that has a microwave and some cabinets in it" + } + ] + }, + { + "id": "identity_31", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000469605.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "there is a man standing on a field talking on the phone" + } + ] + }, + { + "id": "identity_32", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000356702.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A couple of young men sitting in front of a child's laptop." + } + ] + }, + { + "id": "identity_33", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000405207.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A large bottle of wine sitting on a kitchen counter." + } + ] + }, + { + "id": "identity_34", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000472925.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A stove top with pots and pans in a kitchen." + } + ] + }, + { + "id": "identity_35", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000214704.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A young man that is sitting at a kitchen table is looking of to the side." + } + ] + }, + { + "id": "identity_36", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000279108.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A woman feeding a man food from a spoon." + } + ] + }, + { + "id": "identity_37", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000438422.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A woman giving a taste test to a man. " + } + ] + }, + { + "id": "identity_38", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000257350.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a group of people riding bikes stopped in front of a building" + } + ] + }, + { + "id": "identity_39", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000393493.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Little girl looking down at leaves with her bicycle with training wheels parked next to her." + } + ] + }, + { + "id": "identity_40", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000062426.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A sink and bath in a small room." + } + ] + }, + { + "id": "identity_41", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000019380.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bathroom with a yellow sink next to a white bath tub." + } + ] + }, + { + "id": "identity_42", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000485894.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bathroom with a white bath tub sitting in a corner of a green room." + } + ] + }, + { + "id": "identity_43", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000446014.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A girl washing her hands while looking into a mirror crying." + } + ] + }, + { + "id": "identity_44", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000530683.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A woman in a dress riding a bicycle." + } + ] + }, + { + "id": "identity_45", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000292835.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A view of a very dirty bathroom that needs to be cleaned." + } + ] + }, + { + "id": "identity_46", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000262845.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A picture of a broken down bathroom with two sinks." + } + ] + }, + { + "id": "identity_47", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000299411.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a bathroom that has a sink and a toilet in it" + } + ] + }, + { + "id": "identity_48", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000042493.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Two mountain bikers take a break on a path." + } + ] + }, + { + "id": "identity_49", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000239811.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A solid white bicycle is parked next to statues on a sidewalk." + } + ] + }, + { + "id": "identity_50", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000002024.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bathroom looks new with nothing in it." + } + ] + }, + { + "id": "identity_51", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000095133.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a bathroom scene with a wooden door and a sink in view" + } + ] + }, + { + "id": "identity_52", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000287541.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Two bathroom sinks mounted against a mirror, with soap in between the two sinks." + } + ] + }, + { + "id": "identity_53", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000441488.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "MAN KNEELING BETWEEN TWO BICYCLES LOOKING AT HIS PHONE" + } + ] + }, + { + "id": "identity_54", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000179620.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A toilet bowl with rolls of toilet paper stacked next to it\n" + } + ] + }, + { + "id": "identity_55", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000070000.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Bright loft space with large rustic dining table and bikes on the wall." + } + ] + }, + { + "id": "identity_56", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000536587.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Adjustable magnifying mirror attached to a bathroom wall" + } + ] + }, + { + "id": "identity_57", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000543877.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bathroom features white, bowl sinks and a bathtub." + } + ] + }, + { + "id": "identity_58", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000420721.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A large furry cat pulling up the bathroom carpet" + } + ] + }, + { + "id": "identity_59", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000540162.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Toilet with upraised lid sitting next to bookshelves." + } + ] + }, + { + "id": "identity_60", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000218956.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A man standing in front of a white toilet in a restroom." + } + ] + }, + { + "id": "identity_61", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000318574.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Bicycle wheels are lined up on bicycles in a row. " + } + ] + }, + { + "id": "identity_62", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000172899.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bathroom scene complete with a tab, sink and toilet." + } + ] + }, + { + "id": "identity_63", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000352884.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Bathroom with destroyed walls, a sink and a mirrored cabinet. " + } + ] + }, + { + "id": "identity_64", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000394326.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A black and white photo of restroom toilet with a filthy floor." + } + ] + }, + { + "id": "identity_65", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000535786.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A toilet with a bow on it inside a bathroom." + } + ] + }, + { + "id": "identity_66", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000357684.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Several groups of people are standing outside of a building." + } + ] + }, + { + "id": "identity_67", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000576757.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "An unfinished bathroom has a toilet and tools" + } + ] + }, + { + "id": "identity_68", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000165499.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bathroom shower with glass doors and tile walls. " + } + ] + }, + { + "id": "identity_69", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000181104.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A white bathtub sitting in a bathroom next to a sink." + } + ] + }, + { + "id": "identity_70", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000055627.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bathroom is freshly cleaned and ready for hotel patrons." + } + ] + }, + { + "id": "identity_71", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000241364.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A mirror that is sitting behind a sink." + } + ] + }, + { + "id": "identity_72", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000209967.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A sink and vanity with overhead lights a decorative piece on the wall and a commode." + } + ] + }, + { + "id": "identity_73", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000153674.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bathroom sink with the faucet on the side" + } + ] + }, + { + "id": "identity_74", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000354444.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a lady sitting in a van with several seagulls landing on the top" + } + ] + }, + { + "id": "identity_75", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000098760.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A dog sticks its out out the window of a car. " + } + ] + }, + { + "id": "identity_76", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000311914.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A school bus parked with it's stop sign closed." + } + ] + }, + { + "id": "identity_77", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000467311.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bus and a few other vehicles that appear to be traveling down the road." + } + ] + }, + { + "id": "identity_78", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000236772.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "In the bathroom a toilet is full of ice cubes." + } + ] + }, + { + "id": "identity_79", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000049183.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A white toilet commode sits on a tile floor." + } + ] + }, + { + "id": "identity_80", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000426038.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A boat with lots of seats and large windows." + } + ] + }, + { + "id": "identity_81", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000309322.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "The Phillips 66 clocks is in front of some posters." + } + ] + }, + { + "id": "identity_82", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000318189.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "An old red and yellow car with a yellow surfboard on top." + } + ] + }, + { + "id": "identity_83", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000343322.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Blue and white antique car at intersection of city roadway." + } + ] + }, + { + "id": "identity_84", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000036633.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A toilet with a trash can and a roll of toilet paper on top " + } + ] + }, + { + "id": "identity_85", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000213546.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A mirrored bathroom features duel, white porcelain sinks and silver faucets. " + } + ] + }, + { + "id": "identity_86", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000378710.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A wall with four mounted urinals on it." + } + ] + }, + { + "id": "identity_87", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000189993.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A black bear on display in a library." + } + ] + }, + { + "id": "identity_88", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000551125.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A group of three urinals mounted to a wall." + } + ] + }, + { + "id": "identity_89", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000077806.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Small dog in wire basket transported on motor scooter in city." + } + ] + }, + { + "id": "identity_90", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000546451.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A small restroom that is painted the color blue." + } + ] + }, + { + "id": "identity_91", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000444546.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a spoon and a fork that is on a table" + } + ] + }, + { + "id": "identity_92", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000147016.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A toilet connected to a wire, next to a speaker." + } + ] + }, + { + "id": "identity_93", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000497616.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A black and white photo of a cat sitting on a chair." + } + ] + }, + { + "id": "identity_94", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000520208.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a spoon sitting on some food in a bowl " + } + ] + }, + { + "id": "identity_95", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000199628.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A public restroom has two sinks shaped like fancy vases. " + } + ] + }, + { + "id": "identity_96", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000280980.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A man in a costume and wig is using a urinal." + } + ] + }, + { + "id": "identity_97", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000167613.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bathroom with a white toilet and sink and checkered tile." + } + ] + }, + { + "id": "identity_98", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000142088.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Black motorcycle sitting underneath an overhang outdoors. " + } + ] + }, + { + "id": "identity_99", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000301778.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Dirty bathroom floor with a toilet and a toilet brush next to it." + } + ] + }, + { + "id": "identity_100", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000454325.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A modern restroom is equipped with fashionable sinks and urinals surrounded by architectural subway tile." + } + ] + } +] \ No newline at end of file diff --git a/data_vl_test.json b/data_vl_test.json new file mode 100644 index 0000000..1680b44 --- /dev/null +++ b/data_vl_test.json @@ -0,0 +1,54 @@ +[ + { + "id": "identity_97", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000167613.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bathroom with a white toilet and sink and checkered tile." + } + ] + }, + { + "id": "identity_98", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000142088.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Black motorcycle sitting underneath an overhang outdoors. " + } + ] + }, + { + "id": "identity_99", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000301778.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Dirty bathroom floor with a toilet and a toilet brush next to it." + } + ] + }, + { + "id": "identity_100", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000454325.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A modern restroom is equipped with fashionable sinks and urinals surrounded by architectural subway tile." + } + ] + } +] \ No newline at end of file diff --git a/data_vl_train.json b/data_vl_train.json new file mode 100644 index 0000000..6a26607 --- /dev/null +++ b/data_vl_train.json @@ -0,0 +1,1250 @@ +[ + { + "id": "identity_1", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000057870.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A restaurant has modern wooden tables and chairs." + } + ] + }, + { + "id": "identity_2", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000384029.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A man preparing desserts in a kitchen covered in frosting." + } + ] + }, + { + "id": "identity_3", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000222016.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a big red telephone booth that a man is standing in" + } + ] + }, + { + "id": "identity_4", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000520950.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "the kitchen is full of spices on the rack" + } + ] + }, + { + "id": "identity_5", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000069675.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A child and woman are cooking in the kitchen." + } + ] + }, + { + "id": "identity_6", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000547471.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A black and white image of a man in a suit wearing glasses walking through a door." + } + ] + }, + { + "id": "identity_7", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000122688.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "The huge clock on the wall is near a wooden table." + } + ] + }, + { + "id": "identity_8", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000392136.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A large bus and some people on the street." + } + ] + }, + { + "id": "identity_9", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000398494.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bicycle parked in a kitchen with a stove and cabinets." + } + ] + }, + { + "id": "identity_10", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000090570.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Two people in a food truck, one looking at an order." + } + ] + }, + { + "id": "identity_11", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000504616.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a person in white is standing in a kitchen" + } + ] + }, + { + "id": "identity_12", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000161919.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A person is cutting a roast with a fork and knife." + } + ] + }, + { + "id": "identity_13", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000457732.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a kitchen with a table and some chairs " + } + ] + }, + { + "id": "identity_14", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000044404.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A kitchen has wood cabinets, a dishwasher, sink, and refrigerator. " + } + ] + }, + { + "id": "identity_15", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000004428.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A chef preparing food inside of a kitchen near a window." + } + ] + }, + { + "id": "identity_16", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000170558.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Adults using laptop computers while sitting at outdoor venue." + } + ] + }, + { + "id": "identity_17", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000405613.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A group of men at a table preparing food together" + } + ] + }, + { + "id": "identity_18", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000283524.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a man cutting up vegetables on top of a food cart." + } + ] + }, + { + "id": "identity_19", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000037015.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Chefs are preparing food at a restaurant as patrons exit." + } + ] + }, + { + "id": "identity_20", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000071631.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Dining room table set for a casual meal, with flowers." + } + ] + }, + { + "id": "identity_21", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000491269.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A very cluttered but very clean kept kitchen." + } + ] + }, + { + "id": "identity_22", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000365363.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Two people flying a kite above pine trees." + } + ] + }, + { + "id": "identity_23", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000064460.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A kitchen in a restaurant with food on the counter." + } + ] + }, + { + "id": "identity_24", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000581674.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Several kitchen workers making dishes in commercial kitchen." + } + ] + }, + { + "id": "identity_25", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000470072.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a grill that has a bunch of burgers on it" + } + ] + }, + { + "id": "identity_26", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000344806.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A man laying on his stomach with a towel on his head." + } + ] + }, + { + "id": "identity_27", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000084427.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A small cluttered kitchen with a window and sink." + } + ] + }, + { + "id": "identity_28", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000317237.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A small child eating noodles from a bowl in a kitchen " + } + ] + }, + { + "id": "identity_29", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000409382.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "People on a skateboard ramp with one doing a trick and one with skateboard on his head." + } + ] + }, + { + "id": "identity_30", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000570608.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a kitchen that has a microwave and some cabinets in it" + } + ] + }, + { + "id": "identity_31", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000469605.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "there is a man standing on a field talking on the phone" + } + ] + }, + { + "id": "identity_32", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000356702.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A couple of young men sitting in front of a child's laptop." + } + ] + }, + { + "id": "identity_33", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000405207.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A large bottle of wine sitting on a kitchen counter." + } + ] + }, + { + "id": "identity_34", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000472925.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A stove top with pots and pans in a kitchen." + } + ] + }, + { + "id": "identity_35", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000214704.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A young man that is sitting at a kitchen table is looking of to the side." + } + ] + }, + { + "id": "identity_36", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000279108.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A woman feeding a man food from a spoon." + } + ] + }, + { + "id": "identity_37", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000438422.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A woman giving a taste test to a man. " + } + ] + }, + { + "id": "identity_38", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000257350.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a group of people riding bikes stopped in front of a building" + } + ] + }, + { + "id": "identity_39", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000393493.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Little girl looking down at leaves with her bicycle with training wheels parked next to her." + } + ] + }, + { + "id": "identity_40", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000062426.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A sink and bath in a small room." + } + ] + }, + { + "id": "identity_41", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000019380.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bathroom with a yellow sink next to a white bath tub." + } + ] + }, + { + "id": "identity_42", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000485894.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bathroom with a white bath tub sitting in a corner of a green room." + } + ] + }, + { + "id": "identity_43", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000446014.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A girl washing her hands while looking into a mirror crying." + } + ] + }, + { + "id": "identity_44", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000530683.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A woman in a dress riding a bicycle." + } + ] + }, + { + "id": "identity_45", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000292835.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A view of a very dirty bathroom that needs to be cleaned." + } + ] + }, + { + "id": "identity_46", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000262845.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A picture of a broken down bathroom with two sinks." + } + ] + }, + { + "id": "identity_47", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000299411.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a bathroom that has a sink and a toilet in it" + } + ] + }, + { + "id": "identity_48", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000042493.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Two mountain bikers take a break on a path." + } + ] + }, + { + "id": "identity_49", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000239811.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A solid white bicycle is parked next to statues on a sidewalk." + } + ] + }, + { + "id": "identity_50", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000002024.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bathroom looks new with nothing in it." + } + ] + }, + { + "id": "identity_51", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000095133.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a bathroom scene with a wooden door and a sink in view" + } + ] + }, + { + "id": "identity_52", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000287541.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Two bathroom sinks mounted against a mirror, with soap in between the two sinks." + } + ] + }, + { + "id": "identity_53", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000441488.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "MAN KNEELING BETWEEN TWO BICYCLES LOOKING AT HIS PHONE" + } + ] + }, + { + "id": "identity_54", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000179620.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A toilet bowl with rolls of toilet paper stacked next to it\n" + } + ] + }, + { + "id": "identity_55", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000070000.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Bright loft space with large rustic dining table and bikes on the wall." + } + ] + }, + { + "id": "identity_56", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000536587.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Adjustable magnifying mirror attached to a bathroom wall" + } + ] + }, + { + "id": "identity_57", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000543877.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bathroom features white, bowl sinks and a bathtub." + } + ] + }, + { + "id": "identity_58", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000420721.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A large furry cat pulling up the bathroom carpet" + } + ] + }, + { + "id": "identity_59", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000540162.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Toilet with upraised lid sitting next to bookshelves." + } + ] + }, + { + "id": "identity_60", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000218956.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A man standing in front of a white toilet in a restroom." + } + ] + }, + { + "id": "identity_61", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000318574.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Bicycle wheels are lined up on bicycles in a row. " + } + ] + }, + { + "id": "identity_62", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000172899.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bathroom scene complete with a tab, sink and toilet." + } + ] + }, + { + "id": "identity_63", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000352884.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Bathroom with destroyed walls, a sink and a mirrored cabinet. " + } + ] + }, + { + "id": "identity_64", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000394326.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A black and white photo of restroom toilet with a filthy floor." + } + ] + }, + { + "id": "identity_65", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000535786.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A toilet with a bow on it inside a bathroom." + } + ] + }, + { + "id": "identity_66", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000357684.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Several groups of people are standing outside of a building." + } + ] + }, + { + "id": "identity_67", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000576757.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "An unfinished bathroom has a toilet and tools" + } + ] + }, + { + "id": "identity_68", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000165499.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bathroom shower with glass doors and tile walls. " + } + ] + }, + { + "id": "identity_69", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000181104.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A white bathtub sitting in a bathroom next to a sink." + } + ] + }, + { + "id": "identity_70", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000055627.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bathroom is freshly cleaned and ready for hotel patrons." + } + ] + }, + { + "id": "identity_71", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000241364.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A mirror that is sitting behind a sink." + } + ] + }, + { + "id": "identity_72", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000209967.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A sink and vanity with overhead lights a decorative piece on the wall and a commode." + } + ] + }, + { + "id": "identity_73", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000153674.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bathroom sink with the faucet on the side" + } + ] + }, + { + "id": "identity_74", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000354444.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a lady sitting in a van with several seagulls landing on the top" + } + ] + }, + { + "id": "identity_75", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000098760.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A dog sticks its out out the window of a car. " + } + ] + }, + { + "id": "identity_76", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000311914.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A school bus parked with it's stop sign closed." + } + ] + }, + { + "id": "identity_77", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000467311.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A bus and a few other vehicles that appear to be traveling down the road." + } + ] + }, + { + "id": "identity_78", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000236772.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "In the bathroom a toilet is full of ice cubes." + } + ] + }, + { + "id": "identity_79", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000049183.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A white toilet commode sits on a tile floor." + } + ] + }, + { + "id": "identity_80", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000426038.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A boat with lots of seats and large windows." + } + ] + }, + { + "id": "identity_81", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000309322.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "The Phillips 66 clocks is in front of some posters." + } + ] + }, + { + "id": "identity_82", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000318189.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "An old red and yellow car with a yellow surfboard on top." + } + ] + }, + { + "id": "identity_83", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000343322.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Blue and white antique car at intersection of city roadway." + } + ] + }, + { + "id": "identity_84", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000036633.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A toilet with a trash can and a roll of toilet paper on top " + } + ] + }, + { + "id": "identity_85", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000213546.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A mirrored bathroom features duel, white porcelain sinks and silver faucets. " + } + ] + }, + { + "id": "identity_86", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000378710.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A wall with four mounted urinals on it." + } + ] + }, + { + "id": "identity_87", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000189993.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A black bear on display in a library." + } + ] + }, + { + "id": "identity_88", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000551125.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A group of three urinals mounted to a wall." + } + ] + }, + { + "id": "identity_89", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000077806.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "Small dog in wire basket transported on motor scooter in city." + } + ] + }, + { + "id": "identity_90", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000546451.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A small restroom that is painted the color blue." + } + ] + }, + { + "id": "identity_91", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000444546.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a spoon and a fork that is on a table" + } + ] + }, + { + "id": "identity_92", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000147016.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A toilet connected to a wire, next to a speaker." + } + ] + }, + { + "id": "identity_93", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000497616.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A black and white photo of a cat sitting on a chair." + } + ] + }, + { + "id": "identity_94", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000520208.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "a spoon sitting on some food in a bowl " + } + ] + }, + { + "id": "identity_95", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000199628.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A public restroom has two sinks shaped like fancy vases. " + } + ] + }, + { + "id": "identity_96", + "conversations": [ + { + "from": "user", + "value": "COCO Yes: <|vision_start|>/root/PMN_WS/qwen-test/coco_2014_image/COCO_train2014_000000280980.jpg<|vision_end|>" + }, + { + "from": "assistant", + "value": "A man in a costume and wig is using a urinal." + } + ] + } +] \ No newline at end of file diff --git a/model_download.py b/model_download.py new file mode 100644 index 0000000..5ae5f4c --- /dev/null +++ b/model_download.py @@ -0,0 +1,69 @@ + +import torch +from datasets import Dataset +from modelscope import snapshot_download, AutoTokenizer +from qwen_vl_utils import process_vision_info +from transformers import ( + TrainingArguments, + Trainer, + DataCollatorForSeq2Seq, + Qwen2VLForConditionalGeneration, + AutoProcessor, +) + +import json + + +# 在modelscope上下载Qwen2-VL模型到本地目录下 +# model_dir = snapshot_download( +# model_id="Qwen/Qwen2-VL-2B-Instruct", +# cache_dir=model_path, +# revision="master" +# ) +# print(f"模型已下载到: {model_dir}") + +model_dir = "/root/PMN_WS/qwen-test/model/Qwen/Qwen2-VL-2B-Instruct" +# 使用Transformers加载模型权重 +tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True) +processor = AutoProcessor.from_pretrained(model_dir) + +model = Qwen2VLForConditionalGeneration.from_pretrained( + model_dir, torch_dtype="auto", device_map="auto" +) + +messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", + }, + {"type": "text", "text": "Describe this image."}, + ], + } +] + +# Preparation for inference +text = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True +) +image_inputs, video_inputs = process_vision_info(messages) +inputs = processor( + text=[text], + images=image_inputs, + videos=video_inputs, + padding=True, + return_tensors="pt", +) +inputs = inputs.to("cuda") + +# Inference: Generation of the output +generated_ids = model.generate(**inputs, max_new_tokens=128) +generated_ids_trimmed = [ + out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) +] +output_text = processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False +) +print(output_text) \ No newline at end of file diff --git a/train.ipynb b/train.ipynb new file mode 100644 index 0000000..a8c5d7a --- /dev/null +++ b/train.ipynb @@ -0,0 +1,1599 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "1251ab7b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/root/.conda/envs/qwen2.5.3/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "2025-06-25 19:34:00,882 - modelscope - INFO - PyTorch version 2.6.0 Found.\n", + "2025-06-25 19:34:00,884 - modelscope - INFO - Loading ast index from /root/.cache/modelscope/ast_indexer\n", + "2025-06-25 19:34:04,723 - modelscope - INFO - Loading done! Current index file version is 1.12.0, with md5 843fe399ac811db92736a820f07f468e and a total number of 964 components indexed\n" + ] + } + ], + "source": [ + "import torch\n", + "from datasets import Dataset\n", + "from modelscope import snapshot_download, AutoTokenizer\n", + "from swanlab.integration.transformers import SwanLabCallback\n", + "from qwen_vl_utils import process_vision_info\n", + "from peft import LoraConfig, TaskType, get_peft_model, PeftModel\n", + "from transformers import (\n", + " TrainingArguments,\n", + " Trainer,\n", + " DataCollatorForSeq2Seq,\n", + " Qwen2VLForConditionalGeneration,\n", + " AutoProcessor,\n", + ")\n", + "import swanlab\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "id": "394d869a", + "metadata": {}, + "source": [ + "# 1-加载模型" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d6fe9c76", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n", + "Loading checkpoint shards: 100%|██████████| 2/2 [01:34<00:00, 47.11s/it]\n" + ] + } + ], + "source": [ + "model_dir = \"/root/PMN_WS/qwen-test/model/Qwen/Qwen2-VL-2B-Instruct\"\n", + "# 使用Transformers加载模型权重\n", + "tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True)\n", + "processor = AutoProcessor.from_pretrained(model_dir)\n", + "\n", + "model = Qwen2VLForConditionalGeneration.from_pretrained(\n", + " model_dir, torch_dtype=torch.bfloat16, \n", + " device_map=\"auto\",trust_remote_code=True\n", + ")\n", + "model.enable_input_require_grads() # 开启梯度检查点时,要执行该方法" + ] + }, + { + "cell_type": "markdown", + "id": "43ddce3e", + "metadata": {}, + "source": [ + "测试模型能否正常使用" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "842158a4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "text:<|im_start|>system\n", + "You are a helpful assistant.<|im_end|>\n", + "<|im_start|>user\n", + "<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n", + "<|im_start|>assistant\n", + "\n", + "image_inputs:[]\n", + "video_inputs:None\n", + "[\"The image depicts a serene beach scene with a woman and a dog. The woman is sitting on the sand, wearing a plaid shirt and black pants, and appears to be smiling. She is holding the dog's paw in a high-five gesture. The dog, which is a large breed, is sitting on the sand with its front paws raised, possibly in response to the woman's gesture. The background shows the ocean with gentle waves, and the sky is clear with a soft light, suggesting it might be either sunrise or sunset. The overall atmosphere is peaceful and joyful.\"]\n" + ] + } + ], + "source": [ + "messages = [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"image\",\n", + " \"image\": \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg\",\n", + " },\n", + " {\"type\": \"text\", \"text\": \"Describe this image.\"},\n", + " ],\n", + "\n", + " }\n", + "]\n", + "# Preparation for inference\n", + "text = processor.apply_chat_template(\n", + " messages, tokenize=False, add_generation_prompt=True\n", + ")\n", + "print(f'text:{text}')\n", + "image_inputs, video_inputs = process_vision_info(messages)\n", + "print(f'image_inputs:{image_inputs}')\n", + "print(f'video_inputs:{video_inputs}')\n", + "inputs = processor(\n", + " text=[text],\n", + " images=image_inputs,\n", + " videos=video_inputs,\n", + " padding=True,\n", + " return_tensors=\"pt\",\n", + ")\n", + "inputs = inputs.to(\"cuda\")\n", + "\n", + "# Inference: Generation of the output\n", + "generated_ids = model.generate(**inputs, max_new_tokens=128)\n", + "generated_ids_trimmed = [\n", + " out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n", + "]\n", + "output_text = processor.batch_decode(\n", + " generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n", + ")\n", + "print(output_text)" + ] + }, + { + "cell_type": "markdown", + "id": "cdd20a96", + "metadata": {}, + "source": [ + "# 2-数据集预处理:\n", + "2-1 拆分成训练集和测试集,保存为data_vl_train.json和data_vl_test.json" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "e4c5f3b9", + "metadata": {}, + "outputs": [], + "source": [ + "train_json_path = \"/root/PMN_WS/qwen-test/data_vl.json\"\n", + "with open(train_json_path, 'r') as f:\n", + " data = json.load(f)\n", + " train_data = data[:-4]\n", + " test_data = data[-4:]\n", + "\n", + "with open(\"data_vl_train.json\", \"w\") as f:\n", + " json.dump(train_data, f, indent=4, ensure_ascii=False)\n", + "\n", + "with open(\"data_vl_test.json\", \"w\") as f:\n", + " json.dump(test_data, f, indent=4, ensure_ascii=False)" + ] + }, + { + "cell_type": "markdown", + "id": "9ab267e0", + "metadata": {}, + "source": [ + "2-2 处理数据集:读取json文件,将input和label转换成模型训练需要的形式" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "0035b338", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Generating train split: 96 examples [00:00, 258.76 examples/s]\n", + "Map: 2%|▏ | 2/96 [00:00<00:07, 12.52 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 6%|▋ | 6/96 [00:00<00:06, 13.59 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 8%|▊ | 8/96 [00:00<00:06, 13.89 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 12%|█▎ | 12/96 [00:00<00:06, 13.60 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 17%|█▋ | 16/96 [00:01<00:05, 13.91 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 19%|█▉ | 18/96 [00:01<00:05, 14.24 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 23%|██▎ | 22/96 [00:01<00:05, 13.85 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 25%|██▌ | 24/96 [00:01<00:05, 13.87 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 29%|██▉ | 28/96 [00:02<00:04, 13.96 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 31%|███▏ | 30/96 [00:02<00:04, 13.89 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 35%|███▌ | 34/96 [00:02<00:04, 13.97 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 38%|███▊ | 36/96 [00:02<00:04, 13.85 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 42%|████▏ | 40/96 [00:02<00:03, 14.01 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 46%|████▌ | 44/96 [00:03<00:03, 14.53 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 48%|████▊ | 46/96 [00:03<00:03, 14.57 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 52%|█████▏ | 50/96 [00:03<00:03, 13.71 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 54%|█████▍ | 52/96 [00:03<00:03, 14.10 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 58%|█████▊ | 56/96 [00:04<00:02, 14.38 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 60%|██████ | 58/96 [00:04<00:02, 13.22 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 62%|██████▎ | 60/96 [00:04<00:05, 6.39 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 65%|██████▍ | 62/96 [00:05<00:07, 4.68 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 67%|██████▋ | 64/96 [00:06<00:07, 4.49 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 69%|██████▉ | 66/96 [00:06<00:05, 5.02 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 71%|███████ | 68/96 [00:06<00:04, 6.46 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 74%|███████▍ | 71/96 [00:06<00:02, 9.21 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 76%|███████▌ | 73/96 [00:07<00:02, 10.53 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 80%|████████ | 77/96 [00:07<00:01, 12.73 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 84%|████████▍ | 81/96 [00:07<00:01, 13.80 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 86%|████████▋ | 83/96 [00:07<00:00, 14.19 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 91%|█████████ | 87/96 [00:07<00:00, 14.55 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 93%|█████████▎| 89/96 [00:08<00:00, 14.44 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 97%|█████████▋| 93/96 [00:08<00:00, 13.17 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 99%|█████████▉| 95/96 [00:08<00:00, 12.76 examples/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 100%|██████████| 96/96 [00:09<00:00, 9.66 examples/s]\n", + "Saving the dataset (1/1 shards): 100%|██████████| 96/96 [00:03<00:00, 31.86 examples/s]\n" + ] + } + ], + "source": [ + "def process_func(example):\n", + " \"\"\"\n", + " 将数据集进行预处理\n", + " \"\"\"\n", + " MAX_LENGTH = 8192\n", + " input_ids, attention_mask, labels = [], [], []\n", + " conversation = example[\"conversations\"]\n", + " input_content = conversation[0][\"value\"]\n", + " output_content = conversation[1][\"value\"]\n", + " file_path = input_content.split(\"<|vision_start|>\")[1].split(\"<|vision_end|>\")[0] # 获取图像路径\n", + " messages = [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"image\",\n", + " \"image\": f\"{file_path}\",\n", + " \"resized_height\": 280,\n", + " \"resized_width\": 280,\n", + " },\n", + " {\n", + " \n", + " \"type\": \"text\", \n", + " \"text\": \"COCO Yes:\"\n", + " },\n", + " ],\n", + " }\n", + " ]\n", + "\n", + " # 获取文本 \n", + " text = processor.apply_chat_template(\n", + " messages, tokenize=False, add_generation_prompt=True\n", + " ) \n", + "\n", + " image_inputs, video_inputs = process_vision_info(messages) # 获取数据数据(预处理过)\n", + " inputs = processor(\n", + " text=[text],\n", + " images=image_inputs,\n", + " videos=video_inputs,\n", + " padding=True,\n", + " return_tensors=\"pt\",\n", + " )\n", + " inputs = {key: value.tolist() for key, value in inputs.items()} #tensor -> list,为了方便拼接\n", + " instruction = inputs \n", + " # for key, value in inputs.items():\n", + " # print(key,'\\n')\n", + " # \"\"\"\n", + " # inputs:dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw'])\n", + " # \"\"\"\n", + " \n", + "\n", + " response = tokenizer(f\"{output_content}\", add_special_tokens=False)\n", + " # for key, value in response.items():\n", + " # print(key,value)\n", + " # \"\"\"\n", + " # input_ids [32, 10729, 702, 6481, 22360, 12632, 323, 25904, 13]\n", + " # attention_mask [1, 1, 1, 1, 1, 1, 1, 1, 1]\n", + " # \"\"\"\n", + " \n", + " # 输入text+回答text\n", + " input_ids = (\n", + " instruction[\"input_ids\"][0] + response[\"input_ids\"] + [tokenizer.pad_token_id]\n", + " )\n", + " # 哪些位置是有效内容(1有效,0无效)\n", + " attention_mask = instruction[\"attention_mask\"][0] + response[\"attention_mask\"] + [1]\n", + " # 模型不学习部分,用-100标记\n", + " labels = (\n", + " [-100] * len(instruction[\"input_ids\"][0])\n", + " + response[\"input_ids\"]\n", + " + [tokenizer.pad_token_id]\n", + " )\n", + "\n", + " if len(input_ids) > MAX_LENGTH: # 做一个截断\n", + " input_ids = input_ids[:MAX_LENGTH]\n", + " attention_mask = attention_mask[:MAX_LENGTH]\n", + " labels = labels[:MAX_LENGTH]\n", + "\n", + " input_ids = torch.tensor(input_ids)\n", + " attention_mask = torch.tensor(attention_mask)\n", + " labels = torch.tensor(labels)\n", + "\n", + "\n", + "\n", + " inputs['pixel_values'] = torch.tensor(inputs['pixel_values'])\n", + " inputs['image_grid_thw'] = torch.tensor(inputs['image_grid_thw']).squeeze(0) #由(1,h,w)变换为(h,w)\n", + " print(type(input_ids))\n", + "\n", + " return {\"input_ids\": input_ids, \"attention_mask\": attention_mask, \"labels\": labels,\n", + " \"pixel_values\": inputs['pixel_values'], \"image_grid_thw\": inputs['image_grid_thw']}\n", + "\n", + "train_ds = Dataset.from_json(\"data_vl_train.json\")\n", + "train_dataset = train_ds.map(process_func)\n", + "# 保存为本地磁盘目录,默认保存为list\n", + "train_dataset.save_to_disk(\"./processed_train_dataset\")" + ] + }, + { + "cell_type": "markdown", + "id": "74a4927d", + "metadata": {}, + "source": [ + "2-3 读取数据是否保存成功" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "6205a265", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset size: 96\n", + "✅ Forward 成功!\n", + "Loss: tensor(2.3434, device='cuda:0')\n" + ] + } + ], + "source": [ + "from datasets import load_from_disk\n", + "\n", + "# 加载保存的数据集\n", + "loaded_dataset = load_from_disk(\"./processed_train_dataset\")\n", + "\n", + "# 查看数据集大小\n", + "print(f\"Dataset size: {len(loaded_dataset)}\")\n", + "\n", + "# 查看第一条样本内容(字段和类型)\n", + "sample = loaded_dataset[0]\n", + "# 需要转换的字段\n", + "required_keys = [\"input_ids\", \"attention_mask\", \"labels\", \"pixel_values\", \"image_grid_thw\"]\n", + "batch = {}\n", + "\n", + "for key in required_keys:\n", + " value = sample[key]\n", + "\n", + " # 转成 Tensor(避免直接转 dict)\n", + " value = torch.tensor(value)\n", + "\n", + " # 添加 batch 维度\n", + " if value.dim() == 1 or (key == \"image_grid_thw\" and value.dim() == 2):\n", + " value = value.unsqueeze(0)\n", + "\n", + " # 放到模型所在设备\n", + " batch[key] = value.to(model.device)\n", + "\n", + "\n", + "\n", + "# 验证模型能否 forward\n", + "model.eval()\n", + "with torch.no_grad():\n", + " outputs = model(**batch)\n", + "\n", + "# 打印结果\n", + "print(\"✅ Forward 成功!\")\n", + "print(\"Loss:\", outputs.loss if hasattr(outputs, \"loss\") else None)\n" + ] + }, + { + "cell_type": "markdown", + "id": "efbb5025", + "metadata": {}, + "source": [ + "# 训练配置" + ] + }, + { + "cell_type": "markdown", + "id": "428dc34d", + "metadata": {}, + "source": [ + "查看设备配置信息" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "e0085d74", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🧠 CPU 信息:\n", + "- 处理器: x86_64\n", + "- 核心数: 64 物理核心 / 128 逻辑核心\n", + "\n", + "🧮 内存信息:\n", + "- 总内存: 2163.58 GB\n", + "- 已使用: 173.8 GB\n", + "- 可用: 1962.2 GB\n", + "\n", + "🎮 GPU 信息:\n", + "- GPU 0: NVIDIA A100-SXM4-80GB\n", + " - 显存总量: 25.76 GB\n", + " - 当前占用: 4.69 GB\n", + " - 保留显存: 5.2 GB\n" + ] + } + ], + "source": [ + "import torch\n", + "import psutil\n", + "import platform\n", + "import os\n", + "\n", + "# CPU 信息\n", + "print(\"🧠 CPU 信息:\")\n", + "print(f\"- 处理器: {platform.processor()}\")\n", + "print(f\"- 核心数: {psutil.cpu_count(logical=False)} 物理核心 / {psutil.cpu_count(logical=True)} 逻辑核心\")\n", + "\n", + "# 内存信息\n", + "mem = psutil.virtual_memory()\n", + "print(\"\\n🧮 内存信息:\")\n", + "print(f\"- 总内存: {round(mem.total / 1e9, 2)} GB\")\n", + "print(f\"- 已使用: {round(mem.used / 1e9, 2)} GB\")\n", + "print(f\"- 可用: {round(mem.available / 1e9, 2)} GB\")\n", + "\n", + "# GPU 信息(需安装 NVIDIA 驱动)\n", + "if torch.cuda.is_available():\n", + " print(\"\\n🎮 GPU 信息:\")\n", + " for i in range(torch.cuda.device_count()):\n", + " print(f\"- GPU {i}: {torch.cuda.get_device_name(i)}\")\n", + " print(f\" - 显存总量: {round(torch.cuda.get_device_properties(i).total_memory / 1e9, 2)} GB\")\n", + " print(f\" - 当前占用: {round(torch.cuda.memory_allocated(i) / 1e9, 2)} GB\")\n", + " print(f\" - 保留显存: {round(torch.cuda.memory_reserved(i) / 1e9, 2)} GB\")\n", + "else:\n", + " print(\"\\n🚫 当前没有可用的 CUDA GPU\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5a5841bb", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/root/.conda/envs/qwen2.5.3/lib/python3.10/site-packages/peft/mapping_func.py:73: UserWarning: You are trying to modify a model with PEFT for a second time. If you want to reload the model with a different config, make sure to call `.unload()` before.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "# 配置LoRA\n", + "config = LoraConfig(\n", + " task_type=TaskType.CAUSAL_LM,\n", + " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n", + " inference_mode=False, # 训练模式\n", + " r=16, # 越大表达能力越强,参数越多\n", + " lora_alpha=32, # = r 或 2r,缩放因子,调节学习率与初始化范围\n", + " lora_dropout=0.05, # Dropout 比例\n", + " bias=\"none\",\n", + ")\n", + "\n", + "# 获取LoRA模型\n", + "peft_model = get_peft_model(model, config)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e3824dd6", + "metadata": {}, + "outputs": [], + "source": [ + "# 配置训练参数\n", + "args = TrainingArguments(\n", + " output_dir=\"./output/qwen2-vl-lora\",\n", + " per_device_train_batch_size=1,\n", + " gradient_accumulation_steps=8,\n", + " logging_steps=10,\n", + " logging_first_step=True,\n", + " save_safetensors=False, # 禁用 safetensors\n", + " num_train_epochs=2,\n", + " save_steps=100,\n", + " bf16=True, \n", + " learning_rate=1e-4,\n", + " save_on_each_node=True,\n", + " gradient_checkpointing=True,\n", + " remove_unused_columns=False, # 必须关\n", + " dataloader_num_workers=2, # 防止爆内存\n", + " report_to=\"none\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "5692090c", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_from_disk\n", + "# 设置SwanLab回调\n", + "train_dataset = load_from_disk(\"./processed_train_dataset\")\n", + "\n", + "swanlab_callback = SwanLabCallback(\n", + " project=\"Qwen2-VL-finetune\",\n", + " experiment_name=\"qwen2-vl-coco2014\",\n", + " config={\n", + " \"model\": \"https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct\",\n", + " \"dataset\": \"https://modelscope.cn/datasets/modelscope/coco_2014_caption/quickstart\",\n", + " \"github\": \"https://github.com/datawhalechina/self-llm\",\n", + " \"prompt\": \"COCO Yes: \",\n", + " \"train_data_number\": len(train_dataset),\n", + " \"lora_rank\": 64,\n", + " \"lora_alpha\": 16,\n", + " \"lora_dropout\": 0.1,\n", + " },\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d16e718e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Tracking run with swanlab version 0.6.4 \n", + "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Run data will be saved locally in \u001b[35m\u001b[1m/root/PMN_WS/qwen-test/swanlog/run-20250625_194816-a3b1799d\u001b[0m\u001b[0m\n", + "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 👋 Hi \u001b[1m\u001b[39mpumpkin_nan\u001b[0m\u001b[0m, welcome to swanlab!\n", + "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Syncing run \u001b[33mqwen2-vl-coco2014\u001b[0m to the cloud\n", + "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🏠 View project at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune\u001b[0m\u001b[0m\n", + "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune/runs/e8rv1gel0mun1e9c74942\u001b[0m\u001b[0m\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " Show Iframe\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/root/.conda/envs/qwen2.5.3/lib/python3.10/site-packages/torch/utils/checkpoint.py:87: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n", + " warnings.warn(\n", + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [24/24 01:24, Epoch 2/2]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
13.471600
102.685700
201.818100

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🏠 View project at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune\u001b[0m\u001b[0m\n", + "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune/runs/e8rv1gel0mun1e9c74942\u001b[0m\u001b[0m\n", + "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Waiting for uploading complete\n", + " \n" + ] + } + ], + "source": [ + "# 配置Trainer\n", + "from transformers import Trainer\n", + "from torch.nn.utils.rnn import pad_sequence\n", + "\n", + "\n", + "class MyTrainer(Trainer):\n", + " def compute_loss(self, model, inputs, return_outputs=False, **kwargs):\n", + " if \"num_items_in_batch\" in kwargs:\n", + " kwargs.pop(\"num_items_in_batch\")\n", + " if \"num_items_in_batch\" in inputs:\n", + " inputs.pop(\"num_items_in_batch\")\n", + "\n", + " outputs = model(**inputs)\n", + " loss = outputs.loss if hasattr(outputs, \"loss\") else outputs[0]\n", + " return (loss, outputs) if return_outputs else loss\n", + "\n", + "class MultimodalCollator:\n", + " def __init__(self, tokenizer):\n", + " self.tokenizer = tokenizer\n", + "\n", + " def __call__(self, batch):\n", + " input_ids = [torch.tensor(x[\"input_ids\"]) for x in batch]\n", + " attention_mask = [torch.tensor(x[\"attention_mask\"]) for x in batch]\n", + " labels = [torch.tensor(x[\"labels\"]) for x in batch]\n", + " pixel_values = [\n", + " torch.tensor(x[\"pixel_values\"]) if not isinstance(x[\"pixel_values\"], torch.Tensor) else x[\"pixel_values\"] \n", + " for x in batch\n", + " ]\n", + " image_grid_thw = [\n", + " torch.tensor(x[\"image_grid_thw\"]) if not isinstance(x[\"image_grid_thw\"], torch.Tensor) else x[\"image_grid_thw\"] \n", + " for x in batch\n", + " ]\n", + " # padding sequences\n", + " input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)\n", + " attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)\n", + " labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)\n", + "\n", + " # stack images tensors (假设都是相同shape)\n", + " pixel_values = torch.stack(pixel_values)\n", + " image_grid_thw = torch.stack(image_grid_thw)\n", + "\n", + " return {\n", + " \"input_ids\": input_ids,\n", + " \"attention_mask\": attention_mask,\n", + " \"labels\": labels,\n", + " \"pixel_values\": pixel_values,\n", + " \"image_grid_thw\": image_grid_thw,\n", + " }\n", + "\n", + "\n", + "trainer = MyTrainer(\n", + " model=peft_model,\n", + " args=args,\n", + " train_dataset=train_dataset,\n", + " data_collator=MultimodalCollator(tokenizer=tokenizer),\n", + " callbacks=[swanlab_callback],\n", + ")\n", + "\n", + "# 开启模型训练\n", + "trainer.train()\n", + "# 训练完成后手动保存\n", + "peft_model.save_pretrained(args.output_dir)\n", + "# tokenizer.save_pretrained(args.output_dir)\n", + "swanlab.finish()" + ] + }, + { + "cell_type": "markdown", + "id": "73c7ef64", + "metadata": {}, + "source": [ + "# 测试模式" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e40dd83", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/root/.conda/envs/qwen2.5.3/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:167: UserWarning: Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing!\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "# 配置LoRA\n", + "val_config = LoraConfig(\n", + " task_type=TaskType.CAUSAL_LM,\n", + " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n", + " inference_mode=True, # 推理\n", + " r=16, # 越大表达能力越强,参数越多\n", + " lora_alpha=32, # = r 或 2r,缩放因子,调节学习率与初始化范围\n", + " lora_dropout=0.05, # Dropout 比例\n", + " bias=\"none\",\n", + ")\n", + "\n", + "\n", + "# 获取测试模型\n", + "val_peft_model = PeftModel.from_pretrained(model,model_id=\"/root/PMN_WS/qwen-test/output/qwen2-vl-lora/checkpoint-24\", \n", + " config=val_config)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "7637ae3e", + "metadata": {}, + "outputs": [], + "source": [ + "def predict(messages, model):\n", + " # 准备推理\n", + " text = processor.apply_chat_template(\n", + " messages, tokenize=False, add_generation_prompt=True\n", + " )\n", + " image_inputs, video_inputs = process_vision_info(messages)\n", + " inputs = processor(\n", + " text=[text],\n", + " images=image_inputs,\n", + " videos=video_inputs,\n", + " padding=True,\n", + " return_tensors=\"pt\",\n", + " )\n", + " inputs = inputs.to(\"cuda\")\n", + "\n", + " # 生成输出\n", + " generated_ids = model.generate(**inputs, max_new_tokens=128)\n", + " generated_ids_trimmed = [\n", + " out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n", + " ]\n", + " output_text = processor.batch_decode(\n", + " generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n", + " )\n", + " \n", + " return output_text[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "040a4f9e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Tracking run with swanlab version 0.6.4 \n", + "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Run data will be saved locally in \u001b[35m\u001b[1m/root/PMN_WS/qwen-test/swanlog/run-20250625_205322-8ce21ea3\u001b[0m\u001b[0m\n", + "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 👋 Hi \u001b[1m\u001b[39mpumpkin_nan\u001b[0m\u001b[0m, welcome to swanlab!\n", + "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Syncing run \u001b[33mrabbit-5\u001b[0m to the cloud\n", + "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🏠 View project at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune\u001b[0m\u001b[0m\n", + "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune/runs/go1e95rfocx0951w3l8r8\u001b[0m\u001b[0m\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " Show Iframe\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'role': 'assistant', 'content': 'A bathroom with a toilet and sink.'}\n", + "{'role': 'assistant', 'content': 'A motorcycle parked under a roof.'}\n", + "{'role': 'assistant', 'content': 'A toilet with a lid up next to a toilet brush.'}\n", + "{'role': 'assistant', 'content': 'A bathroom with urinals and sinks.'}\n", + "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🏠 View project at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune\u001b[0m\u001b[0m\n", + "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://swanlab.cn/@pumpkin_nan/Qwen2-VL-finetune/runs/go1e95rfocx0951w3l8r8\u001b[0m\u001b[0m\n", + "\u001b[1m\u001b[34mswanlab\u001b[0m\u001b[0m: Waiting for uploading complete\n", + " \n" + ] + } + ], + "source": [ + "# 读取测试数据\n", + "with open(\"data_vl_test.json\", \"r\") as f:\n", + " test_dataset = json.load(f)\n", + "\n", + "# ✅ 初始化,必须最先调用\n", + "\n", + "swanlab.init(\n", + " project=\"Qwen2-VL-finetune\",\n", + " task=\"test\",\n", + " run_name=\"qwen2-vl-eval-ckpt24\",\n", + ")\n", + "\n", + "\n", + "\n", + "test_image_list = []\n", + "for item in test_dataset:\n", + " input_image_prompt = item[\"conversations\"][0][\"value\"]\n", + " # 去掉前后的<|vision_start|>和<|vision_end|>\n", + " origin_image_path = input_image_prompt.split(\"<|vision_start|>\")[1].split(\"<|vision_end|>\")[0]\n", + " \n", + " messages = [{\n", + " \"role\": \"user\", \n", + " \"content\": [\n", + " {\n", + " \"type\": \"image\", \n", + " \"image\": origin_image_path\n", + " },\n", + " {\n", + " \"type\": \"text\",\n", + " \"text\": \"COCO Yes:\"\n", + " }\n", + " ]}]\n", + " \n", + " response = predict(messages, val_peft_model)\n", + " messages.append({\"role\": \"assistant\", \"content\": f\"{response}\"})\n", + " print(messages[-1])\n", + "\n", + " test_image_list.append(swanlab.Image(origin_image_path, caption=response))\n", + "\n", + "swanlab.log({\"Prediction\": test_image_list})\n", + "\n", + "# 在Jupyter Notebook中运行时要停止SwanLab记录,需要调用swanlab.finish()\n", + "swanlab.finish()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f38cb046", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['A woman in a blue shirt is sitting at a table with a child in a pink shirt. The child is holding a hoop with ice cream cones on it. The child is also holding a phone. The woman is looking at the child. The child is looking at the phone. The woman is wearing a blue shirt. The child is wearing a pink shirt. The child is holding a phone. The woman is looking at the child. The child is looking at the phone. The woman is wearing a blue shirt. The child is wearing a pink shirt. The child is holding a phone. The woman is looking at the child. The child']\n" + ] + } + ], + "source": [ + "messages = [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"image\",\n", + " \"image\": \"/root/PMN_WS/VLM_test/image/吃冰激凌.jpg\",\n", + " },\n", + " {\"type\": \"text\", \n", + " \"text\": \"Describe this image.\"},\n", + " ],\n", + "\n", + " }\n", + "]\n", + "# Preparation for inference\n", + "text = processor.apply_chat_template(\n", + " messages, tokenize=False, add_generation_prompt=True\n", + ")\n", + "\n", + "image_inputs, video_inputs = process_vision_info(messages)\n", + "\n", + "inputs = processor(\n", + " text=[text],\n", + " images=image_inputs,\n", + " videos=video_inputs,\n", + " padding=True,\n", + " return_tensors=\"pt\",\n", + ")\n", + "inputs = inputs.to(\"cuda\")\n", + "\n", + "# Inference: Generation of the output\n", + "# val_peft_model\n", + "generated_ids = model.generate(**inputs, max_new_tokens=128)\n", + "generated_ids_trimmed = [\n", + " out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n", + "]\n", + "output_text = processor.batch_decode(\n", + " generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n", + ")\n", + "print(output_text)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "qwen2.5.3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/train.json b/train.json new file mode 100644 index 0000000..e69de29