import os import pandas as pd import json import requests # #查看json文件 # print_json_tree(data) class image_caption_json(): """ { "info": {...}, "licenses": [...], "images": [...], "annotations": [...] } """ def __init__(self,json_path,data_number): self.json_path = json_path self.data_number = data_number with open(self.json_path, 'r') as f: self.data = json.load(f) # 树状输出json def print_json_tree(self, indent=1): for key, value in self.data.items(): print(' ' * indent + str(key), end='\n') # 如果值是列表,统计列表长度 if isinstance(value, list): value_count = len(value) # 如果值是字典,统计字典的键数量 elif isinstance(value, dict): value_count = len(value.keys()) # 如果值是单个值,数量为1 else: value_count = 1 print(value_count) # if isinstance(value, dict): # print() # print_json_tree(value, indent + 1) # else: # print(': ' + str(value)) def download_image(self,url, save_dir, filename, timeout=10): # 1.创建保存路径 if not os.path.exists(save_dir): os.makedirs(save_dir, exist_ok=True) try: response = requests.get(url, stream=True) response.raise_for_status() # 检查请求是否成功 img_path = f"{save_dir}/{filename}" with open(img_path, "wb") as f: for chunk in response.iter_content(1024): f.write(chunk) print(f"✅ Successfully saved {filename} to: {img_path} from: {url}\n") return img_path except Exception as e: print(f"❌ Failed to download from[{url}]: {e}\n") return None def image_annotation(self,save_dir,csv_path): # if os.path.exists(save_dir): # print('coco_2014_caption目录已存在,跳过数据处理步骤') # return 0 # 全局变量,记录最近成功的来源(0-"coco" 或 1-"flickr") LAST_SUCCESSFUL_SOURCE = 0 # 初始化存储图片路径和描述的列表 image_paths = [] captions = [] # 获取前data_number个annotation for i, img_info in enumerate(self.data['images'][:self.data_number]): # 获取对应的caption和image img_id = img_info['id'] filename = img_info['file_name'] coco_url = img_info['coco_url'] flickr_url = img_info['flickr_url'] # 查找image对应的caption # # 匹配所有 # match_annotation =[annotation['caption'] for annotation in self.data['annotations'] if annotation['image_id'] == img_id] # 只匹配第一个 caption =next((annotation['caption'] for annotation in self.data['annotations'] if annotation['image_id'] == img_id),None) print(f"{i+1}. 图片ID: {img_id}") print(f" 文件名: {filename}") print(f" Caption: {caption}") # print(f" coco_url: {coco_url}") # 根据 url 下载图片 if LAST_SUCCESSFUL_SOURCE: first_url = flickr_url second_url = coco_url else: first_url = coco_url second_url = flickr_url image_path = self.download_image(first_url,save_dir,filename) if not image_path: image_path = self.download_image(second_url,save_dir,filename) if image_path: LAST_SUCCESSFUL_SOURCE =1-LAST_SUCCESSFUL_SOURCE else: print(f"❌❌ Failed to download ]\n") continue # 将路径和描述添加到列表中 image_paths.append(image_path) captions.append(caption) # 将图片路径和描述保存为CSV文件 df = pd.DataFrame({ 'image_path': image_paths, 'caption': captions }) # 将数据保存为CSV文件 df.to_csv(csv_path, index=False) print(f'数据处理完成') if __name__ == '__main__': file_path = '/root/PMN_WS/coco_2014_caption/annotations/captions_train2014.json' MAX_DATA_NUMBER = 100 image_caption = image_caption_json(file_path,MAX_DATA_NUMBER) save_dir='/root/PMN_WS/qwen-test/coco_2014_image' csv_path = '/root/PMN_WS/qwen-test/coco-2024-dataset.csv' image_caption.image_annotation(save_dir,csv_path) # image_caption.print_json_tree()