1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
| import os import json import random import shutil from tqdm import tqdm import yaml
RDD2022_ROOT = './RDD2022'
YOLO_DATASET_ROOT = './RDD2022_YOLO'
VAL_SPLIT_RATIO = 0.2
CLASSES = [ 'longitudinal crack', 'transverse crack', 'alligator crack', 'pothole', 'ther corruption', 'other corruption', ]
def convert_supervisely_to_yolo(img_size, points): """ 将 Supervisely 的矩形框坐标转换为 YOLO 格式 参数: img_size: 图片尺寸 [width, height] points: Supervisely 的坐标点 [[xmin, ymin], [xmax, ymax]] 返回: 归一化后的 YOLO 格式坐标 [x_center, y_center, width, height] """ img_width, img_height = img_size xmin = min(points[0][0], points[1][0]) ymin = min(points[0][1], points[1][1]) xmax = max(points[0][0], points[1][0]) ymax = max(points[0][1], points[1][1])
dw = 1. / img_width dh = 1. / img_height x_center = (xmin + xmax) / 2.0 * dw y_center = (ymin + ymax) / 2.0 * dh width = (xmax - xmin) * dw height = (ymax - ymin) * dh return (x_center, y_center, width, height)
def process_json_file(json_path, class_map): """ 解析单个 Supervisely JSON 文件,并返回 YOLO 格式的标注内容 """ yolo_labels = [] with open(json_path, 'r') as f: data = json.load(f) img_height = data['size']['height'] img_width = data['size']['width'] for obj in data['objects']: if obj['geometryType'] != 'rectangle': continue cls_name = obj['classTitle'] if cls_name not in class_map: print(f"警告: 在 {os.path.basename(json_path)} 中发现未知类别 '{cls_name}',已跳过。") continue cls_id = class_map[cls_name] points = obj['points']['exterior'] yolo_box = convert_supervisely_to_yolo((img_width, img_height), points) yolo_labels.append(f"{cls_id} {' '.join([f'{coord:.6f}' for coord in yolo_box])}") return yolo_labels
def main(): print("--- 开始清洗和转换 RDD2022 (Supervisely JSON 格式) 数据集 ---")
class_to_id = {cls: i for i, cls in enumerate(CLASSES)} ann_dir = os.path.join(RDD2022_ROOT, 'train', 'ann') img_dir = os.path.join(RDD2022_ROOT, 'train', 'img')
print("步骤 1/5: 正在扫描所有 JSON 标注文件...") if not os.path.isdir(ann_dir): print(f"错误: 标注目录 '{ann_dir}' 不存在。请检查 RDD2022_ROOT 路径是否正确。") return all_json_files = [f for f in os.listdir(ann_dir) if f.endswith('.json')] if not all_json_files: print(f"错误: 在 '{ann_dir}' 目录下没有找到任何 JSON 文件。") return print(f"完成。共找到 {len(all_json_files)} 个标注文件。")
print("步骤 2/5: 正在随机划分训练集和验证集...") random.shuffle(all_json_files) split_index = int(len(all_json_files) * (1 - VAL_SPLIT_RATIO)) train_files = all_json_files[:split_index] val_files = all_json_files[split_index:] datasets = { 'train': train_files, 'val': val_files } print(f"划分完成: {len(train_files)} (训练), {len(val_files)} (验证)")
print("步骤 3/5: 正在创建 YOLO 目录结构...") for split in ['train', 'val']: os.makedirs(os.path.join(YOLO_DATASET_ROOT, 'images', split), exist_ok=True) os.makedirs(os.path.join(YOLO_DATASET_ROOT, 'labels', split), exist_ok=True) print("目录创建完成。")
print("步骤 4/5: 正在转换标注并复制文件...") for split, files in datasets.items(): print(f"\n处理 {split} 集...") for json_filename in tqdm(files, desc=f"Processing {split} files"): try: json_path = os.path.join(ann_dir, json_filename) base_img_filename = os.path.splitext(json_filename)[0] src_image_path = os.path.join(img_dir, base_img_filename)
if not os.path.exists(src_image_path): print(f"警告: 找不到对应的图片 {src_image_path},跳过 {json_filename}") continue
yolo_content = process_json_file(json_path, class_to_id) if not yolo_content: continue
dest_image_path = os.path.join(YOLO_DATASET_ROOT, 'images', split, base_img_filename) label_filename = f"{os.path.splitext(base_img_filename)[0]}.txt" dest_label_path = os.path.join(YOLO_DATASET_ROOT, 'labels', split, label_filename)
shutil.copy(src_image_path, dest_image_path) with open(dest_label_path, 'w') as f: f.write('\n'.join(yolo_content)) except Exception as e: print(f"处理文件 {json_filename} 时发生错误: {e}")
print("\n步骤 5/5: 正在创建 dataset.yaml 文件...") yaml_data = { 'path': os.path.abspath(YOLO_DATASET_ROOT), 'train': 'images/train', 'val': 'images/val', 'nc': len(CLASSES), 'names': CLASSES }
yaml_path = os.path.join(YOLO_DATASET_ROOT, 'dataset.yaml') with open(yaml_path, 'w') as f: yaml.dump(yaml_data, f, sort_keys=False, allow_unicode=True) print(f"dataset.yaml 文件已创建于: {yaml_path}") print("\n--- 所有任务完成!数据集已准备好用于 YOLO 训练。 ---")
if __name__ == '__main__': try: import yaml from tqdm import tqdm except ImportError: print("错误: 缺少必要的库。请运行 'pip install pyyaml tqdm'") else: main()
|