目录
导入所需库
定义数据集路径
创建输出目录
读取JSON注释文件
随机打乱图像列表
计算划分大小
复制图像到相应文件夹
完整代码
导入所需库
我们需要以下Python库:
os:处理文件路径。
json:读取和写入JSON文件。
numpy:随机打乱图像列表。
shutil:复制图像文件。
import os
import json
import numpy as np
import shutil
定义数据集路径
设置数据集的根目录、图像文件夹和注释文件路径。
根目录:"D:\\dataset"
图像文件夹:"D:\\dataset\\images"
注释文件:"D:\\dataset\\annotations.json"
# 数据集路径(请根据实际情况修改)
dataset_root = "D:\\dataset"
images_folder = os.path.join(dataset_root, "images")
annotations_path = os.path.join(dataset_root, "annotations.json")
创建输出目录
在根目录下创建output文件夹,并在其中创建out_train、out_val和out_test子文件夹。
# 输出路径
output_root = os.path.join(dataset_root, "output")
os.makedirs(output_root, exist_ok=True)train_folder = os.path.join(output_root, "out_train")
val_folder = os.path.join(output_root, "out_val")
test_folder = os.path.join(output_root, "out_test")
os.makedirs(train_folder, exist_ok=True)
os.makedirs(val_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)
读取JSON注释文件
加载COCO格式的JSON文件,提取images(图像信息)、annotations(标注信息)和categories(类别信息)。
# 读取注释文件
with open(annotations_path, "r") as f:
annotations_data = json.load(f)# 提取数据
images = annotations_data["images"]
annotations = annotations_data["annotations"]
categories = annotations_data["categories"]
随机打乱图像列表
使用numpy随机打乱图像列表,确保划分的随机性。
# 随机打乱图像列表
np.random.shuffle(images)
计算划分大小
根据图像总数和比例计算训练集和测试集的大小:
假设图像总数为N。
训练集:N * 0.8。
验证集:N * 0.0 = 0。
测试集:N * 0.2。
# 定义划分比例
train_ratio, val_ratio, test_ratio = 0.8, 0, 0.2# 计算大小
num_images = len(images)
num_train = int(num_images * train_ratio)
num_val = int(num_images * val_ratio) # 将为0# 划分图像
train_images = images[:num_train]
val_images = images[num_train:num_train + num_val] # 空列表
test_images = images[num_train + num_val:]
复制图像到相应文件夹
将训练集和测试集的图像复制到对应的文件夹。
# 复制图像
for img in train_images:
shutil.copy(os.path.join(images_folder, img["file_name"]),
os.path.join(train_folder, img["file_name"]))for img in val_images: # 不会执行
shutil.copy(os.path.join(images_folder, img["file_name"]),
os.path.join(val_folder, img["file_name"]))for img in test_images:
shutil.copy(os.path.join(images_folder, img["file_name"]),
os.path.join(test_folder, img["file_name"]))
完整代码
以下是完整的Python脚本:
import os
import json
import numpy as np
import shutil
# 数据集路径(请根据实际情况修改)
dataset_root = "D:\\dataset"
images_folder = os.path.join(dataset_root, "images")
annotations_path = os.path.join(dataset_root, "annotations.json")
# 输出路径
output_root = os.path.join(dataset_root, "output")
os.makedirs(output_root, exist_ok=True)
train_folder = os.path.join(output_root, "out_train")
val_folder = os.path.join(output_root, "out_val")
test_folder = os.path.join(output_root, "out_test")
os.makedirs(train_folder, exist_ok=True)
os.makedirs(val_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)
# 读取注释文件
with open(annotations_path, "r") as f:
annotations_data = json.load(f)
# 提取数据
images = annotations_data["images"]
annotations = annotations_data["annotations"]
categories = annotations_data["categories"]
# 随机打乱图像列表
np.random.shuffle(images)
# 定义划分比例
train_ratio, val_ratio, test_ratio = 0.8, 0, 0.2
# 计算大小
num_images = len(images)
num_train = int(num_images * train_ratio)
num_val = int(num_images * val_ratio)
# 划分图像
train_images = images[:num_train]
val_images = images[num_train:num_train + num_val]
test_images = images[num_train + num_val:]
# 复制图像
for img in train_images:
shutil.copy(os.path.join(images_folder, img["file_name"]),
os.path.join(train_folder, img["file_name"]))
for img in val_images:
shutil.copy(os.path.join(images_folder, img["file_name"]),
os.path.join(val_folder, img["file_name"]))
for img in test_images:
shutil.copy(os.path.join(images_folder, img["file_name"]),
os.path.join(test_folder, img["file_name"]))
# 函数:过滤注释
def filter_annotations(annotations, image_ids):
return [ann for ann in annotations if ann["image_id"] in image_ids]
# 获取image_ids
train_image_ids = [img["id"] for img in train_images]
val_image_ids = [img["id"] for img in val_images]
test_image_ids = [img["id"] for img in test_images]
# 过滤注释
train_ann = filter_annotations(annotations, train_image_ids)
val_ann = filter_annotations(annotations, val_image_ids)
test_ann = filter_annotations(annotations, test_image_ids)
# 创建JSON字典
train_json = {"images": train_images, "annotations": train_ann, "categories": categories}
val_json = {"images": val_images, "annotations": val_ann, "categories": categories}
test_json = {"images": test_images, "annotations": test_ann, "categories": categories}
# 写入JSON文件
with open(os.path.join(output_root, "out_train.json"), "w") as f:
json.dump(train_json, f)
with open(os.path.join(output_root, "out_val.json"), "w") as f:
json.dump(val_json, f)
with open(os.path.join(output_root, "out_test.json"), "w") as f:
json.dump(test_json, f)
print("数据集划分完成!")