目标检测——清洗数据
清洗VOC格式数据集代码示例
import os
import xml.etree.ElementTree as ET
def process_annotations(image_folder, annotation_folder):
# 遍历标签文件夹中的所有XML文件
for xml_file in os.listdir(annotation_folder):
if not xml_file.endswith('.xml'):
continue
xml_path = os.path.join(annotation_folder, xml_file)
tree = ET.parse(xml_path)
root = tree.getroot()
# 标记是否保留该文件
keep_file = False
# 遍历所有<object>标签
for obj in root.findall('object'):
name = obj.find('name').text
if name == 'person': # 需修改,保留哪个类别就写哪个类别
keep_file = True
else:
root.remove(obj) # 移除非Pedestrian的<object>
# 如果没有Pedestrian类别,删除对应的图片和标签
if not keep_file:
image_name = root.find('filename').text
image_path = os.path.join(image_folder, image_name)
if os.path.exists(image_path):
os.remove(image_path)
os.remove(xml_path)
else:
# 保存修改后的XML文件
tree.write(xml_path)
# 示例用法
image_folder = r'D:\BaiduNetdiskDownload\VOCdevkit\VOCdevkit\VOC2007\JPEGImages' # 替换为图片文件夹路径
annotation_folder = r'D:\BaiduNetdiskDownload\VOCdevkit\VOCdevkit\VOC2007\Annotations' # 替换为标签文件夹路径
process_annotations(image_folder, annotation_folder)
需根据自己的数据集修改name及文件路径!!!
清洗YOLO格式数据集代码示例
import os
def process_labels(image_folder, label_folder):
# 遍历标签文件夹中的所有标签文件
for label_file in os.listdir(label_folder):
if not label_file.endswith('.txt'):
continue
label_path = os.path.join(label_folder, label_file)
image_name = os.path.splitext(label_file)[0] + '.png'
image_path = os.path.join(image_folder, image_name)
# 读取标签文件内容
with open(label_path, 'r') as f:
lines = f.readlines()
# 需修改!!!根据自己想要的类别保留!筛选类别为0的行
filtered_lines = [line for line in lines if line.strip().split()[0] == '0']
# 如果没有类别为0的行,删除对应的图片和标签
if not filtered_lines:
if os.path.exists(image_path):
os.remove(image_path)
os.remove(label_path)
else:
# 保存修改后的标签文件
with open(label_path, 'w') as f:
f.writelines(filtered_lines)
# 示例用法
label_folder = r'D:\BaiduNetdiskDownload\annotations_trainval2017\txt' # 替换为图片文件夹路径
image_folder = r'D:\BaiduNetdiskDownload\val2017\val2017' # 替换为标签文件夹路径
process_labels(image_folder, label_folder)
需根据自己的数据集修改line及文件路径!!!