统计数据集的TXT、XML及JSON标注文件中各类别/每个标签的数量
在计算机视觉和深度学习领域,标注文件是模型训练的重要组成部分。无论是图像分类、目标检测还是图像分割,正确的标注能够显著提升模型的性能。在实际应用中,我们需要快速了解每个类别的样本数量,以便进行数据分析、平衡类别分布或优化模型训练。
以下是各个格式的文件代码,输出均按照标签数量从多到少排序,其中txt输入需要修改列表中标签。
统计YOLO格式的TXT文件
import os
from collections import Counter
string_table = ['hat','nohat'] #按顺序修改为类别列表
folder_path = r' ' #修改为txt文件夹
category_counter = Counter()
for filename in os.listdir(folder_path):
if filename.endswith('.txt'):
file_path = os.path.join(folder_path, filename)
with open(file_path, 'r') as file:
for line in file:
category_index = int(line.split()[0])
if category_index < len(string_table):
category = string_table[category_index]
category_counter[category] += 1
print("各类别数量:")
for category in string_table:
count = category_counter[category]
print(f"{category}: {count}")
统计VOC格式的XML文件
import os
import xml.etree.ElementTree as ET
class_count = {}
folder_path = r' ' # 此处修改为xml文件夹
for filename in os.listdir(folder_path):
if filename.endswith('.xml'):
tree = ET.parse(os.path.join(folder_path, filename))
root = tree.getroot()
for obj in root.findall('object'):
name = obj.find('name').text
if name in class_count:
class_count[name] += 1
else:
class_count[name] = 1
sorted_class_count = sorted(class_count.items(), key=lambda x: x[1], reverse=True)
print("各类别数量:")
for name, count in sorted_class_count:
print(f"{name}: {count}")
统计JSON文件
import os
import json
from collections import Counter
json_folder = r' ' # 修改JSON文件夹路径
json_files = [f for f in os.listdir(json_folder) if f.endswith('.json')]
category_counter = Counter()
for json_file in json_files:
with open(os.path.join(json_folder, json_file), 'r') as f:
data = json.load(f)
for shape in data['shapes']:
category = shape['label']
category_counter[category] += 1
sorted_category_count = sorted(category_counter.items(), key=lambda x: x[1], reverse=True)
print("各类别数量:")
for category, count in sorted_category_count:
print(f"{category}: {count}")