当前位置：首页 > article >正文

图片处理datasets示例（COCO）

article 2025/3/6 16:57:26

在这个数据驱动的时代，计算机视觉作为人工智能的一个重要分支，正以前所未有的速度改变着我们的生活。从人脸识别到自动驾驶，从医疗影像分析到增强现实，计算机视觉的应用无处不在。而这一切的基石，正是那些精心构建、标注丰富的图片处理数据集。今天，让我们一起探索几个在计算机视觉领域具有里程碑意义的图片处理datasets，了解它们如何推动技术的边界，以及如何利用这些资源开展自己的研究项目。

COCO（Common Objects in Context）

简介：COCO是一个大型、复杂的图像数据集，专注于日常场景中常见对象的检测、分割和图像描述生成。它包含超过30万张图像和250万个对象实例，覆盖了91个类别。COCO强调上下文信息的利用，鼓励算法不仅识别对象，还要理解它们之间的关系。

应用：COCO数据集对于提升复杂场景下的目标检测和分割能力至关重要。它推动了如YOLOv3、DETR等先进检测算法的发展，并在图像描述生成领域也发挥着重要作用。

下面是一段datasets代码示例：

import glob  
import random  
import os  
import sys  
import numpy as np  
from PIL import Image  
import torch  
import torch.nn.functional as F  
from utils.augmentations import horisontal_flip  # 从utils.augmentations模块导入水平翻转函数  
from torch.utils.data import Dataset  
import torchvision.transforms as transforms  
  
# 将图像填充为正方形  
def pad_to_square(img, pad_value):  
    c, h, w = img.shape  # 获取图像的通道数、高度和宽度  
    dim_diff = np.abs(h - w)  # 计算高度和宽度的差值  
    pad1, pad2 = dim_diff // 2, dim_diff - dim_diff // 2  # 计算上下或左右需要填充的像素数  
    pad = (0, 0, pad1, pad2) if h <= w else (pad1, pad2, 0, 0)  # 根据图像是更宽还是更高来决定填充的方向  
    img = F.pad(img, pad, "constant", value=pad_value)  # 使用指定的填充值对图像进行填充  
    return img, pad  
  
# 调整图像大小  
def resize(image, size):  
    image = F.interpolate(image.unsqueeze(0), size=size, mode="nearest").squeeze(0)  # 使用最近邻插值调整图像大小  
    return image  
  
# 随机调整图像大小，用于多尺度训练  
def random_resize(images, min_size=288, max_size=448):  
    new_size = random.sample(list(range(min_size, max_size + 1, 32)), 1)[0]  # 随机选择一个新大小  
    images = F.interpolate(images, size=new_size, mode="nearest")  # 调整图像大小  
    return images  
  
# 继承自Dataset的图像文件夹类，用于加载文件夹中的图像  
class ImageFolder(Dataset):  
    def __init__(self, folder_path, img_size=416):  
        self.files = sorted(glob.glob("%s/*.*" % folder_path))  # 获取文件夹中的所有文件路径，并按顺序排序  
        self.img_size = img_size  # 设置图像的目标大小  
  
    def __getitem__(self, index):  
        img_path = self.files[index % len(self.files)]  # 获取图像路径  
        img = transforms.ToTensor()(Image.open(img_path))  # 将图像转换为PyTorch张量  
        img, _ = pad_to_square(img, 0)  # 将图像填充为正方形  
        img = resize(img, self.img_size)  # 调整图像大小  
        return img_path, img  # 返回图像路径和图像张量  
  
    def __len__(self):  
        return len(self.files)  # 返回数据集中的图像数量  
  
# 继承自Dataset的列表数据集类，用于加载带有标签的图像  
class ListDataset(Dataset):  
    def __init__(self, list_path, img_size=416, augment=True, multiscale=True, normalized_labels=True):  
        with open(list_path, "r") as file:  
            self.img_files = file.readlines()  # 读取包含图像路径的文件  
  
        # 根据图像路径生成对应的标签文件路径  
        self.label_files = [  
            path.replace("images", "labels").replace(".png", ".txt").replace(".jpg", ".txt")  
            for path in self.img_files  
        ]  
        self.img_size = img_size  # 设置图像的目标大小  
        self.max_objects = 100  # 设置每张图像中最大对象数（YOLOv3中的默认设置）  
        self.augment = augment  # 是否进行数据增强  
        self.multiscale = multiscale  # 是否进行多尺度训练  
        self.normalized_labels = normalized_labels  # 标签是否归一化  
        self.min_size = self.img_size - 3 * 32  # 多尺度训练时的最小尺寸  
        self.max_size = self.img_size + 3 * 32  # 多尺度训练时的最大尺寸  
        self.batch_count = 0  # 批处理计数器  
  
    def __getitem__(self, index):  
        # 读取图像路径并进行处理  
        img_path = self.img_files[index % len(self.img_files)].rstrip()  
        img_path = r'F:\人工智能学习\深度学习课件\代码\第7章yolo\PyTorch-YOLOv3\PyTorch-YOLOv3\\' + img_path  
        img = transforms.ToTensor()(Image.open(img_path).convert('RGB'))  # 将图像转换为RGB格式并转换为PyTorch张量  
  
        # 处理非三通道图像  
        if len(img.shape) != 3:  
            img = img.unsqueeze(0).expand((3, img.shape[1:]))  # 将图像扩展为三通道  
  
        _, h, w = img.shape  # 获取图像的通道数、高度和宽度  
        h_factor, w_factor = (h, w) if self.normalized_labels else (1, 1)  # 根据是否归一化标签来计算高度和宽度的比例因子  
  
        # 将图像填充为正方形  
        img, pad = pad_to_square(img, 0)  
        _, padded_h, padded_w = img.shape  # 获取填充后的图像尺寸  
  
        # 读取标签文件并进行处理  
        label_path = self.label_files[index % len(self.img_files)].rstrip()  
        label_path = r'E:\Python\jiqistudy\code\yolo\PyTorch-YOLOv3\\' + label_path  
        targets = None  
        if os.path.exists(label_path):  
            boxes = torch.from_numpy(np.loadtxt(label_path).reshape(-1, 5))  
            # Extract coordinates for unpadded + unscaled image，
            # COCO数据集中的.txt文件每个字段的含义：
            # class_num：类别编号，从1开始。
            # box_cx：归一化后的中心横坐标，即像素坐标的cx除以图像宽度的结果。
            # box_cy：归一化后的中心纵坐标，即像素坐标的cy除以图像高度的结果。
            # box_w：归一化后的标注框宽度，即标注框宽度除以图像宽度的结果。
            # box_h：归一化后的标注框高度，即标注框高度除以图像高度的结果
            # 读取标签文件并转换为PyTorch张量  
            # 将标签文件中的归一化坐标转换为像素坐标，并考虑填充  
            x1 = w_factor * (boxes[:, 1] - boxes[:, 3] / 2)  
            y1 = h_factor * (boxes[:, 2] - boxes[:, 4] / 2)  
            x2 = w_factor * (boxes[:, 1] + boxes[:, 3] / 2)  
            y2 = h_factor * (boxes[:, 2] + boxes[:, 4] / 2)  
            x1 += pad[0]  
            y1 += pad[2]  
            x2 += pad[1]  
            y2 += pad[3]  
            # 将填充后的坐标转换回归一化坐标，但基于填充后的图像尺寸  
            boxes[:, 1] = ((x1 + x2) / 2) / padded_w  
            boxes[:, 2] = ((y1 + y2) / 2) / padded_h  
            boxes[:, 3] *= w_factor / padded_w  
            boxes[:, 4] *= h_factor / padded_h  
  
            targets = torch.zeros((len(boxes), 6))  # 初始化目标张量  
            targets[:, 1:] = boxes  # 填充目标张量  
  
        # 进行数据增强  
        if self.augment:  
            if np.random.random() < 0.5:  
                img, targets = horisontal_flip(img, targets)  # 以50%的概率进行水平翻转  
  
        return img_path, img, targets  # 返回图像路径、图像张量和目标张量  
  
    # 自定义的collate_fn函数，用于在数据加载时对每个批次进行处理  
    def collate_fn(self, batch):  
        paths, imgs, targets = list(zip(*batch))  # 解包批次数据  
        targets = [boxes for boxes in targets if boxes is not None]  # 移除空的目标张量  
        for i, boxes in enumerate(targets):  
            boxes[:, 0] = i  # 为每个目标添加样本索引  
        targets = torch.cat(targets, 0)  # 将目标张量按行拼接  
        if self.multiscale and self.batch_count % 10 == 0:  # 每十个批次调整一次图像大小  
            self.img_size = random.choice(range(self.min_size, self.max_size + 1, 32))  
        imgs = torch.stack([resize(img, self.img_size) for img in imgs])  # 调整图像大小  
        self.batch_count += 1  # 更新批处理计数器  
        return paths, imgs, targets  # 返回处理后的批次数据  
  
    def __len__(self):  
        return len(self.img_files)  # 返回数据集中的图像数量

结语

以上只是计算机视觉领域众多优秀数据集的一小部分，每个数据集都有其独特的价值和应用场景。选择合适的数据集对于项目的成功至关重要，它不仅决定了模型能学到什么，还直接影响到最终结果的准确性和泛化能力。随着技术的不断进步和数据的持续积累，未来我们将看到更多高质量、多样化的图片处理数据集，为计算机视觉的发展注入新的活力。如果你正投身于这一领域，不妨从这些经典数据集开始，开启你的探索之旅吧！

查看全文

http://www.kler.cn/a/369588.html