当前位置: 首页 > article >正文

梯度检查点技术的使用

文章目录

  • 1、PyTorch
    • 使用例子1
    • 使用例子2
    • 使用例子3
  • 2、Hugging Face
    • 使用例子
  • 3、DeepSpeed

1、PyTorch

  • torch.utils.checkpoint官方文档
  • PyTorch Training Performance Guide 中关于 Gradient Checkpoints 的介绍
  • 参考博客

使用例子1

见博客。

使用例子2

import torchvision
from torch.utils.data import DataLoader
import torch
from torch import nn
from torch import optim
from torch.utils.tensorboard import SummaryWriter

import numpy as np

import os

transform_train = torchvision.transforms.Compose([
    torchvision.transforms.RandomHorizontalFlip(),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])
transform_test = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),    
])

train_dataset = torchvision.datasets.CIFAR10("/home/zjma/dataset/cifar10/", train=True, transform=transform_train, download=False)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=False)
test_dataset = torchvision.datasets.CIFAR10("/home/zjma/dataset/cifar10/", train=False, transform=transform_test, download=False)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

class CIFAR10Model_Original(nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn_block_1 = nn.Sequential(*[
            nn.Conv2d(3, 32, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Dropout(0.25)
        ])
        self.cnn_block_2 = nn.Sequential(*[
            nn.Conv2d(64, 64, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Dropout(0.25)
        ])
        self.flatten = lambda inp: torch.flatten(inp, 1)
        self.head = nn.Sequential(*[
            nn.Linear(64 * 8 * 8, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 10)
        ])
    
    def forward(self, X):
        X = self.cnn_block_1(X)
        X = self.cnn_block_2(X)
        X = self.flatten(X)
        X = self.head(X)
        return X

class CIFAR10Model_Optimized(nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn_block_1 = nn.Sequential(*[
            nn.Conv2d(3, 32, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        ])
        self.dropout_1 = nn.Dropout(0.25)
        self.cnn_block_2 = nn.Sequential(*[
            nn.Conv2d(64, 64, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        ])
        self.dropout_2 = nn.Dropout(0.25)
        self.flatten = lambda inp: torch.flatten(inp, 1)
        self.linearize = nn.Sequential(*[
            nn.Linear(64 * 8 * 8, 512),
            nn.ReLU()
        ])
        self.dropout_3 = nn.Dropout(0.5)
        self.out = nn.Linear(512, 10)
    
    def forward(self, X):
        X = torch.utils.checkpoint.checkpoint(self.cnn_block_1, X)
        X = self.dropout_1(X)
        X = torch.utils.checkpoint.checkpoint(self.cnn_block_2, X)
        X = self.dropout_2(X)
        X = self.flatten(X)
        X = self.linearize(X)
        X = self.dropout_3(X)
        X = self.out(X)
        return X

# clf = CIFAR10Model_Original()
clf = CIFAR10Model_Optimized()

start_epoch = 1

clf.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(clf.parameters(), lr=0.0001, weight_decay=1e-6)

def train():
    clf.train()
    NUM_EPOCHS = 10
    
    for epoch in range(start_epoch, NUM_EPOCHS + 1):
        losses = []

        for i, (X_batch, y_cls) in enumerate(train_dataloader):
            optimizer.zero_grad()

            y = y_cls.cuda()
            X_batch = X_batch.cuda()

            y_pred = clf(X_batch)
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()

            train_loss = loss.item()
            losses.append(train_loss)
            
            # Memory statistics after each batch
            mem_allocated = torch.cuda.memory_allocated()
            mem_reserved = torch.cuda.memory_reserved()
            max_mem_allocated = torch.cuda.max_memory_allocated()
            max_mem_reserved = torch.cuda.max_memory_reserved()
            
            if i % 10 == 0:
                print(
                    f'Finished epoch {epoch}/{NUM_EPOCHS}, batch {i}. loss: {train_loss:.3f}.'
                    f'Memory allocated: {mem_allocated / (1024 ** 2):.2f} MB, '
                    f'Memory reserved: {mem_reserved / (1024 ** 2):.2f} MB, '
                    f'Max memory allocated: {max_mem_allocated / (1024 ** 2):.2f} MB, '
                    f'Max memory reserved: {max_mem_reserved / (1024 ** 2):.2f} MB.'
                )
            
            # Reset peak memory stats for the next iteration
            torch.cuda.reset_peak_memory_stats()
            
        print(
            f'Finished epoch {epoch}. '
            f'avg loss: {np.mean(losses)}; median loss: {np.median(losses)}'
        )

train()

使用 checkpoint 优化前:

  • Max memory allocated: 69.58 MB
  • Max memory reserved: 96.00 MB

使用 checkpoint 优化后:

  • Max memory allocated: 40.80 MB
  • Max memory reserved: 64.00 MB

使用例子3

见项目。

2、Hugging Face

  • gradient_checkpointing_enable官方文档及用法
  • Methods and tools for efficient training on a single GPU 中关于 Gradient Checkpointing 的内容
  • Performance and Scalability: How To Fit a Bigger Model and Train It Faster中关于Gradient Checkpointing的内容
  • 参考博客1
  • 参考博客2

使用例子

见项目。

3、DeepSpeed

  • https://eanyang7.github.io/transformers_docs/main_classes/deepspeed/#activation-checkpointing-gradient-checkpointing
  • https://zhuanlan.zhihu.com/p/644656141
  • https://blog.csdn.net/Scc_hy/article/details/138728380
  • https://huggingface.co/docs/transformers/main/en/perf_train_gpu_one#deepspeed-zero

http://www.kler.cn/a/328101.html

相关文章:

  • LINUX-线程
  • MySql基础34题写题记录(3-10)
  • 【tbNick专享】虚拟机域控、成员服务器、降级等管理
  • pip install kaggle-environments ISSUE:Failed to build vec-noise
  • MicoZone-Git
  • 深度剖析IT技术前沿:编织数字世界的未来篇章
  • 怎么通过AI大模型开发一个网站?
  • SQL第11课——使用子查询
  • 1.1.5 计算机网络的性能指标(下)
  • 作文网源码 范文论文网模板 带会员系统+支付接口+整站数据
  • docker_阿里云镜像仓库
  • 代码随想录算法训练营第56天 | 1、冗余连接,2、冗余连接II
  • 【数学分析笔记】第4章第2节 导数的意义和性质(1)
  • 计算机前沿技术-人工智能算法-大语言模型-最新研究进展-2024-09-29
  • 谷歌发布Imagen 3,超过SD3、DALL・E-3,谷歌发布新RL方法,性能提升巨大,o1模型已证明
  • Python 封装 socket 为 [TCP/UDP/MULTICAST] 客户端
  • powerbi计算销售额同比增长率
  • MySql Explain优化命令使用
  • Vue实战教程:如何用JS封装一个可复用的Loading组件
  • 基于php的律所管理系统