nvlink 训练笔记
目录
还没测试出效果
还没测试出效果
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
from torchvision.transforms import ToTensor
# 定义上述的大型全连接层模型
class LargeFullyConnectedModel(nn.Module):
def __init__(self):
super(LargeFullyConnectedModel, self).__init__()
input_size = 10000
hidden_size1 = 20000
hidden_size2 = 15000
hidden_size3 = 12000
output_size = 5000
self.fc1 = nn.Linear(input_size, hidden_size1)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(hidden_size1, hidden_size2)
self.relu2 = nn.ReLU()
self.fc3 = nn.Linear(hidden_size2, hidden_size3)
self.relu3 = nn.ReLU()
self.fc4 = nn.Linear(hidden_size3, output_size)
def forward(self, x):
x = self.relu1(self.fc1(x))
x = self.relu2(self.fc2(x))
x = self.relu3(self.fc3(x))
x = self.fc4(x)
return x
# 初始化模型并准备多卡环境
devices = [0, 1] # 指定要使用的显卡编号列表
model = LargeFullyConnectedModel()
if torch.cuda.device_count() > 1 and len(devices) > 1:
print(f"使用 {len(devices)} 个 GPU 进行推理")
model = nn.DataParallel(model, device_ids=devices)
else:
print("仅使用单个 GPU 进行推理")
model.to(torch.device(f"cuda:{devices[0]}" if torch.cuda.is_available() else "cpu"))
# 模拟数据加载(这里只是示例,实际需根据你的数据进行调整)
batch_size = 32
input_size = 10000
data = torch.randn(batch_size, input_size).to(torch.device(f"cuda:{devices[0]}"))
targets = torch.randint(0, 5000, (batch_size,)).to(torch.device(f"cuda:{devices[0]}"))
# 定义推理函数
def inference():
model.eval()
with torch.no_grad():
outputs = model(data)
# 可以根据需要进行后续处理,如计算损失、准确率等
return outputs
if __name__ == "__main__":
inference()