【CUDA】mnist_cuda
【CUDA】mnist_cuda
使用MLP进行手写数字识别,分别使用pytorch,numpy,cuda一步一步展开代码的具体实现细节。
pytorch版
平常使用的pytorch实现,实现了对底层细节的良好封装,易于编程,利于模型的快速搭建。
import time
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
TRAIN_SIZE = 10000
epochs = 3
learning_rate = 1e-3
batch_size = 4
num_epochs = 3
data_dir = "./data"
# 设置浮点数矩阵乘法的精度
torch.set_float32_matmul_precision("high")
# MNIST Dataset
transform = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,)), # Mean and std of MNIST
]
)
train_dataset = datasets.MNIST(
root=data_dir, train=True, transform=transform, download=True
)
test_dataset = datasets.MNIST(
root=data_dir, train=False, transform=transform, download=True
)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Pre-allocate tensors of the appropriate size
train_data = torch.zeros(len(train_dataset), 1, 28, 28)
train_labels = torch.zeros(len(train_dataset), dtype=torch.long)
test_data = torch.zeros(len(test_dataset), 1, 28, 28)
test_labels = torch.zeros(len(test_dataset), dtype=torch.long)
# Load all training data into RAM
for idx, (data, label) in enumerate(train_loader):
start_idx = idx * batch_size
end_idx = start_idx + data.size(0)
train_data[start_idx:end_idx] = data
train_labels[start_idx:end_idx] = label
print("Train Data Shape:", train_data.shape)
print("Train Data Type:", train_data.dtype)
# Load all test data into RAM
for idx, (data, label) in enumerate(test_loader):
start_idx = idx * batch_size
end_idx = start_idx + data.size(0)
test_data[start_idx:end_idx] = data
test_labels[start_idx:end_idx] = label
print("Test Data Shape:", test_data.shape)
print("Test Data Type:", test_data.dtype)
iters_per_epoch = TRAIN_SIZE // batch_size
print("Iters per epoch:", iters_per_epoch)
class MLP(nn.Module):
def __init__(self, in_features, hidden_features, num_classes):
super(MLP, self).__init__()
self.fc1 = nn.Linear(in_features, hidden_features)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_features, num_classes)
def forward(self, x):
x = x.reshape(batch_size, 28 * 28)
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
return x
model = MLP(in_features=784, hidden_features=256, num_classes=10).to("cuda")
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
# Training the model
def train(model, criterion, optimizer, epoch):
model.train()
running_loss = 0.0
for i in range(iters_per_epoch):
optimizer.zero_grad()
data = train_data[i * batch_size : (i + 1) * batch_size].to("cuda")
target = train_labels[i * batch_size : (i + 1) * batch_size].to("cuda")
start = time.time()
outputs = model(data)
loss = criterion(outputs, target)
loss.backward()
optimizer.step()
optimizer.zero_grad()
end = time.time()
running_loss += loss.item()
if i % 100 == 99 or i == 0:
print(f"Epoch: {epoch+1}, Iter: {i+1}, Loss: {loss}")
print(f"Iteration Time: {(end - start) * 1e3:.4f} sec")
running_loss = 0.0
# Evaluation function to report average batch accuracy using the loaded test data
def evaluate(model, test_data, test_labels):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
total_batch_accuracy = torch.tensor(0.0, device=device)
num_batches = 0
with torch.no_grad():
for i in range(len(test_data) // batch_size):
data = test_data[i * batch_size : (i + 1) * batch_size].to(device)
target = test_labels[i * batch_size : (i + 1) * batch_size].to(device)
outputs = model(data)
_, predicted = torch.max(outputs.data, 1)
correct_batch = (predicted == target).sum().item()
total_batch = target.size(0)
if total_batch != 0: # Check to avoid division by zero
batch_accuracy = correct_batch / total_batch
total_batch_accuracy += batch_accuracy
num_batches += 1
avg_batch_accuracy = total_batch_accuracy / num_batches
print(f"Average Batch Accuracy: {avg_batch_accuracy * 100:.2f}%")
# Main
if __name__ == "__main__":
for epoch in range(epochs):
train(model, criterion, optimizer, epoch)
evaluate(model, test_data, test_labels)
print("Finished Training")
根据nn.CrossEntropyLoss()的计算公式,Epoch: 1, Iter: 1, Loss: 2.2725696563720703
,计算exp(-2.273)
大约为0.103003
,最开始参数随机初始化,接近于随机猜测。
numpy版
numpy则进一步对模型的内部进行展开,除了基本的操作,模型的结构,网络的前向传播和方向传播等都需要手动实现。
import numpy as np
from torchvision import datasets, transforms
# Load and preprocess the data
transform = transforms.Compose([transforms.ToTensor()])
mnist_train = datasets.MNIST(root='data', train=True, download=True, transform=transform)
mnist_test = datasets.MNIST(root='data', train=False, download=True, transform=transform)
X_train = mnist_train.data.numpy().reshape(-1, 1, 28, 28)[:10000] / 255.0
y_train = mnist_train.targets.numpy()[:10000]
X_test = mnist_test.data.numpy().reshape(-1, 1, 28, 28) / 255.0
y_test = mnist_test.targets.numpy()
# print the shapes of the data
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
# Activation functions
def relu(x):
return np.maximum(0, x)
# relu导数
def relu_derivative(x):
return (x > 0).astype(float)
# Linear layer
# He初始化是专门为激活函数为 ReLU 或其变体(如 Leaky ReLU、Parametric ReLU 等)设计的权重初始化方法
# 其初始化为N(0,σ) 表示均值为 0、标准差为σ的正态分布,σ=sqrt(2.0 / input_size)
def initialize_weights(input_size, output_size):
return np.random.randn(input_size, output_size) * np.sqrt(2.0 / input_size)
def initialize_bias(output_size):
return np.zeros((1, output_size))
def linear_forward(x, weights, bias):
return x @ weights + bias
def linear_backward(grad_output, x, weights):
grad_weights = x.T @ grad_output
grad_bias = np.sum(grad_output, axis=0, keepdims=True)
grad_input = grad_output @ weights.T
return grad_input, grad_weights, grad_bias
# Softmax and Cross-Entropy Loss
def softmax(x):
exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
return exp_x / np.sum(exp_x, axis=1, keepdims=True)
def cross_entropy_loss(y_pred, y_true):
batch_size = y_pred.shape[0]
probabilities = softmax(y_pred)
correct_log_probs = np.log(probabilities[np.arange(batch_size), y_true])
loss = -np.sum(correct_log_probs) / batch_size
return loss
class NeuralNetwork:
def __init__(self, input_size, hidden_size, output_size):
self.weights1 = initialize_weights(input_size, hidden_size)
self.bias1 = initialize_bias(hidden_size)
self.weights2 = initialize_weights(hidden_size, output_size)
self.bias2 = initialize_bias(output_size)
def forward(self, x):
batch_size = x.shape[0]
fc1_input = x.reshape(batch_size, -1)
fc1_output = linear_forward(fc1_input, self.weights1, self.bias1)
relu_output = relu(fc1_output)
fc2_output = linear_forward(relu_output, self.weights2, self.bias2)
return fc2_output, (fc1_input, fc1_output, relu_output)
def backward(self, grad_output, cache):
x, fc1_output, relu_output = cache
grad_fc2, grad_weights2, grad_bias2 = linear_backward(grad_output, relu_output, self.weights2)
grad_relu = grad_fc2 * relu_derivative(fc1_output)
grad_fc1, grad_weights1, grad_bias1 = linear_backward(grad_relu, x, self.weights1)
return grad_weights1, grad_bias1, grad_weights2, grad_bias2
def update_weights(self, grad_weights1, grad_bias1, grad_weights2, grad_bias2, learning_rate):
self.weights1 -= learning_rate * grad_weights1
self.bias1 -= learning_rate * grad_bias1
self.weights2 -= learning_rate * grad_weights2
self.bias2 -= learning_rate * grad_bias2
def train(model, X_train, y_train, X_test, y_test, batch_size, epochs, learning_rate):
for epoch in range(epochs):
print(f"Epoch {epoch+1}/{epochs}")
for i in range(0, len(X_train), batch_size):
batch_X = X_train[i:i+batch_size]
batch_y = y_train[i:i+batch_size]
y_pred, cache = model.forward(batch_X)
loss = cross_entropy_loss(y_pred, batch_y)
softmax_probs = softmax(y_pred)
y_true_one_hot = np.zeros_like(y_pred)
y_true_one_hot[np.arange(len(batch_y)), batch_y] = 1
grad_output = softmax_probs - y_true_one_hot
grad_weights1, grad_bias1, grad_weights2, grad_bias2 = model.backward(grad_output, cache)
model.update_weights(grad_weights1, grad_bias1, grad_weights2, grad_bias2, learning_rate)
if (i//batch_size) % 100 == 0:
print(f"Iteration: {i//batch_size} Loss: {loss:.4f}")
y_pred, _ = model.forward(X_test)
test_loss = cross_entropy_loss(y_pred, y_test)
accuracy = np.mean(np.argmax(y_pred, axis=1) == y_test)
print(f"Epoch {epoch+1} - Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}")
print("Training completed!")
if __name__ == "__main__":
input_size = 784 # 28x28 pixels
hidden_size = 256
output_size = 10 # 10 digits
model = NeuralNetwork(input_size, hidden_size, output_size)
batch_size = 4
epochs = 3
learning_rate = 0.001
train(model, X_train, y_train, X_test, y_test, batch_size, epochs, learning_rate)
cpu版
仅使用C语言搭建整个训练过程,训练的整个流程,模型的构建、计算、前向传播、反向传播、损失等都需要从头开始编写:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <string.h>
#define INPUT_SIZE 784
#define HIDDEN_SIZE 256
#define OUTPUT_SIZE 10
#define TRAIN_SIZE 10000
#define TEST_SIZE 1000
#define BATCH_SIZE 4
#define EPOCHS 10
#define LEARNING_RATE 0.001
typedef struct {
float *weights1;
float *weights2;
float *bias1;
float *bias2;
float *grad_weights1;
float *grad_weights2;
float *grad_bias1;
float *grad_bias2;
} NeuralNetwork;
// load batched img data
void load_data(const char *filename, float *data, int size) {
FILE *file = fopen(filename, "rb");
if (file == NULL) {
fprintf(stderr, "Error opening file: %s\n", filename);
exit(1);
}
size_t read_size = fread(data, sizeof(float), size, file);
if (read_size != size) {
fprintf(stderr, "Error reading data: expected %d elements, got %zu\n", size, read_size);
exit(1);
}
fclose(file);
}
// load batch labels
void load_labels(const char *filename, int *labels, int size) {
FILE *file = fopen(filename, "rb");
if (file == NULL) {
fprintf(stderr, "Error opening file: %s\n", filename);
exit(1);
}
size_t read_size = fread(labels, sizeof(int), size, file);
if (read_size != size) {
fprintf(stderr, "Error reading labels: expected %d elements, got %zu\n", size, read_size);
exit(1);
}
fclose(file);
}
// kaiming init func for weights
void initialize_weights(float *weights, int size) {
float scale = sqrtf(2.0f / size);
for (int i = 0; i < size; i++) {
weights[i] = ((float)rand() / RAND_MAX) * scale - (scale / 2.0f);
}
}
// basic init for biases
void initialize_bias(float *bias, int size) {
for (int i = 0; i < size; i++) {
bias[i] = 0.0f;
}
}
// Modify softmax to work with batches
void softmax(float *x, int batch_size, int size) {
for (int b = 0; b < batch_size; b++) {
float max = x[b * size];
for (int i = 1; i < size; i++) {
if (x[b * size + i] > max) max = x[b * size + i];
}
float sum = 0.0f;
for (int i = 0; i < size; i++) {
x[b * size + i] = expf(x[b * size + i] - max);
sum += x[b * size + i];
}
for (int i = 0; i < size; i++) {
x[b * size + i] = fmaxf(x[b * size + i] / sum, 1e-7f);
}
}
}
void matmul_a_b(float *A, float *B, float *C, int m, int n, int k) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++) {
C[i * k + j] = 0.0f;
for (int l = 0; l < n; l++) {
C[i * k + j] += A[i * n + l] * B[l * k + j];
}
}
}
}
// Matrix multiplication A @ B.T
void matmul_a_bt(float *A, float *B, float *C, int m, int n, int k) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++) {
C[i * k + j] = 0.0f;
for (int l = 0; l < n; l++) {
C[i * k + j] += A[i * n + l] * B[j * n + l];
}
}
}
}
// Matrix multiplication A.T @ B
void matmul_at_b(float *A, float *B, float *C, int m, int n, int k) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < k; j++) {
C[i * k + j] = 0.0f;
for (int l = 0; l < m; l++) {
C[i * k + j] += A[l * n + i] * B[l * k + j];
}
}
}
}
// ReLU forward
void relu_forward(float *x, int size) {
for (int i = 0; i < size; i++) {
x[i] = fmaxf(0.0f, x[i]);
}
}
// Add bias
void bias_forward(float *x, float *bias, int batch_size, int size) {
for (int b = 0; b < batch_size; b++) {
for (int i = 0; i < size; i++) {
x[b * size + i] += bias[i];
}
}
}
// Modified forward function
void forward(NeuralNetwork *nn, float *input, float *hidden, float *output, int batch_size) {
// Input to Hidden (X @ W1)
matmul_a_b(input, nn->weights1, hidden, batch_size, INPUT_SIZE, HIDDEN_SIZE);
// Add bias1
bias_forward(hidden, nn->bias1, batch_size, HIDDEN_SIZE);
// Apply ReLU
relu_forward(hidden, batch_size * HIDDEN_SIZE);
// Hidden to Output (Hidden @ W2)
matmul_a_b(hidden, nn->weights2, output, batch_size, HIDDEN_SIZE, OUTPUT_SIZE);
// Add bias2
bias_forward(output, nn->bias2, batch_size, OUTPUT_SIZE);
// Apply softmax
softmax(output, batch_size, OUTPUT_SIZE);
}
// Modify cross_entropy_loss to work with batches
float cross_entropy_loss(float *output, int *labels, int batch_size) {
float total_loss = 0.0f;
for (int b = 0; b < batch_size; b++) {
total_loss -= logf(fmaxf(output[b * OUTPUT_SIZE + labels[b]], 1e-7f));
}
return total_loss / batch_size;
}
// Zero out gradients
void zero_grad(float *grad, int size) {
memset(grad, 0, size * sizeof(float));
}
// ReLU backward
void relu_backward(float *grad, float *x, int size) {
for (int i = 0; i < size; i++) {
grad[i] *= (x[i] > 0);
}
}
// Bias backward
void bias_backward(float *grad_bias, float *grad, int batch_size, int size) {
for (int i = 0; i < size; i++) {
grad_bias[i] = 0.0f;
// 偏置的梯度是上一层梯度的累加。
for (int b = 0; b < batch_size; b++) {
grad_bias[i] += grad[b * size + i];
}
}
}
// Compute gradients for output layer
void compute_output_gradients(float *grad_output, float *output, int *labels, int batch_size) {
for (int b = 0; b < batch_size; b++) {
for (int i = 0; i < OUTPUT_SIZE; i++) {
grad_output[b * OUTPUT_SIZE + i] = output[b * OUTPUT_SIZE + i];
}
// 交叉熵损失函数,输出层梯度为 output - target
grad_output[b * OUTPUT_SIZE + labels[b]] -= 1.0f;
}
}
// Backward pass function
void backward(NeuralNetwork *nn, float *input, float *hidden, float *output, int *labels, int batch_size) {
// Initialize gradients to zero
zero_grad(nn->grad_weights1, HIDDEN_SIZE * INPUT_SIZE);
zero_grad(nn->grad_weights2, OUTPUT_SIZE * HIDDEN_SIZE);
zero_grad(nn->grad_bias1, HIDDEN_SIZE);
zero_grad(nn->grad_bias2, OUTPUT_SIZE);
// Compute gradients for output layer
float *grad_output = malloc(batch_size * OUTPUT_SIZE * sizeof(float));
compute_output_gradients(grad_output, output, labels, batch_size);
// Update gradients for weights2 (W2.grad = grad_output.T @ hidden)
matmul_at_b(hidden, grad_output, nn->grad_weights2, batch_size, HIDDEN_SIZE, OUTPUT_SIZE);
// Update gradients for bias2
bias_backward(nn->grad_bias2, grad_output, batch_size, OUTPUT_SIZE);
// Compute dX2 (gradient of loss w.r.t. input of second layer)
float *dX2 = malloc(batch_size * HIDDEN_SIZE * sizeof(float));
// grad_output @ W2.T = dX2 -> (B, 10) @ (10, 256) = (B, 256)
matmul_a_bt(grad_output, nn->weights2, dX2, batch_size, OUTPUT_SIZE, HIDDEN_SIZE);
// Compute d_ReLU_out (element-wise multiplication with ReLU derivative)
float *d_ReLU_out = malloc(batch_size * HIDDEN_SIZE * sizeof(float));
for (int i = 0; i < batch_size * HIDDEN_SIZE; i++) {
d_ReLU_out[i] = dX2[i] * (hidden[i] > 0);
}
// retains its shape since its just a point-wise operation
// Update gradients for weights1 (W1.grad = d_ReLU_out.T @ input)
matmul_at_b(input, d_ReLU_out, nn->grad_weights1, batch_size, INPUT_SIZE, HIDDEN_SIZE);
// Update gradients for bias1
bias_backward(nn->grad_bias1, d_ReLU_out, batch_size, HIDDEN_SIZE);
// Free allocated memory
free(grad_output);
free(dX2);
free(d_ReLU_out);
}
// gradient descent step
void update_weights(NeuralNetwork *nn) {
for (int i = 0; i < HIDDEN_SIZE * INPUT_SIZE; i++) {
nn->weights1[i] -= LEARNING_RATE * nn->grad_weights1[i];
}
for (int i = 0; i < OUTPUT_SIZE * HIDDEN_SIZE; i++) {
nn->weights2[i] -= LEARNING_RATE * nn->grad_weights2[i];
}
for (int i = 0; i < HIDDEN_SIZE; i++) {
nn->bias1[i] -= LEARNING_RATE * nn->grad_bias1[i];
}
for (int i = 0; i < OUTPUT_SIZE; i++) {
nn->bias2[i] -= LEARNING_RATE * nn->grad_bias2[i];
}
}
// Modify train function to work with batches
void train(NeuralNetwork *nn, float *X_train, int *y_train) {
float *hidden = malloc(BATCH_SIZE * HIDDEN_SIZE * sizeof(float));
float *output = malloc(BATCH_SIZE * OUTPUT_SIZE * sizeof(float));
int num_batches = TRAIN_SIZE / BATCH_SIZE;
for (int epoch = 0; epoch < EPOCHS; epoch++) {
float total_loss = 0.0f;
int correct = 0;
for (int batch = 0; batch < num_batches; batch++) {
int start_idx = batch * BATCH_SIZE;
forward(nn, &X_train[start_idx * INPUT_SIZE], hidden, output, BATCH_SIZE);
float loss = cross_entropy_loss(output, &y_train[start_idx], BATCH_SIZE);
total_loss += loss;
for (int i = 0; i < BATCH_SIZE; i++) {
int predicted = 0;
for (int j = 1; j < OUTPUT_SIZE; j++) {
if (output[i * OUTPUT_SIZE + j] > output[i * OUTPUT_SIZE + predicted]) {
predicted = j;
}
}
if (predicted == y_train[start_idx + i]) {
correct++;
}
}
backward(nn, &X_train[start_idx * INPUT_SIZE], hidden, output, &y_train[start_idx], BATCH_SIZE);
update_weights(nn);
if ((batch + 1) % 100 == 0 || (epoch == 0 && batch == 0)) {
printf("Epoch %d/%d, Iter %d/%d, Loss: %.4f, Accuracy: %.2f%%\n",
epoch + 1, EPOCHS, batch + 1, num_batches, total_loss / (batch + 1),
100.0f * correct / ((batch + 1) * BATCH_SIZE));
}
}
printf("Epoch %d/%d completed, Loss: %.4f, Accuracy: %.2f%%\n",
epoch + 1, EPOCHS, total_loss / num_batches, 100.0f * correct / TRAIN_SIZE);
}
free(hidden);
free(output);
}
// Modify the initialize function to allocate memory for gradients
void initialize_neural_network(NeuralNetwork *nn) {
nn->weights1 = malloc(HIDDEN_SIZE * INPUT_SIZE * sizeof(float));
nn->weights2 = malloc(OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float));
nn->bias1 = malloc(HIDDEN_SIZE * sizeof(float));
nn->bias2 = malloc(OUTPUT_SIZE * sizeof(float));
nn->grad_weights1 = malloc(HIDDEN_SIZE * INPUT_SIZE * sizeof(float));
nn->grad_weights2 = malloc(OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float));
nn->grad_bias1 = malloc(HIDDEN_SIZE * sizeof(float));
nn->grad_bias2 = malloc(OUTPUT_SIZE * sizeof(float));
initialize_weights(nn->weights1, HIDDEN_SIZE * INPUT_SIZE);
initialize_weights(nn->weights2, OUTPUT_SIZE * HIDDEN_SIZE);
initialize_bias(nn->bias1, HIDDEN_SIZE);
initialize_bias(nn->bias2, OUTPUT_SIZE);
}
int main() {
srand(time(NULL));
NeuralNetwork nn;
initialize_neural_network(&nn);
float *X_train = malloc(TRAIN_SIZE * INPUT_SIZE * sizeof(float));
int *y_train = malloc(TRAIN_SIZE * sizeof(int));
float *X_test = malloc(TEST_SIZE * INPUT_SIZE * sizeof(float));
int *y_test = malloc(TEST_SIZE * sizeof(int));
load_data("./mnist_data/X_train.bin", X_train, TRAIN_SIZE * INPUT_SIZE);
load_labels("./mnist_data/y_train.bin", y_train, TRAIN_SIZE);
load_data("./mnist_data/X_test.bin", X_test, TEST_SIZE * INPUT_SIZE);
load_labels("./mnist_data/y_test.bin", y_test, TEST_SIZE);
// print first image in the terminal
for (int i = 0; i < 28; i++) {
for (int j = 0; j < 28; j++) {
if (X_train[0 * INPUT_SIZE + i * 28 + j] > 0.0f) {
printf("X");
} else {
printf(" ");
}
}
printf("\n");
}
printf("First 10 training labels: ");
for (int i = 0; i < 10; i++) {
printf("%d ", y_train[i]);
}
printf("\n");
train(&nn, X_train, y_train);
free(nn.weights1);
free(nn.weights2);
free(nn.bias1);
free(nn.bias2);
free(nn.grad_weights1);
free(nn.grad_weights2);
free(nn.grad_bias1);
free(nn.grad_bias2);
free(X_train);
free(y_train);
free(X_test);
free(y_test);
return 0;
}
gpu版
朴素版
#include <cuda_runtime.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define INPUT_SIZE 784
#define HIDDEN_SIZE 4096
#define OUTPUT_SIZE 10
#define TRAIN_SIZE 10000
#define TEST_SIZE 1000
#define BATCH_SIZE 32
#define EPOCHS 20
#define LEARNING_RATE 0.05
typedef struct {
float *weights1;
float *weights2;
float *bias1;
float *bias2;
float *grad_weights1;
float *grad_weights2;
float *grad_bias1;
float *grad_bias2;
} NeuralNetwork;
// Modify the CUDA_CHECK macro to print more information
#define CUDA_CHECK(call) \
do { \
cudaError_t error = call; \
if (error != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(error)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} while (0)
// load batched img data
void load_data(const char *filename, float *data, int size) {
FILE *file = fopen(filename, "rb");
if (file == NULL) {
fprintf(stderr, "Error opening file: %s\n", filename);
exit(1);
}
size_t read_size = fread(data, sizeof(float), size, file);
if (read_size != size) {
fprintf(stderr, "Error reading data: expected %d elements, got %zu\n", size, read_size);
exit(1);
}
fclose(file);
}
// load batch labels
void load_labels(const char *filename, int *labels, int size) {
FILE *file = fopen(filename, "rb");
if (file == NULL) {
fprintf(stderr, "Error opening file: %s\n", filename);
exit(1);
}
size_t read_size = fread(labels, sizeof(int), size, file);
if (read_size != size) {
fprintf(stderr, "Error reading labels: expected %d elements, got %zu\n", size, read_size);
exit(1);
}
fclose(file);
}
// kaiming init func for weights
void initialize_weights(float *weights, int size) {
float scale = sqrtf(2.0f / size);
for (int i = 0; i < size; i++) {
weights[i] = ((float)rand() / RAND_MAX) * scale - (scale / 2.0f);
}
}
// basic init for biases
void initialize_bias(float *bias, int size) {
for (int i = 0; i < size; i++) {
bias[i] = 0.0f;
}
}
// CUDA kernel for matrix multiplication (A @ B)
__global__ void matmul_a_b_kernel(float *A, float *B, float *C, int m, int n, int k) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < m && col < k) {
float sum = 0.0f;
for (int i = 0; i < n; ++i) {
sum += A[row * n + i] * B[i * k + col];
}
C[row * k + col] = sum;
}
}
// CUDA kernel for matrix multiplication (A @ B.T)
__global__ void matmul_a_bt_kernel(float *A, float *B, float *C, int m, int n, int k) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < m && col < k) {
float sum = 0.0f;
for (int i = 0; i < n; ++i) {
sum += A[row * n + i] * B[col * n + i];
}
C[row * k + col] = sum;
}
}
// CUDA kernel for matrix multiplication (A.T @ B)
__global__ void matmul_at_b_kernel(float *A, float *B, float *C, int m, int n, int k) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < n && col < k) {
float sum = 0.0f;
for (int i = 0; i < m; ++i) {
sum += A[i * n + row] * B[i * k + col];
}
C[row * k + col] = sum;
}
}
// CUDA kernel for ReLU activation
__global__ void relu_kernel(float *x, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
x[idx] = fmaxf(0.0f, x[idx]);
}
}
// CUDA kernel for bias addition
__global__ void bias_add_kernel(float *x, float *bias, int batch_size, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int b = idx / size;
int i = idx % size;
if (b < batch_size && i < size) {
x[idx] += bias[i];
}
}
// CUDA kernel for softmax
__global__ void softmax_kernel(float *x, int batch_size, int size) {
int b = blockIdx.x;
if (b < batch_size) {
float max_val = x[b * size];
for (int i = 1; i < size; ++i) {
max_val = fmaxf(max_val, x[b * size + i]);
}
float sum = 0.0f;
for (int i = 0; i < size; ++i) {
x[b * size + i] = expf(x[b * size + i] - max_val);
sum += x[b * size + i];
}
for (int i = 0; i < size; ++i) {
x[b * size + i] = fmaxf(x[b * size + i] / sum, 1e-7f);
}
}
}
__global__ void clip_gradients_kernel(float *gradients, int size, float max_norm) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
float grad = gradients[idx];
if (grad > max_norm) {
gradients[idx] = max_norm;
} else if (grad < -max_norm) {
gradients[idx] = -max_norm;
}
}
}
// Modified forward function using CUDA kernels
void forward(NeuralNetwork *nn, float *d_input, float *d_hidden, float *d_output, int batch_size) {
// 1024 threads/blocks
dim3 block_size(32, 32);
// just enough blocks + threads for our naive matmul kernel
dim3 grid_size((HIDDEN_SIZE + block_size.x - 1) / block_size.x, (batch_size + block_size.y - 1) / block_size.y);
// Input to Hidden (X @ W1)
matmul_a_b_kernel<<<grid_size, block_size>>>(d_input, nn->weights1, d_hidden, batch_size, INPUT_SIZE, HIDDEN_SIZE);
CUDA_CHECK(cudaGetLastError());
// Add bias1 (one bias term for each neuron (multiple weights))
bias_add_kernel<<<(batch_size * HIDDEN_SIZE + 255) / 256, 256>>>(d_hidden, nn->bias1, batch_size, HIDDEN_SIZE);
CUDA_CHECK(cudaGetLastError());
// Apply ReLU
relu_kernel<<<(batch_size * HIDDEN_SIZE + 255) / 256, 256>>>(d_hidden, batch_size * HIDDEN_SIZE);
CUDA_CHECK(cudaGetLastError());
// Hidden to Output (Hidden @ W2)
grid_size.x = (OUTPUT_SIZE + block_size.x - 1) / block_size.x;
grid_size.y = (batch_size + block_size.y - 1) / block_size.y;
matmul_a_b_kernel<<<grid_size, block_size>>>(d_hidden, nn->weights2, d_output, batch_size, HIDDEN_SIZE,
OUTPUT_SIZE);
CUDA_CHECK(cudaGetLastError());
// Add bias2 (also one bias term per neuron)
bias_add_kernel<<<(batch_size * OUTPUT_SIZE + 255) / 256, 256>>>(d_output, nn->bias2, batch_size, OUTPUT_SIZE);
CUDA_CHECK(cudaGetLastError());
// Apply softmax
softmax_kernel<<<batch_size, 1>>>(d_output, batch_size, OUTPUT_SIZE);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
}
// Modify cross_entropy_loss to work with batches (w/out softmax because we already do this in the forward pass)
float cross_entropy_loss(float *output, int *labels, int batch_size) {
float total_loss = 0.0f;
for (int b = 0; b < batch_size; b++) {
total_loss -= logf(fmaxf(output[b * OUTPUT_SIZE + labels[b]], 1e-7f));
}
return total_loss / batch_size;
}
// Add this CUDA kernel to zero out gradients
__global__ void zero_grad_kernel(float *grad, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
grad[idx] = 0.0f;
}
}
// CUDA kernel for computing output gradients
__global__ void compute_output_gradients_kernel(float *grad_output, float *output, int *labels, int batch_size) {
int b = blockIdx.x * blockDim.x + threadIdx.x;
if (b < batch_size) {
for (int i = 0; i < OUTPUT_SIZE; ++i) {
grad_output[b * OUTPUT_SIZE + i] = output[b * OUTPUT_SIZE + i];
}
grad_output[b * OUTPUT_SIZE + labels[b]] -= 1.0f;
}
}
// CUDA kernel for updating gradients
__global__ void update_gradients_kernel(float *grad_weights, float *grad_bias, float *grad_layer, float *prev_layer,
int batch_size, int prev_size, int curr_size) {
int i = blockIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
if (i < curr_size && j < prev_size) {
float grad_w_sum = 0.0f;
for (int b = 0; b < batch_size; ++b) {
grad_w_sum += grad_layer[b * curr_size + i] * prev_layer[b * prev_size + j];
}
atomicAdd(&grad_weights[i * prev_size + j], grad_w_sum);
if (j == 0) {
float grad_b_sum = 0.0f;
for (int b = 0; b < batch_size; ++b) {
grad_b_sum += grad_layer[b * curr_size + i];
}
atomicAdd(&grad_bias[i], grad_b_sum);
}
}
}
__global__ void drelu_kernel(float *x, float *d_ReLU_out, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
d_ReLU_out[idx] = x[idx] > 0.0f ? 1.0f : 0.0f;
}
}
// Element-wise multiplication of d_dX2 and d_grad_hidden
__global__ void multiply_gradients_kernel(float *grad1, float *grad2, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
grad1[idx] *= grad2[idx];
}
}
// Modified backward function using CUDA kernels
// shape rotating is on par with the visual example (excalidraw diagram) in the mnist-cuda git repo (also found in
// "assets")
void backward(NeuralNetwork *nn, float *d_input, float *d_hidden, float *d_output, int *d_labels, int batch_size) {
// Initialize gradients to zero using CUDA kernel
zero_grad_kernel<<<(HIDDEN_SIZE * INPUT_SIZE + 256 - 1) / 256, 256>>>(nn->grad_weights1, HIDDEN_SIZE * INPUT_SIZE);
CUDA_CHECK(cudaGetLastError());
zero_grad_kernel<<<(OUTPUT_SIZE * HIDDEN_SIZE + 256 - 1) / 256, 256>>>(nn->grad_weights2,
OUTPUT_SIZE * HIDDEN_SIZE);
CUDA_CHECK(cudaGetLastError());
zero_grad_kernel<<<(HIDDEN_SIZE + 256 - 1) / 256, 256>>>(nn->grad_bias1, HIDDEN_SIZE);
CUDA_CHECK(cudaGetLastError());
zero_grad_kernel<<<(OUTPUT_SIZE + 256 - 1) / 256, 256>>>(nn->grad_bias2, OUTPUT_SIZE);
CUDA_CHECK(cudaGetLastError());
// Compute gradients for output layer
float *d_grad_output;
CUDA_CHECK(cudaMalloc(&d_grad_output, batch_size * OUTPUT_SIZE * sizeof(float)));
compute_output_gradients_kernel<<<(batch_size + 255) / 256, 256>>>(d_grad_output, d_output, d_labels, batch_size);
CUDA_CHECK(cudaGetLastError());
// Update gradients for weights2 (W2.grad = grad_output.T @ hidden)
dim3 block_size(32, 32);
dim3 grid_size((HIDDEN_SIZE + block_size.x - 1) / block_size.x, (OUTPUT_SIZE + block_size.y - 1) / block_size.y);
matmul_at_b_kernel<<<grid_size, block_size>>>(d_hidden, d_grad_output, nn->grad_weights2, batch_size, HIDDEN_SIZE,
OUTPUT_SIZE);
CUDA_CHECK(cudaGetLastError());
// Update gradients for bias2
update_gradients_kernel<<<grid_size, block_size>>>(nn->grad_weights2, nn->grad_bias2, d_grad_output, d_hidden,
batch_size, HIDDEN_SIZE, OUTPUT_SIZE);
CUDA_CHECK(cudaGetLastError());
// Compute dX2 (gradient of loss w.r.t. input of second layer)
float *d_dX2;
CUDA_CHECK(cudaMalloc(&d_dX2, batch_size * HIDDEN_SIZE * sizeof(float)));
grid_size.x = (HIDDEN_SIZE + block_size.x - 1) / block_size.x;
grid_size.y = (batch_size + block_size.y - 1) / block_size.y;
matmul_a_bt_kernel<<<grid_size, block_size>>>(d_grad_output, nn->weights2, d_dX2, batch_size, OUTPUT_SIZE,
HIDDEN_SIZE);
CUDA_CHECK(cudaGetLastError());
// Compute d_ReLU_out (element-wise multiplication with ReLU derivative)
float *d_grad_hidden;
CUDA_CHECK(cudaMalloc(&d_grad_hidden, batch_size * HIDDEN_SIZE * sizeof(float)));
drelu_kernel<<<(batch_size * HIDDEN_SIZE + 255) / 256, 256>>>(d_hidden, d_grad_hidden, batch_size * HIDDEN_SIZE);
CUDA_CHECK(cudaGetLastError());
multiply_gradients_kernel<<<(batch_size * HIDDEN_SIZE + 255) / 256, 256>>>(d_dX2, d_grad_hidden,
batch_size * HIDDEN_SIZE);
CUDA_CHECK(cudaGetLastError());
// Update gradients for weights1 (W1.grad = d_ReLU_out.T @ input)
grid_size.x = (INPUT_SIZE + block_size.x - 1) / block_size.x;
grid_size.y = (HIDDEN_SIZE + block_size.y - 1) / block_size.y;
matmul_at_b_kernel<<<grid_size, block_size>>>(d_input, d_dX2, nn->grad_weights1, batch_size, INPUT_SIZE,
HIDDEN_SIZE);
CUDA_CHECK(cudaGetLastError());
// Update gradients for bias1
update_gradients_kernel<<<grid_size, block_size>>>(nn->grad_weights1, nn->grad_bias1, d_dX2, d_input, batch_size,
INPUT_SIZE, HIDDEN_SIZE);
CUDA_CHECK(cudaGetLastError());
// Free allocated memory
CUDA_CHECK(cudaFree(d_grad_output));
CUDA_CHECK(cudaFree(d_dX2));
CUDA_CHECK(cudaFree(d_grad_hidden));
CUDA_CHECK(cudaDeviceSynchronize());
}
// gradient descent step
__global__ void update_weights_kernel(float *weights, float *grad_weights, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
weights[idx] -= LEARNING_RATE * grad_weights[idx];
}
}
void update_weights(NeuralNetwork *nn) {
int block_size = 256;
int grid_size;
// Update weights1
grid_size = (HIDDEN_SIZE * INPUT_SIZE + block_size - 1) / block_size;
update_weights_kernel<<<grid_size, block_size>>>(nn->weights1, nn->grad_weights1, HIDDEN_SIZE * INPUT_SIZE);
CUDA_CHECK(cudaGetLastError());
// Update weights2
grid_size = (OUTPUT_SIZE * HIDDEN_SIZE + block_size - 1) / block_size;
update_weights_kernel<<<grid_size, block_size>>>(nn->weights2, nn->grad_weights2, OUTPUT_SIZE * HIDDEN_SIZE);
CUDA_CHECK(cudaGetLastError());
// Update bias1
grid_size = (HIDDEN_SIZE + block_size - 1) / block_size;
update_weights_kernel<<<grid_size, block_size>>>(nn->bias1, nn->grad_bias1, HIDDEN_SIZE);
CUDA_CHECK(cudaGetLastError());
// Update bias2
grid_size = (OUTPUT_SIZE + block_size - 1) / block_size;
update_weights_kernel<<<grid_size, block_size>>>(nn->bias2, nn->grad_bias2, OUTPUT_SIZE);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
}
// Modify evaluate_accuracy to handle larger datasets by processing in batches
float evaluate_accuracy(NeuralNetwork *nn, float *d_X_test, int *d_y_test, float *d_hidden, float *d_output,
int total_size) {
int num_batches = (total_size + BATCH_SIZE - 1) / BATCH_SIZE;
int total_correct = 0;
int total_processed = 0;
for (int batch = 0; batch < num_batches; batch++) {
int current_batch_size = (batch == num_batches - 1) ? (total_size - batch * BATCH_SIZE) : BATCH_SIZE;
if (current_batch_size <= 0) break;
forward(nn, &d_X_test[batch * BATCH_SIZE * INPUT_SIZE], d_hidden, d_output, current_batch_size);
float *h_output = (float *)malloc(current_batch_size * OUTPUT_SIZE * sizeof(float));
int *h_y_test = (int *)malloc(current_batch_size * sizeof(int));
CUDA_CHECK(
cudaMemcpy(h_output, d_output, current_batch_size * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(h_y_test, &d_y_test[batch * BATCH_SIZE], current_batch_size * sizeof(int),
cudaMemcpyDeviceToHost));
for (int i = 0; i < current_batch_size; i++) {
int predicted = 0;
for (int j = 1; j < OUTPUT_SIZE; j++) {
if (h_output[i * OUTPUT_SIZE + j] > h_output[i * OUTPUT_SIZE + predicted]) {
predicted = j;
}
}
if (predicted == h_y_test[i]) {
total_correct++;
}
}
total_processed += current_batch_size;
free(h_output);
free(h_y_test);
}
return 100.0f * total_correct / total_processed;
}
// Modify train function
void train(NeuralNetwork *nn, float *X_train, int *y_train, float *X_test, int *y_test) {
float *d_X_train, *d_X_test, *d_hidden, *d_output;
int *d_y_train, *d_y_test;
// Allocate memory for training and test data
CUDA_CHECK(cudaMalloc(&d_X_train, TRAIN_SIZE * INPUT_SIZE * sizeof(float)));
CUDA_CHECK(cudaMalloc(&d_X_test, TEST_SIZE * INPUT_SIZE * sizeof(float)));
CUDA_CHECK(cudaMalloc(&d_hidden, BATCH_SIZE * HIDDEN_SIZE * sizeof(float)));
CUDA_CHECK(cudaMalloc(&d_output, BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
CUDA_CHECK(cudaMalloc(&d_y_train, TRAIN_SIZE * sizeof(int)));
CUDA_CHECK(cudaMalloc(&d_y_test, TEST_SIZE * sizeof(int)));
// Copy data to GPU
CUDA_CHECK(cudaMemcpy(d_X_train, X_train, TRAIN_SIZE * INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_X_test, X_test, TEST_SIZE * INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_y_train, y_train, TRAIN_SIZE * sizeof(int), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_y_test, y_test, TEST_SIZE * sizeof(int), cudaMemcpyHostToDevice));
int num_batches = TRAIN_SIZE / BATCH_SIZE;
for (int epoch = 0; epoch < EPOCHS; epoch++) {
float total_loss = 0.0f;
// Zero out gradients at the beginning of each epoch
zero_grad_kernel<<<(HIDDEN_SIZE * INPUT_SIZE + 256 - 1) / 256, 256>>>(nn->grad_weights1,
HIDDEN_SIZE * INPUT_SIZE);
zero_grad_kernel<<<(OUTPUT_SIZE * HIDDEN_SIZE + 256 - 1) / 256, 256>>>(nn->grad_weights2,
OUTPUT_SIZE * HIDDEN_SIZE);
zero_grad_kernel<<<(HIDDEN_SIZE + 256 - 1) / 256, 256>>>(nn->grad_bias1, HIDDEN_SIZE);
zero_grad_kernel<<<(OUTPUT_SIZE + 256 - 1) / 256, 256>>>(nn->grad_bias2, OUTPUT_SIZE);
CUDA_CHECK(cudaDeviceSynchronize());
for (int batch = 0; batch < num_batches; batch++) {
int start_idx = batch * BATCH_SIZE;
forward(nn, &d_X_train[start_idx * INPUT_SIZE], d_hidden, d_output, BATCH_SIZE);
float *h_output = (float *)malloc(BATCH_SIZE * OUTPUT_SIZE * sizeof(float));
CUDA_CHECK(
cudaMemcpy(h_output, d_output, BATCH_SIZE * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
float loss = cross_entropy_loss(h_output, &y_train[start_idx], BATCH_SIZE);
total_loss += loss;
free(h_output);
backward(nn, &d_X_train[start_idx * INPUT_SIZE], d_hidden, d_output, &d_y_train[start_idx], BATCH_SIZE);
update_weights(nn);
if ((batch + 1) % 100 == 0 || (epoch == 0 && batch == 0)) {
// Use random batch from test set for accuracy reporting
int test_start_idx = rand() % (TEST_SIZE - BATCH_SIZE);
float test_accuracy = evaluate_accuracy(nn, &d_X_test[test_start_idx * INPUT_SIZE],
&d_y_test[test_start_idx], d_hidden, d_output, BATCH_SIZE);
printf("Epoch %d/%d, Iter %d/%d, Loss: %.4f, Test Accuracy: %.2f%%\n", epoch + 1, EPOCHS, batch + 1,
num_batches, total_loss / (batch + 1), test_accuracy);
}
}
// Evaluate on entire test set at end of epoch
float test_accuracy = evaluate_accuracy(nn, d_X_test, d_y_test, d_hidden, d_output, TEST_SIZE);
printf("Epoch %d/%d completed, Loss: %.4f, Test Accuracy: %.2f%%\n", epoch + 1, EPOCHS,
total_loss / num_batches, test_accuracy);
}
// Free GPU memory
CUDA_CHECK(cudaFree(d_X_train));
CUDA_CHECK(cudaFree(d_X_test));
CUDA_CHECK(cudaFree(d_hidden));
CUDA_CHECK(cudaFree(d_output));
CUDA_CHECK(cudaFree(d_y_train));
CUDA_CHECK(cudaFree(d_y_test));
}
// Modified initialize function to allocate memory for gradients
void initialize_neural_network(NeuralNetwork *nn) {
CUDA_CHECK(cudaMalloc(&nn->weights1, HIDDEN_SIZE * INPUT_SIZE * sizeof(float)));
CUDA_CHECK(cudaMalloc(&nn->weights2, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float)));
CUDA_CHECK(cudaMalloc(&nn->bias1, HIDDEN_SIZE * sizeof(float)));
CUDA_CHECK(cudaMalloc(&nn->bias2, OUTPUT_SIZE * sizeof(float)));
CUDA_CHECK(cudaMalloc(&nn->grad_weights1, HIDDEN_SIZE * INPUT_SIZE * sizeof(float)));
CUDA_CHECK(cudaMalloc(&nn->grad_weights2, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float)));
CUDA_CHECK(cudaMalloc(&nn->grad_bias1, HIDDEN_SIZE * sizeof(float)));
CUDA_CHECK(cudaMalloc(&nn->grad_bias2, OUTPUT_SIZE * sizeof(float)));
// Allocate temporary host memory
float *h_weights1 = (float *)malloc(HIDDEN_SIZE * INPUT_SIZE * sizeof(float));
float *h_weights2 = (float *)malloc(OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float));
float *h_bias1 = (float *)malloc(HIDDEN_SIZE * sizeof(float));
float *h_bias2 = (float *)malloc(OUTPUT_SIZE * sizeof(float));
// Initialize weights and biases on the host
initialize_weights(h_weights1, HIDDEN_SIZE * INPUT_SIZE);
initialize_weights(h_weights2, OUTPUT_SIZE * HIDDEN_SIZE);
initialize_bias(h_bias1, HIDDEN_SIZE);
initialize_bias(h_bias2, OUTPUT_SIZE);
// Copy initialized values to device
CUDA_CHECK(cudaMemcpy(nn->weights1, h_weights1, HIDDEN_SIZE * INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(nn->weights2, h_weights2, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(nn->bias1, h_bias1, HIDDEN_SIZE * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(nn->bias2, h_bias2, OUTPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice));
// Free temporary host memory
free(h_weights1);
free(h_weights2);
free(h_bias1);
free(h_bias2);
}
int main() {
srand(time(NULL));
NeuralNetwork nn;
initialize_neural_network(&nn);
float *X_train = (float *)malloc(TRAIN_SIZE * INPUT_SIZE * sizeof(float));
int *y_train = (int *)malloc(TRAIN_SIZE * sizeof(int));
float *X_test = (float *)malloc(TEST_SIZE * INPUT_SIZE * sizeof(float));
int *y_test = (int *)malloc(TEST_SIZE * sizeof(int));
load_data("./mnist_data/X_train.bin", X_train, TRAIN_SIZE * INPUT_SIZE);
load_labels("./mnist_data/y_train.bin", y_train, TRAIN_SIZE);
load_data("./mnist_data/X_test.bin", X_test, TEST_SIZE * INPUT_SIZE);
load_labels("./mnist_data/y_test.bin", y_test, TEST_SIZE);
// print first image in the terminal
for (int i = 0; i < 28; i++) {
for (int j = 0; j < 28; j++) {
if (X_train[0 * INPUT_SIZE + i * 28 + j] > 0.0f) {
printf("X");
} else {
printf(" ");
}
}
printf("\n");
}
printf("First 10 training labels: ");
for (int i = 0; i < 10; i++) {
printf("%d ", y_train[i]);
}
printf("\n");
// Start timing
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC, &start);
train(&nn, X_train, y_train, X_test, y_test);
// End timing
clock_gettime(CLOCK_MONOTONIC, &end);
// Calculate duration in seconds with milliseconds
double training_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
printf("\nTotal training time: %.2f sec\n", training_time);
CUDA_CHECK(cudaFree(nn.weights1));
CUDA_CHECK(cudaFree(nn.weights2));
CUDA_CHECK(cudaFree(nn.bias1));
CUDA_CHECK(cudaFree(nn.bias2));
CUDA_CHECK(cudaFree(nn.grad_weights1));
CUDA_CHECK(cudaFree(nn.grad_weights2));
CUDA_CHECK(cudaFree(nn.grad_bias1));
CUDA_CHECK(cudaFree(nn.grad_bias2));
free(X_train);
free(y_train);
free(X_test);
free(y_test);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(err));
return 1;
}
return 0;
}
参考:https://github.com/Infatoshi/cuda-course