当前位置：首页 > article >正文

AlgoC++第八课：手写BP

article 2025/2/22 2:17:35

手写BP

前言

手写AI推出的全新面向AI算法的C++课程 Algo C++，链接。记录下个人学习笔记，仅供自己参考。

本次课程主要是手写 BP 代码

课程大纲可看下面的思维导图

在这里插入图片描述

1. 数据加载

我们首先来实现下MNIST数据集的加载工作

#include <iostream>
#include <tuple>
#include <string>
#include <fstream>
#include "matrix.hpp"

using namespace std;

namespace Application{

    namespace io{

        // 不要内存对齐
        struct __attribute__((packed)) mnist_labels_header_t{
            unsigned int magic_number;
            unsigned int number_of_items;
        }

        struct __attribute__((packed)) mnist_images_header_t{
            unsigned int magic_number;
            unsigned int number_of_images;
            unsigned int number_of_rows;
            unsigned int number_of_cols;
        }

        unsigned int inverse_byte(unsigned int v){
            unsigned char* p = (unsigned char*)&v;
            std::swap(p[0], p[3]);
            std::swap(p[1], p[2]);
            return v;
        }

        /* 加载mnist数据集 */
        tuple<Matrix, Matrix> load_data(const string& image_file, const string& label_file){

            // file, mode = rb/wb/a+
            Matrix images, labels;
            fstream fimage(image_file, ios::binary | ios::in);
            fstream flabel(label_file, ios::binary | ios::in)

            mnist_images_header_t images_header;
            mnist_labels_header_t labels_header;
            fimage.read((char *)&images_header, sizeof(images_header));
            flabel.read((char *)&labels_header, sizeof(labels_header));
            
            // 大小端转换
            images_header.number_of_images = inverse_byte(images_header.number_of_images);
            labels_header.number_of_items  = inverse_byte(labels_header.number_of_items);

            images.resize(images_header.number_of_images, 28 * 28);
            // one hot, float
            labels.resize(labels_header.number_of_items, 10);

            // 中间存储作用, 它的大小等于文件中图像数据的大小
            std::vector<unsigned char> buffer(images.rows() * images.cols());
            fimage.read((char*)buffer.data(), buffer.size());

            // buffer 是unsigned char类型的图像数据，pixel
            // 现在需要转化到images矩阵中
            // 顺便把标准化给做了
            for(int i = 0; i < buffer.size() ++i)
                images.ptr()[i] = (buffer[i] / 255.0f - 0.1307f) / 0.3081f;

            // 开始处理label，ont-hot过程
            // labels是全零的矩阵
            buffer.resize(labels.rows());
            flabel.read((char*)buffer.data(), buffer.size());
            for(int i = 0; i < buffer.size(); ++i)
                labels.ptr(i)[buffer[i]] = 1;	// one-hot
            return make_tuple(images, labels);
        }
    }

    int run(){

        // 验证
        Matrix images, labels;
        tie(images, labels) = io::load_data(
            "mnist/train-images.idx3-ubyte",
            "mnist/train-labels.idx1-ubyte"
        );
        cout << labels;

        return 0;
    }
}

int main(){

    return Application::run();
}

上面示例代码演示了 mnist 数据集加载的示例，其中

定义了两个用于 mnist 数据集的结构体，分别是 mnist_labels_header_t 和 mnist_images_header_t。这两个结构体表示 mnist 标签和图像的头信息
__attribute__((packed)) 是一个 GCC/Clang 的扩展，用于告诉编译器不要进行内存对齐，以便我们自己手动控制内存的布局和对齐。
inverse_byte 用于大小端转换。在 mnist 数据集中，头文件中存储的是大端模式，而读取数据时需要将其转换为小端模式。inverse_byte 函数实现的方法是将 unsigned int 类型的变量按照字节顺序重新排列。
load_data 函数用于加载 mnist 数据集，需要传入图像和标签的文件名。在函数内部，使用 fstream 打开文件并读取头信息，对头信息进行大小端转换，然后根据图像和标签的大小创建矩阵。之后，读取图像和标签数据并进行归一化，最后返回归一化后的矩阵。

标签处理的代码有点看不懂，来分析下

buffer.resize(labels.rows());
flabel.read((char*)buffer.data(), buffer.size());
for(int i = 0; i < buffer.size(); ++i)
    labels.ptr(i)[buffer[i]] = 1;

在 MNIST 数据集中，标签是一个整数值，代表着手写数字的真实值。但是，在神经网络训练中，我们通常需要将标签转化为 one-hot 编码，以便于计算误差。例如，对于数字 5，它的 ont-hot 编码为[0,0,0,0,0,1,0,0,0,0]，也就是在第 6 个位置上为 1，其余位置都为0

这段代码中，首先通过 resize 函数将 buffer 的大小设置为标签行数，也就是样本数。然后从 flabel 中读取 buffer.size() 个字节到 buffer 中。由于标签是一个字节大小的整数，所以 buffer 中存储的是一连串的整数值。接下来的 for 循环中，遍历 buffer 中的所有元素，将标签对应的位置设置为 1，其余位置设置为 0

labels.ptr(i) 返回 labels 中第i行的地址，然后用 [buffer[i]] 访问该行中第 buffer[i] 列的元素，将该元素的值设置为1。这样就完成了标签数据的转化，使其变为了适合神经网络训练的形式

2. 前向传播

来看下整个前向传播过程，包括初始化超参数和权重、偏置矩阵，矩阵相乘进行前传并求取 Loss

#include <iostream>
#include <tuple>
#include <string>
#include <fstream>
#include <random>
#include <cmath>
#include <string.h>
#include <algorithm>    // shuffle
#include "matrix.hpp"

using namespace std;

namespace Application{

    namespace tools{
        
        vector<int> range(int end){
            vector<int> output(end);
            for(int i = 0; i < end; ++i)
                output[i] = i;
            return  output;
        }

        // 通过指定索引数组，并且指定图像、索引的起点和终点。获取对应索引的对应图像，返回matrix
        Matrix slice(const Matrix& images, const vector<int>& indexs, int start, int size){
            
            Matrix output(size, images.cols());
            for(int i = 0; i < size; ++i){

                int image_row_index = indexs[start + i];
                // 把images[image_row_index]对应的一行，复制到output[i]行
                memcpy(output.ptr(i), images.ptr(image_row_index), sizeof(float) * images.cols());
            }
            return output;
        }
    };

    namespace random{

        // 对应的就是seed
        static default_random_engine global_random_engine;

        Matrix normal_distribution_matrix(int rows, int cols, float mean = 0.0f, float stddev = 0.0f){

            // seed
            normal_distribution<float> norm(mean, stddev);
            Matrix output(rows, cols);
            for(int i = 0; i < rows * cols; ++i)
                output.ptr()[i] = norm(global_random_engine);
            
            return output;
        }

        // inplace -> 现场修改
        void shuffle_array(vector<int>& indexs){
            
            std::shuffle(indexs.begin(), indexs.end(), global_random_engine);
        }
    };

    namespace nn{
        
        Matrix relu(cosnt Matrix& x){

            Matrix output = x;
            auto p = output.ptr();
            for(int i = 0; i < x.numel(); ++i, ++p){
                // max(float, int)
                *p = std::max(*p, 0.0f);
            }
            return output;            
        }

        float sigmoid_cross_entropy_loss(const Matrix& output, const Matrix& y){

            static auto sigmoid = [](float x){return 1.0f / (1.0f + exp(-x));}

            // output => n x 10
            // y      => n x 10
            auto po = output.ptr();
            auto py = y.ptr();
            float loss = 0;
            float eps = 1e-5;
            for(int i = 0; i < output.numnel(); ++i, ++po, ++py){

                float prob = sigmoid(*po);
                prob = max(min(prob, 1-eps), eps);
                // loss += -(y * log(p) + (1-y) * log(1-p));
                loss += (*py) * log(prob) + (1 - *py) * log(1 - prob);
            }
            return -loss / output.rows();
        }
    };

    int run(){

        Matrix trainimages, trainlabels;
        Matrix valimages, vallabels;
        tie(trainimages, trainlabels) = io::load_data("mnist/train-images.idx3-ubyte", "mnist/train-labels.idx1-ubyte");
        tie(valimage, vallabels)      = io::load_data("mnist/t10k-images.idx3-ubyte",  "mnist/t10k-labels.idx1-ubyte");

        // hidden = images @ w1 + b1
        // output = hidden @ w2 + b2
        // loss   = lossfn(output, onehot_label)
        // loss.backward()
        int num_images = trainimages.rows();  // 60000
        int num_input  = trainimages.cols();  // 784
        int num_hidden = 1024;
        int num_output = 10;
        int num_epoch  = 10;
        float lr       = 1e-1;
        int batch_size = 256;
        float momentum = 0.9f;

        // drop last -> pytorch dataloader
        int num_batch_per_epoch = num_images / batch_size;

        // 训练流程
        // 1. 随机抓取一个batch，shuffle=True
        //      怎么实现随机呢？
        //      重点是，要求，每一个epoch，抓取的随机性不同
        //     按照索引抓取，index[0,1,2,3,4,5,6,7]
        //     shuffle(indexs) ->[6,5,4,0,1,3,2,7] 
        auto image_indexs = tools::range(num_images);

        // 确定矩阵的大小，并且对矩阵做初始化(凯明初始化fan_in,fan_out)
        float w1_gain =  2.0f / std::sqrt((float)num_input + num_hidden); // 对应激活函数        
        Matrix w1 = random::normal_distribution_matrix(num_input, num_hidden, 0, w1_gain);
        Matrix b1 = random::normal_distribution_matrix(1, num_hidden);        
        
        float w2_gain =  1.0f / std::sqrt((float)num_hidden + num_output); // 对应激活函数        
        Matrix w2 = random::normal_distribution_matrix(num_hidden, num_output, 0, w1_gain);
        Matrix b2 = random::normal_distribution_matrix(1, num_output);

        for(int epoch = 0; epoch < num_epoch; ++epoch){

            random::shuffle_array(image_indexs);

            for(int ibatch = 0; ibatch < num_batch_per_epoch; ++ibatch){

                // image_indexs[ibatch * batch_size : (ibatch + 1) * batch_size]
                auto x = tools::slice(trainimages, image_indexs, ibatch * batch_size, batch_size);
                cout << x.rows() << ", " << x.cols() << endl;
                auto y = tools::slice(trainlabels, image_indexs, ibatch * batch_size, batch_size);
                
                // n x m + 1 x m
                auto hidden = x.gemm(w1) + b1;
                auto hidden_act = nn::relu(hidden);
                auto output     = hidden_act.gemm(w2) + b2;
                float loss      = nn::sigmoid_cross_entropy_loss(output, y);
                
                cout << "Loss: " << loss << endl;
            }

        }
        // 验证
        // Matrix t = random::normal_distribution_matrix(3, 3);
        // Matrix s = tools::slice(t, {0, 1, 2}, 1, 2);
        // cout << t;
        // cout << s;

        return 0;
    }
}

int main(){

    return Application::run();
}

上面示例代码演示了前向传播的过程，包括初始化神经网络参数、进行前向传播计算、计算损失函数

首先，对神经网络的参数进行了初始化，包括输入层到隐藏层的权重矩阵 w1 和偏置向量 b1，隐藏层到输出层的权重矩阵 w2 和偏置向量 b2。这些参数都是通过调用 random::normal_distribution_matrix() 函数生成的，该函数使用了高斯分布来初始化，而 w1 和 w2 的初始化还使用了凯明初始化
然后，代码进入了训练流程，首先进行了数据的随机抽取。随机抽取的方法是使用了 tools::range() 函数生成一个索引数组，然后使用 random::shuffle_array() 函数对该索引数组进行打乱。打乱的目的是保证每个 epoch 中数据的随机性不同。
对于每个 batch，代码首先使用 tools::slice() 函数从训练集抽取出 batch_size 个样本，然后进行前向传播计算。
- hidden = images @ w1 + b1
- output = hidden @ w2 + b2
在得到输出层的数据后，代码计算了该 batch 的损失函数。其具体实现在 nn::sigmoid_cross_entropy_loss() 函数中，采用交叉熵损失，它首先将 output 中的每个元素通过 sigmoid 函数，然后根据公式计算 loss 并返回。

3. 反向传播

我们先来回忆下反向传播的公式

在这里插入图片描述

根据上图来计算 $L oss$ 分别对 $W_1$ $B_1$ $W_2$ $B_2$ 的梯度

1.隐藏层的输出为： $H = relu(XW_1 + B_1)$
2.输出层的预测概率为： $P = sigmoid(HW_2 + B_2)$
3.损失为： $L = B ina ry C ross E n t ro p y L oss (P, Y)$
4.计算 $L$ 对 $W_2$ 和 $B_2$ 的梯度： $\frac{\partial L}{\partial W_2}=H^{T}(P-Y)$ $\frac{\partial L}{\partial B_{2}}= reduce\_sum(P-Y)$
5.计算 $L$ 对 $W_1$ 和 $B_1$ 的梯度： $\frac{\partial L}{\partial W_{1}}=X^{T}\frac{\partial L}{\partial(X W_{1}+B_{1})}$ $\frac{\partial L}{\partial B_{1}}=reduce\_sum\frac{\partial L}{\partial(X W_{1}+B_{1})}$
6.拿到梯度后，对每一个参数应用优化器进行更新迭代

来看下整个反向传播的过程，根据 Loss 分别对 w1、b1、w2、b2 求取梯度，并更新参数

#include <iostream>
#include <tuple>
#include <string>
#include <fstream>
#include <random>
#include <cmath>
#include <string.h>
#include <algorithm>    // shuffle
#include "matrix.hpp"

using namespace std;

namespace Application{

    namespace nn{
        
        Matrix drelu(const Matrix& g, cosnt Matrix& x){
            
            Matrix output = g;
            auto px = x.ptr();
            auto pg = output.ptr();
            for(int i = 0; i < x.numel(); ++i, ++px, ++pg){
                if(*px < 0)
                    *pg = 0;
            }
            return output;
        }

        Matrix relu(cosnt Matrix& x){

            Matrix output = x;
            auto p = output.ptr();
            for(int i = 0; i < x.numel(); ++i, ++p){
                // max(float, int)
                *p = std::max(*p, 0.0f);
            }
            return output;            
        }

        Matrix sigmoid(const Matrix& x){
            Matrix output = x;
            auto p = output.ptr();
            for(int i = 0; i < x.numel(); ++i, ++p){
                float t = *p;
                *p = 1.0f / (1.0f + exp(-t));
            }
            return output;
        }

        float sigmoid_cross_entropy_loss(const Matrix& prob, const Matrix& y){

            // static auto sigmoid = [](float x){return 1.0f / (1.0f + exp(-x));}
            auto prob = sigmoid(output);

            // output => n x 10
            // y      => n x 10
            auto po = prob.ptr();
            auto py = y.ptr();
            float loss = 0;
            float eps = 1e-5;
            for(int i = 0; i < prob.numel(); ++i, ++po, ++py){

                float prob = *po;
                // prob = max(min(prob, 1-eps), eps);
                // loss += -(y * log(p) + (1-y) * log(1-p));
                loss += (*py) * log(p) + (1 - *py) * log(1 - p);
            }
            return -loss / prob.rows();
        }

        float eval_accuracy(const Matrix& testset, const Matrix& y,
            const Matrix& w1, const Matrix& b1, const Matrix& w2, const Matrix& b2){
                
                auto hidden = nn::relu(testset.gemm(w1) + b1);
                auto prob   = nn::sigmoid(hidden.gemm(w2) + b2);
                int correct = 0;

                for(int i = 0; i < prob.rows(); ++i){
                    
                    // 为了拿到每一行预测的标签值，使用argmax
                    int predict_label = prob.argmax(i);
                    
                    // 因为y是one-hot，所以对应预测标签如果是1，则正确
                    if(y(i, predict_label) == 1)
                        correct++;
                }

                return correct / (float)prob.rows();
            }
    };

    int run(){

        Matrix trainimages, trainlabels;
        Matrix valimages, vallabels;
        tie(trainimages, trainlabels) = io::load_data("mnist/train-images.idx3-ubyte", "mnist/train-labels.idx1-ubyte");
        tie(valimages, vallabels)      = io::load_data("mnist/t10k-images.idx3-ubyte",  "mnist/t10k-labels.idx1-ubyte");

        // hidden = images @ w1 + b1
        // output = hidden @ w2 + b2
        // loss   = lossfn(output, onehot_label)
        // loss.backward()
        int num_images = trainimages.rows();  // 60000
        int num_input  = trainimages.cols();  // 784
        int num_hidden = 1024;
        int num_output = 10;
        int num_epoch  = 10;
        float lr       = 1e-1;
        int batch_size = 256;
        float momentum = 0.9f;

        // drop last -> pytorch dataloader
        int num_batch_per_epoch = num_images / batch_size;  // 60000 / 256 = 234(iter)

        // 训练流程
        // 1. 随机抓取一个batch，shuffle=True
        //      怎么实现随机呢？
        //      重点是，要求，每一个epoch，抓取的随机性不同
        //     按照索引抓取，index[0,1,2,3,4,5,6,7]
        //     shuffle(indexs) ->[6,5,4,0,1,3,2,7] 
        auto image_indexs = tools::range(num_images);

        // 确定矩阵的大小，并且对矩阵做初始化(凯明初始化fan_in,fan_out)
        float w1_gain =  2.0f / std::sqrt((float)num_input + num_hidden); // 对应激活函数        
        Matrix w1 = random::normal_distribution_matrix(num_input, num_hidden, 0, w1_gain);
        Matrix b1 = random::normal_distribution_matrix(1, num_hidden);        
        
        float w2_gain =  1.0f / std::sqrt((float)num_hidden + num_output); // 对应激活函数        
        Matrix w2 = random::normal_distribution_matrix(num_hidden, num_output, 0, w1_gain);
        Matrix b2 = random::normal_distribution_matrix(1, num_output);

        for(int epoch = 0; epoch < num_epoch; ++epoch){

            random::shuffle_array(image_indexs);

            for(int ibatch = 0; ibatch < num_batch_per_epoch; ++ibatch){    // 一个epoch

                // image_indexs[ibatch * batch_size : (ibatch + 1) * batch_size]
                auto x = tools::slice(trainimages, image_indexs, ibatch * batch_size, batch_size);
                cout << x.rows() << ", " << x.cols() << endl;
                auto y = tools::slice(trainlabels, image_indexs, ibatch * batch_size, batch_size);
                
                // n x m + 1 x m
                auto hidden      = x.gemm(w1) + b1;
                auto hidden_act  = nn::relu(hidden);
                auto output      = hidden_act.gemm(w2) + b2;
                auto probability = nn::sigmoid(output)
                float loss       = nn::sigmoid_cross_entropy_loss(probability, y);
                
                cout << "Loss: " << loss << endl;

                // backward过程
                // dloss / doutput = (sigmoid(output) - y) / output.rows
                // C = AB
                // G = dloss / dC
                // dloss / dA = G @ B^T
                // dloss / dB = A^T @ G
                // output = hidden_act @ w2
                auto doutput     = (probability - y) / (float)output.rows();
                auto db2         = doutput.reduce_sum_by_row();
                auto dhidden_act = doutput.gemm(w2, false, true);
                auto dw2         = hidden_act.gemm(doutput, true);

                // dloss / dhidden
                auto dhidden     = nn::drelu(dhidden_act, hidden);

                // x @ w1 + b1
                auto db1         = dhidden.reduce_sum_by_row();
                auto dw1         = x.gemm(dhidden, true);

                // 更新
                w1 = w1 - lr * dw1;
                b1 = b1 - lr * dw1;

            }

            float accuracy = nn::eval_accuracy(valimages, vallabels, w1, b1, w2, b2);
            printf("%d. accuracy: %.2f %%\n", epoch, accuracy * 100);

        }
        // 验证
        // Matrix t = random::normal_distribution_matrix(3, 3);
        // Matrix s = tools::slice(t, {0, 1, 2}, 1, 2);
        // cout << t;
        // cout << s;

        return 0;
    }
}

int main(){

    return Application::run();
}

在上面的示例代码中，主要是求取梯度的过程比较繁琐，需要明确 $L$ 分别对 $W_1$ $B_1$ $W_2$ $B_2$ 的导数计算，后续就是利用 SGD 算法进行参数更新。最后在每个 epoch 结束后，计算测试集上的准确率并输出结果

总结

本次课程跟着杜老师手写了一遍 BP 反向传播算法，加深了对 BP 的认识，锻炼了自己的动手能力，提高了对 C++ 和神经网络的进一步理解。整个 BP 过程还是比较清晰的，无非是前传 => 计算Loss => 反传 => 更新，但还是要求对许多细节的把握，比如如何去优化代码提高性能，如何尽可能的减少矩阵相乘的次数等等，多想多写吧。期待下次手写 AutoGrad 课程😄

查看全文

http://www.kler.cn/a/15882.html