当前位置：首页 > article >正文

cuda手搓CNN识别手写数字

article 2025/2/8 19:15:52

英伟达提出了cuda框架，用以实现gpu变成。cuda c以c语言为基础，目前的cuda编译器已经能够支持c++17的语法。但是cuda c的基础语法还是只能使用C。
最近结合使用C++模板编程和cuda c，手搓了一个CNN。其中矩阵的点乘是依据之前博客提出的原理、激活函数和更新方法延续了之前模板元编程的思路，并且复用了一部分原来的代码。cuda的矩阵加和、softmax等都是在AI生成代码上修改而来（笔者使用的事github.copilot，这个东西写出来的代码不能全信，因为它考虑的太过片面）。
笔者使用的显卡是Nvidia Tesla P4。这是一块玩具卡，市面上的价格在250块以下，相当实惠。
在这里插入图片描述
下面简短地贴一下主干的代码，首先是卷积层：

#ifndef __CNN_BASE_HPP__
#define __CNN_BASE_HPP__
#include "mat.hpp"
#include "bp_network.hpp"

/* 卷积层 */
template<int input_row, int input_col, int tpl_row, int tpl_col, int row_step, int col_step, template<typename> class update_method_templ, template<typename> class activate_func, typename tpl_init_method, typename val_t = double>
struct conv_layer 
{
	using tpl_type = mat<tpl_row, tpl_col, val_t>;
	using input_type = mat<input_row, input_col, val_t>;
	using pad_type = mat<input_row + get_pad_size(input_row, tpl_row, row_step)
		, input_col + get_pad_size(input_col, tpl_col, col_step), val_t>;
	using pad_size = pad_size_t<input_row, input_col, tpl_row, tpl_col, row_step, col_step>;
	using ret_type = decltype(inner_conv<row_step, col_step>(input_type().template pad<pad_size::top, pad_size::left, pad_size::right, pad_size::bottom>(), tpl_type()));

	tpl_type mt_tpl;
	update_method_templ<mat<tpl_row, tpl_col, val_t>>	um_tpl;
	pad_type mt_input;
	ret_type mt_bias;
	update_method_templ<ret_type>	um_bias;

	activate_func<ret_type>	act_func;

	conv_layer()
	{
		mt_tpl.template reset<tpl_init_method>();
		mt_bias.template reset<tpl_init_method>();
	}

	inline ret_type forward(const input_type& mt)
	{
		mt_input = mt.template pad<pad_size::top, pad_size::left, pad_size::right, pad_size::bottom>();
		ret_type mt1 = inner_conv<row_step, col_step>(mt_input, mt_tpl);
		return act_func.forward( mt1 + mt_bias);
	}

	inline input_type backward(const ret_type& mt_delta) 
	{
		auto mt_delta_deact = act_func.backward(mt_delta);
		auto mt_delta_span = mt_delta_deact.template span<row_step - 1, col_step - 1>();			// 采用了步长运算，等于有一些没计算，所以反向传播时候的贡献是0
		using ret_pad_type = decltype(mt_delta_span);
		/* 计算反向传播误差 */
		/* 计算返回阵需要pad的大小 */
		constexpr int target_r = tpl_row + pad_type::row_num - 1;
		constexpr int target_c = tpl_col + pad_type::col_num - 1;
		constexpr int pad_top = (target_r - ret_pad_type::row_num) / 2;
		constexpr int pad_left = (target_c - ret_pad_type::col_num) / 2;
		constexpr int pad_right = (target_c - ret_pad_type::col_num) - pad_left;
		constexpr int pad_bottom = (target_r - ret_pad_type::row_num) - pad_top;
		auto mt_delta_span_pad = mt_delta_span.template pad<pad_top, pad_left, pad_right, pad_bottom>();
		auto mt_tpl_rot = mt_tpl.rot180();
		auto mt_ret_pad = inner_conv<1, 1, target_r, target_c, tpl_row, tpl_col, val_t>(mt_delta_span_pad, mt_tpl_rot);
		input_type mt_ret;
		mt_ret.template assign<-1 * pad_size::top, -1 * pad_size::left>(mt_ret_pad);			// 剪除外边
		/* 计算卷积核更新 */
		auto mt_update = inner_conv<1, 1, pad_type::row_num, pad_type::col_num, ret_pad_type::row_num, ret_pad_type::col_num, val_t>(mt_input, mt_delta_span);
		if (mt_update.max_abs() != 0)
			mt_update = mt_update / mt_update.max_abs();
		mt_tpl = um_tpl.update(mt_tpl, mt_update);
		mt_bias = mt_bias - um_bias.update(mt_delta_deact, mt_delta_deact);
		/* 将模板均值置0，最大波动范围为1 */
		double d_mean = mt_tpl.sum() / (tpl_row * tpl_col);
		mt_tpl = mt_tpl - d_mean;
		if (mt_tpl.max_abs() != 0)
			mt_tpl = mt_tpl / mt_tpl.max_abs();

		return mt_ret;
	}

	void update_inert() 
	{
		um_tpl.update_inert();
		um_bias.update_inert();
	}

	void print() 
	{
		printf("<template>\r\n");
		mt_tpl.print();
		printf("<bias>\r\n");
		mt_bias.print();
	}

	static void print_type() 
	{
		printf("conv_layer<%d, %d, %d, %d, %d, %d> ", input_row, input_col, tpl_row, tpl_col, row_step, col_step);
		input_type::print_type();
	}
};

/* 
	多通道、多核的卷积层 
	輸入是這個樣子的：
	C = |C1|
		|C2|
		|C3|
    经过卷积核之后生成一个矩阵数组
    A = {CK1,CK2,CK3}
    将数组中的矩阵纵向排列形成一个大的矩阵，作为加权层的输入
    I = |CK1|
        |CK2|
        |CK3|
    通过加权层得到输出，并且经过激活函数得到最终的输出
    O = activate(W * I + B)
	所以最终O是CK相同维度的一个矩阵
	fix:--------------------------------------------------------------------------
	将A矩阵横向排列，然后通过加权层得到输出
	I=|CK1,CK2,CK3|=|C1K1, C1K2, C1K3|
					|C2K1, C2K2, C2K3|
					|C3K1, C3K2, C3K3|
	然后经过加权层输出得到O
	O = |sum(Wri * CiK1 + B), sum(Wri * CiK2 + B), sum(Wri * CiK3 + B)|
	    ---> Kernel
	  = |sum(W1i * C1K1 + B), sum(W1i * C1K2 + B), sum(W1i * C1K3 + B)| |
	    |sum(W2i * CiK1 + B), sum(W2i * CiK2 + B), sum(W2i * CiK3 + B)| | Weight
		|sum(W3i * CiK1 + B), sum(W3i * CiK2 + B), sum(W3i * CiK3 + B)| V
	最终输出的是一个增倍的矩阵。其中每个Wri都是一个矩阵。
*/
template<
    typename val_type
	, int tpl_num
	, int input_row, int input_col
	, int tpl_row, int tpl_col
	, int row_step, int col_step
	, template<typename> class update_method_templ
	, template<typename> class activate_func
	, typename tpl_init_method
>
struct conv_with_weight
{
	using conv_type = conv_layer<input_row, input_col
		, tpl_row, tpl_col
		, row_step, col_step
		, update_method_templ
		, activate_func
		, tpl_init_method
		, val_type
	>;
	using input_type = mat<input_row, input_col, typename conv_type::input_type::type>;
	using conv_ret_type = typename conv_type::ret_type;
    //using weight_type = typename bp_network<conv_ret_type::col_num, ReLu, val_type, nadam, he_mean_type, conv_ret_type::row_num * tpl_num, conv_ret_type::row_num>;
	//using ret_type = typename weight_type::ret_type;
    //weight_type weight_layer;
	using ret_type = mat<conv_ret_type::row_num*tpl_num, conv_ret_type::col_num, val_type>;

	conv_type tpls[tpl_num];

	static void print_type()
	{
		//weight_type::print_type();
	}

    template<int N>
    void join(ret_type& mt, const conv_ret_type* mt_each)
    {
        mt.template assign<N*conv_ret_type::row_num, 0>(mt_each[N]);
        if constexpr(N < tpl_num - 1)
            join<N + 1>(mt, mt_each);
    }

	// 将数组分割成多个矩阵
    template<int N>
    void split(const ret_type& mt, conv_ret_type* mt_each)
    {
        mt_each[N].template assign<-1 * N*conv_ret_type::row_num, 0>(mt);
        if constexpr(N < tpl_num - 1)
            split<N + 1>(mt, mt_each);
    }

	ret_type forward(const input_type& mt)
	{
        typename conv_type::ret_type ret[tpl_num];
        for (int j = 0; j < tpl_num; ++j) 
        {
            ret[j] = tpls[j].forward(mt);
        }
        // 定义一个输出矩阵，用于存储卷积后的结果
        //typename weight_type::input_type mt_ret;
		ret_type mt_ret;
        join<0>(mt_ret, ret);
		return mt_ret;
		//return weight_layer.forward(mt_ret);
	}

	input_type backward(const ret_type& delta) 
	{
		input_type ret;
        typename conv_type::ret_type ret_delta[tpl_num];
        split<0>(delta, ret_delta);
        for (int j = 0; j < tpl_num; ++j) 
        {
            ret = ret + tpls[j].backward(ret_delta[j]);
        }
        ret = ret / (val_type)tpl_num;
		return ret;
	}

	void update_inert() 
	{
		for (int i = 0; i < tpl_num; ++i) 
		{
			tpls[i].update_inert();
		}
		//weight_layer.update_inert();
	}
};

#endif

要说为什么一定要用模板编程。其实并不是因为模板编程更快，相反，有的时候模板编程的速度要比动态的要慢，因为现在的编译器（至少gcc 4.8.5）不能很好地进行优化，比如对于模板函数的递归调用，不能智能地展开，而真的就是傻傻地一个一个调用。模板编程的好处在于在编译期就能找到一些静态的问题，比如，如果你的网络结构有问题（使用模板编程这个问题在设计的时候就能解决掉），在编译的时候你就能察觉，而不需要很麻烦的跑去调试。
回到正题，下面是卷积神经网络的代码：

#ifndef __CNN_NETWORK_HPP__
#define __CNN_NETWORK_HPP__
#include "cnn_base.hpp"
#include "pool_layer.hpp"
#include "bp_network.hpp"

/*
卷积神经网络
卷积加权层  W
池化层  P
全连接层  O
I --> W1 --> P1 --> W2 --> P2 --> W3 --> P3 --> ... --> Wn --> Pn --> O
 */

template<
    typename val_type
    , template<typename> class update_method_templ
    , template<typename> class activate_func
    , typename tpl_init_method
    , int input_row, int input_col          // 输入矩阵行数，列数
    , int tpl_num, int tpl_row, int tpl_col              // 模板行数，列数
    , int row_step, int col_step            // 行步长，列步长
    , int pool_row, int pool_col            // 池化行数，列数
    >
struct conv_pool_layer
{
    using conv_type = conv_with_weight<val_type, tpl_num, input_row, input_col, tpl_row, tpl_col, row_step, col_step, update_method_templ, activate_func, tpl_init_method>;
    using pool_type = pool_layer_max<conv_type::ret_type::row_num, conv_type::ret_type::col_num, pool_row, pool_col, val_type>;
    using input_type = mat<input_row, input_col, typename conv_type::input_type::type>;
    using ret_type = typename pool_type::ret_type;

    conv_type mt_conv;
    pool_type mt_pool;

    ret_type forward(const input_type& mt)
    {
        return mt_pool.forward(mt_conv.forward(mt));
    }

    input_type backward(const ret_type& mt)
    {
        return mt_conv.backward(mt_pool.backward(mt));
    }

    void update_inert()
    {
        mt_conv.update_inert();
    }

    void print()
    {
        mt_conv.print();
        mt_pool.print();
    }

    static void print_type()
    {
        std::cout << "---------- conv-pool layer" << std::endl;
        std::cout << "tpl_num: " << tpl_num << std::endl;
        std::cout << "input_row: " << input_row << std::endl;
        std::cout << "input_col: " << input_col << std::endl;
        std::cout << "tpl_row: " << tpl_row << std::endl;
        std::cout << "tpl_col: " << tpl_col << std::endl;
        std::cout << "row_step: " << row_step << std::endl;
        std::cout << "col_step: " << col_step << std::endl;
        std::cout << "pool_row: " << pool_row << std::endl;
        std::cout << "pool_col: " << pool_col << std::endl;
    }
};


// 堆叠卷积池化层
template<
    typename val_type
    , template<typename> class update_method_templ
    , template<typename> class activate_func
    , typename tpl_init_method
    , int input_row, int input_col
    /* 循环部分 */
    , int tpl_num , int tpl_row, int tpl_col
    , int row_step, int col_step
    , int pool_row, int pool_col
    , int... remain_layer>
struct stack_conv_pool_layer
{
    using conv_pool_type = conv_pool_layer<val_type, update_method_templ, activate_func, tpl_init_method, input_row, input_col, tpl_num, tpl_row, tpl_col, row_step, col_step, pool_row, pool_col>;
    using next_node_type = stack_conv_pool_layer<val_type, update_method_templ, activate_func, tpl_init_method, conv_pool_type::ret_type::row_num, conv_pool_type::ret_type::col_num, remain_layer...>;
    using input_type = typename conv_pool_type::input_type;
    using ret_type = typename next_node_type::ret_type;

    conv_pool_type mt_conv_pool;
    next_node_type next_node;

    ret_type forward(const typename conv_pool_type::input_type& mt)
    {
        return next_node.forward(mt_conv_pool.forward(mt));
    }

    typename conv_pool_type::input_type backward(const ret_type& delta)
    {
        return mt_conv_pool.backward(next_node.backward(delta));
    }

    void update_inert()
    {
        mt_conv_pool.update_inert();
        next_node.update_inert();
    }

    void print()
    {
        mt_conv_pool.print();
        next_node.print();
    }

    static void print_type()
    {
        std::cout << "---------- stack conv-pool layer" << std::endl;
        std::cout << "tpl_num: " << tpl_num << std::endl;
        std::cout << "input_row: " << input_row << std::endl;
        std::cout << "input_col: " << input_col << std::endl;
        std::cout << "tpl_row: " << tpl_row << std::endl;
        std::cout << "tpl_col: " << tpl_col << std::endl;
        std::cout << "row_step: " << row_step << std::endl;
        std::cout << "col_step: " << col_step << std::endl;
        std::cout << "pool_row: " << pool_row << std::endl;
        std::cout << "pool_col: " << pool_col << std::endl;
        std::cout << "remain_layer: " << sizeof...(remain_layer)/7 << std::endl;

        next_node_type::print_type();
    }
};

// 堆叠卷积池化层
template<
    typename val_type
    , template<typename> class update_method_templ
    , template<typename> class activate_func
    , typename tpl_init_method
    , int input_row, int input_col
    , int tpl_num , int tpl_row, int tpl_col
    , int row_step, int col_step
    , int pool_row, int pool_col
    >
struct stack_conv_pool_layer<val_type, update_method_templ, activate_func, tpl_init_method, input_row, input_col, tpl_num, tpl_row, tpl_col, row_step, col_step, pool_row, pool_col>
{
    using conv_pool_type = conv_pool_layer<val_type, update_method_templ, activate_func, tpl_init_method, input_row, input_col, tpl_num, tpl_row, tpl_col, row_step, col_step, pool_row, pool_col>;
    using input_type = typename conv_pool_type::input_type;
    using ret_type = typename conv_pool_type::ret_type;

    conv_pool_type mt_conv_pool;

    ret_type forward(const typename conv_pool_type::input_type& mt)
    {
        return mt_conv_pool.forward(mt);
    }

    typename conv_pool_type::input_type backward(const ret_type& delta)
    {
        return mt_conv_pool.backward(delta);
    }

    void update_inert()
    {
        mt_conv_pool.update_inert();
    }

    void print()
    {
        mt_conv_pool.print();
    }

    static void print_type()
    {
        std::cout << "---------- stack conv-pool layer" << std::endl;
        std::cout << "tpl_num: " << tpl_num << std::endl;
        std::cout << "input_row: " << input_row << std::endl;
        std::cout << "input_col: " << input_col << std::endl;
        std::cout << "tpl_row: " << tpl_row << std::endl;
        std::cout << "tpl_col: " << tpl_col << std::endl;
        std::cout << "row_step: " << row_step << std::endl;
        std::cout << "col_step: " << col_step << std::endl;
        std::cout << "pool_row: " << pool_row << std::endl;
        std::cout << "pool_col: " << pool_col << std::endl;
        std::cout << "remain_layer: 0" << std::endl;
    }
};


template<
    typename val_type
    /* 全连接层参数 */
    , template<typename> class fullcon_activate_type        // 全连接层激活函数
    , template<typename> class fullcon_update_method
    , typename fullcon_init_method
    , int fullcon_output_num
    /* 卷积池化层参数 */
    , template<typename> class activate_func
    , template<typename> class update_method_templ
    , typename tpl_init_method
    /* 全连接层网络结构 */
    , int input_row, int input_col
    /* 循环部分 */
    , int tpl_num , int tpl_row, int tpl_col
    , int row_step, int col_step
    , int pool_row, int pool_col
    , int... remain_layer>
struct cnn_network
{
    using conv_pool_type = stack_conv_pool_layer<val_type, update_method_templ, activate_func, tpl_init_method, input_row, input_col, tpl_num, tpl_row, tpl_col, row_step, col_step, pool_row, pool_col, remain_layer...>;
    using fullcon_type = bp_network<1, fullcon_activate_type, val_type, fullcon_update_method, fullcon_init_method, conv_pool_type::ret_type::size, fullcon_output_num>;
    using input_type = typename conv_pool_type::input_type;
    using output_type = typename fullcon_type::output_type;
    using ret_type = typename output_type;

    conv_pool_type mt_conv_pool;
    fullcon_type mt_fullcon;

    output_type forward(const input_type& mt)
    {
        return mt_fullcon.forward(mt_conv_pool.forward(mt).to_vector());
    }

    input_type backward(const output_type& delta)
    {
        typename conv_pool_type::ret_type mt_conv_pool_delta;
        mt_fullcon.backward(delta).to_matrix(mt_conv_pool_delta);
        return mt_conv_pool.backward(mt_conv_pool_delta);
    }

    void update_inert()
    {
        mt_conv_pool.update_inert();
        mt_fullcon.update_inert();
    }

    void print()
    {
        mt_conv_pool.print();
        mt_fullcon.print();
    }

    static void print_type()
    {
        std::cout << "---------- cnn network" << std::endl;
        std::cout << "fullcon_output_num: " << fullcon_output_num << std::endl;
        std::cout << "tpl_num: " << tpl_num << std::endl;
        std::cout << "input_row: " << input_row << std::endl;
        std::cout << "input_col: " << input_col << std::endl;
        std::cout << "tpl_row: " << tpl_row << std::endl;
        std::cout << "tpl_col: " << tpl_col << std::endl;
        std::cout << "row_step: " << row_step << std::endl;
        std::cout << "col_step: " << col_step << std::endl;
        std::cout << "pool_row: " << pool_row << std::endl;
        std::cout << "pool_col: " << pool_col << std::endl;
        std::cout << "remain_layer: " << sizeof...(remain_layer)/7 << std::endl;

        conv_pool_type::print_type();
        fullcon_type::print_type();
    }
};

#endif

接下来是测试的代码，用的是minist手写数字数据集。

#include <vector>
#include <iostream>
#include <string>

#include "ht_memory.h"
#include "matrix.hpp"
#include "cnn_network.hpp"

struct train_data 
{
	matrix_host<28, 28, double> mt_image;
	matrix_host<10, 1, double> mt_label;
	int					i_num;
    train_data():mt_image(), mt_label()
    {
        i_num = 0;
    }
    train_data(const train_data &td)
    {
        mt_image = td.mt_image;
        mt_label = td.mt_label;
        i_num = td.i_num;
    }
    train_data &operator=(const train_data &td)
    {
        mt_image = td.mt_image;
        mt_label = td.mt_label;
        i_num = td.i_num;
        return *this;
    }
    train_data(train_data &&td)
    {
        mt_image = std::move(td.mt_image);
        mt_label = std::move(td.mt_label);
        i_num = td.i_num;
    }
    train_data &operator=(train_data &&td)
    {
        mt_image = std::move(td.mt_image);
        mt_label = std::move(td.mt_label);
        i_num = td.i_num;
        return *this;
    }
};


int main()
{
    unsigned char sz_image_buf[28 * 28];

    std::vector<train_data> vec_train_data;

    ht_memory mry_train_images(ht_memory::big_endian);
    mry_train_images.read_file("./data/train-images.idx3-ubyte");
    int32_t i_image_magic_num = 0, i_image_num = 0, i_image_col_num = 0, i_image_row_num = 0;
    mry_train_images >> i_image_magic_num >> i_image_num >> i_image_row_num >> i_image_col_num;
    printf("magic num:%d | image num:%d | image_row:%d | image_col:%d\r\n", i_image_magic_num, i_image_num, i_image_row_num, i_image_col_num);

    ht_memory mry_train_labels(ht_memory::big_endian);
    mry_train_labels.read_file("./data/train-labels.idx1-ubyte");
    int32_t i_label_magic_num = 0, i_label_num = 0;
    mry_train_labels >> i_label_magic_num >> i_label_num;
    printf("magic num:%d | label num:%d\r\n", i_label_magic_num, i_label_num);

    for (int i = 0; i < i_image_num; ++i)
    {
        memset(sz_image_buf, 0, sizeof(sz_image_buf));
        train_data td;
        unsigned char uc_label = 0;
        mry_train_images.read((char *)sz_image_buf, sizeof(sz_image_buf));
        td.mt_image.set_data(sz_image_buf);
        mry_train_labels >> uc_label;
        td.i_num = uc_label;
        td.mt_label.get((const int)uc_label, 0) = 1;
        vec_train_data.push_back(td);
    }
    // 训练参数数量输入
    std::string str_train_times;
    std::cout << "train times:";
    std::getline(std::cin, str_train_times);

    int i_train_times = std::stol(str_train_times);

    std::cout << "train data set size:";
    std::string str_train_data_set_size;
    std::getline(std::cin, str_train_data_set_size);
    int i_train_data_set_size = std::stol(str_train_data_set_size);

    std::cout << "how many times should we update inert? ";
    std::string str_repeat_times;
    std::getline(std::cin, str_repeat_times);
    int i_repeat_times = std::stol(str_repeat_times);

    std::cout << "how many times should we show the result? ";
    std::string str_show_times;
    std::getline(std::cin, str_show_times);
    int i_show_times = std::stol(str_show_times);

    std::cout << "when correct rate reach threshold to stop?:";
    std::string str_repeat_threshold;
    std::getline(std::cin, str_repeat_threshold);
    double dthreshold = std::stod(str_repeat_threshold);

    // 打乱训练数据
    std::random_device rd;
    std::mt19937 rng(rd());
    std::shuffle(vec_train_data.begin(), vec_train_data.end(), rng);

    // 取出训练数据集
    std::vector<train_data> vec_train_data_set;
    for (int i = 0; i < i_train_data_set_size; ++i)
    {
        vec_train_data_set.push_back(vec_train_data[i]);
    }

    using cnn_type = cnn_network<
        double
        , softmax, nadam, xavier_gaussian_type, 10
        , ReLu, nadam, he_gaussian_type
        , 28, 28
        , 16, 5, 5, 1, 1, 2, 2
        , 32, 5, 5, 1, 1, 2, 2
    >;
    cnn_type cnn;
    printf("****** cnn network ******\r\n");
    cnn.print_type();
    printf("###### cnn network ######\r\n");

    mat<28, 28, double>** sz_images = new mat<28,28,double>*[i_train_data_set_size];
    mat<10, 1, double>** sz_labels = new mat<10, 1, double>*[i_train_data_set_size];
    memset(sz_images, 0, sizeof(mat<28, 28, double>*) * i_train_data_set_size);
    memset(sz_labels, 0, sizeof(mat<10, 1, double>*) * i_train_data_set_size);

    int i = 0;
    int i_correct = 0;
    for (; i < i_train_times; ++i)
    {
        i_correct = 0;
        for (int j = 0; j < i_train_data_set_size; ++j)
        {
            auto &td = vec_train_data_set[j];
            if (sz_images[j] == nullptr)
            {
                mat<28, 28, double> mt_image(td.mt_image);
                sz_images[j] = new mat<28, 28, double>(mt_image/256.0);
            }
            auto output = cnn.forward(*sz_images[j]);
            if (sz_labels[j] == nullptr)
            {
                sz_labels[j] = new mat<10, 1, double>(td.mt_label);
            }
            auto delta = (output - *sz_labels[j]);
            cnn.backward(delta);
            int r = 0, c = 0;
            output.max_idx(r, c);
            if (td.i_num == r)
            {
                ++i_correct;
            }
        }
        if ((double)i_correct / i_train_data_set_size > dthreshold)
        {
            break;
        }
        if (i % i_repeat_times == (i_repeat_times - 1))
        {
            cnn.update_inert();
        }
        if (i % i_show_times == (i_show_times - 1))
        {
            std::cout << "train times:" << i << " correct rate:" << (double)i_correct / i_train_data_set_size << std::endl;
        }
    }
    printf("train times:%d correct rate:%f\r\n", i, (double)i_correct / i_train_data_set_size);
    // 使用训练数据测试
    printf("---------- test with train data set ----------\r\n");
    i_correct = 0;
    for (int j = 0; j < i_train_data_set_size; ++j)
    {
        auto &td = vec_train_data_set[j];
        auto output = cnn.forward(*sz_images[j]);
        int r = 0, c = 0;
        output.max_idx(r, c);
        if (td.i_num == r)
        {
            ++i_correct;
        }
        if (j < 10)
        {
            std::cout << "label:" << td.i_num << " output:" << r << std::endl;
        }
    }
    // 随机找10个数据测试
    printf("---------- test with random data ----------\r\n");
    std::shuffle(vec_train_data.begin(), vec_train_data.end(), rng);
    for (int i = 0; i < 10; ++i)
    {
        auto &td = vec_train_data[i];
        mat<28, 28, double> mt_image(td.mt_image);
        auto output = cnn.forward(mt_image/256.0);
        int r = 0, c = 0;
        output.max_idx(r, c);
        std::cout << "label:" << td.i_num << " output:" << r << std::endl;
        //output.print();
    }

    cudaDeviceReset();

    return 0;
}

这个程序用的是2个卷积池化层加上1个全连接层，使用nadam进行训练加速。卷积池化层使用ReLu作为激活函数，全连接层使用的是softmax激活函数。全连接层使用xavier高斯初始化方法，卷积池化层使用的事he高斯进行初始化权值。
下面看看试验结果：

magic num:2049 | label num:60000
train times:200
train data set size:500
how many times should we update inert? 20
how many times should we show the result? 10
when correct rate reach threshold to stop?:0.99
****** cnn network ******
---------- cnn network
fullcon_output_num: 10
tpl_num: 16
input_row: 28
input_col: 28
tpl_row: 5
tpl_col: 5
row_step: 1
col_step: 1
pool_row: 2
pool_col: 2
remain_layer: 1
---------- stack conv-pool layer
tpl_num: 16
input_row: 28
input_col: 28
tpl_row: 5
tpl_col: 5
row_step: 1
col_step: 1
pool_row: 2
pool_col: 2
remain_layer: 1
---------- stack conv-pool layer
tpl_num: 32
input_row: 192
input_col: 12
tpl_row: 5
tpl_col: 5
row_step: 1
col_step: 1
pool_row: 2
pool_col: 2
remain_layer: 0
---------- bp_network ----------
batch_num:1, input_num:12032, output_num:10
###### cnn network ######
train times:9 correct rate:0.986
train times:10 correct rate:0.996000
---------- test with train data set ----------
label:7 output:7
label:3 output:3
label:1 output:1
label:1 output:1
label:2 output:2
label:9 output:9
label:7 output:7
label:4 output:4
label:3 output:3
label:7 output:7
---------- test with random data ----------
label:8 output:8
label:0 output:0
label:1 output:1
label:5 output:5
label:0 output:0
label:0 output:0
label:5 output:5
label:8 output:8
label:0 output:0
label:9 output:9
CUDA error in J:\03_workspace\00_cuda\02_matrix\matrix.hpp at line 279: invalid argument

以上我们输入最大训练200次，训练集大小为500，每20轮训练更新一下nadam的惯性量，每10轮训练打印一下结果，起码达到99%的正确率才能退出。最终我们看到训练了10轮这500个数据的数据集正确率就达到了99.6%。然后分别使用训练集中的数据和随机抽取的数据进行验证，可以看到，随机抽取的10个数据正确率也是100%！
手搓CNN完成。

查看全文

http://www.kler.cn/a/536833.html