cuda手搓CNN识别手写数字
英伟达提出了cuda框架,用以实现gpu变成。cuda c以c语言为基础,目前的cuda编译器已经能够支持c++17的语法。但是cuda c的基础语法还是只能使用C。
最近结合使用C++模板编程和cuda c,手搓了一个CNN。其中矩阵的点乘是依据之前博客提出的原理、激活函数和更新方法延续了之前模板元编程的思路,并且复用了一部分原来的代码。cuda的矩阵加和、softmax等都是在AI生成代码上修改而来(笔者使用的事github.copilot,这个东西写出来的代码不能全信,因为它考虑的太过片面)。
笔者使用的显卡是Nvidia Tesla P4。这是一块玩具卡,市面上的价格在250块以下,相当实惠。
下面简短地贴一下主干的代码,首先是卷积层:
#ifndef __CNN_BASE_HPP__
#define __CNN_BASE_HPP__
#include "mat.hpp"
#include "bp_network.hpp"
/* 卷积层 */
template<int input_row, int input_col, int tpl_row, int tpl_col, int row_step, int col_step, template<typename> class update_method_templ, template<typename> class activate_func, typename tpl_init_method, typename val_t = double>
struct conv_layer
{
using tpl_type = mat<tpl_row, tpl_col, val_t>;
using input_type = mat<input_row, input_col, val_t>;
using pad_type = mat<input_row + get_pad_size(input_row, tpl_row, row_step)
, input_col + get_pad_size(input_col, tpl_col, col_step), val_t>;
using pad_size = pad_size_t<input_row, input_col, tpl_row, tpl_col, row_step, col_step>;
using ret_type = decltype(inner_conv<row_step, col_step>(input_type().template pad<pad_size::top, pad_size::left, pad_size::right, pad_size::bottom>(), tpl_type()));
tpl_type mt_tpl;
update_method_templ<mat<tpl_row, tpl_col, val_t>> um_tpl;
pad_type mt_input;
ret_type mt_bias;
update_method_templ<ret_type> um_bias;
activate_func<ret_type> act_func;
conv_layer()
{
mt_tpl.template reset<tpl_init_method>();
mt_bias.template reset<tpl_init_method>();
}
inline ret_type forward(const input_type& mt)
{
mt_input = mt.template pad<pad_size::top, pad_size::left, pad_size::right, pad_size::bottom>();
ret_type mt1 = inner_conv<row_step, col_step>(mt_input, mt_tpl);
return act_func.forward( mt1 + mt_bias);
}
inline input_type backward(const ret_type& mt_delta)
{
auto mt_delta_deact = act_func.backward(mt_delta);
auto mt_delta_span = mt_delta_deact.template span<row_step - 1, col_step - 1>(); // 采用了步长运算,等于有一些没计算,所以反向传播时候的贡献是0
using ret_pad_type = decltype(mt_delta_span);
/* 计算反向传播误差 */
/* 计算返回阵需要pad的大小 */
constexpr int target_r = tpl_row + pad_type::row_num - 1;
constexpr int target_c = tpl_col + pad_type::col_num - 1;
constexpr int pad_top = (target_r - ret_pad_type::row_num) / 2;
constexpr int pad_left = (target_c - ret_pad_type::col_num) / 2;
constexpr int pad_right = (target_c - ret_pad_type::col_num) - pad_left;
constexpr int pad_bottom = (target_r - ret_pad_type::row_num) - pad_top;
auto mt_delta_span_pad = mt_delta_span.template pad<pad_top, pad_left, pad_right, pad_bottom>();
auto mt_tpl_rot = mt_tpl.rot180();
auto mt_ret_pad = inner_conv<1, 1, target_r, target_c, tpl_row, tpl_col, val_t>(mt_delta_span_pad, mt_tpl_rot);
input_type mt_ret;
mt_ret.template assign<-1 * pad_size::top, -1 * pad_size::left>(mt_ret_pad); // 剪除外边
/* 计算卷积核更新 */
auto mt_update = inner_conv<1, 1, pad_type::row_num, pad_type::col_num, ret_pad_type::row_num, ret_pad_type::col_num, val_t>(mt_input, mt_delta_span);
if (mt_update.max_abs() != 0)
mt_update = mt_update / mt_update.max_abs();
mt_tpl = um_tpl.update(mt_tpl, mt_update);
mt_bias = mt_bias - um_bias.update(mt_delta_deact, mt_delta_deact);
/* 将模板均值置0,最大波动范围为1 */
double d_mean = mt_tpl.sum() / (tpl_row * tpl_col);
mt_tpl = mt_tpl - d_mean;
if (mt_tpl.max_abs() != 0)
mt_tpl = mt_tpl / mt_tpl.max_abs();
return mt_ret;
}
void update_inert()
{
um_tpl.update_inert();
um_bias.update_inert();
}
void print()
{
printf("<template>\r\n");
mt_tpl.print();
printf("<bias>\r\n");
mt_bias.print();
}
static void print_type()
{
printf("conv_layer<%d, %d, %d, %d, %d, %d> ", input_row, input_col, tpl_row, tpl_col, row_step, col_step);
input_type::print_type();
}
};
/*
多通道、多核的卷积层
輸入是這個樣子的:
C = |C1|
|C2|
|C3|
经过卷积核之后生成一个矩阵数组
A = {CK1,CK2,CK3}
将数组中的矩阵纵向排列形成一个大的矩阵,作为加权层的输入
I = |CK1|
|CK2|
|CK3|
通过加权层得到输出,并且经过激活函数得到最终的输出
O = activate(W * I + B)
所以最终O是CK相同维度的一个矩阵
fix:--------------------------------------------------------------------------
将A矩阵横向排列,然后通过加权层得到输出
I=|CK1,CK2,CK3|=|C1K1, C1K2, C1K3|
|C2K1, C2K2, C2K3|
|C3K1, C3K2, C3K3|
然后经过加权层输出得到O
O = |sum(Wri * CiK1 + B), sum(Wri * CiK2 + B), sum(Wri * CiK3 + B)|
---> Kernel
= |sum(W1i * C1K1 + B), sum(W1i * C1K2 + B), sum(W1i * C1K3 + B)| |
|sum(W2i * CiK1 + B), sum(W2i * CiK2 + B), sum(W2i * CiK3 + B)| | Weight
|sum(W3i * CiK1 + B), sum(W3i * CiK2 + B), sum(W3i * CiK3 + B)| V
最终输出的是一个增倍的矩阵。其中每个Wri都是一个矩阵。
*/
template<
typename val_type
, int tpl_num
, int input_row, int input_col
, int tpl_row, int tpl_col
, int row_step, int col_step
, template<typename> class update_method_templ
, template<typename> class activate_func
, typename tpl_init_method
>
struct conv_with_weight
{
using conv_type = conv_layer<input_row, input_col
, tpl_row, tpl_col
, row_step, col_step
, update_method_templ
, activate_func
, tpl_init_method
, val_type
>;
using input_type = mat<input_row, input_col, typename conv_type::input_type::type>;
using conv_ret_type = typename conv_type::ret_type;
//using weight_type = typename bp_network<conv_ret_type::col_num, ReLu, val_type, nadam, he_mean_type, conv_ret_type::row_num * tpl_num, conv_ret_type::row_num>;
//using ret_type = typename weight_type::ret_type;
//weight_type weight_layer;
using ret_type = mat<conv_ret_type::row_num*tpl_num, conv_ret_type::col_num, val_type>;
conv_type tpls[tpl_num];
static void print_type()
{
//weight_type::print_type();
}
template<int N>
void join(ret_type& mt, const conv_ret_type* mt_each)
{
mt.template assign<N*conv_ret_type::row_num, 0>(mt_each[N]);
if constexpr(N < tpl_num - 1)
join<N + 1>(mt, mt_each);
}
// 将数组分割成多个矩阵
template<int N>
void split(const ret_type& mt, conv_ret_type* mt_each)
{
mt_each[N].template assign<-1 * N*conv_ret_type::row_num, 0>(mt);
if constexpr(N < tpl_num - 1)
split<N + 1>(mt, mt_each);
}
ret_type forward(const input_type& mt)
{
typename conv_type::ret_type ret[tpl_num];
for (int j = 0; j < tpl_num; ++j)
{
ret[j] = tpls[j].forward(mt);
}
// 定义一个输出矩阵,用于存储卷积后的结果
//typename weight_type::input_type mt_ret;
ret_type mt_ret;
join<0>(mt_ret, ret);
return mt_ret;
//return weight_layer.forward(mt_ret);
}
input_type backward(const ret_type& delta)
{
input_type ret;
typename conv_type::ret_type ret_delta[tpl_num];
split<0>(delta, ret_delta);
for (int j = 0; j < tpl_num; ++j)
{
ret = ret + tpls[j].backward(ret_delta[j]);
}
ret = ret / (val_type)tpl_num;
return ret;
}
void update_inert()
{
for (int i = 0; i < tpl_num; ++i)
{
tpls[i].update_inert();
}
//weight_layer.update_inert();
}
};
#endif
要说为什么一定要用模板编程。其实并不是因为模板编程更快,相反,有的时候模板编程的速度要比动态的要慢,因为现在的编译器(至少gcc 4.8.5)不能很好地进行优化,比如对于模板函数的递归调用,不能智能地展开,而真的就是傻傻地一个一个调用。模板编程的好处在于在编译期就能找到一些静态的问题,比如,如果你的网络结构有问题(使用模板编程这个问题在设计的时候就能解决掉),在编译的时候你就能察觉,而不需要很麻烦的跑去调试。
回到正题,下面是卷积神经网络的代码:
#ifndef __CNN_NETWORK_HPP__
#define __CNN_NETWORK_HPP__
#include "cnn_base.hpp"
#include "pool_layer.hpp"
#include "bp_network.hpp"
/*
卷积神经网络
卷积加权层 W
池化层 P
全连接层 O
I --> W1 --> P1 --> W2 --> P2 --> W3 --> P3 --> ... --> Wn --> Pn --> O
*/
template<
typename val_type
, template<typename> class update_method_templ
, template<typename> class activate_func
, typename tpl_init_method
, int input_row, int input_col // 输入矩阵行数,列数
, int tpl_num, int tpl_row, int tpl_col // 模板行数,列数
, int row_step, int col_step // 行步长,列步长
, int pool_row, int pool_col // 池化行数,列数
>
struct conv_pool_layer
{
using conv_type = conv_with_weight<val_type, tpl_num, input_row, input_col, tpl_row, tpl_col, row_step, col_step, update_method_templ, activate_func, tpl_init_method>;
using pool_type = pool_layer_max<conv_type::ret_type::row_num, conv_type::ret_type::col_num, pool_row, pool_col, val_type>;
using input_type = mat<input_row, input_col, typename conv_type::input_type::type>;
using ret_type = typename pool_type::ret_type;
conv_type mt_conv;
pool_type mt_pool;
ret_type forward(const input_type& mt)
{
return mt_pool.forward(mt_conv.forward(mt));
}
input_type backward(const ret_type& mt)
{
return mt_conv.backward(mt_pool.backward(mt));
}
void update_inert()
{
mt_conv.update_inert();
}
void print()
{
mt_conv.print();
mt_pool.print();
}
static void print_type()
{
std::cout << "---------- conv-pool layer" << std::endl;
std::cout << "tpl_num: " << tpl_num << std::endl;
std::cout << "input_row: " << input_row << std::endl;
std::cout << "input_col: " << input_col << std::endl;
std::cout << "tpl_row: " << tpl_row << std::endl;
std::cout << "tpl_col: " << tpl_col << std::endl;
std::cout << "row_step: " << row_step << std::endl;
std::cout << "col_step: " << col_step << std::endl;
std::cout << "pool_row: " << pool_row << std::endl;
std::cout << "pool_col: " << pool_col << std::endl;
}
};
// 堆叠卷积池化层
template<
typename val_type
, template<typename> class update_method_templ
, template<typename> class activate_func
, typename tpl_init_method
, int input_row, int input_col
/* 循环部分 */
, int tpl_num , int tpl_row, int tpl_col
, int row_step, int col_step
, int pool_row, int pool_col
, int... remain_layer>
struct stack_conv_pool_layer
{
using conv_pool_type = conv_pool_layer<val_type, update_method_templ, activate_func, tpl_init_method, input_row, input_col, tpl_num, tpl_row, tpl_col, row_step, col_step, pool_row, pool_col>;
using next_node_type = stack_conv_pool_layer<val_type, update_method_templ, activate_func, tpl_init_method, conv_pool_type::ret_type::row_num, conv_pool_type::ret_type::col_num, remain_layer...>;
using input_type = typename conv_pool_type::input_type;
using ret_type = typename next_node_type::ret_type;
conv_pool_type mt_conv_pool;
next_node_type next_node;
ret_type forward(const typename conv_pool_type::input_type& mt)
{
return next_node.forward(mt_conv_pool.forward(mt));
}
typename conv_pool_type::input_type backward(const ret_type& delta)
{
return mt_conv_pool.backward(next_node.backward(delta));
}
void update_inert()
{
mt_conv_pool.update_inert();
next_node.update_inert();
}
void print()
{
mt_conv_pool.print();
next_node.print();
}
static void print_type()
{
std::cout << "---------- stack conv-pool layer" << std::endl;
std::cout << "tpl_num: " << tpl_num << std::endl;
std::cout << "input_row: " << input_row << std::endl;
std::cout << "input_col: " << input_col << std::endl;
std::cout << "tpl_row: " << tpl_row << std::endl;
std::cout << "tpl_col: " << tpl_col << std::endl;
std::cout << "row_step: " << row_step << std::endl;
std::cout << "col_step: " << col_step << std::endl;
std::cout << "pool_row: " << pool_row << std::endl;
std::cout << "pool_col: " << pool_col << std::endl;
std::cout << "remain_layer: " << sizeof...(remain_layer)/7 << std::endl;
next_node_type::print_type();
}
};
// 堆叠卷积池化层
template<
typename val_type
, template<typename> class update_method_templ
, template<typename> class activate_func
, typename tpl_init_method
, int input_row, int input_col
, int tpl_num , int tpl_row, int tpl_col
, int row_step, int col_step
, int pool_row, int pool_col
>
struct stack_conv_pool_layer<val_type, update_method_templ, activate_func, tpl_init_method, input_row, input_col, tpl_num, tpl_row, tpl_col, row_step, col_step, pool_row, pool_col>
{
using conv_pool_type = conv_pool_layer<val_type, update_method_templ, activate_func, tpl_init_method, input_row, input_col, tpl_num, tpl_row, tpl_col, row_step, col_step, pool_row, pool_col>;
using input_type = typename conv_pool_type::input_type;
using ret_type = typename conv_pool_type::ret_type;
conv_pool_type mt_conv_pool;
ret_type forward(const typename conv_pool_type::input_type& mt)
{
return mt_conv_pool.forward(mt);
}
typename conv_pool_type::input_type backward(const ret_type& delta)
{
return mt_conv_pool.backward(delta);
}
void update_inert()
{
mt_conv_pool.update_inert();
}
void print()
{
mt_conv_pool.print();
}
static void print_type()
{
std::cout << "---------- stack conv-pool layer" << std::endl;
std::cout << "tpl_num: " << tpl_num << std::endl;
std::cout << "input_row: " << input_row << std::endl;
std::cout << "input_col: " << input_col << std::endl;
std::cout << "tpl_row: " << tpl_row << std::endl;
std::cout << "tpl_col: " << tpl_col << std::endl;
std::cout << "row_step: " << row_step << std::endl;
std::cout << "col_step: " << col_step << std::endl;
std::cout << "pool_row: " << pool_row << std::endl;
std::cout << "pool_col: " << pool_col << std::endl;
std::cout << "remain_layer: 0" << std::endl;
}
};
template<
typename val_type
/* 全连接层参数 */
, template<typename> class fullcon_activate_type // 全连接层激活函数
, template<typename> class fullcon_update_method
, typename fullcon_init_method
, int fullcon_output_num
/* 卷积池化层参数 */
, template<typename> class activate_func
, template<typename> class update_method_templ
, typename tpl_init_method
/* 全连接层网络结构 */
, int input_row, int input_col
/* 循环部分 */
, int tpl_num , int tpl_row, int tpl_col
, int row_step, int col_step
, int pool_row, int pool_col
, int... remain_layer>
struct cnn_network
{
using conv_pool_type = stack_conv_pool_layer<val_type, update_method_templ, activate_func, tpl_init_method, input_row, input_col, tpl_num, tpl_row, tpl_col, row_step, col_step, pool_row, pool_col, remain_layer...>;
using fullcon_type = bp_network<1, fullcon_activate_type, val_type, fullcon_update_method, fullcon_init_method, conv_pool_type::ret_type::size, fullcon_output_num>;
using input_type = typename conv_pool_type::input_type;
using output_type = typename fullcon_type::output_type;
using ret_type = typename output_type;
conv_pool_type mt_conv_pool;
fullcon_type mt_fullcon;
output_type forward(const input_type& mt)
{
return mt_fullcon.forward(mt_conv_pool.forward(mt).to_vector());
}
input_type backward(const output_type& delta)
{
typename conv_pool_type::ret_type mt_conv_pool_delta;
mt_fullcon.backward(delta).to_matrix(mt_conv_pool_delta);
return mt_conv_pool.backward(mt_conv_pool_delta);
}
void update_inert()
{
mt_conv_pool.update_inert();
mt_fullcon.update_inert();
}
void print()
{
mt_conv_pool.print();
mt_fullcon.print();
}
static void print_type()
{
std::cout << "---------- cnn network" << std::endl;
std::cout << "fullcon_output_num: " << fullcon_output_num << std::endl;
std::cout << "tpl_num: " << tpl_num << std::endl;
std::cout << "input_row: " << input_row << std::endl;
std::cout << "input_col: " << input_col << std::endl;
std::cout << "tpl_row: " << tpl_row << std::endl;
std::cout << "tpl_col: " << tpl_col << std::endl;
std::cout << "row_step: " << row_step << std::endl;
std::cout << "col_step: " << col_step << std::endl;
std::cout << "pool_row: " << pool_row << std::endl;
std::cout << "pool_col: " << pool_col << std::endl;
std::cout << "remain_layer: " << sizeof...(remain_layer)/7 << std::endl;
conv_pool_type::print_type();
fullcon_type::print_type();
}
};
#endif
接下来是测试的代码,用的是minist手写数字数据集。
#include <vector>
#include <iostream>
#include <string>
#include "ht_memory.h"
#include "matrix.hpp"
#include "cnn_network.hpp"
struct train_data
{
matrix_host<28, 28, double> mt_image;
matrix_host<10, 1, double> mt_label;
int i_num;
train_data():mt_image(), mt_label()
{
i_num = 0;
}
train_data(const train_data &td)
{
mt_image = td.mt_image;
mt_label = td.mt_label;
i_num = td.i_num;
}
train_data &operator=(const train_data &td)
{
mt_image = td.mt_image;
mt_label = td.mt_label;
i_num = td.i_num;
return *this;
}
train_data(train_data &&td)
{
mt_image = std::move(td.mt_image);
mt_label = std::move(td.mt_label);
i_num = td.i_num;
}
train_data &operator=(train_data &&td)
{
mt_image = std::move(td.mt_image);
mt_label = std::move(td.mt_label);
i_num = td.i_num;
return *this;
}
};
int main()
{
unsigned char sz_image_buf[28 * 28];
std::vector<train_data> vec_train_data;
ht_memory mry_train_images(ht_memory::big_endian);
mry_train_images.read_file("./data/train-images.idx3-ubyte");
int32_t i_image_magic_num = 0, i_image_num = 0, i_image_col_num = 0, i_image_row_num = 0;
mry_train_images >> i_image_magic_num >> i_image_num >> i_image_row_num >> i_image_col_num;
printf("magic num:%d | image num:%d | image_row:%d | image_col:%d\r\n", i_image_magic_num, i_image_num, i_image_row_num, i_image_col_num);
ht_memory mry_train_labels(ht_memory::big_endian);
mry_train_labels.read_file("./data/train-labels.idx1-ubyte");
int32_t i_label_magic_num = 0, i_label_num = 0;
mry_train_labels >> i_label_magic_num >> i_label_num;
printf("magic num:%d | label num:%d\r\n", i_label_magic_num, i_label_num);
for (int i = 0; i < i_image_num; ++i)
{
memset(sz_image_buf, 0, sizeof(sz_image_buf));
train_data td;
unsigned char uc_label = 0;
mry_train_images.read((char *)sz_image_buf, sizeof(sz_image_buf));
td.mt_image.set_data(sz_image_buf);
mry_train_labels >> uc_label;
td.i_num = uc_label;
td.mt_label.get((const int)uc_label, 0) = 1;
vec_train_data.push_back(td);
}
// 训练参数数量输入
std::string str_train_times;
std::cout << "train times:";
std::getline(std::cin, str_train_times);
int i_train_times = std::stol(str_train_times);
std::cout << "train data set size:";
std::string str_train_data_set_size;
std::getline(std::cin, str_train_data_set_size);
int i_train_data_set_size = std::stol(str_train_data_set_size);
std::cout << "how many times should we update inert? ";
std::string str_repeat_times;
std::getline(std::cin, str_repeat_times);
int i_repeat_times = std::stol(str_repeat_times);
std::cout << "how many times should we show the result? ";
std::string str_show_times;
std::getline(std::cin, str_show_times);
int i_show_times = std::stol(str_show_times);
std::cout << "when correct rate reach threshold to stop?:";
std::string str_repeat_threshold;
std::getline(std::cin, str_repeat_threshold);
double dthreshold = std::stod(str_repeat_threshold);
// 打乱训练数据
std::random_device rd;
std::mt19937 rng(rd());
std::shuffle(vec_train_data.begin(), vec_train_data.end(), rng);
// 取出训练数据集
std::vector<train_data> vec_train_data_set;
for (int i = 0; i < i_train_data_set_size; ++i)
{
vec_train_data_set.push_back(vec_train_data[i]);
}
using cnn_type = cnn_network<
double
, softmax, nadam, xavier_gaussian_type, 10
, ReLu, nadam, he_gaussian_type
, 28, 28
, 16, 5, 5, 1, 1, 2, 2
, 32, 5, 5, 1, 1, 2, 2
>;
cnn_type cnn;
printf("****** cnn network ******\r\n");
cnn.print_type();
printf("###### cnn network ######\r\n");
mat<28, 28, double>** sz_images = new mat<28,28,double>*[i_train_data_set_size];
mat<10, 1, double>** sz_labels = new mat<10, 1, double>*[i_train_data_set_size];
memset(sz_images, 0, sizeof(mat<28, 28, double>*) * i_train_data_set_size);
memset(sz_labels, 0, sizeof(mat<10, 1, double>*) * i_train_data_set_size);
int i = 0;
int i_correct = 0;
for (; i < i_train_times; ++i)
{
i_correct = 0;
for (int j = 0; j < i_train_data_set_size; ++j)
{
auto &td = vec_train_data_set[j];
if (sz_images[j] == nullptr)
{
mat<28, 28, double> mt_image(td.mt_image);
sz_images[j] = new mat<28, 28, double>(mt_image/256.0);
}
auto output = cnn.forward(*sz_images[j]);
if (sz_labels[j] == nullptr)
{
sz_labels[j] = new mat<10, 1, double>(td.mt_label);
}
auto delta = (output - *sz_labels[j]);
cnn.backward(delta);
int r = 0, c = 0;
output.max_idx(r, c);
if (td.i_num == r)
{
++i_correct;
}
}
if ((double)i_correct / i_train_data_set_size > dthreshold)
{
break;
}
if (i % i_repeat_times == (i_repeat_times - 1))
{
cnn.update_inert();
}
if (i % i_show_times == (i_show_times - 1))
{
std::cout << "train times:" << i << " correct rate:" << (double)i_correct / i_train_data_set_size << std::endl;
}
}
printf("train times:%d correct rate:%f\r\n", i, (double)i_correct / i_train_data_set_size);
// 使用训练数据测试
printf("---------- test with train data set ----------\r\n");
i_correct = 0;
for (int j = 0; j < i_train_data_set_size; ++j)
{
auto &td = vec_train_data_set[j];
auto output = cnn.forward(*sz_images[j]);
int r = 0, c = 0;
output.max_idx(r, c);
if (td.i_num == r)
{
++i_correct;
}
if (j < 10)
{
std::cout << "label:" << td.i_num << " output:" << r << std::endl;
}
}
// 随机找10个数据测试
printf("---------- test with random data ----------\r\n");
std::shuffle(vec_train_data.begin(), vec_train_data.end(), rng);
for (int i = 0; i < 10; ++i)
{
auto &td = vec_train_data[i];
mat<28, 28, double> mt_image(td.mt_image);
auto output = cnn.forward(mt_image/256.0);
int r = 0, c = 0;
output.max_idx(r, c);
std::cout << "label:" << td.i_num << " output:" << r << std::endl;
//output.print();
}
cudaDeviceReset();
return 0;
}
这个程序用的是2个卷积池化层加上1个全连接层,使用nadam进行训练加速。卷积池化层使用ReLu作为激活函数,全连接层使用的是softmax激活函数。全连接层使用xavier高斯初始化方法,卷积池化层使用的事he高斯进行初始化权值。
下面看看试验结果:
magic num:2049 | label num:60000
train times:200
train data set size:500
how many times should we update inert? 20
how many times should we show the result? 10
when correct rate reach threshold to stop?:0.99
****** cnn network ******
---------- cnn network
fullcon_output_num: 10
tpl_num: 16
input_row: 28
input_col: 28
tpl_row: 5
tpl_col: 5
row_step: 1
col_step: 1
pool_row: 2
pool_col: 2
remain_layer: 1
---------- stack conv-pool layer
tpl_num: 16
input_row: 28
input_col: 28
tpl_row: 5
tpl_col: 5
row_step: 1
col_step: 1
pool_row: 2
pool_col: 2
remain_layer: 1
---------- stack conv-pool layer
tpl_num: 32
input_row: 192
input_col: 12
tpl_row: 5
tpl_col: 5
row_step: 1
col_step: 1
pool_row: 2
pool_col: 2
remain_layer: 0
---------- bp_network ----------
batch_num:1, input_num:12032, output_num:10
###### cnn network ######
train times:9 correct rate:0.986
train times:10 correct rate:0.996000
---------- test with train data set ----------
label:7 output:7
label:3 output:3
label:1 output:1
label:1 output:1
label:2 output:2
label:9 output:9
label:7 output:7
label:4 output:4
label:3 output:3
label:7 output:7
---------- test with random data ----------
label:8 output:8
label:0 output:0
label:1 output:1
label:5 output:5
label:0 output:0
label:0 output:0
label:5 output:5
label:8 output:8
label:0 output:0
label:9 output:9
CUDA error in J:\03_workspace\00_cuda\02_matrix\matrix.hpp at line 279: invalid argument
以上我们输入最大训练200次,训练集大小为500,每20轮训练更新一下nadam的惯性量,每10轮训练打印一下结果,起码达到99%的正确率才能退出。最终我们看到训练了10轮这500个数据的数据集正确率就达到了99.6%。然后分别使用训练集中的数据和随机抽取的数据进行验证,可以看到,随机抽取的10个数据正确率也是100%!
手搓CNN完成。