英伟达提出了cuda框架,用以实现gpu变成。cuda c以c语言为基础,目前的cuda编译器已经能够支持c++17的语法。但是cuda c的基础语法还是只能使用C。
最近结合使用C++模板编程和cuda c,手搓了一个CNN。其中矩阵的点乘是依据之前博客提出的原理、激活函数和更新方法延续了之前模板元编程的思路,并且复用了一部分原来的代码。cuda的矩阵加和、softmax等都是在AI生成代码上修改而来(笔者使用的事github.copilot,这个东西写出来的代码不能全信,因为它考虑的太过片面)。
笔者使用的显卡是Nvidia Tesla P4。这是一块玩具卡,市面上的价格在250块以下,相当实惠。
#ifndef __CNN_BASE_HPP__
#define __CNN_BASE_HPP__
#include "mat.hpp"
#include "bp_network.hpp"
/* 卷积层 */
template<int input_row, int input_col, int tpl_row, int tpl_col, int row_step, int col_step, template<typename> class update_method_templ, template<typename> class activate_func, typename tpl_init_method, typename val_t = double>
struct conv_layer
using tpl_type = mat<tpl_row, tpl_col, val_t>;
using input_type = mat<input_row, input_col, val_t>;
using pad_type = mat<input_row + get_pad_size(input_row, tpl_row, row_step)
, input_col + get_pad_size(input_col, tpl_col, col_step), val_t>;
using pad_size = pad_size_t<input_row, input_col, tpl_row, tpl_col, row_step, col_step>;
using ret_type = decltype(inner_conv<row_step, col_step>(input_type().template pad<pad_size::top, pad_size::left, pad_size::right, pad_size::bottom>(), tpl_type()));
tpl_type mt_tpl;
update_method_templ<mat<tpl_row, tpl_col, val_t>> um_tpl;
pad_type mt_input;
ret_type mt_bias;
update_method_templ<ret_type> um_bias;
activate_func<ret_type> act_func;
mt_tpl.template reset<tpl_init_method>();
mt_bias.template reset<tpl_init_method>();
inline ret_type forward(const input_type& mt)
mt_input = mt.template pad<pad_size::top, pad_size::left, pad_size::right, pad_size::bottom>();
ret_type mt1 = inner_conv<row_step, col_step>(mt_input, mt_tpl);
return act_func.forward( mt1 + mt_bias);
inline input_type backward(const ret_type& mt_delta)
auto mt_delta_deact = act_func.backward(mt_delta);
auto mt_delta_span = mt_delta_deact.template span<row_step - 1, col_step - 1>(); // 采用了步长运算,等于有一些没计算,所以反向传播时候的贡献是0
using ret_pad_type = decltype(mt_delta_span);
/* 计算反向传播误差 */
/* 计算返回阵需要pad的大小 */
constexpr int target_r = tpl_row + pad_type::row_num - 1;
constexpr int target_c = tpl_col + pad_type::col_num - 1;
constexpr int pad_top = (target_r - ret_pad_type::row_num) / 2;
constexpr int pad_left = (target_c - ret_pad_type::col_num) / 2;
constexpr int pad_right = (target_c - ret_pad_type::col_num) - pad_left;
constexpr int pad_bottom = (target_r - ret_pad_type::row_num) - pad_top;
auto mt_delta_span_pad = mt_delta_span.template pad<pad_top, pad_left, pad_right, pad_bottom>();
auto mt_tpl_rot = mt_tpl.rot180();
auto mt_ret_pad = inner_conv<1, 1, target_r, target_c, tpl_row, tpl_col, val_t>(mt_delta_span_pad, mt_tpl_rot);
input_type mt_ret;
mt_ret.template assign<-1 * pad_size::top, -1 * pad_size::left>(mt_ret_pad); // 剪除外边
/* 计算卷积核更新 */
auto mt_update = inner_conv<1, 1, pad_type::row_num, pad_type::col_num, ret_pad_type::row_num, ret_pad_type::col_num, val_t>(mt_input, mt_delta_span);
if (mt_update.max_abs() != 0)
mt_update = mt_update / mt_update.max_abs();
mt_tpl = um_tpl.update(mt_tpl, mt_update);
mt_bias = mt_bias - um_bias.update(mt_delta_deact, mt_delta_deact);
/* 将模板均值置0,最大波动范围为1 */
double d_mean = mt_tpl.sum() / (tpl_row * tpl_col);
mt_tpl = mt_tpl - d_mean;
if (mt_tpl.max_abs() != 0)
mt_tpl = mt_tpl / mt_tpl.max_abs();
return mt_ret;
void update_inert()
void print()
static void print_type()
printf("conv_layer<%d, %d, %d, %d, %d, %d> ", input_row, input_col, tpl_row, tpl_col, row_step, col_step);
C = |C1|
A = {CK1,CK2,CK3}
I = |CK1|
O = activate(W * I + B)
I=|CK1,CK2,CK3|=|C1K1, C1K2, C1K3|
|C2K1, C2K2, C2K3|
|C3K1, C3K2, C3K3|
O = |sum(Wri * CiK1 + B), sum(Wri * CiK2 + B), sum(Wri * CiK3 + B)|
---> Kernel
= |sum(W1i * C1K1 + B), sum(W1i * C1K2 + B), sum(W1i * C1K3 + B)| |
|sum(W2i * CiK1 + B), sum(W2i * CiK2 + B), sum(W2i * CiK3 + B)| | Weight
|sum(W3i * CiK1 + B), sum(W3i * CiK2 + B), sum(W3i * CiK3 + B)| V
typename val_type
, int tpl_num
, int input_row, int input_col
, int tpl_row, int tpl_col
, int row_step, int col_step
, template<typename> class update_method_templ
, template<typename> class activate_func
, typename tpl_init_method
struct conv_with_weight
using conv_type = conv_layer<input_row, input_col
, tpl_row, tpl_col
, row_step, col_step
, update_method_templ
, activate_func
, tpl_init_method
, val_type
using input_type = mat<input_row, input_col, typename conv_type::input_type::type>;
using conv_ret_type = typename conv_type::ret_type;
//using weight_type = typename bp_network<conv_ret_type::col_num, ReLu, val_type, nadam, he_mean_type, conv_ret_type::row_num * tpl_num, conv_ret_type::row_num>;
//using ret_type = typename weight_type::ret_type;
//weight_type weight_layer;
using ret_type = mat<conv_ret_type::row_num*tpl_num, conv_ret_type::col_num, val_type>;
conv_type tpls[tpl_num];
static void print_type()
template<int N>
void join(ret_type& mt, const conv_ret_type* mt_each)
mt.template assign<N*conv_ret_type::row_num, 0>(mt_each[N]);
if constexpr(N < tpl_num - 1)
join<N + 1>(mt, mt_each);
// 将数组分割成多个矩阵
template<int N>
void split(const ret_type& mt, conv_ret_type* mt_each)
mt_each[N].template assign<-1 * N*conv_ret_type::row_num, 0>(mt);
if constexpr(N < tpl_num - 1)
split<N + 1>(mt, mt_each);
ret_type forward(const input_type& mt)
typename conv_type::ret_type ret[tpl_num];
for (int j = 0; j < tpl_num; ++j)
ret[j] = tpls[j].forward(mt);
// 定义一个输出矩阵,用于存储卷积后的结果
//typename weight_type::input_type mt_ret;
ret_type mt_ret;
join<0>(mt_ret, ret);
return mt_ret;
//return weight_layer.forward(mt_ret);
input_type backward(const ret_type& delta)
input_type ret;
typename conv_type::ret_type ret_delta[tpl_num];
split<0>(delta, ret_delta);
for (int j = 0; j < tpl_num; ++j)
ret = ret + tpls[j].backward(ret_delta[j]);
ret = ret / (val_type)tpl_num;
return ret;
void update_inert()
for (int i = 0; i < tpl_num; ++i)
要说为什么一定要用模板编程。其实并不是因为模板编程更快,相反,有的时候模板编程的速度要比动态的要慢,因为现在的编译器(至少gcc 4.8.5)不能很好地进行优化,比如对于模板函数的递归调用,不能智能地展开,而真的就是傻傻地一个一个调用。模板编程的好处在于在编译期就能找到一些静态的问题,比如,如果你的网络结构有问题(使用模板编程这个问题在设计的时候就能解决掉),在编译的时候你就能察觉,而不需要很麻烦的跑去调试。
#ifndef __CNN_NETWORK_HPP__
#define __CNN_NETWORK_HPP__
#include "cnn_base.hpp"
#include "pool_layer.hpp"
#include "bp_network.hpp"
卷积加权层 W
池化层 P
全连接层 O
I --> W1 --> P1 --> W2 --> P2 --> W3 --> P3 --> ... --> Wn --> Pn --> O
typename val_type
, template<typename> class update_method_templ
, template<typename> class activate_func
, typename tpl_init_method
, int input_row, int input_col // 输入矩阵行数,列数
, int tpl_num, int tpl_row, int tpl_col // 模板行数,列数
, int row_step, int col_step // 行步长,列步长
, int pool_row, int pool_col // 池化行数,列数
struct conv_pool_layer
using conv_type = conv_with_weight<val_type, tpl_num, input_row, input_col, tpl_row, tpl_col, row_step, col_step, update_method_templ, activate_func, tpl_init_method>;
using pool_type = pool_layer_max<conv_type::ret_type::row_num, conv_type::ret_type::col_num, pool_row, pool_col, val_type>;
using input_type = mat<input_row, input_col, typename conv_type::input_type::type>;
using ret_type = typename pool_type::ret_type;
conv_type mt_conv;
pool_type mt_pool;
ret_type forward(const input_type& mt)
return mt_pool.forward(mt_conv.forward(mt));
input_type backward(const ret_type& mt)
return mt_conv.backward(mt_pool.backward(mt));
void update_inert()
void print()
static void print_type()
std::cout << "---------- conv-pool layer" << std::endl;
std::cout << "tpl_num: " << tpl_num << std::endl;
std::cout << "input_row: " << input_row << std::endl;
std::cout << "input_col: " << input_col << std::endl;
std::cout << "tpl_row: " << tpl_row << std::endl;
std::cout << "tpl_col: " << tpl_col << std::endl;
std::cout << "row_step: " << row_step << std::endl;
std::cout << "col_step: " << col_step << std::endl;
std::cout << "pool_row: " << pool_row << std::endl;
std::cout << "pool_col: " << pool_col << std::endl;
// 堆叠卷积池化层
typename val_type
, template<typename> class update_method_templ
, template<typename> class activate_func
, typename tpl_init_method
, int input_row, int input_col
/* 循环部分 */
, int tpl_num , int tpl_row, int tpl_col
, int row_step, int col_step
, int pool_row, int pool_col
, int... remain_layer>
struct stack_conv_pool_layer
using conv_pool_type = conv_pool_layer<val_type, update_method_templ, activate_func, tpl_init_method, input_row, input_col, tpl_num, tpl_row, tpl_col, row_step, col_step, pool_row, pool_col>;
using next_node_type = stack_conv_pool_layer<val_type, update_method_templ, activate_func, tpl_init_method, conv_pool_type::ret_type::row_num, conv_pool_type::ret_type::col_num, remain_layer...>;
using input_type = typename conv_pool_type::input_type;
using ret_type = typename next_node_type::ret_type;
conv_pool_type mt_conv_pool;
next_node_type next_node;
ret_type forward(const typename conv_pool_type::input_type& mt)
return next_node.forward(mt_conv_pool.forward(mt));
typename conv_pool_type::input_type backward(const ret_type& delta)
return mt_conv_pool.backward(next_node.backward(delta));
void update_inert()
void print()
static void print_type()
std::cout << "---------- stack conv-pool layer" << std::endl;
std::cout << "tpl_num: " << tpl_num << std::endl;
std::cout << "input_row: " << input_row << std::endl;
std::cout << "input_col: " << input_col << std::endl;
std::cout << "tpl_row: " << tpl_row << std::endl;
std::cout << "tpl_col: " << tpl_col << std::endl;
std::cout << "row_step: " << row_step << std::endl;
std::cout << "col_step: " << col_step << std::endl;
std::cout << "pool_row: " << pool_row << std::endl;
std::cout << "pool_col: " << pool_col << std::endl;
std::cout << "remain_layer: " << sizeof...(remain_layer)/7 << std::endl;
// 堆叠卷积池化层
typename val_type
, template<typename> class update_method_templ
, template<typename> class activate_func
, typename tpl_init_method
, int input_row, int input_col
, int tpl_num , int tpl_row, int tpl_col
, int row_step, int col_step
, int pool_row, int pool_col
struct stack_conv_pool_layer<val_type, update_method_templ, activate_func, tpl_init_method, input_row, input_col, tpl_num, tpl_row, tpl_col, row_step, col_step, pool_row, pool_col>
using conv_pool_type = conv_pool_layer<val_type, update_method_templ, activate_func, tpl_init_method, input_row, input_col, tpl_num, tpl_row, tpl_col, row_step, col_step, pool_row, pool_col>;
using input_type = typename conv_pool_type::input_type;
using ret_type = typename conv_pool_type::ret_type;
conv_pool_type mt_conv_pool;
ret_type forward(const typename conv_pool_type::input_type& mt)
return mt_conv_pool.forward(mt);
typename conv_pool_type::input_type backward(const ret_type& delta)
return mt_conv_pool.backward(delta);
void update_inert()
void print()
static void print_type()
std::cout << "---------- stack conv-pool layer" << std::endl;
std::cout << "tpl_num: " << tpl_num << std::endl;
std::cout << "input_row: " << input_row << std::endl;
std::cout << "input_col: " << input_col << std::endl;
std::cout << "tpl_row: " << tpl_row << std::endl;
std::cout << "tpl_col: " << tpl_col << std::endl;
std::cout << "row_step: " << row_step << std::endl;
std::cout << "col_step: " << col_step << std::endl;
std::cout << "pool_row: " << pool_row << std::endl;
std::cout << "pool_col: " << pool_col << std::endl;
std::cout << "remain_layer: 0" << std::endl;
typename val_type
/* 全连接层参数 */
, template<typename> class fullcon_activate_type // 全连接层激活函数
, template<typename> class fullcon_update_method
, typename fullcon_init_method
, int fullcon_output_num
/* 卷积池化层参数 */
, template<typename> class activate_func
, template<typename> class update_method_templ
, typename tpl_init_method
/* 全连接层网络结构 */
, int input_row, int input_col
/* 循环部分 */
, int tpl_num , int tpl_row, int tpl_col
, int row_step, int col_step
, int pool_row, int pool_col
, int... remain_layer>
struct cnn_network
using conv_pool_type = stack_conv_pool_layer<val_type, update_method_templ, activate_func, tpl_init_method, input_row, input_col, tpl_num, tpl_row, tpl_col, row_step, col_step, pool_row, pool_col, remain_layer...>;
using fullcon_type = bp_network<1, fullcon_activate_type, val_type, fullcon_update_method, fullcon_init_method, conv_pool_type::ret_type::size, fullcon_output_num>;
using input_type = typename conv_pool_type::input_type;
using output_type = typename fullcon_type::output_type;
using ret_type = typename output_type;
conv_pool_type mt_conv_pool;
fullcon_type mt_fullcon;
output_type forward(const input_type& mt)
return mt_fullcon.forward(mt_conv_pool.forward(mt).to_vector());
input_type backward(const output_type& delta)
typename conv_pool_type::ret_type mt_conv_pool_delta;
return mt_conv_pool.backward(mt_conv_pool_delta);
void update_inert()
void print()
static void print_type()
std::cout << "---------- cnn network" << std::endl;
std::cout << "fullcon_output_num: " << fullcon_output_num << std::endl;
std::cout << "tpl_num: " << tpl_num << std::endl;
std::cout << "input_row: " << input_row << std::endl;
std::cout << "input_col: " << input_col << std::endl;
std::cout << "tpl_row: " << tpl_row << std::endl;
std::cout << "tpl_col: " << tpl_col << std::endl;
std::cout << "row_step: " << row_step << std::endl;
std::cout << "col_step: " << col_step << std::endl;
std::cout << "pool_row: " << pool_row << std::endl;
std::cout << "pool_col: " << pool_col << std::endl;
std::cout << "remain_layer: " << sizeof...(remain_layer)/7 << std::endl;
#include <vector>
#include <iostream>
#include <string>
#include "ht_memory.h"
#include "matrix.hpp"
#include "cnn_network.hpp"
struct train_data
matrix_host<28, 28, double> mt_image;
matrix_host<10, 1, double> mt_label;
int i_num;
train_data():mt_image(), mt_label()
i_num = 0;
train_data(const train_data &td)
mt_image = td.mt_image;
mt_label = td.mt_label;
i_num = td.i_num;
train_data &operator=(const train_data &td)
mt_image = td.mt_image;
mt_label = td.mt_label;
i_num = td.i_num;
return *this;
train_data(train_data &&td)
mt_image = std::move(td.mt_image);
mt_label = std::move(td.mt_label);
i_num = td.i_num;
train_data &operator=(train_data &&td)
mt_image = std::move(td.mt_image);
mt_label = std::move(td.mt_label);
i_num = td.i_num;
return *this;
int main()
unsigned char sz_image_buf[28 * 28];
std::vector<train_data> vec_train_data;
ht_memory mry_train_images(ht_memory::big_endian);
int32_t i_image_magic_num = 0, i_image_num = 0, i_image_col_num = 0, i_image_row_num = 0;
mry_train_images >> i_image_magic_num >> i_image_num >> i_image_row_num >> i_image_col_num;
printf("magic num:%d | image num:%d | image_row:%d | image_col:%d\r\n", i_image_magic_num, i_image_num, i_image_row_num, i_image_col_num);
ht_memory mry_train_labels(ht_memory::big_endian);
int32_t i_label_magic_num = 0, i_label_num = 0;
mry_train_labels >> i_label_magic_num >> i_label_num;
printf("magic num:%d | label num:%d\r\n", i_label_magic_num, i_label_num);
for (int i = 0; i < i_image_num; ++i)
memset(sz_image_buf, 0, sizeof(sz_image_buf));
train_data td;
unsigned char uc_label = 0; *)sz_image_buf, sizeof(sz_image_buf));
mry_train_labels >> uc_label;
td.i_num = uc_label;
td.mt_label.get((const int)uc_label, 0) = 1;
// 训练参数数量输入
std::string str_train_times;
std::cout << "train times:";
std::getline(std::cin, str_train_times);
int i_train_times = std::stol(str_train_times);
std::cout << "train data set size:";
std::string str_train_data_set_size;
std::getline(std::cin, str_train_data_set_size);
int i_train_data_set_size = std::stol(str_train_data_set_size);
std::cout << "how many times should we update inert? ";
std::string str_repeat_times;
std::getline(std::cin, str_repeat_times);
int i_repeat_times = std::stol(str_repeat_times);
std::cout << "how many times should we show the result? ";
std::string str_show_times;
std::getline(std::cin, str_show_times);
int i_show_times = std::stol(str_show_times);
std::cout << "when correct rate reach threshold to stop?:";
std::string str_repeat_threshold;
std::getline(std::cin, str_repeat_threshold);
double dthreshold = std::stod(str_repeat_threshold);
// 打乱训练数据
std::random_device rd;
std::mt19937 rng(rd());
std::shuffle(vec_train_data.begin(), vec_train_data.end(), rng);
// 取出训练数据集
std::vector<train_data> vec_train_data_set;
for (int i = 0; i < i_train_data_set_size; ++i)
using cnn_type = cnn_network<
, softmax, nadam, xavier_gaussian_type, 10
, ReLu, nadam, he_gaussian_type
, 28, 28
, 16, 5, 5, 1, 1, 2, 2
, 32, 5, 5, 1, 1, 2, 2
cnn_type cnn;
printf("****** cnn network ******\r\n");
printf("###### cnn network ######\r\n");
mat<28, 28, double>** sz_images = new mat<28,28,double>*[i_train_data_set_size];
mat<10, 1, double>** sz_labels = new mat<10, 1, double>*[i_train_data_set_size];
memset(sz_images, 0, sizeof(mat<28, 28, double>*) * i_train_data_set_size);
memset(sz_labels, 0, sizeof(mat<10, 1, double>*) * i_train_data_set_size);
int i = 0;
int i_correct = 0;
for (; i < i_train_times; ++i)
i_correct = 0;
for (int j = 0; j < i_train_data_set_size; ++j)
auto &td = vec_train_data_set[j];
if (sz_images[j] == nullptr)
mat<28, 28, double> mt_image(td.mt_image);
sz_images[j] = new mat<28, 28, double>(mt_image/256.0);
auto output = cnn.forward(*sz_images[j]);
if (sz_labels[j] == nullptr)
sz_labels[j] = new mat<10, 1, double>(td.mt_label);
auto delta = (output - *sz_labels[j]);
int r = 0, c = 0;
output.max_idx(r, c);
if (td.i_num == r)
if ((double)i_correct / i_train_data_set_size > dthreshold)
if (i % i_repeat_times == (i_repeat_times - 1))
if (i % i_show_times == (i_show_times - 1))
std::cout << "train times:" << i << " correct rate:" << (double)i_correct / i_train_data_set_size << std::endl;
printf("train times:%d correct rate:%f\r\n", i, (double)i_correct / i_train_data_set_size);
// 使用训练数据测试
printf("---------- test with train data set ----------\r\n");
i_correct = 0;
for (int j = 0; j < i_train_data_set_size; ++j)
auto &td = vec_train_data_set[j];
auto output = cnn.forward(*sz_images[j]);
int r = 0, c = 0;
output.max_idx(r, c);
if (td.i_num == r)
if (j < 10)
std::cout << "label:" << td.i_num << " output:" << r << std::endl;
// 随机找10个数据测试
printf("---------- test with random data ----------\r\n");
std::shuffle(vec_train_data.begin(), vec_train_data.end(), rng);
for (int i = 0; i < 10; ++i)
auto &td = vec_train_data[i];
mat<28, 28, double> mt_image(td.mt_image);
auto output = cnn.forward(mt_image/256.0);
int r = 0, c = 0;
output.max_idx(r, c);
std::cout << "label:" << td.i_num << " output:" << r << std::endl;
return 0;
magic num:2049 | label num:60000
train times:200
train data set size:500
how many times should we update inert? 20
how many times should we show the result? 10
when correct rate reach threshold to stop?:0.99
****** cnn network ******
---------- cnn network
fullcon_output_num: 10
tpl_num: 16
input_row: 28
input_col: 28
tpl_row: 5
tpl_col: 5
row_step: 1
col_step: 1
pool_row: 2
pool_col: 2
remain_layer: 1
---------- stack conv-pool layer
tpl_num: 16
input_row: 28
input_col: 28
tpl_row: 5
tpl_col: 5
row_step: 1
col_step: 1
pool_row: 2
pool_col: 2
remain_layer: 1
---------- stack conv-pool layer
tpl_num: 32
input_row: 192
input_col: 12
tpl_row: 5
tpl_col: 5
row_step: 1
col_step: 1
pool_row: 2
pool_col: 2
remain_layer: 0
---------- bp_network ----------
batch_num:1, input_num:12032, output_num:10
###### cnn network ######
train times:9 correct rate:0.986
train times:10 correct rate:0.996000
---------- test with train data set ----------
label:7 output:7
label:3 output:3
label:1 output:1
label:1 output:1
label:2 output:2
label:9 output:9
label:7 output:7
label:4 output:4
label:3 output:3
label:7 output:7
---------- test with random data ----------
label:8 output:8
label:0 output:0
label:1 output:1
label:5 output:5
label:0 output:0
label:0 output:0
label:5 output:5
label:8 output:8
label:0 output:0
label:9 output:9
CUDA error in J:\03_workspace\00_cuda\02_matrix\matrix.hpp at line 279: invalid argument