【cuda学习日记】8.2 GPU加速库 --cuBLAS
8.2.1 cuBLAS
cuBLAS是一个线性代数子程式,基于BLAS basic linear algebra subprograms),即基本线性代数子程序。cuBLAS不支持多种稀疏数据格式,它仅支持并善于优化稠密向量和稠密矩阵的操作。
由于最初的BLAS库是用FORTRAN语言编写的,因此它使用的是以列优先的数组存储和以1为基准的索引。
8.2.1 先计算C = AB
#include "../common/common.h"
#include <stdio.h>
#include <cublas_v2.h>
#define M 3
#define N 2
#define K 2
void printMatrix(int R, int C, float *A, const char *name) {
printf("%s = \n", name);
for (int r = 0; r < R; ++r) {
for (int c = 0; c < C; ++c) {
printf("%10.6f", A[r * C + c]);
}
printf("\n");
}
printf("\n");
}
void initialData_ez(float *ip, int size)
{
//time_t t;
//srand((unsigned int) time(&t));
for (int i = 0; i < size; i++) {
ip[i] = i;
}
}
/*
cublasSgemm(
handle,
CUBLAS_OP_T, //矩阵A的属性参数,转置,按行优先
CUBLAS_OP_T, //矩阵B的属性参数,转置,按行优先
A_ROW, //矩阵A、C的行数
B_COL, //矩阵B、C的列数
A_COL, //A的列数,B的行数,此处也可为B_ROW,一样的
&a, //alpha的值
d_A, //左矩阵,为A
A_COL, //A的leading dimension,此时选择转置,按行优先,则leading dimension为A的列数
d_B, //右矩阵,为B
B_COL, //B的leading dimension,此时选择转置,按行优先,则leading dimension为B的列数
&b, //beta的值
d_C, //结果矩阵C
A_ROW //C的leading dimension,C矩阵一定按列优先,则leading dimension为C的行数
);
*/
int main( int argc, char** argv ){
int dev = 0;
cudaSetDevice(dev);
cudaDeviceProp deviceprop;
CHECK(cudaGetDeviceProperties(&deviceprop,dev));
printf("device %d: %s \n", dev, deviceprop.name);
int m = M;
int n = N;
int k = K;
if (argc > 1) m = atoi(argv[1]);
if (argc > 2) n = atoi(argv[2]);
if (argc > 3) k = atoi(argv[3]);
printf("m n k value: %d %d %d\n", m,n,k);
//matrix A size m,K, martix B size K,N martirx C size m,N
// sgemm a * A * B + beta * C
int mn = n * m;
int nk = n * k;
int mk = m * k;
float *h_A, *h_B, *h_C, *h_ret;
int nBytes_A = mk * sizeof(float);
int nBytes_B = nk * sizeof(float);
int nBytes_C = mn * sizeof(float);
h_A = (float *)malloc(nBytes_A);
h_B = (float *)malloc(nBytes_B);
h_C = (float *)malloc(nBytes_C);
h_ret = (float *)malloc(nBytes_C);
initialData_ez(h_A, mk);
initialData_ez(h_B, nk);
printf("A:");
for(int i = 0; i< m*k; i++){
printf("%f\t", h_A[i]);
}
printf("\n");
printf("B:");
for(int i = 0; i< n*k; i++){
printf("%f\t", h_B[i]);
}
printf("\n");
printf("c:");
for(int i = 0; i< n*m; i++){
h_C[i] = 0;
printf("%f\t", h_C[i]);
}
printf("\n");
//device memory
float* d_A, *d_B, *d_C ;
CHECK(cudaMalloc((float**)&d_A, nBytes_A));
CHECK(cudaMalloc((float**)&d_B, nBytes_B));
CHECK(cudaMalloc((float**)&d_C, nBytes_C));
CHECK(cudaMemcpy(d_A, h_A, nBytes_A, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_B, h_A, nBytes_B, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_C, h_C, nBytes_C, cudaMemcpyHostToDevice));
float alpha = 1.0;
float beta = 0.0;
cublasHandle_t handle;
CHECK_CUBLAS(cublasCreate(&handle));
CHECK_CUBLAS(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, d_A, m, d_B, k, &beta, d_C, m));
//CHECK_CUBLAS( cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, k, &alpha, d_A, k, d_B, n, &beta, d_C, m));
CHECK(cudaMemcpy(h_ret, d_C, nBytes_C, cudaMemcpyDeviceToHost));
printf("h_ret:");
for(int i = 0; i< n*m; i++){
printf("%f\t", h_ret[i]);
}
printf("\n");
free(h_A);
free(h_B);
free(h_C);
free(h_ret);
CHECK(cudaFree(d_A));
CHECK(cudaFree(d_B));
CHECK(cudaFree(d_C));
CHECK_CUBLAS(cublasDestroy(handle));
//cudaDeviceReset();
return 0;
}
在common.h里添加:
#define CHECK_CUBLAS(call) \
{ \
cublasStatus_t status = (call); \
if (status != CUBLAS_STATUS_SUCCESS) { \
printf("cuBLAS error: %s at %s:%d\n", cublasGetStatusString(status), __FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} \
}
编译:
-lcublas用于链接cublas.lib
nvcc .\cublassgemm.cu -o .\cublassgemm -lcublas
如以上代码矩阵A的尺寸m*k 为 3x2 (3行2列) B 为k * n (2x2), 相乘矩阵C 尺寸 m * n;
完整输出:
A:0.000000 1.000000 2.000000 3.000000 4.000000 5.000000
B:0.000000 1.000000 2.000000 3.000000
C:0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
h_ret:3.000000 4.000000 5.000000 9.000000 14.000000 19.000000
分析:
CHECK_CUBLAS(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, d_A, m, d_B, k, &beta, d_C, m));
两个矩阵都没有转置 – CUBLAS_OP_N (CUBLAS_OP_N ,CUBLAS_OP_T)N 代表列优先,T则变成行优先
在内存A里的数据是0,1,2,3,4,5,根据列优先,则矩阵A =
[[0,3],
[1,4],
[2,5]]
lda 因为A列优先,切矩阵A有3行,所以lda = 3 (m)
B =
[[0,2],
[1,3]]
相乘得到的矩阵C =
[[ 3 9]
[ 4 14]
[ 5 19]]
C的数据依然是列优先,所以显示在内存里的顺序是3,4,5,9,14,19
列优先矩阵打印
void printMatrixCol(float* A, int row, int col){
for (int i = 0; i < row; i ++){
for (int j = 0; j < col; j ++){
//printf("%f offset %d row %d col %d \t", A[row * j + i], (row * j + i), i, j);
printf("%f \t", A[row * j + i]);
}
printf("\n");
}
}
printMatrixCol(h_ret, m, n);
3.000000 9.000000
4.000000 14.000000
5.000000 19.000000
CUBLAS_OP_N -> CUBLAS_OP_T
CHECK_CUBLAS( cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, k, &alpha, d_A, k, d_B, n, &beta, d_C, m));
矩阵A,B都转置(CUBLAS_OP_T),但是尺寸没有变 ,A列 = m, A 行 = k; 所以A矩阵变为
[[0 1]
[2 3]
[4 5]]
因为矩阵变为行优先,所以lda = k
同理矩阵B (k 行 n 列), ldb = n
[[0 1]
[2 3]]
输出C:
[[ 2 3]
[ 6 11]
[10 19]]
因为C在内存里依然是列优先顺序: 2,6,10,3,11,19
完整输出:
A:0.000000 1.000000 2.000000 3.000000 4.000000 5.000000
B:0.000000 1.000000 2.000000 3.000000
C:0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
h_ret:2.000000 6.000000 10.000000 3.000000 11.000000 19.000000