【cuda学习日记】5.4 常量内存
常量内存其实只是全局内存的一种虚拟地址形式。2个特性:
- 高速缓存
- 支持单个值广播到线程束中的每个线程。
声明关键字:
__constant__
对比使用常量内存和确切的变量:
#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
#include <iostream>
#define KERNEL_LOOP 65536
__constant__ static const int const_data_01 = 0x55555555;
__constant__ static const int const_data_02 = 0x77777777;
__constant__ static const int const_data_03 = 0x33333333;
__constant__ static const int const_data_04 = 0x11111111;
__global__ void warmup(int* const data, const int num_elements){
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < num_elements){
int d = 0x55555555;
for (int i = 0; i < KERNEL_LOOP; i++){
d ^= 0x55555555;
d |= 0x77777777;
d &= 0x33333333;
d |= 0x11111111;
}
data[tid] = d;
}
}
__global__ void const_test_gpu_literal(int* const data, const int num_elements){
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < num_elements){
int d = 0x55555555;
for (int i = 0; i < KERNEL_LOOP; i++){
d ^= 0x55555555;
d |= 0x77777777;
d &= 0x33333333;
d |= 0x11111111;
}
data[tid] = d;
}
}
__global__ void const_test_gpu_const(int* const data, const int num_elements){
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < num_elements){
int d = 0x55555555;
for (int i = 0; i < KERNEL_LOOP; i++){
d ^= const_data_01;
d |= const_data_02;
d &= const_data_03;
d |= const_data_04;
}
data[tid] = d;
}
}
int main(int argc , char **argv)
{
printf("%s starting\n", argv[0]);
int dev = 0;
cudaSetDevice(dev);
cudaDeviceProp deviceprop;
CHECK(cudaGetDeviceProperties(&deviceprop,dev));
printf("Using Device %d : %s\n", dev, deviceprop.name);
const int num_elements = 128 * 1024;
const int num_threads = 256;
const int num_blocks = (num_elements + num_threads - 1)/num_threads;
const int num_bytes = num_elements * sizeof(int);
int *data;
cudaMalloc((int**)&data, num_bytes);
dim3 block(num_threads, 1);
dim3 grid(num_blocks,1);
Timer timer;
timer.start();
warmup<<<grid,block>>>(data, num_elements);
cudaDeviceSynchronize();
timer.stop();
float elapsedTime = timer.elapsedms();
printf("warmup <<<grid (%4d, %4d), block (%4d, %4d)>>> elapsed %f ms \n", grid.x,grid.y, block.x, block.y, elapsedTime);
timer.start();
const_test_gpu_literal<<<grid,block>>>(data, num_elements);
cudaDeviceSynchronize();
timer.stop();
elapsedTime = timer.elapsedms();
printf("const_test_gpu_literal <<<grid (%4d, %4d), block (%4d, %4d)>>> elapsed %f ms \n", grid.x,grid.y, block.x, block.y, elapsedTime);
timer.start();
const_test_gpu_const<<<grid,block>>>(data, num_elements);
cudaDeviceSynchronize();
timer.stop();
elapsedTime = timer.elapsedms();
printf("const_test_gpu_const <<<grid (%4d, %4d), block (%4d, %4d)>>> elapsed %f ms \n", grid.x,grid.y, block.x, block.y, elapsedTime);
cudaFree(data);
cudaDeviceReset();
return 0;
}
两个内核的执行时间不相上下:
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- -------- -------- -------- -------- ----------- ----------------------------------
38.1 1376 1 1376.0 1376.0 1376 1376 0.0 warmup(int *, int)
31.0 1120 1 1120.0 1120.0 1120 1120 0.0 const_test_gpu_const(int *, int)
31.0 1120 1 1120.0 1120.0 1120 1120 0.0 const_test_gpu_literal(int *, int)
进而与全局内存对比:
添加核函数
__device__ static int data_01 = 0x55555555;
__device__ static int data_02 = 0x77777777;
__device__ static int data_03 = 0x33333333;
__device__ static int data_04 = 0x11111111;
//
__global__ void const_test_gpu_gmem(int* const data, const int num_elements){
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < num_elements){
int d = 0x55555555;
for (int i = 0; i < KERNEL_LOOP; i++){
d ^= data_01;
d |= data_02;
d &= data_03;
d |= data_04;
}
data[tid] = d;
}
}
以及调用:
timer.start();
const_test_gpu_gmem<<<grid,block>>>(data, num_elements);
cudaDeviceSynchronize();
timer.stop();
elapsedTime = timer.elapsedms();
printf("const_test_gpu_gmem <<<grid (%4d, %4d), block (%4d, %4d)>>> elapsed %f ms \n", grid.x,grid.y, block.x, block.y, elapsedTime);
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- -------- -------- -------- -------- ----------- ----------------------------------
99.6 867427 1 867427.0 867427.0 867427 867427 0.0 const_test_gpu_gmem(int *, int)
0.2 1408 1 1408.0 1408.0 1408 1408 0.0 warmup(int *, int)
0.1 1152 1 1152.0 1152.0 1152 1152 0.0 const_test_gpu_const(int *, int)
0.1 1152 1 1152.0 1152.0 1152 1152 0.0 const_test_gpu_literal(int *, int)
非常夸张的加速比。
进一步验证,查看生成的PTX(虚拟汇编)代码。为了看到PTX,在编译的时候需要使用-keep编译器选项:
nvcc constant_mem.cu -keep -o constant_mem.exe
会生成: constant_mem.ptx
以warmup函数为例学习PTX:
// 可见的 CUDA 内核入口函数:函数名经过 Name Mangling(实际对应 warmup(int*, int))
.visible .entry _Z6warmupPii(
.param .u64 _Z6warmupPii_param_0, // 参数1:u64类型的指针(对应 int* 数组)
.param .u32 _Z6warmupPii_param_1 // 参数2:u32类型的整数(对应数组长度 N)
)
{
// 寄存器声明
.reg .pred %p<2>; // 谓词寄存器(用于条件判断),定义2个:%p0, %p1
.reg .b32 %r<7>; // 32位通用寄存器,定义7个:%r0-%r6
.reg .b64 %rd<5>; // 64位通用寄存器,定义5个:%rd0-%rd4
// --- 代码逻辑开始 ---
ld.param.u64 %rd1, [_Z6warmupPii_param_0]; // 加载参数1(指针)到 %rd1
ld.param.u32 %r2, [_Z6warmupPii_param_1]; // 加载参数2(数组长度)到 %r2
// 计算全局线程索引:blockIdx.x * blockDim.x + threadIdx.x
mov.u32 %r3, %ctaid.x; // %r3 = blockIdx.x(块索引)
mov.u32 %r4, %ntid.x; // %r4 = blockDim.x(每块线程数)
mov.u32 %r5, %tid.x; // %r5 = threadIdx.x(线程索引)
mad.lo.s32 %r1, %r3, %r4, %r5; // %r1 = %r3 * %r4 + %r5 → 全局线程索引
// 检查是否越界(%r1 >= %r2 → 数组越界?)
setp.ge.s32 %p1, %r1, %r2; // 若 %r1 >= %r2,则设置谓词寄存器 %p1 为真
@%p1 bra $L__BB0_2; // 如果 %p1 为真,跳转到标签 $L__BB0_2(直接返回)
// 未越界时,向数组写入值 858993459(0x33333333)
cvta.to.global.u64 %rd2, %rd1; // 将指针 %rd1 转换为全局内存地址 %rd2
mul.wide.s32 %rd3, %r1, 4; // 计算偏移:%rd3 = %r1 * 4(int类型占4字节)
add.s64 %rd4, %rd2, %rd3; // 计算目标地址:%rd4 = %rd2(基址) + %rd3(偏移)
mov.u32 %r6, 858993459; // 858993459 的十六进制是 0x33333333
st.global.u32 [%rd4], %r6; // 将 0x33333333 写入地址 %rd4
$L__BB0_2:
ret; // 返回(所有线程执行完毕)
}