当前位置：首页 > article >正文

【cuda学习日记】5.4 常量内存

article 2025/3/6 3:27:32

常量内存其实只是全局内存的一种虚拟地址形式。2个特性：

高速缓存
支持单个值广播到线程束中的每个线程。

声明关键字：

__constant__

对比使用常量内存和确切的变量：

#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
#include <iostream>

#define KERNEL_LOOP 65536

__constant__ static const int const_data_01 = 0x55555555;
__constant__ static const int const_data_02 = 0x77777777;
__constant__ static const int const_data_03 = 0x33333333;
__constant__ static const int const_data_04 = 0x11111111;

__global__ void warmup(int* const data, const int num_elements){
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < num_elements){
        int d = 0x55555555;
        for (int i = 0; i < KERNEL_LOOP; i++){
            d ^= 0x55555555;
            d |= 0x77777777;
            d &= 0x33333333;
            d |= 0x11111111;
        }
        data[tid] = d;
    }
}

__global__ void const_test_gpu_literal(int* const data, const int num_elements){
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < num_elements){
        int d = 0x55555555;
        for (int i = 0; i < KERNEL_LOOP; i++){
            d ^= 0x55555555;
            d |= 0x77777777;
            d &= 0x33333333;
            d |= 0x11111111;
        }
        data[tid] = d;
    }
}

__global__ void const_test_gpu_const(int* const data, const int num_elements){
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < num_elements){
        int d = 0x55555555;
        for (int i = 0; i < KERNEL_LOOP; i++){
            d ^= const_data_01;
            d |= const_data_02;
            d &= const_data_03;
            d |= const_data_04;
        }
        data[tid] = d;
    }
}

int main(int argc , char **argv)
{
    printf("%s starting\n", argv[0]);

    int dev = 0;
    cudaSetDevice(dev);
    cudaDeviceProp deviceprop;
    CHECK(cudaGetDeviceProperties(&deviceprop,dev));
    printf("Using Device %d : %s\n", dev, deviceprop.name);

    const int num_elements = 128 * 1024;
    const int num_threads = 256;
    const int num_blocks = (num_elements + num_threads - 1)/num_threads;
    const int num_bytes = num_elements * sizeof(int);

    int *data;
    cudaMalloc((int**)&data, num_bytes);

    dim3 block(num_threads, 1);
    dim3 grid(num_blocks,1);

    Timer timer;
    timer.start();
    warmup<<<grid,block>>>(data, num_elements);
    cudaDeviceSynchronize();
    timer.stop();
    float elapsedTime = timer.elapsedms();
    printf("warmup <<<grid (%4d, %4d), block (%4d, %4d)>>> elapsed %f ms \n", grid.x,grid.y, block.x, block.y, elapsedTime);

    timer.start();
    const_test_gpu_literal<<<grid,block>>>(data, num_elements);
    cudaDeviceSynchronize();
    timer.stop();
    elapsedTime = timer.elapsedms();
    printf("const_test_gpu_literal <<<grid (%4d, %4d), block (%4d, %4d)>>> elapsed %f ms \n", grid.x,grid.y, block.x, block.y, elapsedTime);
    
    timer.start();
    const_test_gpu_const<<<grid,block>>>(data, num_elements);
    cudaDeviceSynchronize();
    timer.stop();
    elapsedTime = timer.elapsedms();
    printf("const_test_gpu_const <<<grid (%4d, %4d), block (%4d, %4d)>>> elapsed %f ms \n", grid.x,grid.y, block.x, block.y, elapsedTime);
    


    cudaFree(data);
    cudaDeviceReset();
    return 0;

    
}

两个内核的执行时间不相上下：

 Time (%)  Total Time (ns)  Instances  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)                 Name
 --------  ---------------  ---------  --------  --------  --------  --------  -----------  ----------------------------------
     38.1             1376          1    1376.0    1376.0      1376      1376          0.0  warmup(int *, int)
     31.0             1120          1    1120.0    1120.0      1120      1120          0.0  const_test_gpu_const(int *, int)
     31.0             1120          1    1120.0    1120.0      1120      1120          0.0  const_test_gpu_literal(int *, int)

进而与全局内存对比：
添加核函数

__device__ static  int data_01 = 0x55555555;
__device__ static  int data_02 = 0x77777777;
__device__ static  int data_03 = 0x33333333;
__device__ static  int data_04 = 0x11111111;

//
__global__ void const_test_gpu_gmem(int* const data, const int num_elements){
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < num_elements){
        int d = 0x55555555;
        for (int i = 0; i < KERNEL_LOOP; i++){
            d ^= data_01;
            d |= data_02;
            d &= data_03;
            d |= data_04;
        }
        data[tid] = d;
    }
}

以及调用：

timer.start();
    const_test_gpu_gmem<<<grid,block>>>(data, num_elements);
    cudaDeviceSynchronize();
    timer.stop();
    elapsedTime = timer.elapsedms();
    printf("const_test_gpu_gmem <<<grid (%4d, %4d), block (%4d, %4d)>>> elapsed %f ms \n", grid.x,grid.y, block.x, block.y, elapsedTime);

 Time (%)  Total Time (ns)  Instances  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)                 Name
 --------  ---------------  ---------  --------  --------  --------  --------  -----------  ----------------------------------
     99.6           867427          1  867427.0  867427.0    867427    867427          0.0  const_test_gpu_gmem(int *, int)
      0.2             1408          1    1408.0    1408.0      1408      1408          0.0  warmup(int *, int)
      0.1             1152          1    1152.0    1152.0      1152      1152          0.0  const_test_gpu_const(int *, int)
      0.1             1152          1    1152.0    1152.0      1152      1152          0.0  const_test_gpu_literal(int *, int)

非常夸张的加速比。
进一步验证，查看生成的PTX（虚拟汇编）代码。为了看到PTX，在编译的时候需要使用-keep编译器选项：
nvcc constant_mem.cu -keep -o constant_mem.exe
会生成： constant_mem.ptx

以warmup函数为例学习PTX:

// 可见的 CUDA 内核入口函数：函数名经过 Name Mangling（实际对应 warmup(int*, int)）
.visible .entry _Z6warmupPii(
    .param .u64 _Z6warmupPii_param_0,  // 参数1：u64类型的指针（对应 int* 数组）
    .param .u32 _Z6warmupPii_param_1   // 参数2：u32类型的整数（对应数组长度 N）
)
{
    // 寄存器声明
    .reg .pred  %p<2>;  // 谓词寄存器（用于条件判断），定义2个：%p0, %p1
    .reg .b32   %r<7>;  // 32位通用寄存器，定义7个：%r0-%r6
    .reg .b64   %rd<5>; // 64位通用寄存器，定义5个：%rd0-%rd4

    // --- 代码逻辑开始 ---
    ld.param.u64   %rd1, [_Z6warmupPii_param_0]; // 加载参数1（指针）到 %rd1
    ld.param.u32   %r2, [_Z6warmupPii_param_1];  // 加载参数2（数组长度）到 %r2

    // 计算全局线程索引：blockIdx.x * blockDim.x + threadIdx.x
    mov.u32   %r3, %ctaid.x;    // %r3 = blockIdx.x（块索引）
    mov.u32   %r4, %ntid.x;     // %r4 = blockDim.x（每块线程数）
    mov.u32   %r5, %tid.x;      // %r5 = threadIdx.x（线程索引）
    mad.lo.s32 %r1, %r3, %r4, %r5; // %r1 = %r3 * %r4 + %r5 → 全局线程索引

    // 检查是否越界（%r1 >= %r2 → 数组越界？）
    setp.ge.s32 %p1, %r1, %r2;  // 若 %r1 >= %r2，则设置谓词寄存器 %p1 为真
    @%p1 bra $L__BB0_2;         // 如果 %p1 为真，跳转到标签 $L__BB0_2（直接返回）

    // 未越界时，向数组写入值 858993459（0x33333333）
    cvta.to.global.u64 %rd2, %rd1;      // 将指针 %rd1 转换为全局内存地址 %rd2
    mul.wide.s32 %rd3, %r1, 4;          // 计算偏移：%rd3 = %r1 * 4（int类型占4字节）
    add.s64      %rd4, %rd2, %rd3;      // 计算目标地址：%rd4 = %rd2（基址） + %rd3（偏移）
    mov.u32      %r6, 858993459;        // 858993459 的十六进制是 0x33333333
    st.global.u32 [%rd4], %r6;          // 将 0x33333333 写入地址 %rd4

$L__BB0_2:
    ret;  // 返回（所有线程执行完毕）
}