Tesla T4 P2P测试
Tesla T4 P2P测试
- 一.测试环境
- 二.测试步骤
- 1.获取设备信息
- 2.查看PCIE拓扑结构
- 3.选择9B、9E这二张
- 4.查看逻辑设备ID
- 5.设置环境变量(需要用逻辑设备ID,通过UUID跟smi看到的物理ID关联)
- 6.不同地址的原子操作
- 2.P2P与非P2P的性能差异
- 3.GPU带宽测试
Tesla T4 P2P测试
- 通过物理ID找到逻辑ID
- NCU P2P相关的Metrics
- PCIE、DRAM相关的Metrics
一.测试环境
二.测试步骤
1.获取设备信息
nvidia-smi -L
GPU 0: NVIDIA A100 80GB PCIe (UUID: GPU-c91a8013-0877-df61-53b9-016fabcd5f82)
GPU 1: NVIDIA A30 (UUID: GPU-f67790b5-bb58-614d-4190-3598a99f925e)
GPU 2: Tesla T4 (UUID: GPU-e95bfaa3-bf41-7aeb-d1e7-4f4f98ac3a63)
GPU 3: Tesla T4 (UUID: GPU-7b470c8f-cfe3-81d2-1dd8-2e36c2552d0e)
GPU 4: Tesla T4 (UUID: GPU-d59282d2-060d-270e-1c0e-f50e936ffede)
GPU 5: NVIDIA GeForce RTX 3080 Ti (UUID: GPU-9a131b18-28de-d6a1-01e9-76a133e21680)
GPU 6: NVIDIA A30 (UUID: GPU-49daa3a5-490b-569c-f1d2-79d98c6d3a02)
GPU 7: Tesla T4 (UUID: GPU-1f45f1e1-1e10-7d2d-f25a-e79ac17ddfa1)
nvidia-smi -q | grep "Bus Id"
Bus Id : 00000000:34:00.0
Bus Id : 00000000:35:00.0
Bus Id : 00000000:36:00.0 Tesla T4
Bus Id : 00000000:37:00.0 Tesla T4
Bus Id : 00000000:9B:00.0 Tesla T4 d59282d2
Bus Id : 00000000:9C:00.0
Bus Id : 00000000:9D:00.0
Bus Id : 00000000:9E:00.0 Tesla T4 1f45f1e1
2.查看PCIE拓扑结构
lstopo --ignore Core --ignore Misc --ignore PU
3.选择9B、9E这二张
4.查看逻辑设备ID
tee devinfo.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <thread>
#define CUDA_CHECK(call) \
do { \
cudaError_t error = call; \
if (error != cudaSuccess) { \
fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(error)); \
exit(EXIT_FAILURE); \
} \
} while (0)
int main() {
int device_count=0;
CUDA_CHECK(cudaGetDeviceCount(&device_count));
for(int deviceid=0; deviceid<device_count;deviceid++)
{
CUDA_CHECK(cudaSetDevice(deviceid));
cudaDeviceProp deviceProp;
CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, deviceid));
printf("Index,%d UUID:GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x\n",
deviceid,
(unsigned char)deviceProp.uuid.bytes[0], (unsigned char)deviceProp.uuid.bytes[1],
(unsigned char)deviceProp.uuid.bytes[2], (unsigned char)deviceProp.uuid.bytes[3],
(unsigned char)deviceProp.uuid.bytes[4], (unsigned char)deviceProp.uuid.bytes[5],
(unsigned char)deviceProp.uuid.bytes[6], (unsigned char)deviceProp.uuid.bytes[7],
(unsigned char)deviceProp.uuid.bytes[8], (unsigned char)deviceProp.uuid.bytes[9],
(unsigned char)deviceProp.uuid.bytes[10],(unsigned char)deviceProp.uuid.bytes[11],
(unsigned char)deviceProp.uuid.bytes[12],(unsigned char)deviceProp.uuid.bytes[13],
(unsigned char)deviceProp.uuid.bytes[14],(unsigned char)deviceProp.uuid.bytes[15]);
}
return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -o devinfo devinfo.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcuda
unset CUDA_VISIBLE_DEVICES && ./devinfo
- 输出
Index,0 UUID:GPU-c91a8013-0877-df61-53b9-016fabcd5f82
Index,1 UUID:GPU-9a131b18-28de-d6a1-01e9-76a133e21680
Index,2 UUID:GPU-f67790b5-bb58-614d-4190-3598a99f925e
Index,3 UUID:GPU-49daa3a5-490b-569c-f1d2-79d98c6d3a02
Index,4 UUID:GPU-e95bfaa3-bf41-7aeb-d1e7-4f4f98ac3a63
Index,5 UUID:GPU-7b470c8f-cfe3-81d2-1dd8-2e36c2552d0e
Index,6 UUID:GPU-d59282d2-060d-270e-1c0e-f50e936ffede #选中
Index,7 UUID:GPU-1f45f1e1-1e10-7d2d-f25a-e79ac17ddfa1 #选中
5.设置环境变量(需要用逻辑设备ID,通过UUID跟smi看到的物理ID关联)
export CUDA_VISIBLE_DEVICES=6,7
6.不同地址的原子操作
tee p2p.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <thread>
#define CUDA_CHECK(call) \
do { \
cudaError_t error = call; \
if (error != cudaSuccess) { \
fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(error)); \
exit(EXIT_FAILURE); \
} \
} while (0)
template<int mode>
__global__ void dummyKernel(float *data) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
for(int i=0;i<102400;i++)
{
atomicAdd(&data[idx], idx*i);
}
}
template <typename F>
void TIMEIT(F const &f,cudaStream_t &stream,cudaEvent_t &start_ev,cudaEvent_t&stop_ev)
{
CUDA_CHECK(cudaDeviceSynchronize());
auto start = std::chrono::high_resolution_clock::now();
cudaEventRecord(start_ev, stream);
f(stream);
cudaEventRecord(stop_ev, stream);
CUDA_CHECK(cudaEventSynchronize(stop_ev));
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start_ev, stop_ev);
printf("E2E:%8.4fms Kernel:%8.4fms\n",diff.count()*1000,milliseconds);
}
int main() {
int devID0 = 0, devID1 = 1;
int device_count=0;
CUDA_CHECK(cudaGetDeviceCount(&device_count));
for(int deviceid=0; deviceid<2;deviceid++)
{
CUDA_CHECK(cudaSetDevice(deviceid));
cudaDeviceProp deviceProp;
CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, deviceid));
std::cout << "-----------------------------------" << std::endl;
std::cout << "Device Index: " << deviceid << std::endl;
std::cout << "Compute Capability:"<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;
std::cout << "Device name: " << deviceProp.name << std::endl;
std::cout << "Max threads per block: " << deviceProp.maxThreadsPerBlock << std::endl;
std::cout << "Shared memory per block: " << deviceProp.sharedMemPerBlock << " bytes" << std::endl;
std::cout << "Max blocks per SM: " << deviceProp.maxBlocksPerMultiProcessor << std::endl;
std::cout << "asyncEngineCount: " << deviceProp.asyncEngineCount << std::endl;
std::cout << "directManagedMemAccessFromHost: " << deviceProp.directManagedMemAccessFromHost << std::endl;
std::cout << "unifiedAddressing: " << deviceProp.unifiedAddressing << std::endl;
std::cout << "canMapHostMemory: " << deviceProp.canMapHostMemory << std::endl;
std::cout << "Number of SMs: " << deviceProp.multiProcessorCount << std::endl;
printf("UUID:GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x\n",
(unsigned char)deviceProp.uuid.bytes[0], (unsigned char)deviceProp.uuid.bytes[1],
(unsigned char)deviceProp.uuid.bytes[2], (unsigned char)deviceProp.uuid.bytes[3],
(unsigned char)deviceProp.uuid.bytes[4], (unsigned char)deviceProp.uuid.bytes[5],
(unsigned char)deviceProp.uuid.bytes[6], (unsigned char)deviceProp.uuid.bytes[7],
(unsigned char)deviceProp.uuid.bytes[8], (unsigned char)deviceProp.uuid.bytes[9],
(unsigned char)deviceProp.uuid.bytes[10],(unsigned char)deviceProp.uuid.bytes[11],
(unsigned char)deviceProp.uuid.bytes[12],(unsigned char)deviceProp.uuid.bytes[13],
(unsigned char)deviceProp.uuid.bytes[14],(unsigned char)deviceProp.uuid.bytes[15]);
}
std::cout << "-----------------------------------" << std::endl;
int p2p_value=0;
CUDA_CHECK(cudaDeviceGetP2PAttribute(&p2p_value,cudaDevP2PAttrAccessSupported,devID0,devID1));
std::cout << "cudaDevP2PAttrAccessSupported: " << p2p_value << std::endl;
#define block_size (32*4)
size_t dataSize = block_size * sizeof(float);
float *data0_dev, *data1_dev;
CUDA_CHECK(cudaSetDevice(devID0));
CUDA_CHECK(cudaMalloc(&data0_dev, dataSize));
CUDA_CHECK(cudaSetDevice(devID1));
CUDA_CHECK(cudaMalloc(&data1_dev, dataSize));
CUDA_CHECK(cudaMemset(data1_dev, 0, dataSize));
// 启用P2P
int canAccessPeer=0;
CUDA_CHECK(cudaDeviceCanAccessPeer(&canAccessPeer, devID0, devID1));
if (canAccessPeer) {
CUDA_CHECK(cudaSetDevice(devID1));
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaEvent_t start_ev, stop_ev;
cudaEventCreate(&start_ev);
cudaEventCreate(&stop_ev);
CUDA_CHECK(cudaDeviceEnablePeerAccess(devID0, 0));//让devID1可以访问devID0的设备内存
TIMEIT([&data0_dev](cudaStream_t &stream)-> void {dummyKernel<1><<<1, block_size,0,stream>>>(data0_dev);},stream,start_ev,stop_ev);
TIMEIT([&data1_dev](cudaStream_t &stream)-> void {dummyKernel<3><<<1, block_size,0,stream>>>(data1_dev);},stream,start_ev,stop_ev);
CUDA_CHECK(cudaDeviceDisablePeerAccess(devID0));
}
else
{
printf("%s %d canAccessPeer=0\n",__FILE__,__LINE__);
}
CUDA_CHECK(cudaFree(data0_dev));
CUDA_CHECK(cudaFree(data1_dev));
return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -o p2p p2p.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcuda
./p2p
/usr/local/cuda/bin/ncu --metrics \
lts__t_requests_aperture_device.sum,\
lts__t_sectors_aperture_device.sum,\
lts__t_requests_aperture_peer.sum,\
lts__t_requests_srcnode_gpc_aperture_peer.sum,\
lts__t_requests_srcunit_l1_aperture_peer.sum,\
lts__t_requests_srcunit_tex_aperture_peer.sum,\
lts__t_sectors_aperture_peer.sum,\
lts__t_sectors_srcnode_gpc_aperture_peer.sum,\
lts__t_sectors_srcunit_l1_aperture_peer.sum,\
lts__t_sectors_srcunit_tex_aperture_peer.sum,\
dram__bytes_read.sum,\
dram__bytes_write.sum,\
dram__bytes_read.sum.per_second,\
pcie__read_bytes.sum,\
pcie__write_bytes.sum,\
pcie__read_bytes.sum.per_second,\
pcie__write_bytes.sum.per_second,\
dram__bytes_write.sum.per_second ./p2p
- 输出
-----------------------------------
Device Index: 0
Compute Capability:7.5
Device name: Tesla T4
Max threads per block: 1024
Shared memory per block: 49152 bytes
Max blocks per SM: 16
asyncEngineCount: 3
directManagedMemAccessFromHost: 0
unifiedAddressing: 1
canMapHostMemory: 1
Number of SMs: 40
UUID:GPU-d59282d2-060d-270e-1c0e-f50e936ffede
-----------------------------------
Device Index: 1
Compute Capability:7.5
Device name: Tesla T4
Max threads per block: 1024
Shared memory per block: 49152 bytes
Max blocks per SM: 16
asyncEngineCount: 3
directManagedMemAccessFromHost: 0
unifiedAddressing: 1
canMapHostMemory: 1
Number of SMs: 40
UUID:GPU-1f45f1e1-1e10-7d2d-f25a-e79ac17ddfa1
-----------------------------------
cudaDevP2PAttrAccessSupported: 1
E2E:132.1300ms Kernel:132.1245ms #GPU1通过P2P对GPU0的设备内存进行原子操作
E2E: 3.5552ms Kernel: 3.5444ms #GPU1对本地DRAM进行原子操作
void dummyKernel<(int)1>(float *), 2024-Sep-25 17:06:47, Context 2, Stream 34
Section: Command line profiler metrics
---------------------------------------------------------------------- --------------- ------------------------------
dram__bytes_read.sum Kbyte 52.86
dram__bytes_read.sum.per_second Kbyte/second 405.11
dram__bytes_write.sum Kbyte 1.25
dram__bytes_write.sum.per_second Kbyte/second 9.56
lts__t_requests_aperture_device.sum request 17775
lts__t_requests_aperture_peer.sum request 409600 #4个warp,每个102400 次合并访问 地址范围了4*32*4
lts__t_requests_srcnode_gpc_aperture_peer.sum request 409600
lts__t_requests_srcunit_l1_aperture_peer.sum request 0
lts__t_requests_srcunit_tex_aperture_peer.sum request 409600
lts__t_sectors_aperture_device.sum sector 93043
lts__t_sectors_aperture_peer.sum sector 1638400
lts__t_sectors_srcnode_gpc_aperture_peer.sum sector 1638400 #合并访问一次请求4个sector 1638400*32=50MB
lts__t_sectors_srcunit_l1_aperture_peer.sum sector 0
lts__t_sectors_srcunit_tex_aperture_peer.sum sector 1638400
pcie__read_bytes.sum Mbyte 59.10
pcie__read_bytes.sum.per_second Mbyte/second 452.93
pcie__write_bytes.sum Mbyte 72.18 #实际PCIE读写加起来超过50MB
pcie__write_bytes.sum.per_second Mbyte/second 553.17
---------------------------------------------------------------------- --------------- ------------------------------
void dummyKernel<(int)3>(float *), 2024-Sep-25 17:06:48, Context 2, Stream 34
Section: Command line profiler metrics
---------------------------------------------------------------------- --------------- ------------------------------
dram__bytes_read.sum Kbyte 6.21
dram__bytes_read.sum.per_second Mbyte/second 1.77
dram__bytes_write.sum byte 224
dram__bytes_write.sum.per_second Kbyte/second 63.93
lts__t_requests_aperture_device.sum request 414530
lts__t_requests_aperture_peer.sum request 0
lts__t_requests_srcnode_gpc_aperture_peer.sum request 0
lts__t_requests_srcunit_l1_aperture_peer.sum request 0
lts__t_requests_srcunit_tex_aperture_peer.sum request 0
lts__t_sectors_aperture_device.sum sector 1643605
lts__t_sectors_aperture_peer.sum sector 0
lts__t_sectors_srcnode_gpc_aperture_peer.sum sector 0
lts__t_sectors_srcunit_l1_aperture_peer.sum sector 0
lts__t_sectors_srcunit_tex_aperture_peer.sum sector 0
pcie__read_bytes.sum Kbyte 3.58
pcie__read_bytes.sum.per_second Mbyte/second 1.02
pcie__write_bytes.sum Kbyte 3.07
pcie__write_bytes.sum.per_second Kbyte/second 876.74
---------------------------------------------------------------------- --------------- ------------------------------
2.P2P与非P2P的性能差异
tee p2p.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <thread>
#define CUDA_CHECK(call) \
do { \
cudaError_t error = call; \
if (error != cudaSuccess) { \
fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(error)); \
exit(EXIT_FAILURE); \
} \
} while (0)
template<int mode>
__global__ void dummyKernel(float *input_data,float *output_data) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
output_data[idx]=input_data[idx];
}
template <typename F>
void TIMEIT(F const &f,cudaStream_t &stream,cudaEvent_t &start_ev,cudaEvent_t&stop_ev)
{
CUDA_CHECK(cudaDeviceSynchronize());
auto start = std::chrono::high_resolution_clock::now();
cudaEventRecord(start_ev, stream);
f(stream);
cudaEventRecord(stop_ev, stream);
CUDA_CHECK(cudaEventSynchronize(stop_ev));
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start_ev, stop_ev);
printf("E2E:%7.2fms Kernel:%7.2fms\n",diff.count()*1000,milliseconds);
}
int main() {
int devID0 = 0, devID1 = 1;
int device_count=0;
CUDA_CHECK(cudaGetDeviceCount(&device_count));
for(int deviceid=0; deviceid<2;deviceid++)
{
CUDA_CHECK(cudaSetDevice(deviceid));
cudaDeviceProp deviceProp;
CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, deviceid));
std::cout << "-----------------------------------" << std::endl;
std::cout << "Device Index: " << deviceid << std::endl;
std::cout << "Compute Capability:"<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;
std::cout << "Device name: " << deviceProp.name << std::endl;
std::cout << "Max threads per block: " << deviceProp.maxThreadsPerBlock << std::endl;
std::cout << "Shared memory per block: " << deviceProp.sharedMemPerBlock << " bytes" << std::endl;
std::cout << "Max blocks per SM: " << deviceProp.maxBlocksPerMultiProcessor << std::endl;
std::cout << "asyncEngineCount: " << deviceProp.asyncEngineCount << std::endl;
std::cout << "directManagedMemAccessFromHost: " << deviceProp.directManagedMemAccessFromHost << std::endl;
std::cout << "unifiedAddressing: " << deviceProp.unifiedAddressing << std::endl;
std::cout << "Number of SMs: " << deviceProp.multiProcessorCount << std::endl;
}
std::cout << "-----------------------------------" << std::endl;
int p2p_value=0;
CUDA_CHECK(cudaDeviceGetP2PAttribute(&p2p_value,cudaDevP2PAttrAccessSupported,devID0,devID1));
std::cout << "cudaDevP2PAttrAccessSupported: " << p2p_value << std::endl;
#define block_size 1024
#define block_count 1000000L
size_t dataSize = block_count*block_size * sizeof(float);
float *data0_dev, *data1_dev,*data1_dev_ex;
CUDA_CHECK(cudaSetDevice(devID0));
CUDA_CHECK(cudaMalloc(&data0_dev, dataSize));
CUDA_CHECK(cudaSetDevice(devID1));
CUDA_CHECK(cudaMalloc(&data1_dev, dataSize));
CUDA_CHECK(cudaMalloc(&data1_dev_ex, dataSize));
char *host;
CUDA_CHECK(cudaMallocHost(&host,dataSize));
printf("Init Done(%.2f)GB..\n",dataSize/1024.0/1024.0/1024.0);
// 启用P2P
int canAccessPeer=0;
CUDA_CHECK(cudaDeviceCanAccessPeer(&canAccessPeer, devID0, devID1));
if (canAccessPeer) {
CUDA_CHECK(cudaSetDevice(devID1));
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaEvent_t start_ev, stop_ev;
cudaEventCreate(&start_ev);
cudaEventCreate(&stop_ev);
CUDA_CHECK(cudaDeviceEnablePeerAccess(devID0, 0));//让devID1可以访问devID0的设备内存
TIMEIT([&](cudaStream_t &stream)-> void {cudaMemcpyAsync(host,data1_dev,dataSize,cudaMemcpyHostToDevice,stream);},stream,start_ev,stop_ev);
TIMEIT([&](cudaStream_t &stream)-> void {dummyKernel<1><<<block_count, block_size,0,stream>>>(data0_dev,data1_dev);},stream,start_ev,stop_ev);
TIMEIT([&](cudaStream_t &stream)-> void {dummyKernel<2><<<block_count, block_size,0,stream>>>(data1_dev_ex,data1_dev);},stream,start_ev,stop_ev);
CUDA_CHECK(cudaDeviceDisablePeerAccess(devID0));
}
else
{
printf("%s %d canAccessPeer=0\n",__FILE__,__LINE__);
}
CUDA_CHECK(cudaFreeHost(host));
CUDA_CHECK(cudaFree(data0_dev));
CUDA_CHECK(cudaFree(data1_dev));
CUDA_CHECK(cudaFree(data1_dev_ex));
return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -o p2p p2p.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcuda
./p2p
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics \
dram__bytes_read.sum.pct_of_peak_sustained_elapsed,\
dram__bytes_write.sum.pct_of_peak_sustained_elapsed,\
dram__bytes_read.sum.per_second,\
pcie__read_bytes.sum.per_second,\
pcie__write_bytes.sum.per_second,\
dram__bytes_write.sum.per_second ./p2p
- 输出
-----------------------------------
Device Index: 0
Compute Capability:7.5
Device name: Tesla T4
Max threads per block: 1024
Shared memory per block: 49152 bytes
Max blocks per SM: 16
asyncEngineCount: 3
directManagedMemAccessFromHost: 0
unifiedAddressing: 1
Number of SMs: 40
-----------------------------------
Device Index: 1
Compute Capability:7.5
Device name: Tesla T4
Max threads per block: 1024
Shared memory per block: 49152 bytes
Max blocks per SM: 16
asyncEngineCount: 3
directManagedMemAccessFromHost: 0
unifiedAddressing: 1
Number of SMs: 40
-----------------------------------
cudaDevP2PAttrAccessSupported: 1
Init Done(3.81)GB..
E2E: 325.70ms Kernel: 325.71ms # GPU1 cudaMemcpyHostToDevice 11.697GB/s
E2E: 307.29ms Kernel: 307.31ms # GPU1 通过P2P从GPU0的设备内存读取数据(比H2D快) 12.39GB/s
E2E: 37.90ms Kernel: 37.89ms # GPU1 DRAM内 D2D的拷贝 2*100.55GB/s
void dummyKernel<1>(float *, float *) (1000000, 1, 1)x(1024, 1, 1), Context 2, Stream 34, Device 7, CC 7.5
Section: Command line profiler metrics
--------------------------------------------------- ----------- ------------
Metric Name Metric Unit Metric Value
--------------------------------------------------- ----------- ------------
dram__bytes_read.sum.pct_of_peak_sustained_elapsed % 0.01
dram__bytes_read.sum.per_second Mbyte/s 37.35
dram__bytes_write.sum.pct_of_peak_sustained_elapsed % 4.72
dram__bytes_write.sum.per_second Gbyte/s 15.10
pcie__read_bytes.sum.per_second Gbyte/s 15.01
pcie__write_bytes.sum.per_second Gbyte/s 3.34
--------------------------------------------------- ----------- ------------
void dummyKernel<2>(float *, float *) (1000000, 1, 1)x(1024, 1, 1), Context 2, Stream 34, Device 7, CC 7.5
Section: Command line profiler metrics
--------------------------------------------------- ----------- ------------
Metric Name Metric Unit Metric Value
--------------------------------------------------- ----------- ------------
dram__bytes_read.sum.pct_of_peak_sustained_elapsed % 37.34 #同时读写时,利用率加起来75%
dram__bytes_read.sum.per_second Gbyte/s 119.41
dram__bytes_write.sum.pct_of_peak_sustained_elapsed % 37.81
dram__bytes_write.sum.per_second Gbyte/s 120.89 #加起来239GB/s,跟后面的带宽测试一致
pcie__read_bytes.sum.per_second Mbyte/s 22.03
pcie__write_bytes.sum.per_second Mbyte/s 7.42
--------------------------------------------------- ----------- ------------
3.GPU带宽测试
git clone https://www.github.com/nvidia/cuda-samples
cd cuda-samples/Samples/1_Utilities/deviceQuery
make clean && make
./deviceQuery
cd ../bandwidthTest/
make clean && make
./bandwidthTest --device=0
- 输出
Running on...
Device 0: Tesla T4
Quick Mode
Host to Device Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(GB/s)
32000000 12.8
Device to Host Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(GB/s)
32000000 13.1
Device to Device Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(GB/s)
32000000 239.4
Result = PASS