ubuntu 22.04 cuda12.x 上 cutensor 1.6.2 版本环境搭建
ubuntu 22.04 cuda12.x 运行 cutensor 1.6.2 sample
1.6.2 是比较久的cutensor 版本,但是nv对新的cuda 平台做了继续支持,故可以在cuda sdk 12上使用cutensor 1.6.2
1,下载libcutensor 1.6.2
下载 cutensor 1.6.2 for all Linux and all cuda:
https://developer.nvidia.com/cutensor/1.6.2/downloads
wget https://developer.download.nvidia.com/compute/cutensor/redist/libcutensor/linux-x86_64/libcutensor-linux-x86_64-1.6.2.3-archive.tar.xz
tar xf libcutensor-linux-x86_64-1.6.2.3-archive.tar.xz
ls lib..../lib/
10.2/ 11/ 11.0/ 12/
2,运行示例
由于cutensor 2.x中的api有改写,例如 cutensorInit(&handle) 已经改名字;
故需要使用旧的 CUDALibrarySamples中的代码运行示例,例如:
Makefile:
CUTENSOR_ROOT := /home/hipper/cutensor_ex/libcutensor-linux-x86_64-1.6.2.3
CXX_FLAGS=-std=c++11 -I${CUTENSOR_ROOT}/include -L${CUTENSOR_ROOT}/lib/12 -lcutensor -lcudart
all:
nvcc einsum.cu -o einsum ${CXX_FLAGS}
nvcc contraction.cu -o contraction ${CXX_FLAGS}
nvcc contraction_simple.cu -o contraction_simple ${CXX_FLAGS}
nvcc contraction_autotuning.cu -o contraction_autotuning ${CXX_FLAGS}
nvcc elementwise_binary.cu -o elementwise_binary ${CXX_FLAGS}
nvcc elementwise_permute.cu -o elementwise_permute ${CXX_FLAGS}
nvcc elementwise_trinary.cu -o elementwise_trinary ${CXX_FLAGS}
nvcc reduction.cu -o reduction ${CXX_FLAGS}
clean:
rm -f contraction contraction_simple contraction_autotuning elementwise_binary elementwise_permute elementwise_trinary reduction
contraction_simple.cu
#include <stdlib.h>
#include <stdio.h>
#include <unordered_map>
#include <vector>
#include <cuda_runtime.h>
#include <cutensor.h>
#define HANDLE_ERROR(x) \
{ const auto err = x; \
if( err != CUTENSOR_STATUS_SUCCESS ) \
{ printf("Error: %s\n", cutensorGetErrorString(err)); return err; } \
};
#define HANDLE_CUDA_ERROR(x) \
{ const auto err = x; \
if( err != cudaSuccess ) \
{ printf("Error: %s\n", cudaGetErrorString(err)); return err; } \
};
/* This routine computes the tensor contraction \f[ D = alpha * A * B + beta * C \f] using the staged-API */
cutensorStatus_t cutensorContractionSimple(const cutensorHandle_t* handle,
const void* alpha, const void *A, const cutensorTensorDescriptor_t* descA, const int32_t modeA[],
const void *B, const cutensorTensorDescriptor_t* descB, const int32_t modeB[],
const void* beta, const void *C, const cutensorTensorDescriptor_t* descC, const int32_t modeC[],
void *D, const cutensorTensorDescriptor_t* descD, const int32_t modeD[],
cutensorComputeType_t typeCompute, cutensorAlgo_t algo, cutensorWorksizePreference_t workPref,
cudaStream_t stream)
{
/**********************************************
* Retrieve the memory alignment for each tensor
**********************************************/
uint32_t alignmentRequirementA;
HANDLE_ERROR(cutensorGetAlignmentRequirement(handle,
A, descA, &alignmentRequirementA));
uint32_t alignmentRequirementB;
HANDLE_ERROR(cutensorGetAlignmentRequirement(handle,
B, descB, &alignmentRequirementB));
uint32_t alignmentRequirementC;
HANDLE_ERROR(cutensorGetAlignmentRequirement(handle,
C, descC, &alignmentRequirementC));
uint32_t alignmentRequirementD;
HANDLE_ERROR(cutensorGetAlignmentRequirement(handle,
D, descD, &alignmentRequirementD));
/*******************************
* Create Contraction Descriptor
*******************************/
cutensorContractionDescriptor_t desc;
HANDLE_ERROR(cutensorInitContractionDescriptor(handle,
&desc,
descA, modeA, alignmentRequirementA,
descB, modeB, alignmentRequirementB,
descC, modeC, alignmentRequirementC,
descD, modeD, alignmentRequirementD,
typeCompute));
/**************************
* Set the algorithm to use
***************************/
cutensorContractionFind_t find;
HANDLE_ERROR(cutensorInitContractionFind(
handle, &find,
algo));
/**********************
* Query workspace
**********************/
size_t worksize = 0;
HANDLE_ERROR(cutensorContractionGetWorkspaceSize(handle,
&desc,
&find,
workPref, &worksize));
void *work = nullptr;
if (worksize > 0)
{
if(cudaSuccess != cudaMalloc(&work, worksize))
{
work = nullptr;
worksize = 0;
}
}
/**************************
* Create Contraction Plan
**************************/
cutensorContractionPlan_t plan;
HANDLE_ERROR(cutensorInitContractionPlan(handle,
&plan,
&desc,
&find,
worksize));
/**********************
* Run
**********************/
HANDLE_ERROR(cutensorContraction(handle,
&plan,
(void*) &alpha, A, B,
(void*) &beta, C, D,
work, worksize, stream));
return CUTENSOR_STATUS_SUCCESS;
}
int main()
{
typedef float floatTypeA;
typedef float floatTypeB;
typedef float floatTypeC;
typedef float floatTypeCompute;
cudaDataType_t typeA = CUDA_R_32F;
cudaDataType_t typeB = CUDA_R_32F;
cudaDataType_t typeC = CUDA_R_32F;
cutensorComputeType_t typeCompute = CUTENSOR_COMPUTE_32F;
floatTypeCompute alpha = (floatTypeCompute) 1.1f;
floatTypeCompute beta = (floatTypeCompute) 0.f;
/**********************
* Computing: C_{m,u,n,v} = alpha * A_{m,h,k,n} B_{u,k,v,h} + beta * C_{m,u,n,v}
**********************/
std::vector<int> modeC{'m','u','n','v'};
std::vector<int> modeA{'m','h','k','n'};
std::vector<int> modeB{'u','k','v','h'};
int nmodeA = modeA.size();
int nmodeB = modeB.size();
int nmodeC = modeC.size();
std::unordered_map<int, int64_t> extent;
extent['m'] = 96;
extent['n'] = 96;
extent['u'] = 96;
extent['v'] = 64;
extent['h'] = 64;
extent['k'] = 64;
double gflops = (2.0 * extent['m'] * extent['n'] * extent['u'] * extent['v'] * extent['k'] * extent['h']) /1e9;
std::vector<int64_t> extentC;
for (auto mode : modeC)
extentC.push_back(extent[mode]);
std::vector<int64_t> extentA;
for (auto mode : modeA)
extentA.push_back(extent[mode]);
std::vector<int64_t> extentB;
for (auto mode : modeB)
extentB.push_back(extent[mode]);
/**********************
* Allocating data
**********************/
size_t elementsA = 1;
for (auto mode : modeA)
elementsA *= extent[mode];
size_t elementsB = 1;
for (auto mode : modeB)
elementsB *= extent[mode];
size_t elementsC = 1;
for (auto mode : modeC)
elementsC *= extent[mode];
size_t sizeA = sizeof(floatTypeA) * elementsA;
size_t sizeB = sizeof(floatTypeB) * elementsB;
size_t sizeC = sizeof(floatTypeC) * elementsC;
printf("Total memory: %.2f GiB\n", (sizeA + sizeB + sizeC)/1024./1024./1024);
void *A_d, *B_d, *C_d;
HANDLE_CUDA_ERROR(cudaMalloc((void**) &A_d, sizeA));
HANDLE_CUDA_ERROR(cudaMalloc((void**) &B_d, sizeB));
HANDLE_CUDA_ERROR(cudaMalloc((void**) &C_d, sizeC));
floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA);
floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB);
floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC);
if (A == NULL || B == NULL || C == NULL)
{
printf("Error: Host allocation of A, B, or C.\n");
return -1;
}
/*******************
* Initialize data
*******************/
for (int64_t i = 0; i < elementsA; i++)
A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
for (int64_t i = 0; i < elementsB; i++)
B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
for (int64_t i = 0; i < elementsC; i++)
C[i] = (((float) rand())/RAND_MAX - 0.5)*100;
HANDLE_CUDA_ERROR(cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice));
HANDLE_CUDA_ERROR(cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice));
HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice));
/*************************
* cuTENSOR
*************************/
cutensorHandle_t handle;
HANDLE_ERROR(cutensorInit(&handle));
/**********************
* Create Tensor Descriptors
**********************/
cutensorTensorDescriptor_t descA;
HANDLE_ERROR(cutensorInitTensorDescriptor(&handle,
&descA,
nmodeA,
extentA.data(),
NULL /* stride */,
typeA, CUTENSOR_OP_IDENTITY));
cutensorTensorDescriptor_t descB;
HANDLE_ERROR(cutensorInitTensorDescriptor(&handle,
&descB,
nmodeB,
extentB.data(),
NULL /* stride */,
typeB, CUTENSOR_OP_IDENTITY));
cutensorTensorDescriptor_t descC;
HANDLE_ERROR(cutensorInitTensorDescriptor(&handle,
&descC,
nmodeC,
extentC.data(),
NULL /* stride */,
typeC, CUTENSOR_OP_IDENTITY));
HANDLE_ERROR(cutensorContractionSimple(&handle,
(void*)&alpha, A_d, &descA, modeA.data(),
B_d, &descB, modeB.data(),
(void*)&beta, C_d, &descC, modeC.data(),
C_d, &descC, modeC.data(),
typeCompute, CUTENSOR_ALGO_DEFAULT,
CUTENSOR_WORKSPACE_RECOMMENDED, 0 /* stream */));
return 0;
}
运行:
export LD_LIBRARY_PATH=/home/hipper/cutensor_ex/libcutensor-linux-x86_64-1.6.2.3/lib/12