系统软件层
编写代码使用 MUSA 驱动 API 执行向量加法,主要流程包括设备初始化、内存分配、模块加载、内核启动和结果处理等。主要步骤:
-
设备初始化
-
模块加载和内核函数获取
-
MUSA流创建
-
内存分配与数据初始化
-
内核参数设置
-
结果传回主机
-
资源释放
示例代码:
#include <musa.h>
int main() {
const size_t numElements = 4096;
const size_t sizeBytes = numElements * sizeof(int);
int devCnt;
MUctx_st* primaryCtx;
muInit(0);
muDeviceGetCount(&devCnt);
muDevicePrimaryCtxRetain(&primaryCtx, 0);
muCtxPushCurrent(primaryCtx);
MUmodule module;
MUfunction function;
muModuleLoad(&module, "./VectorAdd.elf");
muModuleGetFunction(&function, module, "_Z9VectorAddPiS_");
MUstream stream;
muStreamCreate(&stream, 0);
int *hA = nullptr, *hB = nullptr;
MUdeviceptr dA = 0, dB = 0;
hA = reinterpret_cast<int*>(malloc(sizeBytes));
hB = reinterpret_cast<int*>(malloc(sizeBytes));
muMemAlloc(&dA, sizeBytes);
muMemAlloc(&dB, sizeBytes);
for (int i = 0; i < numElements; ++i) {
hA[i] = i;
hB[i] = 2 * i;
}
muMemcpyHtoD(dA, hA, sizeBytes);
muMemcpyHtoD(dB, hB, sizeBytes);
int threadsPerBlock = 1024;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
struct KernArg {
void *A, *B;
};
KernArg kernArg = { reinterpret_cast<void*>(dA), reinterpret_cast<void*>(dB) };
size_t kernArgSize = sizeof(kernArg);
void* extra[] = {
MU_LAUNCH_PARAM_BUFFER_POINTER, &kernArg,
MU_LAUNCH_PARAM_BUFFER_SIZE, &kernArgSize,
MU_LAUNCH_PARAM_END
};
muLaunchKernel(function,
blocksPerGrid, 1, 1, /* grid dim */
threadsPerBlock, 1, 1, /* block dim */
0, stream, nullptr, extra);
muMemcpyDtoH(hA, dA, sizeBytes);
muModuleUnload(module);
muStreamDestroy(stream);
muMemFree(dA);
muMemFree(dB);
muDevicePrimaryCtxRelease(0);
free(hA);
free(hB);
return 0;
}