系统软件层

编写代码使用 MUSA 驱动 API 执行向量加法,主要流程包括设备初始化、内存分配、模块加载、内核启动和结果处理等。主要步骤:

  • 设备初始化

  • 模块加载和内核函数获取

  • MUSA流创建

  • 内存分配与数据初始化

  • 内核参数设置

  • 结果传回主机

  • 资源释放

    示例代码:

#include <musa.h>

int main() {
    const size_t numElements = 4096;
    const size_t sizeBytes = numElements * sizeof(int);

    int devCnt;
    MUctx_st* primaryCtx;
    muInit(0);
    muDeviceGetCount(&devCnt);
    muDevicePrimaryCtxRetain(&primaryCtx, 0);
    muCtxPushCurrent(primaryCtx);

    MUmodule module;
    MUfunction function;
    muModuleLoad(&module, "./VectorAdd.elf");
    muModuleGetFunction(&function, module, "_Z9VectorAddPiS_");

    MUstream stream;
    muStreamCreate(&stream, 0);

    int *hA = nullptr, *hB = nullptr;
    MUdeviceptr dA = 0, dB = 0;
    hA = reinterpret_cast<int*>(malloc(sizeBytes));
    hB = reinterpret_cast<int*>(malloc(sizeBytes));
    muMemAlloc(&dA, sizeBytes);
    muMemAlloc(&dB, sizeBytes);

    for (int i = 0; i < numElements; ++i) {
        hA[i] = i;
        hB[i] = 2 * i;
    }

    muMemcpyHtoD(dA, hA, sizeBytes);
    muMemcpyHtoD(dB, hB, sizeBytes);

    int threadsPerBlock = 1024;
    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;

    struct KernArg {
        void *A, *B;
    };
    
    KernArg kernArg = { reinterpret_cast<void*>(dA), reinterpret_cast<void*>(dB) };
    size_t kernArgSize = sizeof(kernArg);

    void* extra[] = {
        MU_LAUNCH_PARAM_BUFFER_POINTER, &kernArg,
        MU_LAUNCH_PARAM_BUFFER_SIZE, &kernArgSize,
        MU_LAUNCH_PARAM_END
    };

    muLaunchKernel(function,
                   blocksPerGrid, 1, 1,  /* grid dim */
                   threadsPerBlock, 1, 1,  /* block dim */
                   0, stream, nullptr, extra);

    muMemcpyDtoH(hA, dA, sizeBytes);

    muModuleUnload(module);
    muStreamDestroy(stream);
    muMemFree(dA);
    muMemFree(dB);
    muDevicePrimaryCtxRelease(0);

    free(hA);
    free(hB);

    return 0;
}