系统软件层

Triton 通过使用 CUDA Driver API 与底层 GPU 进行交互。具体流程如下：

Triton 生成的代码将被编译为 PTX（Parallel Thread Execution）代码，用于 NVIDIA GPU。
通过 CUDA Driver API（例如 cuModuleLoad, cuLaunchKernel 等）来加载和执行这些 PTX 代码。

使用 CUDA Driver API 来进行简单的 GPU 内存分配和向量加法计算，示例代码如下：

PTX 文件：

__global__ void vecAdd(float *A, float *B, float *C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) {
        C[i] = A[i] + B[i];
    }
}

程序：

#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>

#define CHECK_CUDA_RESULT(res, msg) \
    if (res != CUDA_SUCCESS) { \
        printf("Error: %s, CUDA result: %d\n", msg, res); \
        return -1; \
    }

int main() {
    CUdevice cuDevice;
    CUcontext cuContext;
    CUmodule cuModule;
    CUfunction cuFunction;
    CUresult res;
  
    int N = 512;
    size_t size = N * sizeof(float);
  
    // Initialize the CUDA Driver API
    res = cuInit(0);
    CHECK_CUDA_RESULT(res, "cuInit failed");
  
    // Get the device and create a context
    res = cuDeviceGet(&cuDevice, 0);
    CHECK_CUDA_RESULT(res, "cuDeviceGet failed");

    res = cuCtxCreate(&cuContext, 0, cuDevice);
    CHECK_CUDA_RESULT(res, "cuCtxCreate failed");
  
    // Allocate device memory
    CUdeviceptr d_A, d_B, d_C;
    res = cuMemAlloc(&d_A, size);
    CHECK_CUDA_RESULT(res, "cuMemAlloc for A failed");

    res = cuMemAlloc(&d_B, size);
    CHECK_CUDA_RESULT(res, "cuMemAlloc for B failed");

    res = cuMemAlloc(&d_C, size);
    CHECK_CUDA_RESULT(res, "cuMemAlloc for C failed");
  
    // Initialize host arrays and copy data to device
    float *h_A = (float *)malloc(size);
    float *h_B = (float *)malloc(size);
    float *h_C = (float *)malloc(size);
  
    for (int i = 0; i < N; ++i) {
        h_A[i] = i; // Example values
        h_B[i] = i; // Example values
    }
  
    // Copy data from host to device
    res = cuMemcpyHtoD(d_A, h_A, size);
    CHECK_CUDA_RESULT(res, "cuMemcpyHtoD for A failed");
  
    res = cuMemcpyHtoD(d_B, h_B, size);
    CHECK_CUDA_RESULT(res, "cuMemcpyHtoD for B failed");
  
    // Load the compiled PTX module and get the kernel function
    const char *kernel_file = "add.ptx"; // Precompiled PTX file
    res = cuModuleLoad(&cuModule, kernel_file);
    CHECK_CUDA_RESULT(res, "cuModuleLoad failed");

    res = cuModuleGetFunction(&cuFunction, cuModule, "_Z6vecAddPfS_S_i"); // Adjust if needed
    CHECK_CUDA_RESULT(res, "cuModuleGetFunction failed");
  
    // Set kernel parameters and launch the kernel
    void *args[] = { &d_A, &d_B, &d_C, &N };
    res = cuLaunchKernel(
        cuFunction,        // Kernel to launch
        (N + 255) / 256, 1, 1,     // Grid dimensions
        256, 1, 1,         // Block dimensions
        0,                 // Shared memory size
        0,                 // Stream
        args,              // Kernel arguments
        NULL               // Extra options
    );
    CHECK_CUDA_RESULT(res, "cuLaunchKernel failed");
  
    // Synchronize to ensure kernel execution is complete
    res = cuCtxSynchronize();
    CHECK_CUDA_RESULT(res, "cuCtxSynchronize failed");
  
    // Copy the result back to the host
    res = cuMemcpyDtoH(h_C, d_C, size);
    CHECK_CUDA_RESULT(res, "cuMemcpyDtoH for C failed");
  
    // Print the results
    for (int i = 0; i < N; ++i) {
        printf("%f + %f = %f\n", h_A[i], h_B[i], h_C[i]);
    }
  
    // Free device memory and destroy the context
    cuMemFree(d_A);
    cuMemFree(d_B);
    cuMemFree(d_C);
    cuCtxDestroy(cuContext);
  
    // Free host memory
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}

结果：

0.000000 + 0.000000 = 0.000000
1.000000 + 1.000000 = 2.000000
2.000000 + 2.000000 = 4.000000
3.000000 + 3.000000 = 6.000000
4.000000 + 4.000000 = 8.000000
5.000000 + 5.000000 = 10.000000
...
510.000000 + 510.000000 = 1020.000000
511.000000 + 511.000000 = 1022.000000

AI技术栈解析及应用- 作者：张真瑜 | 山东大学智能创新研究院

系统软件层