系统软件层

Triton 通过使用 CUDA Driver API 与底层 GPU 进行交互。具体流程如下:

  • Triton 生成的代码将被编译为 PTX(Parallel Thread Execution)代码,用于 NVIDIA GPU。
  • 通过 CUDA Driver API(例如 cuModuleLoad, cuLaunchKernel 等)来加载和执行这些 PTX 代码。

使用 CUDA Driver API 来进行简单的 GPU 内存分配和向量加法计算,示例代码如下:

PTX 文件:

__global__ void vecAdd(float *A, float *B, float *C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) {
        C[i] = A[i] + B[i];
    }
}

程序:

#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>

#define CHECK_CUDA_RESULT(res, msg) \
    if (res != CUDA_SUCCESS) { \
        printf("Error: %s, CUDA result: %d\n", msg, res); \
        return -1; \
    }

int main() {
    CUdevice cuDevice;
    CUcontext cuContext;
    CUmodule cuModule;
    CUfunction cuFunction;
    CUresult res;
  
    int N = 512;
    size_t size = N * sizeof(float);
  
    // Initialize the CUDA Driver API
    res = cuInit(0);
    CHECK_CUDA_RESULT(res, "cuInit failed");
  
    // Get the device and create a context
    res = cuDeviceGet(&cuDevice, 0);
    CHECK_CUDA_RESULT(res, "cuDeviceGet failed");

    res = cuCtxCreate(&cuContext, 0, cuDevice);
    CHECK_CUDA_RESULT(res, "cuCtxCreate failed");
  
    // Allocate device memory
    CUdeviceptr d_A, d_B, d_C;
    res = cuMemAlloc(&d_A, size);
    CHECK_CUDA_RESULT(res, "cuMemAlloc for A failed");

    res = cuMemAlloc(&d_B, size);
    CHECK_CUDA_RESULT(res, "cuMemAlloc for B failed");

    res = cuMemAlloc(&d_C, size);
    CHECK_CUDA_RESULT(res, "cuMemAlloc for C failed");
  
    // Initialize host arrays and copy data to device
    float *h_A = (float *)malloc(size);
    float *h_B = (float *)malloc(size);
    float *h_C = (float *)malloc(size);
  
    for (int i = 0; i < N; ++i) {
        h_A[i] = i; // Example values
        h_B[i] = i; // Example values
    }
  
    // Copy data from host to device
    res = cuMemcpyHtoD(d_A, h_A, size);
    CHECK_CUDA_RESULT(res, "cuMemcpyHtoD for A failed");
  
    res = cuMemcpyHtoD(d_B, h_B, size);
    CHECK_CUDA_RESULT(res, "cuMemcpyHtoD for B failed");
  
    // Load the compiled PTX module and get the kernel function
    const char *kernel_file = "add.ptx"; // Precompiled PTX file
    res = cuModuleLoad(&cuModule, kernel_file);
    CHECK_CUDA_RESULT(res, "cuModuleLoad failed");

    res = cuModuleGetFunction(&cuFunction, cuModule, "_Z6vecAddPfS_S_i"); // Adjust if needed
    CHECK_CUDA_RESULT(res, "cuModuleGetFunction failed");
  
    // Set kernel parameters and launch the kernel
    void *args[] = { &d_A, &d_B, &d_C, &N };
    res = cuLaunchKernel(
        cuFunction,        // Kernel to launch
        (N + 255) / 256, 1, 1,     // Grid dimensions
        256, 1, 1,         // Block dimensions
        0,                 // Shared memory size
        0,                 // Stream
        args,              // Kernel arguments
        NULL               // Extra options
    );
    CHECK_CUDA_RESULT(res, "cuLaunchKernel failed");
  
    // Synchronize to ensure kernel execution is complete
    res = cuCtxSynchronize();
    CHECK_CUDA_RESULT(res, "cuCtxSynchronize failed");
  
    // Copy the result back to the host
    res = cuMemcpyDtoH(h_C, d_C, size);
    CHECK_CUDA_RESULT(res, "cuMemcpyDtoH for C failed");
  
    // Print the results
    for (int i = 0; i < N; ++i) {
        printf("%f + %f = %f\n", h_A[i], h_B[i], h_C[i]);
    }
  
    // Free device memory and destroy the context
    cuMemFree(d_A);
    cuMemFree(d_B);
    cuMemFree(d_C);
    cuCtxDestroy(cuContext);
  
    // Free host memory
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}

结果:

0.000000 + 0.000000 = 0.000000
1.000000 + 1.000000 = 2.000000
2.000000 + 2.000000 = 4.000000
3.000000 + 3.000000 = 6.000000
4.000000 + 4.000000 = 8.000000
5.000000 + 5.000000 = 10.000000
...
510.000000 + 510.000000 = 1020.000000
511.000000 + 511.000000 = 1022.000000