运行时环境层
在现代计算架构中,OpenCL Runtime 作为一个核心软件组件,承担着在不同平台和硬件设备上执行 OpenCL 程序的责任。它不仅为开发者提供了一系列 API 和工具,帮助管理计算设备,还支持创建、编译和调度 OpenCL 程序,进行内存管理。本文将深入探讨 OpenCL Runtime 的各个功能模块,包括设备管理、上下文创建、内存管理、程序编译与执行、命令队列管理以及事件和同步机制。
在接下来的内容中,我们将通过一个示例程序来演示如何在 AMD 平台上使用 OpenCL Runtime 实现一个简单的向量加法操作。
代码示例如下:
#define CL_TARGET_OPENCL_VERSION 220
#include <CL/cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <cstring>
#define ARRAY_SIZE 1024
// OpenCL kernel code for vector addition
const char* kernelSource = "__kernel void vec_add(__global float* A, __global float* B, __global float* C) { \
int id = get_global_id(0); \
C[id] = A[id] + B[id]; \
}";
int main() {
cl_platform_id platform_id = NULL;
cl_device_id device_id;
cl_context context;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
cl_int ret;
// Arrays on the host
float A[ARRAY_SIZE], B[ARRAY_SIZE], C[ARRAY_SIZE];
for (int i = 0; i < ARRAY_SIZE; i++) {
A[i] = i;
B[i] = i * 2;
}
// 1. Get the number of platforms
cl_uint num_platforms;
ret = clGetPlatformIDs(0, NULL, &num_platforms);
if (ret != CL_SUCCESS) {
printf("Failed to get platform IDs\n");
return -1;
}
cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms * sizeof(cl_platform_id));
ret = clGetPlatformIDs(num_platforms, platforms, NULL);
if (ret != CL_SUCCESS) {
printf("Failed to get platforms\n");
free(platforms);
return -1;
}
// Try to find the AMD platform
for (cl_uint i = 0; i < num_platforms; i++) {
char platform_name[128];
clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
printf("Platform %d: %s\n", i, platform_name);
if (strstr(platform_name, "AMD") != NULL) {
platform_id = platforms[i];
printf("Selected AMD platform: %s\n", platform_name);
break;
}
}
if (!platform_id) {
printf("AMD platform not found\n");
free(platforms);
return -1;
}
// 2. Get the GPU device from the selected AMD platform
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
if (ret != CL_SUCCESS) {
printf("Failed to get GPU device ID from AMD platform, error code: %d\n", ret);
free(platforms);
return -1;
}
// Print the selected device
char device_name[128];
clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL);
printf("Selected device: %s\n", device_name);
// 3. Create a context
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
if (ret != CL_SUCCESS) {
printf("Failed to create context\n");
free(platforms);
return -1;
}
printf("Context created successfully.\n");
// 4. Create a command queue
queue = clCreateCommandQueueWithProperties(context, device_id, 0, &ret);
if (ret != CL_SUCCESS) {
printf("Failed to create command queue\n");
free(platforms);
return -1;
}
printf("Command queue created successfully.\n");
// 5. Create a program from the kernel source
program = clCreateProgramWithSource(context, 1, &kernelSource, NULL, &ret);
if (ret != CL_SUCCESS) {
printf("Failed to create program\n");
free(platforms);
return -1;
}
printf("Program created successfully.\n");
// 6. Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if (ret != CL_SUCCESS) {
printf("Failed to build program\n");
char log[1024];
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log), log, NULL);
printf("Build log:\n%s\n", log);
free(platforms);
return -1;
}
printf("Program built successfully.\n");
// 7. Create the kernel
kernel = clCreateKernel(program, "vec_add", &ret);
if (ret != CL_SUCCESS) {
printf("Failed to create kernel\n");
free(platforms);
return -1;
}
printf("Kernel created successfully.\n");
// 8. Create buffers for the input and output arrays
cl_mem buffer_A = clCreateBuffer(context, CL_MEM_READ_ONLY, ARRAY_SIZE * sizeof(float), NULL, &ret);
cl_mem buffer_B = clCreateBuffer(context, CL_MEM_READ_ONLY, ARRAY_SIZE * sizeof(float), NULL, &ret);
cl_mem buffer_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY, ARRAY_SIZE * sizeof(float), NULL, &ret);
if (ret != CL_SUCCESS) {
printf("Failed to create buffers\n");
free(platforms);
return -1;
}
printf("Buffers created successfully.\n");
// 9. Copy the input data to the respective memory buffers
ret = clEnqueueWriteBuffer(queue, buffer_A, CL_TRUE, 0, ARRAY_SIZE * sizeof(float), A, 0, NULL, NULL);
ret |= clEnqueueWriteBuffer(queue, buffer_B, CL_TRUE, 0, ARRAY_SIZE * sizeof(float), B, 0, NULL, NULL);
if (ret != CL_SUCCESS) {
printf("Failed to write to buffers\n");
free(platforms);
return -1;
}
printf("Data written to buffers successfully.\n");
// 10. Set the kernel arguments
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buffer_A);
ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&buffer_B);
ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&buffer_C);
if (ret != CL_SUCCESS) {
printf("Failed to set kernel arguments\n");
free(platforms);
return -1;
}
printf("Kernel arguments set successfully.\n");
// 11. Execute the kernel
size_t global_size = ARRAY_SIZE;
ret = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, NULL);
if (ret != CL_SUCCESS) {
printf("Failed to enqueue kernel\n");
free(platforms);
return -1;
}
printf("Kernel enqueued successfully.\n");
// 12. Read the output buffer back to the host
ret = clEnqueueReadBuffer(queue, buffer_C, CL_TRUE, 0, ARRAY_SIZE * sizeof(float), C, 0, NULL, NULL);
if (ret != CL_SUCCESS) {
printf("Failed to read from buffer\n");
free(platforms);
return -1;
}
printf("Data read from buffer successfully.\n");
// Output the results
printf("Result:\n");
for (int i = 0; i < 10; i++) {
printf("C[%d] = %f\n", i, C[i]);
}
// 13. Clean up
clReleaseMemObject(buffer_A);
clReleaseMemObject(buffer_B);
clReleaseMemObject(buffer_C);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
free(platforms);
printf("Resources released successfully.\n");
return 0;
}
结果:
Platform 0: Intel(R) OpenCL
Platform 1: AMD Accelerated Parallel Processing
Selected AMD platform: AMD Accelerated Parallel Processing
Selected device: AMD Radeon RX 7900 XTX
Context created successfully.
Command queue created successfully.
Program created successfully.
Program built successfully.
Kernel created successfully.
Buffers created successfully.
Data written to buffers successfully.
Kernel arguments set successfully.
Kernel enqueued successfully.
Data read from buffer successfully.
Result:
C[0] = 0.000000
C[1] = 3.000000
C[2] = 6.000000
C[3] = 9.000000
C[4] = 12.000000
C[5] = 15.000000
C[6] = 18.000000
C[7] = 21.000000
C[8] = 24.000000