计算库层
cuBLAS 是 NVIDIA 提供的高性能线性代数库,专为 CUDA 平台优化,支持多种基本线性代数操作,如矩阵乘法、向量运算和矩阵分解。cuBLAS 利用 GPU 的并行计算能力,提供高效的内存访问模式和自动优化的内核,能够显著提升矩阵运算的性能。
参考仓库地址:cuda-samples
例如,矩阵乘法(GEMM)操作可以通过 cuBLAS 的简单接口实现。
示例代码如下:
// System includes
#include <stdio.h>
#include <assert.h>
// CUDA runtime
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
// Helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>
// cuBLAS library
#include <cublas_v2.h>
void ConstantInit(float *data, int size, float val) {
for (int i = 0; i < size; ++i) {
data[i] = val;
}
}
/**
* Run a simple test of matrix multiplication using cuBLAS
*/
int MatrixMultiply(int argc, char **argv,
const dim3 &dimsA,
const dim3 &dimsB) {
// Allocate host memory for matrices A and B
unsigned int size_A = dimsA.x * dimsA.y;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A;
checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
unsigned int size_B = dimsB.x * dimsB.y;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B;
checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
cudaStream_t stream;
// Initialize host memory
const float valB = 0.01f;
ConstantInit(h_A, size_A, 1.0f);
ConstantInit(h_B, size_B, valB);
// Allocate device memory
float *d_A, *d_B, *d_C;
// Allocate host matrix C
dim3 dimsC(dimsB.x, dimsA.y, 1);
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
float *h_C;
checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));
if (h_C == NULL) {
fprintf(stderr, "Failed to allocate host matrix C!\n");
exit(EXIT_FAILURE);
}
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
// Allocate CUDA events that we'll use for timing
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
// Copy host memory to device
checkCudaErrors(
cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
checkCudaErrors(
cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
// Record the start event
checkCudaErrors(cudaEventRecord(start, stream));
// Execute the cuBLAS matrix multiplication
int nIter = 300;
cublasHandle_t handle;
cublasCreate(&handle);
const float alpha = 1.0f;
const float beta = 0.0f;
for (int j = 0; j < nIter; j++) {
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
dimsB.x, dimsA.y, dimsA.x,
&alpha,
d_B, dimsB.x,
d_A, dimsA.x,
&beta,
d_C, dimsB.x);
}
// Record the stop event
checkCudaErrors(cudaEventRecord(stop, stream));
checkCudaErrors(cudaEventSynchronize(stop));
float msecTotal = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
// Compute and print the performance
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
static_cast<double>(dimsA.y) *
static_cast<double>(dimsB.x);
double gigaFlops =
(flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf("cuBLAS Performance= %.2f GFlop/s, Time= %.3f msec\n",
gigaFlops, msecPerMatrixMul);
// Copy result from device to host
checkCudaErrors(
cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
cublasDestroy(handle);
// Clean up memory
checkCudaErrors(cudaFreeHost(h_A));
checkCudaErrors(cudaFreeHost(h_B));
checkCudaErrors(cudaFreeHost(h_C));
checkCudaErrors(cudaFree(d_A));
checkCudaErrors(cudaFree(d_B));
checkCudaErrors(cudaFree(d_C));
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
return EXIT_SUCCESS;
}
int main(int argc, char **argv) {
printf("[Matrix Multiply Using cuBLAS] - Starting...\n");
if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
checkCmdLineFlag(argc, (const char **)argv, "?")) {
printf("Usage -device=n (n >= 0 for deviceID)\n");
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
printf(" Note: Outer matrix dimensions of A & B matrices" \
" must be equal.\n");
exit(EXIT_SUCCESS);
}
dim3 dimsA(320, 320, 1);
dim3 dimsB(320, 320, 1);
// Width of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
}
// Height of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
}
// Width of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
}
// Height of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
}
if (dimsA.x != dimsB.y) {
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
dimsA.x, dimsB.y);
exit(EXIT_FAILURE);
}
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y,
dimsB.x, dimsB.y);
checkCudaErrors(cudaProfilerStart());
int matrix_result = MatrixMultiply(argc, argv, dimsA, dimsB);
checkCudaErrors(cudaProfilerStop());
exit(matrix_result);
}
结果:
[Matrix Multiply Using cuBLAS] - Starting...
MatrixA(320,320), MatrixB(320,320)
cuBLAS Performance= 1752.85 GFlop/s, Time= 0.037 msec
参考仓库地址:sample2
矩阵加法代码示例
#include <cstdio>
#include <cstdlib>
#include <vector>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include "cublas_utils.h"
using data_type = double;
int main(int argc, char *argv[]) {
cublasHandle_t cublasH = NULL;
cudaStream_t stream = NULL;
const int m = 2;
const int n = 2;
const int k = 2;
const int lda = 2;
const int ldb = 2;
const int ldc = 2;
/*
* A = | 1.0 | 2.0 |
* | 3.0 | 4.0 |
*
* B = | 5.0 | 6.0 |
* | 7.0 | 8.0 |
*/
const std::vector<data_type> A = {1.0, 3.0, 2.0, 4.0};
const std::vector<data_type> B = {5.0, 7.0, 6.0, 8.0};
std::vector<data_type> C(m * n);
const data_type alpha = 1.0;
const data_type beta = 2.0;
data_type *d_A = nullptr;
data_type *d_B = nullptr;
data_type *d_C = nullptr;
cublasOperation_t transa = CUBLAS_OP_N;
cublasOperation_t transb = CUBLAS_OP_N;
printf("A\n");
print_matrix(m, k, A.data(), lda);
printf("=====\n");
printf("B\n");
print_matrix(k, n, B.data(), ldb);
printf("=====\n");
/* step 1: create cublas handle, bind a stream */
CUBLAS_CHECK(cublasCreate(&cublasH));
CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
CUBLAS_CHECK(cublasSetStream(cublasH, stream));
/* step 2: copy data to device */
CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_A), sizeof(data_type) * A.size()));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_B), sizeof(data_type) * B.size()));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_C), sizeof(data_type) * C.size()));
CUDA_CHECK(cudaMemcpyAsync(d_A, A.data(), sizeof(data_type) * A.size(), cudaMemcpyHostToDevice,
stream));
CUDA_CHECK(cudaMemcpyAsync(d_B, B.data(), sizeof(data_type) * B.size(), cudaMemcpyHostToDevice,
stream));
/* step 3: compute */
CUBLAS_CHECK(
cublasDgeam(cublasH, transa, transb, m, n, &alpha, d_A, lda, &beta, d_B, ldb, d_C, ldc));
/* step 4: copy data to host */
CUDA_CHECK(cudaMemcpyAsync(C.data(), d_C, sizeof(data_type) * C.size(), cudaMemcpyDeviceToHost,
stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
/*
* C = | 11.0 | 14.0 |
* | 17.0 | 20.0 |
*/
printf("C\n");
print_matrix(m, n, C.data(), ldc);
printf("=====\n");
/* free resources */
CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_B));
CUDA_CHECK(cudaFree(d_C));
CUBLAS_CHECK(cublasDestroy(cublasH));
CUDA_CHECK(cudaStreamDestroy(stream));
CUDA_CHECK(cudaDeviceReset());
return EXIT_SUCCESS;
}
编译命令
nvcc [头文件地址] -o 编译后文件名称 编译前文件名称 [库链接]
结果
1.00 2.00
3.00 4.00
=====
B
5.00 6.00
7.00 8.00
=====
C
11.00 14.00
17.00 20.00
=====
参考仓库地址:sample3
矩阵加法代码示例
#include <cstdio>
#include <cstdlib>
#include <vector>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include "cublas_utils.h"
using data_type = double;
int main(int argc, char *argv[]) {
cublasHandle_t cublasH = NULL;
cudaStream_t stream = NULL;
/*
* A = | 1.0 2.0 3.0 4.0 |
*/
std::vector<data_type> A = {1.0, 2.0, 3.0, 4.0};
const int incx = 1;
data_type result = 0.0;
data_type *d_A = nullptr;
printf("A\n");
print_vector(A.size(), A.data());
printf("=====\n");
/* step 1: create cublas handle, bind a stream */
CUBLAS_CHECK(cublasCreate(&cublasH));
CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
CUBLAS_CHECK(cublasSetStream(cublasH, stream));
/* step 2: copy data to device */
CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_A), sizeof(data_type) * A.size()));
CUDA_CHECK(cudaMemcpyAsync(d_A, A.data(), sizeof(data_type) * A.size(), cudaMemcpyHostToDevice,
stream));
/* step 3: compute */
CUBLAS_CHECK(cublasNrm2Ex(cublasH, A.size(), d_A, traits<data_type>::cuda_data_type, incx,
&result, traits<data_type>::cuda_data_type,
traits<data_type>::cuda_data_type));
/* step 4: copy data to host */
CUDA_CHECK(cudaMemcpyAsync(A.data(), d_A, sizeof(data_type) * A.size(), cudaMemcpyDeviceToHost,
stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
/*
* Result = 5.48
*/
printf("Result\n");
printf("%0.2f\n", result);
printf("=====\n");
/* free resources */
CUDA_CHECK(cudaFree(d_A));
CUBLAS_CHECK(cublasDestroy(cublasH));
CUDA_CHECK(cudaStreamDestroy(stream));
CUDA_CHECK(cudaDeviceReset());
return EXIT_SUCCESS;
}
编译命令
nvcc [头文件地址] -o 编译后文件名称 编译前文件名称 [库链接]
结果
A
1.00 2.00 3.00 4.00
=====
Result
5.48
=====