计算库层
muBLAS是基于MUSA开发的基础线性代数库,在MTGPU上经过深度优化,在AI和HPC场景下被广泛使用。按照计算复杂性,muBLAS函数可分为三类,第一类用来处理标量、向量和向量与向量间的运算,第二类用来处理向量与矩阵之间的运算, 第三类用来进行矩阵与矩阵间的运算.
#include <cstdio>
#include <cstdlib>
#include <mublas.h>
#include <musa_runtime.h>
#include <vector>
int main(int argc, char* argv[])
{
mublasHandle_t mublasH = NULL;
musaStream_t stream = NULL;
/*
* A = | 1.0 2.0 3.0 4.0 |
* B = | 5.0 6.0 7.0 8.0 |
*/
const std::vector<float> A = {1.0, 2.0, 3.0, 4.0};
std::vector<float> B = {5.0, 6.0, 7.0, 8.0};
const float alpha = 2.1;
const int incx = 1;
const int incy = 1;
float* d_A = nullptr;
float* d_B = nullptr;
/* step 1: create mublas handle, bind a stream */
mublasCreate(&mublasH);
musaStreamCreateWithFlags(&stream, musaStreamNonBlocking);
mublasSetStream(mublasH, stream);
/* step 2: copy data to device */
musaMalloc(reinterpret_cast<void**>(&d_A), sizeof(float) * A.size());
musaMalloc(reinterpret_cast<void**>(&d_B), sizeof(float) * B.size());
musaMemcpyAsync(d_A, A.data(), sizeof(float) * A.size(), musaMemcpyHostToDevice, stream);
musaMemcpyAsync(d_B, B.data(), sizeof(float) * B.size(), musaMemcpyHostToDevice, stream);
/* step 3: compute */
mublasSaxpy(mublasH, A.size(), &alpha, d_A, incx, d_B, incy);
/* step 4: copy data to host */
musaMemcpyAsync(B.data(), d_B, sizeof(float) * B.size(), musaMemcpyDeviceToHost, stream);
musaStreamSynchronize(stream);
/*
* B = | 7.10 10.20 13.30 16.40 |
*/
printf("B\n");
for(int i = 0; i < B.size(); i++)
printf("%f ", B[i]);
/* free resources */
musaFree(d_A);
musaFree(d_B);
mublasDestroy(mublasH);
musaStreamDestroy(stream);
musaDeviceReset();
return EXIT_SUCCESS;
}
输出结果:
B
7.100000 10.200000 13.300000 16.400000
为了方便与其他平台其他技术栈对比,使用下面代码进行矩阵运算:
// System includes
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <musa_runtime.h>
#include <mublas.h>
#include <vector>
#include <chrono>
void ConstantInit(float *data, int size, float val) {
for (int i = 0; i < size; ++i) {
data[i] = val;
}
}
bool checkCmdLineFlag(int argc, char **argv, const char *flag) {
for (int i = 1; i < argc; ++i) {
if (strcmp(argv[i], flag) == 0) {
return true;
}
}
return false;
}
int getCmdLineArgumentInt(int argc, char **argv, const char *flag) {
for (int i = 1; i < argc; ++i) {
if (strcmp(argv[i], flag) == 0 && (i + 1) < argc) {
return atoi(argv[i + 1]);
}
}
return -1; // Return -1 if the flag is not found or no value is provided
}
/**
* Run a simple test of matrix multiplication using muBLAS
*/
int MatrixMultiply(int argc, char **argv,
const dim3 &dimsA,
const dim3 &dimsB) {
// Allocate host memory for matrices A and B
unsigned int size_A = dimsA.x * dimsA.y;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = new float[size_A];
unsigned int size_B = dimsB.x * dimsB.y;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = new float[size_B];
// Initialize host memory
const float valB = 0.01f;
ConstantInit(h_A, size_A, 1.0f);
ConstantInit(h_B, size_B, valB);
// Allocate device memory
float *d_A, *d_B, *d_C;
// Allocate host matrix C
dim3 dimsC(dimsB.x, dimsA.y, 1);
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
float *h_C = new float[dimsC.x * dimsC.y];
if (h_C == nullptr) {
fprintf(stderr, "Failed to allocate host matrix C!\n");
exit(EXIT_FAILURE);
}
musaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A);
musaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B);
musaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C);
// Allocate CUDA events that we'll use for timing
musaEvent_t start, stop;
musaEventCreate(&start);
musaEventCreate(&stop);
musaStream_t stream;
musaStreamCreateWithFlags(&stream, musaStreamNonBlocking);
// Copy host memory to device
musaMemcpyAsync(d_A, h_A, mem_size_A, musaMemcpyHostToDevice, stream);
musaMemcpyAsync(d_B, h_B, mem_size_B, musaMemcpyHostToDevice, stream);
// Record the start event
musaEventRecord(start, stream);
// Execute the muBLAS matrix multiplication
int nIter = 300;
mublasHandle_t handle;
mublasCreate(&handle);
const float alpha = 1.0f;
const float beta = 0.0f;
for (int j = 0; j < nIter; j++) {
mublasSgemm(handle, MUBLAS_OP_N, MUBLAS_OP_N,
dimsB.x, dimsA.y, dimsA.x,
&alpha,
d_B, dimsB.x,
d_A, dimsA.x,
&beta,
d_C, dimsB.x);
}
// Record the stop event
musaEventRecord(stop, stream);
musaStreamSynchronize(stream);
float msecTotal = 0.0f;
musaEventElapsedTime(&msecTotal, start, stop);
// Compute and print the performance
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
static_cast<double>(dimsA.y) *
static_cast<double>(dimsB.x);
double gigaFlops =
(flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf("muBLAS Performance= %.2f GFlop/s, Time= %.3f msec\n",
gigaFlops, msecPerMatrixMul);
// Copy result from device to host
musaMemcpyAsync(h_C, d_C, mem_size_C, musaMemcpyDeviceToHost, stream);
musaStreamSynchronize(stream);
mublasDestroy(handle);
// Clean up memory
delete[] h_A;
delete[] h_B;
delete[] h_C;
musaFree(d_A);
musaFree(d_B);
musaFree(d_C);
musaEventDestroy(start);
musaEventDestroy(stop);
musaStreamDestroy(stream);
return EXIT_SUCCESS;
}
int main(int argc, char **argv) {
printf("[Matrix Multiply Using muBLAS] - Starting...\n");
dim3 dimsA(320, 320, 1);
dim3 dimsB(320, 320, 1);
// Width of Matrix A
if (checkCmdLineFlag(argc, argv, "-wA")) {
dimsA.x = getCmdLineArgumentInt(argc, argv, "-wA");
}
// Height of Matrix A
if (checkCmdLineFlag(argc, argv, "-hA")) {
dimsA.y = getCmdLineArgumentInt(argc, argv, "-hA");
}
// Width of Matrix B
if (checkCmdLineFlag(argc, argv, "-wB")) {
dimsB.x = getCmdLineArgumentInt(argc, argv, "-wB");
}
// Height of Matrix B
if (checkCmdLineFlag(argc, argv, "-hB")) {
dimsB.y = getCmdLineArgumentInt(argc, argv, "-hB");
}
if (dimsA.x != dimsB.y) {
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
dimsA.x, dimsB.y);
exit(EXIT_FAILURE);
}
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y,
dimsB.x, dimsB.y);
int matrix_result = MatrixMultiply(argc, argv, dimsA, dimsB);
exit(matrix_result);
}
输出结果:
[Matrix Multiply Using muBLAS] - Starting...
MatrixA(320,320), MatrixB(320,320)
muBLAS Performance= 39.73 GFlop/s, Time= 1.650 msec