计算库层

muBLAS是基于MUSA开发的基础线性代数库,在MTGPU上经过深度优化,在AI和HPC场景下被广泛使用。按照计算复杂性,muBLAS函数可分为三类,第一类用来处理标量、向量和向量与向量间的运算,第二类用来处理向量与矩阵之间的运算, 第三类用来进行矩阵与矩阵间的运算.

#include <cstdio>
#include <cstdlib>
#include <mublas.h>
#include <musa_runtime.h>
#include <vector>

int main(int argc, char* argv[])
{
    mublasHandle_t mublasH = NULL;
    musaStream_t   stream  = NULL;

    /*
     *   A = | 1.0 2.0 3.0 4.0 |
     *   B = | 5.0 6.0 7.0 8.0 |
     */

    const std::vector<float> A     = {1.0, 2.0, 3.0, 4.0};
    std::vector<float>       B     = {5.0, 6.0, 7.0, 8.0};
    const float              alpha = 2.1;
    const int                incx  = 1;
    const int                incy  = 1;

    float* d_A = nullptr;
    float* d_B = nullptr;

    /* step 1: create mublas handle, bind a stream */
    mublasCreate(&mublasH);

    musaStreamCreateWithFlags(&stream, musaStreamNonBlocking);
    mublasSetStream(mublasH, stream);

    /* step 2: copy data to device */
    musaMalloc(reinterpret_cast<void**>(&d_A), sizeof(float) * A.size());
    musaMalloc(reinterpret_cast<void**>(&d_B), sizeof(float) * B.size());

    musaMemcpyAsync(d_A, A.data(), sizeof(float) * A.size(), musaMemcpyHostToDevice, stream);
    musaMemcpyAsync(d_B, B.data(), sizeof(float) * B.size(), musaMemcpyHostToDevice, stream);
    /* step 3: compute */
    mublasSaxpy(mublasH, A.size(), &alpha, d_A, incx, d_B, incy);

    /* step 4: copy data to host */
    musaMemcpyAsync(B.data(), d_B, sizeof(float) * B.size(), musaMemcpyDeviceToHost, stream);
    musaStreamSynchronize(stream);

    /*
     *   B = | 7.10 10.20 13.30 16.40 |
     */
    printf("B\n");
    for(int i = 0; i < B.size(); i++)
        printf("%f ", B[i]);
    /* free resources */
    musaFree(d_A);
    musaFree(d_B);

    mublasDestroy(mublasH);

    musaStreamDestroy(stream);

    musaDeviceReset();

    return EXIT_SUCCESS;
}

输出结果:

B
7.100000 10.200000 13.300000 16.400000 

为了方便与其他平台其他技术栈对比,使用下面代码进行矩阵运算:

// System includes
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <musa_runtime.h>
#include <mublas.h>
#include <vector>
#include <chrono>

void ConstantInit(float *data, int size, float val) {
    for (int i = 0; i < size; ++i) {
        data[i] = val;
    }
}

bool checkCmdLineFlag(int argc, char **argv, const char *flag) {
    for (int i = 1; i < argc; ++i) {
        if (strcmp(argv[i], flag) == 0) {
            return true;
        }
    }
    return false;
}

int getCmdLineArgumentInt(int argc, char **argv, const char *flag) {
    for (int i = 1; i < argc; ++i) {
        if (strcmp(argv[i], flag) == 0 && (i + 1) < argc) {
            return atoi(argv[i + 1]);
        }
    }
    return -1; // Return -1 if the flag is not found or no value is provided
}

/**
 * Run a simple test of matrix multiplication using muBLAS
 */
int MatrixMultiply(int argc, char **argv,
                   const dim3 &dimsA,
                   const dim3 &dimsB) {
    // Allocate host memory for matrices A and B
    unsigned int size_A = dimsA.x * dimsA.y;
    unsigned int mem_size_A = sizeof(float) * size_A;
    float *h_A = new float[size_A];
    unsigned int size_B = dimsB.x * dimsB.y;
    unsigned int mem_size_B = sizeof(float) * size_B;
    float *h_B = new float[size_B];

    // Initialize host memory
    const float valB = 0.01f;
    ConstantInit(h_A, size_A, 1.0f);
    ConstantInit(h_B, size_B, valB);

    // Allocate device memory
    float *d_A, *d_B, *d_C;

    // Allocate host matrix C
    dim3 dimsC(dimsB.x, dimsA.y, 1);
    unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
    float *h_C = new float[dimsC.x * dimsC.y];

    if (h_C == nullptr) {
        fprintf(stderr, "Failed to allocate host matrix C!\n");
        exit(EXIT_FAILURE);
    }

    musaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A);
    musaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B);
    musaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C);

    // Allocate CUDA events that we'll use for timing
    musaEvent_t start, stop;
    musaEventCreate(&start);
    musaEventCreate(&stop);

    musaStream_t stream;
    musaStreamCreateWithFlags(&stream, musaStreamNonBlocking);

    // Copy host memory to device
    musaMemcpyAsync(d_A, h_A, mem_size_A, musaMemcpyHostToDevice, stream);
    musaMemcpyAsync(d_B, h_B, mem_size_B, musaMemcpyHostToDevice, stream);

    // Record the start event
    musaEventRecord(start, stream);

    // Execute the muBLAS matrix multiplication
    int nIter = 300;
    mublasHandle_t handle;
    mublasCreate(&handle);

    const float alpha = 1.0f;
    const float beta = 0.0f;

    for (int j = 0; j < nIter; j++) {
        mublasSgemm(handle, MUBLAS_OP_N, MUBLAS_OP_N,
                    dimsB.x, dimsA.y, dimsA.x,
                    &alpha,
                    d_B, dimsB.x,
                    d_A, dimsA.x,
                    &beta,
                    d_C, dimsB.x);
    }

    // Record the stop event
    musaEventRecord(stop, stream);
    musaStreamSynchronize(stream);

    float msecTotal = 0.0f;
    musaEventElapsedTime(&msecTotal, start, stop);

    // Compute and print the performance
    float msecPerMatrixMul = msecTotal / nIter;
    double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
                               static_cast<double>(dimsA.y) *
                               static_cast<double>(dimsB.x);
    double gigaFlops =
        (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
    printf("muBLAS Performance= %.2f GFlop/s, Time= %.3f msec\n",
           gigaFlops, msecPerMatrixMul);

    // Copy result from device to host
    musaMemcpyAsync(h_C, d_C, mem_size_C, musaMemcpyDeviceToHost, stream);
    musaStreamSynchronize(stream);

    mublasDestroy(handle);
    // Clean up memory
    delete[] h_A;
    delete[] h_B;
    delete[] h_C;
    musaFree(d_A);
    musaFree(d_B);
    musaFree(d_C);
    musaEventDestroy(start);
    musaEventDestroy(stop);
    musaStreamDestroy(stream);
    return EXIT_SUCCESS;
}


int main(int argc, char **argv) {
    printf("[Matrix Multiply Using muBLAS] - Starting...\n");

    dim3 dimsA(320, 320, 1);
    dim3 dimsB(320, 320, 1);

    // Width of Matrix A
    if (checkCmdLineFlag(argc, argv, "-wA")) {
        dimsA.x = getCmdLineArgumentInt(argc, argv, "-wA");
    }

    // Height of Matrix A
    if (checkCmdLineFlag(argc, argv, "-hA")) {
        dimsA.y = getCmdLineArgumentInt(argc, argv, "-hA");
    }

    // Width of Matrix B
    if (checkCmdLineFlag(argc, argv, "-wB")) {
        dimsB.x = getCmdLineArgumentInt(argc, argv, "-wB");
    }

    // Height of Matrix B
    if (checkCmdLineFlag(argc, argv, "-hB")) {
        dimsB.y = getCmdLineArgumentInt(argc, argv, "-hB");
    }

    if (dimsA.x != dimsB.y) {
        printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
               dimsA.x, dimsB.y);
        exit(EXIT_FAILURE);
    }

    printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y,
           dimsB.x, dimsB.y);

    int matrix_result = MatrixMultiply(argc, argv, dimsA, dimsB);
    exit(matrix_result);
}

输出结果:

[Matrix Multiply Using muBLAS] - Starting...
MatrixA(320,320), MatrixB(320,320)
muBLAS Performance= 39.73 GFlop/s, Time= 1.650 msec