计算库层

portBLAS 使用 SYCL 实现 BLAS（基本线性代数例程），适用于现代异构计算平台。

参考仓库地址：portBLAS

blas::_gemm 通常是指 BLAS 库中的 gemm 函数，它执行矩阵-矩阵乘法。gemm 是 BLAS Level 3 例程中最核心的函数之一，它的全称是 "GEneral Matrix-Matrix multiplication"。

gemm 函数的基本操作是计算两个矩阵的乘积，并将结果加到第三个矩阵上，可以用以下数学公式表示：

C=α×A×B+β×C

其中：

A 和 B 是输入矩阵。
C 是输出矩阵，其内容在函数调用前可以是任意值。
α 和 β 是标量倍数。

示例代码如下：

#include "portblas.hpp"
#include <sycl/sycl.hpp>

#include "util.hpp"

int main(int argc, char** argv) {
  /* Create a SYCL queue with the default device selector */
  sycl::queue q = sycl::queue(sycl::default_selector_v);

  /* Create a portBLAS sb_handle and get the policy handler */
  blas::SB_Handle sb_handle(q);

  /* Arguments of the Gemm operation.
   * Note: these matrix dimensions are too small to get a performance gain by
   * using portBLAS, but they are convenient for this sample */
  const size_t m = 7;
  const size_t k = 9;
  const size_t n = 5;
  const size_t lda = 12;
  const size_t ldb = 17;
  const size_t ldc = 10;
  const float alpha = 1.5;
  const float beta = 0.5;

  /* Create the matrices */
  std::vector<float> A = std::vector<float>(lda * k);
  std::vector<float> B = std::vector<float>(ldb * n);
  std::vector<float> C = std::vector<float>(ldc * n);

  /* Fill the matrices with random values */
  fill_matrix(A, m, k, lda);
  fill_matrix(B, k, n, ldb);
  fill_matrix(C, m, n, ldc);

  /* Print the matrices before the GEMM operation */
  std::cout << "A:\n";
  print_matrix(A, m, k, lda);
  std::cout << "---\nB:\n";
  print_matrix(B, k, n, ldb);
  std::cout << "---\nC (before):\n";
  print_matrix(C, m, n, ldc);

  /* Create the buffers */
  auto a_gpu = blas::make_sycl_iterator_buffer<float>(lda * k);
  auto b_gpu = blas::make_sycl_iterator_buffer<float>(ldb * n);
  auto c_gpu = blas::make_sycl_iterator_buffer<float>(ldc * n);

  /* Copy the matrices to the device
   * Note: this sample uses explicit copy operations, see the GEMV sample for
   * an alternative way
   */
  std::cout << "---\nCopying A, B and C to device\n";
  blas::helper::copy_to_device(sb_handle.get_queue(), A.data(), a_gpu, lda * k);
  blas::helper::copy_to_device(sb_handle.get_queue(), B.data(), b_gpu, ldb * n);
  blas::helper::copy_to_device(sb_handle.get_queue(), C.data(), c_gpu, ldc * n);

  /* Execute the GEMM operation */
  std::cout << "Executing C = " << alpha << "*A*B + " << beta << "*C\n";
  blas::_gemm(sb_handle, 'n', 'n', m, n, k, alpha, a_gpu, lda, b_gpu, ldb, beta,
              c_gpu, ldc);

  /* Copy the result to the host */
  std::cout << "Copying C to host\n";
  auto event = blas::helper::copy_to_host(sb_handle.get_queue(), c_gpu,
                                          C.data(), ldc * n);
  sb_handle.wait(event);

  /* Print the result after the GEMM operation */
  std::cout << "---\nC (after):" << std::endl;
  print_matrix(C, m, n, ldc);

  return 0;
}

结果：

A:
-1.438 9.5166 9.2763 9.7061 4.7123 -7.849 -5.247 -0.935 4.6360
-3.838 -9.125 -8.024 3.5507 9.6027 2.9939 8.8246 -3.819 1.4996
7.2394 3.6334 -6.483 5.1184 -0.098 -3.100 -3.855 2.2647 -2.312
-9.887 -4.862 -2.542 4.7151 -1.834 -6.717 6.3718 6.2206 -0.902
4.5156 -5.968 -1.436 4.9522 -2.603 9.3485 0.9153 8.6185 3.1563
-2.132 -7.835 -2.304 -3.719 -3.643 -7.861 4.0819 3.1938 -4.767
6.2794 9.6350 -1.894 -3.764 3.6267 -9.607 -3.615 -5.863 9.1741
---
B:
-5.155 8.9958 8.3243 -0.440 5.9613
0.6664 -6.633 -5.409 -9.602 6.2469
-4.575 4.5948 -7.293 1.4219 5.0536
-1.557 9.6612 2.7314 2.8409 9.9053
-5.624 1.0714 0.1577 -9.203 8.2468
-6.736 -8.187 -9.487 -1.367 2.2212
0.3280 8.0061 -7.627 -8.519 3.3489
8.2866 -6.657 -1.385 1.2509 0.0343
-1.143 8.3706 3.1169 -9.348 8.6974
---
C (before):
3.2586 0.6738 -0.649 -9.132 7.7019
-8.786 -5.771 0.5866 6.0604 -9.920
5.7516 -4.882 -1.464 -7.489 -1.981
-3.144 4.9594 3.2095 7.9944 8.8018
0.4556 9.3061 -6.114 -0.811 8.0797
5.1307 -4.322 7.5669 8.6118 6.9407
8.2092 -8.333 -4.818 2.2205 9.4932
---
Copying A, B and C to device
Executing C = 1.5*A*B + 0.5*C
Copying C to host
---
C (after):
-46.67 199.34 39.273 -128.2 360.91
-94.02 173.88 2.3888 -144.6 59.593
45.442 28.556 224.77 40.769 63.731
241.84 50.848 -30.16 77.259 -103.3
-12.70 32.391 -9.527 77.588 96.350
195.79 25.208 86.087 184.93 -274.5
-36.95 171.24 237.15 -285.7 194.97

blas::_gemv 是用于执行矩阵-向量乘法的例程。这里的 "gemv" 代表 "GEneral Matrix-Vector multiplication"。这个函数是 BLAS Level 2 例程的一部分，它提供了矩阵和向量之间的乘法操作。

示例代码如下：

#include "portblas.hpp"
#include <sycl/sycl.hpp>

#include "util.hpp"

int main(int argc, char** argv) {
  /* Create a SYCL queue with the default device selector */
  sycl::queue q = sycl::queue(sycl::default_selector_v);

  /* Create a portBLAS sb_handle and get the policy handler */
  blas::SB_Handle sb_handle(q);

  /* Arguments of the Gemm operation.
   * Note: these matrix dimensions are too small to get a performance gain by
   * using portBLAS, but they are convenient for this sample */
  const size_t m = 7;
  const size_t n = 7;
  const size_t lda = 12;
  const size_t incx = 2;
  const size_t incy = 3;
  const float alpha = 1.5;
  const float beta = 0.5;

  /* Create the matrix and vectors */
  const size_t lx = (n - 1) * incx + 1;
  const size_t ly = (m - 1) * incy + 1;
  std::vector<float> A = std::vector<float>(lda * n);
  std::vector<float> X = std::vector<float>(lx);
  std::vector<float> Y = std::vector<float>(ly);

  /* Fill the matrices with random values */
  fill_matrix(A, m, n, lda);
  fill_vector(X, n, incx);
  fill_vector(Y, m, incy);

  /* Print the matrices before the GEMV operation */
  std::cout << "A:\n";
  print_matrix(A, m, n, lda);
  std::cout << "---\nX:\n";
  print_vector(X, n, incx);
  std::cout << "---\nY (before):\n";
  print_vector(Y, m, incy);

  /* Execute the GEMV operation
   * Note: you can also use explicit copies, see the GEMM sample
   */
  std::cout << "---\nExecuting Y = " << alpha << "*A*X + " << beta << "*Y\n";
  {
    auto a_gpu = blas::make_sycl_iterator_buffer<float>(A, lda * n);
    auto x_gpu = blas::make_sycl_iterator_buffer<float>(X, lx);
    auto y_gpu = blas::make_sycl_iterator_buffer<float>(Y, ly);
    auto event = blas::_gemv(sb_handle, 'n', m, n, alpha, a_gpu, lda, x_gpu,
                             incx, beta, y_gpu, incy);
  }

  /* Print the result after the GEMM operation */
  std::cout << "---\nY (after):" << std::endl;
  print_vector(Y, m, incy);

  return 0;
}

结果：

A:
3.6917 7.9525 0.9778 1.6808 -4.840 -3.500 -5.867
1.6618 -4.460 5.0066 8.7876 1.7519 5.5783 -7.085
-5.231 0.6601 0.5053 -1.237 -5.133 -7.862 -4.636
7.9965 -2.646 5.6696 1.9292 -1.571 3.1671 -2.653
-5.239 3.9521 -1.618 -4.856 4.8694 5.0238 3.2869
-0.183 0.5850 -1.778 0.6833 -7.747 0.8794 9.6463
0.8742 -9.219 -8.838 -4.268 2.2112 3.4617 -8.035
---
X:
5.5933
-2.949
8.8669
6.9684
-1.255
-7.447
-2.448
---
Y (before):
4.0870
-9.189
-8.224
-1.195
-7.723
-6.854
-1.389
---
Executing Y = 1.5*A*X + 0.5*Y
---
Y (after):
98.187
147.93
57.397
151.09
-214.9
-54.74
-128.0

AI技术栈解析及应用- 作者：张真瑜 | 山东大学智能创新研究院

计算库层