计算库层
portBLAS 使用 SYCL 实现 BLAS(基本线性代数例程),适用于现代异构计算平台。
参考仓库地址:portBLAS
blas::_gemm
通常是指 BLAS
库中的 gemm
函数,它执行矩阵-矩阵乘法。gemm
是 BLAS
Level 3 例程中最核心的函数之一,它的全称是 "GEneral Matrix-Matrix multiplication"。
gemm
函数的基本操作是计算两个矩阵的乘积,并将结果加到第三个矩阵上,可以用以下数学公式表示:
C=α×A×B+β×C
其中:
- A 和 B 是输入矩阵。
- C 是输出矩阵,其内容在函数调用前可以是任意值。
- α 和 β 是标量倍数。
示例代码如下:
#include "portblas.hpp"
#include <sycl/sycl.hpp>
#include "util.hpp"
int main(int argc, char** argv) {
/* Create a SYCL queue with the default device selector */
sycl::queue q = sycl::queue(sycl::default_selector_v);
/* Create a portBLAS sb_handle and get the policy handler */
blas::SB_Handle sb_handle(q);
/* Arguments of the Gemm operation.
* Note: these matrix dimensions are too small to get a performance gain by
* using portBLAS, but they are convenient for this sample */
const size_t m = 7;
const size_t k = 9;
const size_t n = 5;
const size_t lda = 12;
const size_t ldb = 17;
const size_t ldc = 10;
const float alpha = 1.5;
const float beta = 0.5;
/* Create the matrices */
std::vector<float> A = std::vector<float>(lda * k);
std::vector<float> B = std::vector<float>(ldb * n);
std::vector<float> C = std::vector<float>(ldc * n);
/* Fill the matrices with random values */
fill_matrix(A, m, k, lda);
fill_matrix(B, k, n, ldb);
fill_matrix(C, m, n, ldc);
/* Print the matrices before the GEMM operation */
std::cout << "A:\n";
print_matrix(A, m, k, lda);
std::cout << "---\nB:\n";
print_matrix(B, k, n, ldb);
std::cout << "---\nC (before):\n";
print_matrix(C, m, n, ldc);
/* Create the buffers */
auto a_gpu = blas::make_sycl_iterator_buffer<float>(lda * k);
auto b_gpu = blas::make_sycl_iterator_buffer<float>(ldb * n);
auto c_gpu = blas::make_sycl_iterator_buffer<float>(ldc * n);
/* Copy the matrices to the device
* Note: this sample uses explicit copy operations, see the GEMV sample for
* an alternative way
*/
std::cout << "---\nCopying A, B and C to device\n";
blas::helper::copy_to_device(sb_handle.get_queue(), A.data(), a_gpu, lda * k);
blas::helper::copy_to_device(sb_handle.get_queue(), B.data(), b_gpu, ldb * n);
blas::helper::copy_to_device(sb_handle.get_queue(), C.data(), c_gpu, ldc * n);
/* Execute the GEMM operation */
std::cout << "Executing C = " << alpha << "*A*B + " << beta << "*C\n";
blas::_gemm(sb_handle, 'n', 'n', m, n, k, alpha, a_gpu, lda, b_gpu, ldb, beta,
c_gpu, ldc);
/* Copy the result to the host */
std::cout << "Copying C to host\n";
auto event = blas::helper::copy_to_host(sb_handle.get_queue(), c_gpu,
C.data(), ldc * n);
sb_handle.wait(event);
/* Print the result after the GEMM operation */
std::cout << "---\nC (after):" << std::endl;
print_matrix(C, m, n, ldc);
return 0;
}
结果:
A:
-1.438 9.5166 9.2763 9.7061 4.7123 -7.849 -5.247 -0.935 4.6360
-3.838 -9.125 -8.024 3.5507 9.6027 2.9939 8.8246 -3.819 1.4996
7.2394 3.6334 -6.483 5.1184 -0.098 -3.100 -3.855 2.2647 -2.312
-9.887 -4.862 -2.542 4.7151 -1.834 -6.717 6.3718 6.2206 -0.902
4.5156 -5.968 -1.436 4.9522 -2.603 9.3485 0.9153 8.6185 3.1563
-2.132 -7.835 -2.304 -3.719 -3.643 -7.861 4.0819 3.1938 -4.767
6.2794 9.6350 -1.894 -3.764 3.6267 -9.607 -3.615 -5.863 9.1741
---
B:
-5.155 8.9958 8.3243 -0.440 5.9613
0.6664 -6.633 -5.409 -9.602 6.2469
-4.575 4.5948 -7.293 1.4219 5.0536
-1.557 9.6612 2.7314 2.8409 9.9053
-5.624 1.0714 0.1577 -9.203 8.2468
-6.736 -8.187 -9.487 -1.367 2.2212
0.3280 8.0061 -7.627 -8.519 3.3489
8.2866 -6.657 -1.385 1.2509 0.0343
-1.143 8.3706 3.1169 -9.348 8.6974
---
C (before):
3.2586 0.6738 -0.649 -9.132 7.7019
-8.786 -5.771 0.5866 6.0604 -9.920
5.7516 -4.882 -1.464 -7.489 -1.981
-3.144 4.9594 3.2095 7.9944 8.8018
0.4556 9.3061 -6.114 -0.811 8.0797
5.1307 -4.322 7.5669 8.6118 6.9407
8.2092 -8.333 -4.818 2.2205 9.4932
---
Copying A, B and C to device
Executing C = 1.5*A*B + 0.5*C
Copying C to host
---
C (after):
-46.67 199.34 39.273 -128.2 360.91
-94.02 173.88 2.3888 -144.6 59.593
45.442 28.556 224.77 40.769 63.731
241.84 50.848 -30.16 77.259 -103.3
-12.70 32.391 -9.527 77.588 96.350
195.79 25.208 86.087 184.93 -274.5
-36.95 171.24 237.15 -285.7 194.97
blas::_gemv
是用于执行矩阵-向量乘法的例程。这里的 "gemv" 代表 "GEneral Matrix-Vector multiplication"。这个函数是 BLAS
Level 2 例程的一部分,它提供了矩阵和向量之间的乘法操作。
示例代码如下:
#include "portblas.hpp"
#include <sycl/sycl.hpp>
#include "util.hpp"
int main(int argc, char** argv) {
/* Create a SYCL queue with the default device selector */
sycl::queue q = sycl::queue(sycl::default_selector_v);
/* Create a portBLAS sb_handle and get the policy handler */
blas::SB_Handle sb_handle(q);
/* Arguments of the Gemm operation.
* Note: these matrix dimensions are too small to get a performance gain by
* using portBLAS, but they are convenient for this sample */
const size_t m = 7;
const size_t n = 7;
const size_t lda = 12;
const size_t incx = 2;
const size_t incy = 3;
const float alpha = 1.5;
const float beta = 0.5;
/* Create the matrix and vectors */
const size_t lx = (n - 1) * incx + 1;
const size_t ly = (m - 1) * incy + 1;
std::vector<float> A = std::vector<float>(lda * n);
std::vector<float> X = std::vector<float>(lx);
std::vector<float> Y = std::vector<float>(ly);
/* Fill the matrices with random values */
fill_matrix(A, m, n, lda);
fill_vector(X, n, incx);
fill_vector(Y, m, incy);
/* Print the matrices before the GEMV operation */
std::cout << "A:\n";
print_matrix(A, m, n, lda);
std::cout << "---\nX:\n";
print_vector(X, n, incx);
std::cout << "---\nY (before):\n";
print_vector(Y, m, incy);
/* Execute the GEMV operation
* Note: you can also use explicit copies, see the GEMM sample
*/
std::cout << "---\nExecuting Y = " << alpha << "*A*X + " << beta << "*Y\n";
{
auto a_gpu = blas::make_sycl_iterator_buffer<float>(A, lda * n);
auto x_gpu = blas::make_sycl_iterator_buffer<float>(X, lx);
auto y_gpu = blas::make_sycl_iterator_buffer<float>(Y, ly);
auto event = blas::_gemv(sb_handle, 'n', m, n, alpha, a_gpu, lda, x_gpu,
incx, beta, y_gpu, incy);
}
/* Print the result after the GEMM operation */
std::cout << "---\nY (after):" << std::endl;
print_vector(Y, m, incy);
return 0;
}
结果:
A:
3.6917 7.9525 0.9778 1.6808 -4.840 -3.500 -5.867
1.6618 -4.460 5.0066 8.7876 1.7519 5.5783 -7.085
-5.231 0.6601 0.5053 -1.237 -5.133 -7.862 -4.636
7.9965 -2.646 5.6696 1.9292 -1.571 3.1671 -2.653
-5.239 3.9521 -1.618 -4.856 4.8694 5.0238 3.2869
-0.183 0.5850 -1.778 0.6833 -7.747 0.8794 9.6463
0.8742 -9.219 -8.838 -4.268 2.2112 3.4617 -8.035
---
X:
5.5933
-2.949
8.8669
6.9684
-1.255
-7.447
-2.448
---
Y (before):
4.0870
-9.189
-8.224
-1.195
-7.723
-6.854
-1.389
---
Executing Y = 1.5*A*X + 0.5*Y
---
Y (after):
98.187
147.93
57.397
151.09
-214.9
-54.74
-128.0