计算库层

使用 DPC++ 和 Intel oneAPI MKL 库进行通用矩阵乘法(GEMM)的示例。

  • 引入了必要的头文件,包括 STL、SYCL 和 oneAPI MKL,设置了异常处理器用于捕获和处理异步异常。
  • 定义了矩阵 A、B 和 C 的维度,以及标量 alpha 和 beta 的值。使用 set_fp_value 函数设置浮点数值。
  • 使用 SYCL 设备( CPU 或 GPU)创建执行队列,并分配矩阵内存。矩阵数据从主机复制到设备内存。
  • 调用 oneapi::mkl::blas::column_major::gemm 进行矩阵乘法计算。该函数根据给定的矩阵 A 和 B 以及标量 alpha 和 beta 计算 C。
  • 计算完成后,将结果从设备内存复制回主机内存,并输出参数及部分矩阵的值。
  • 捕获并输出 SYCL 和其他异常信息。初始化设备、打印设备信息并调用 run_gemm_example 执行 GEMM 操作。

// stl includes
#include <algorithm>
#include <cstdlib>
#include <iostream>
#include <vector>

#if __has_include(<sycl/sycl.hpp>)
#include <sycl/sycl.hpp>
#else
#include <CL/sycl.hpp>
#endif
#include "oneapi/mkl.hpp"

#include "example_helper.hpp"

void run_gemm_example(const sycl::device& dev) {

    oneapi::mkl::transpose transA = oneapi::mkl::transpose::trans;
    oneapi::mkl::transpose transB = oneapi::mkl::transpose::nontrans;

    // matrix data sizes
    int m = 45;
    int n = 98;
    int k = 67;

    // leading dimensions of data
    int ldA = 103;
    int ldB = 105;
    int ldC = 106;
    int sizea = (transA == oneapi::mkl::transpose::nontrans) ? ldA * k : ldA * m;
    int sizeb = (transB == oneapi::mkl::transpose::nontrans) ? ldB * n : ldB * k;
    int sizec = ldC * n;

    // set scalar fp values
    float alpha = set_fp_value(float(2.0), float(-0.5));
    float beta = set_fp_value(float(3.0), float(-1.5));

    // Catch asynchronous exceptions
    auto exception_handler = [](sycl::exception_list exceptions) {
        for (std::exception_ptr const& e : exceptions) {
            try {
                std::rethrow_exception(e);
            }
            catch (sycl::exception const& e) {
                std::cerr << "Caught asynchronous SYCL exception during GEMM:" << std::endl;
                std::cerr << "\t" << e.what() << std::endl;
            }
        }
        std::exit(2);
    };

    // create execution queue
    sycl::queue main_queue(dev, exception_handler);
    sycl::event gemm_done;
    sycl::context cxt = main_queue.get_context();

    // allocate matrix on host
    std::vector<float> A(sizea);
    std::vector<float> B(sizeb);
    std::vector<float> C(sizec);
    std::fill(A.begin(), A.end(), 0);
    std::fill(B.begin(), B.end(), 0);
    std::fill(C.begin(), C.end(), 0);

    rand_matrix(A, transA, m, k, ldA);
    rand_matrix(B, transB, k, n, ldB);
    rand_matrix(C, oneapi::mkl::transpose::nontrans, m, n, ldC);

    // allocate memory on device
    auto dev_A = sycl::malloc_device<float>(sizea * sizeof(float), main_queue);
    auto dev_B = sycl::malloc_device<float>(sizeb * sizeof(float), main_queue);
    auto dev_C = sycl::malloc_device<float>(sizec * sizeof(float), main_queue);
    if (!dev_A || !dev_B || !dev_C) {
        throw std::runtime_error("Failed to allocate USM memory.");
    }

    // copy data from host to device
    main_queue.memcpy(dev_A, A.data(), sizea * sizeof(float)).wait();
    main_queue.memcpy(dev_B, B.data(), sizeb * sizeof(float)).wait();
    main_queue.memcpy(dev_C, C.data(), sizec * sizeof(float)).wait();

    // add oneapi::mkl::blas::gemm to execution queue
    gemm_done = oneapi::mkl::blas::column_major::gemm(main_queue, transA, transB, m, n, k, alpha,
                                                      dev_A, ldA, dev_B, ldB, beta, dev_C, ldC);

    // Wait until calculations are done
    main_queue.wait_and_throw();

    // copy data from device back to host
    main_queue.memcpy(C.data(), dev_C, sizec * sizeof(float)).wait_and_throw();

    std::cout << "\n\t\tGEMM parameters:" << std::endl;
    std::cout << "\t\t\ttransA = "
              << (transA == oneapi::mkl::transpose::nontrans
                      ? "nontrans"
                      : (transA == oneapi::mkl::transpose::trans ? "trans" : "conjtrans"))
              << ", transB = "
              << (transB == oneapi::mkl::transpose::nontrans
                      ? "nontrans"
                      : (transB == oneapi::mkl::transpose::trans ? "trans" : "conjtrans"))
              << std::endl;
    std::cout << "\t\t\tm = " << m << ", n = " << n << ", k = " << k << std::endl;
    std::cout << "\t\t\tlda = " << ldA << ", ldB = " << ldB << ", ldC = " << ldC << std::endl;
    std::cout << "\t\t\talpha = " << alpha << ", beta = " << beta << std::endl;

    std::cout << "\n\t\tOutputting 2x2 block of A,B,C matrices:" << std::endl;

    // output the top 2x2 block of A matrix
    print_2x2_matrix_values(A.data(), ldA, "A");

    // output the top 2x2 block of B matrix
    print_2x2_matrix_values(B.data(), ldB, "B");

    // output the top 2x2 block of C matrix
    print_2x2_matrix_values(C.data(), ldC, "C");

    sycl::free(dev_C, main_queue);
    sycl::free(dev_B, main_queue);
    sycl::free(dev_A, main_queue);
}

void print_example_banner() {
    std::cout << "" << std::endl;
    std::cout << "########################################################################"
              << std::endl;
    std::cout << "# General Matrix-Matrix Multiplication using Unified Shared Memory Example: "
              << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "# C = alpha * A * B + beta * C" << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "# where A, B and C are general dense matrices and alpha, beta are" << std::endl;
    std::cout << "# floating point type precision scalars." << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "# Using apis:" << std::endl;
    std::cout << "#   gemm" << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "# Using single precision (float) data type" << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "# Device will be selected during runtime." << std::endl;
    std::cout << "# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify"
              << std::endl;
    std::cout << "# available devices" << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "########################################################################"
              << std::endl;
    std::cout << std::endl;
}

int main(int argc, char** argv) {
    print_example_banner();

    try {
        sycl::device dev = sycl::device();

        if (dev.is_gpu()) {
            std::cout << "Running BLAS GEMM USM example on GPU device." << std::endl;
            std::cout << "Device name is: " << dev.get_info<sycl::info::device::name>()
                      << std::endl;
        }
        else {
            std::cout << "Running BLAS GEMM USM example on CPU device." << std::endl;
            std::cout << "Device name is: " << dev.get_info<sycl::info::device::name>()
                      << std::endl;
        }
        std::cout << "Running with single precision real data type:" << std::endl;

        run_gemm_example(dev);
        std::cout << "BLAS GEMM USM example ran OK." << std::endl;
    }
    catch (sycl::exception const& e) {
        std::cerr << "Caught synchronous SYCL exception during GEMM:" << std::endl;
        std::cerr << "\t" << e.what() << std::endl;
        std::cerr << "\tSYCL error code: " << e.code().value() << std::endl;
        return 1;
    }
    catch (std::exception const& e) {
        std::cerr << "Caught std::exception during GEMM:" << std::endl;
        std::cerr << "\t" << e.what() << std::endl;
        return 1;
    }

    return 0;
}

一个使用 Intel oneAPI 和 SYCL 的快速傅里叶变换 (DFT) 示例,主要功能包括在支持的设备(GPU 或 CPU)上进行复数的原位前向变换。

  • 定义 run_example 函数,接收一个 SYCL 设备参数。N 是傅里叶变换的大小,这里设定为 16。
  • 定义异步异常处理函数,用于捕获和处理可能出现的 SYCL 异常。创建一个 SYCL 队列并分配共享内存。这里分配了大小为 N * 2 的浮点数组(用于存储复数数据)。
  • 创建一个 DFT 描述符,指定数据精度为单精度(float),数据域为实数,变换大小为 N。
  • 调用前向 DFT 计算,传入描述符和输入数据的指针,返回计算事件。等待 DFT 计算完成,确保后续操作在变换完成后进行。
  • 调用 print_example_banner() 函数以显示示例信息。
  • 判断设备类型(GPU 或 CPU),并输出设备名称。捕获不同类型的异常并输出错误信息,确保程序能够正确处理错误。
// stl includes
#include <iostream>
#include <cstdint>

// oneMKL/SYCL includes
#if __has_include(<sycl/sycl.hpp>)
#include <sycl/sycl.hpp>
#else
#include <CL/sycl.hpp>
#endif

#include "oneapi/mkl.hpp"

void run_example(const sycl::device& dev) {
    constexpr std::size_t N = 16;

    // Catch asynchronous exceptions
    auto exception_handler = [](sycl::exception_list exceptions) {
        for (std::exception_ptr const& e : exceptions) {
            try {
                std::rethrow_exception(e);
            }
            catch (sycl::exception const& e) {
                std::cerr << "Caught asynchronous SYCL exception:" << std::endl;
                std::cerr << "\t" << e.what() << std::endl;
            }
        }
        std::exit(2);
    };

    std::cout << "DFT example run_time dispatch" << std::endl;

    sycl::queue sycl_queue(dev, exception_handler);
    auto x_usm = sycl::malloc_shared<float>(N * 2, sycl_queue);

    // 1. create descriptors
    oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::SINGLE,
                                 oneapi::mkl::dft::domain::REAL>
        desc(static_cast<std::int64_t>(N));

    // 2. variadic set_value
    desc.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
                   static_cast<std::int64_t>(1));
    desc.set_value(oneapi::mkl::dft::config_param::PLACEMENT,
                   oneapi::mkl::dft::config_value::INPLACE);

    // 3. commit_descriptor (runtime dispatch)
    desc.commit(sycl_queue);

    // 4. compute_forward / compute_backward (runtime dispatch)
    auto compute_event = oneapi::mkl::dft::compute_forward(desc, x_usm);

    // Do something with transformed data.
    compute_event.wait();

    // 5. Free USM allocation.
    sycl::free(x_usm, sycl_queue);
}

void print_example_banner() {
    std::cout << "########################################################################\n"
                 "# DFT complex in-place forward transform with USM API example:\n"
                 "#\n"
                 "# Using APIs:\n"
                 "#   USM forward complex in-place\n"
                 "#   Run-time dispatch\n"
                 "#\n"
                 "# Using single precision (float) data type\n"
                 "#\n"
                 "# Device will be selected during runtime.\n"
                 "# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify\n"
                 "# available devices\n"
                 "#\n"
                 "########################################################################\n"
              << std::endl;
}

int main(int /*argc*/, char** /*argv*/) {
    print_example_banner();

    try {
        sycl::device my_dev((sycl::default_selector_v));

        if (my_dev.is_gpu()) {
            std::cout << "Running DFT complex forward example on GPU device" << std::endl;
            std::cout << "Device name is: " << my_dev.get_info<sycl::info::device::name>()
                      << std::endl;
        }
        else {
            std::cout << "Running DFT complex forward example on CPU device" << std::endl;
            std::cout << "Device name is: " << my_dev.get_info<sycl::info::device::name>()
                      << std::endl;
        }
        std::cout << "Running with single precision real data type:" << std::endl;

        run_example(my_dev);
        std::cout << "DFT example ran OK" << std::endl;
    }
    catch (oneapi::mkl::unimplemented const& e) {
        std::cerr << "Unsupported Configuration:" << std::endl;
        std::cerr << "\t" << e.what() << std::endl;
        return 0;
    }
    catch (sycl::exception const& e) {
        std::cerr << "Caught synchronous SYCL exception:" << std::endl;
        std::cerr << "\t" << e.what() << std::endl;
        std::cerr << "\tSYCL error code: " << e.code().value() << std::endl;
        return 1;
    }
    catch (std::exception const& e) {
        std::cerr << "Caught std::exception:" << std::endl;
        std::cerr << "\t" << e.what() << std::endl;
        return 1;
    }
    return 0;
}

使用 Intel oneAPI MKL 在 SYCL 设备上进行 LU 分解和求解线性方程组

  • 定义 run_getrs_example 函数,接收一个 SYCL 设备参数。
  • 定义矩阵的尺寸和主维度。m、n 是矩阵 A 的行和列,nrhs 是右侧矩阵 B 的列数。
  • 在主机上初始化矩阵 A 和 B,并填充为零。用随机数据填充矩阵 A 和 B。为矩阵 A、B 和 IPIV(pivot index)分配设备内存。获取 LU 分解和求解所需的 scratchpad 大小。将主机上的数据复制到设备。
  • 在设备上执行 LU 分解和求解操作。将结果从设备复制回主机,并打印矩阵 A 和解矩阵 X 的前 2x2 块。释放在设备上分配的所有内存。
  • 调用 print_example_banner(),选择设备并执行 LU 分解示例。检查所选设备是 CPU 还是 GPU,并输出设备名称。
// STL includes
#include <algorithm>
#include <cstdlib>
#include <iostream>
#include <vector>

// oneMKL/SYCL includes
#if __has_include(<sycl/sycl.hpp>)
#include <sycl/sycl.hpp>
#else
#include <CL/sycl.hpp>
#endif
#include "oneapi/mkl.hpp"

// local includes
#include "example_helper.hpp"

void run_getrs_example(const sycl::device& device) {
    // Matrix sizes and leading dimensions
    std::int64_t m = 23;
    std::int64_t n = 23;
    std::int64_t nrhs = 23;
    std::int64_t lda = 32;
    std::int64_t ldb = 32;
    std::int64_t A_size = n * lda;
    std::int64_t B_size = nrhs * ldb;
    std::int64_t ipiv_size = n;
    oneapi::mkl::transpose trans = oneapi::mkl::transpose::nontrans;

    // Asynchronous error handler
    auto error_handler = [&](sycl::exception_list exceptions) {
        for (auto const& e : exceptions) {
            try {
                std::rethrow_exception(e);
            }
            catch (oneapi::mkl::lapack::exception const& e) {
                // Handle LAPACK related exceptions that happened during asynchronous call
                std::cerr << "Caught asynchronous LAPACK exception during GETRF or GETRS:"
                          << std::endl;
                std::cerr << "\t" << e.what() << std::endl;
                std::cerr << "\tinfo: " << e.info() << std::endl;
            }
            catch (sycl::exception const& e) {
                // Handle not LAPACK related exceptions that happened during asynchronous call
                std::cerr << "Caught asynchronous SYCL exception during GETRF or GETRS:"
                          << std::endl;
                std::cerr << "\t" << e.what() << std::endl;
            }
        }
        std::exit(2);
    };

    // Data preparation on host
    std::vector<float> A(A_size);
    std::vector<float> B(B_size);
    std::fill(A.begin(), A.end(), 0);
    std::fill(B.begin(), B.end(), 0);

    rand_matrix(A, trans, m, n, lda);
    rand_matrix(B, trans, n, nrhs, ldb);

    // Data preparation on selected device
    sycl::queue queue(device, error_handler);
    sycl::context context = queue.get_context();
    sycl::event getrf_done;
    sycl::event getrs_done;

    float* dev_A = sycl::malloc_device<float>(A_size * sizeof(float), queue);
    float* dev_B = sycl::malloc_device<float>(B_size * sizeof(float), queue);
    std::int64_t* dev_ipiv =
        sycl::malloc_device<std::int64_t>(ipiv_size * sizeof(std::int64_t), queue);

    std::int64_t getrf_scratchpad_size =
        oneapi::mkl::lapack::getrf_scratchpad_size<float>(queue, m, n, lda);
    std::int64_t getrs_scratchpad_size =
        oneapi::mkl::lapack::getrs_scratchpad_size<float>(queue, trans, n, nrhs, lda, ldb);
    float* getrf_scratchpad =
        sycl::malloc_shared<float>(getrf_scratchpad_size * sizeof(float), device, context);
    float* getrs_scratchpad =
        sycl::malloc_shared<float>(getrs_scratchpad_size * sizeof(float), device, context);
    if (!dev_A || !dev_B || !dev_ipiv) {
        throw std::runtime_error("Failed to allocate USM memory.");
    }
    // Skip checking getrf scratchpad memory allocation on rocsolver because with rocsolver
    // backend getrf does not use scrachpad memory
    if (device.is_cpu() || device.get_info<sycl::info::device::vendor_id>() != AMD_ID) {
        if (!getrf_scratchpad) {
            throw std::runtime_error("Failed to allocate USM memory.");
        }
    }
    // Skip checking getrs scratchpad memory allocation on cusolver/rocsolver because with
    // cusolver/rocsolver backend getrs does not use scrachpad memory
    if (device.is_cpu() || (device.get_info<sycl::info::device::vendor_id>() != NVIDIA_ID &&
                            device.get_info<sycl::info::device::vendor_id>() != AMD_ID)) {
        if (!getrs_scratchpad) {
            throw std::runtime_error("Failed to allocate USM memory.");
        }
    }

    // copy data from host to device
    queue.memcpy(dev_A, A.data(), A_size * sizeof(float)).wait();
    queue.memcpy(dev_B, B.data(), B_size * sizeof(float)).wait();

    // Execute on device
    getrf_done = oneapi::mkl::lapack::getrf(queue, m, n, dev_A, lda, dev_ipiv, getrf_scratchpad,
                                            getrf_scratchpad_size);
    getrs_done =
        oneapi::mkl::lapack::getrs(queue, trans, n, nrhs, dev_A, lda, dev_ipiv, dev_B, ldb,
                                   getrs_scratchpad, getrs_scratchpad_size, { getrf_done });

    // Wait until calculations are done
    queue.wait_and_throw();

    // Copy data from device back to host
    queue.memcpy(B.data(), dev_B, B_size * sizeof(float)).wait_and_throw();

    // Print results
    std::cout << "\n\t\tGETRF and GETRS parameters:" << std::endl;
    std::cout << "\t\t\ttrans = "
              << (trans == oneapi::mkl::transpose::nontrans
                      ? "nontrans"
                      : (trans == oneapi::mkl::transpose::trans ? "trans" : "conjtrans"))
              << std::endl;
    std::cout << "\t\t\tm = " << m << ", n = " << n << ", nrhs = " << nrhs << std::endl;
    std::cout << "\t\t\tlda = " << lda << ", ldb = " << ldb << std::endl;

    std::cout << "\n\t\tOutputting 2x2 block of A and X matrices:" << std::endl;
    // output the top 2x2 block of A matrix
    print_2x2_matrix_values(A.data(), lda, "A");

    // output the top 2x2 block of X matrix
    print_2x2_matrix_values(B.data(), ldb, "X");

    sycl::free(getrs_scratchpad, queue);
    sycl::free(getrf_scratchpad, queue);
    sycl::free(dev_ipiv, queue);
    sycl::free(dev_B, queue);
    sycl::free(dev_A, queue);
}

void print_example_banner() {
    std::cout << "" << std::endl;
    std::cout << "########################################################################"
              << std::endl;
    std::cout << "# LU Factorization and Solve Example: " << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "# Computes LU Factorization A = P * L * U" << std::endl;
    std::cout << "# and uses it to solve for X in a system of linear equations:" << std::endl;
    std::cout << "#   AX = B" << std::endl;
    std::cout << "# where A is a general dense matrix and B is a matrix whose columns" << std::endl;
    std::cout << "# are the right-hand sides for the systems of equations." << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "# Using apis:" << std::endl;
    std::cout << "#   getrf and getrs" << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "# Using single precision (float) data type" << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "# Device will be selected during runtime." << std::endl;
    std::cout << "# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify"
              << std::endl;
    std::cout << "# available devices" << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "########################################################################"
              << std::endl;
    std::cout << std::endl;
}

int main(int argc, char** argv) {
    print_example_banner();

    try {
        sycl::device dev = sycl::device();
        if (dev.is_gpu()) {
            std::cout << "Running LAPACK getrs example on GPU device." << std::endl;
            std::cout << "Device name is: " << dev.get_info<sycl::info::device::name>()
                      << std::endl;
        }
        else {
            std::cout << "Running LAPACK getrs example on CPU device." << std::endl;
            std::cout << "Device name is: " << dev.get_info<sycl::info::device::name>()
                      << std::endl;
        }

        std::cout << "Running with single precision real data type:" << std::endl;
        run_getrs_example(dev);
        std::cout << "LAPACK GETRS USM example ran OK" << std::endl;
    }
    catch (oneapi::mkl::lapack::exception const& e) {
        // Handle LAPACK related exceptions that happened during synchronous call
        std::cerr << "Caught synchronous LAPACK exception:" << std::endl;
        std::cerr << "\t" << e.what() << std::endl;
        std::cerr << "\tinfo: " << e.info() << std::endl;
        return 1;
    }
    catch (sycl::exception const& e) {
        // Handle not LAPACK related exceptions that happened during synchronous call
        std::cerr << "Caught synchronous SYCL exception:" << std::endl;
        std::cerr << "\t" << e.what() << std::endl;
        std::cerr << "\tSYCL error code: " << e.code().value() << std::endl;
        return 1;
    }
    catch (std::exception const& e) {
        // Handle not SYCL related exceptions that happened during synchronous call
        std::cerr << "Caught synchronous std::exception:" << std::endl;
        std::cerr << "\t" << e.what() << std::endl;
        return 1;
    }

    return 0;
}

使用DPC++ (Data Parallel C++) 和 oneAPI 数学核心库(oneMKL)生成均匀分布随机数的示例。使用SYCL (Standard C++ for Heterogeneous Computing) 编程模型,能够在不同硬件设备(如CPU、GPU)上生成随机数,具体使用了Philox4x32x10随机数生成器和Unified Shared Memory (USM) API来进行内存管理。

  • seed:随机数生成器的种子值,确保生成相同的随机序列。n:要生成的随机数的数量,这里为1000个。a 和 b:随机数的范围,从0.0到10.0。定义了一个异步异常处理器,当在SYCL的队列执行过程中发生异常时,能够捕捉并处理这些异常。
  • sycl::queue queue(dev, exception_handler); 创建了一个SYCL队列,该队列会根据传入的设备对象(dev)选择执行平台(如CPU或GPU),并在出现异常时调用exception_handler。
  • oneapi::mkl::rng::default_engine engine(queue, seed); 使用默认的RNG引擎,并根据给定的种子进行初始化。oneapi::mkl::rng::uniform distribution(a, b); 使用均匀分布生成器,生成范围在a到b之间的浮点数。
  • 使用sycl::malloc_device为设备端分配USM内存,用来存放随机数。oneapi::mkl::rng::generate生成随机数并将其存放在设备端的USM内存中。随后将生成的随机数从设备端内存复制回主机端进行处理。
  • 生成完毕后,代码将前10个随机数输出到控制台。调用run_uniform_example来生成随机数并处理异常。如果运行成功,则输出随机数生成正常结束的信息。
// stl includes
#include <algorithm>
#include <cstdlib>
#include <iostream>
#include <vector>

// oneMKL/SYCL includes
#if __has_include(<sycl/sycl.hpp>)
#include <sycl/sycl.hpp>
#else
#include <CL/sycl.hpp>
#endif
#include "oneapi/mkl.hpp"

// local includes
#include "example_helper.hpp"

void run_uniform_example(const sycl::device& dev) {
    constexpr std::uint64_t seed = 777;
    constexpr std::size_t n = 1000;
    constexpr std::size_t n_print = 10;
    constexpr std::size_t alignment = 64;

    // Catch asynchronous exceptions
    auto exception_handler = [](sycl::exception_list exceptions) {
        for (std::exception_ptr const& e : exceptions) {
            try {
                std::rethrow_exception(e);
            }
            catch (sycl::exception const& e) {
                std::cerr << "Caught asynchronous SYCL exception during generation:" << std::endl;
                std::cerr << "\t" << e.what() << std::endl;
            }
        }
        std::exit(2);
    };

    sycl::queue queue(dev, exception_handler);

    // set scalar Type values
    float a(0.0);
    float b(10.0);

    oneapi::mkl::rng::default_engine engine(queue, seed);
    oneapi::mkl::rng::uniform<float> distribution(a, b);
    std::vector<float> r(n);

    // Data preparation on selected device
    float* dev_r = sycl::malloc_device<float>(n * sizeof(float), queue);
    if (!dev_r) {
        throw std::runtime_error("Failed to allocate USM memory.");
    }
    sycl::event event_out;
    event_out = oneapi::mkl::rng::generate(distribution, engine, n, dev_r);
    event_out.wait_and_throw();
    queue.memcpy(r.data(), dev_r, n * sizeof(float)).wait_and_throw();

    std::cout << "\t\tgeneration parameters:" << std::endl;
    std::cout << "\t\t\tseed = " << seed << ", a = " << a << ", b = " << b << std::endl;

    std::cout << "\t\tOutput of generator:" << std::endl;
    std::cout << "\t\t\tfirst " << n_print << " numbers of " << n << ": " << std::endl;
    for (int i = 0; i < n_print; i++) {
        std::cout << r.at(i) << " ";
    }
    std::cout << std::endl;

    sycl::free(dev_r, queue);
}

void print_example_banner() {
    std::cout << "" << std::endl;
    std::cout << "########################################################################"
              << std::endl;
    std::cout
        << "# Generate uniformly distributed random numbers with philox4x32x10\n# generator example: "
        << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "# Using APIs:" << std::endl;
    std::cout << "#   default_engine uniform" << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "# Using single precision (float) data type" << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "# Device will be selected during runtime." << std::endl;
    std::cout << "# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify"
              << std::endl;
    std::cout << "# available devices" << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "########################################################################"
              << std::endl;
    std::cout << std::endl;
}

int main(int argc, char** argv) {
    print_example_banner();

    try {
        sycl::device my_dev = sycl::device();

        if (my_dev.is_gpu()) {
            std::cout << "Running RNG uniform usm example on GPU device" << std::endl;
            std::cout << "Device name is: " << my_dev.get_info<sycl::info::device::name>()
                      << std::endl;
        }
        else {
            std::cout << "Running RNG uniform usm example on CPU device" << std::endl;
            std::cout << "Device name is: " << my_dev.get_info<sycl::info::device::name>()
                      << std::endl;
        }
        std::cout << "Running with single precision real data type:" << std::endl;

        run_uniform_example(my_dev);
        std::cout << "Random number generator with uniform distribution ran OK" << std::endl;
    }
    catch (sycl::exception const& e) {
        std::cerr << "Caught synchronous SYCL exception:" << std::endl;
        std::cerr << "\t" << e.what() << std::endl;
        std::cerr << "\tSYCL error code: " << e.code().value() << std::endl;
        return 1;
    }
    catch (std::exception const& e) {
        std::cerr << "Caught std::exception during generation:" << std::endl;
        std::cerr << "\t" << e.what() << std::endl;
        return 1;
    }
    return 0;
}