diff --git a/CMakeLists.txt b/CMakeLists.txt index b23e18975..0559484e1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,7 +39,6 @@ option(ENABLE_MKLCPU_BACKEND "Enable the Intel oneMKL CPU backend for supported option(ENABLE_MKLGPU_BACKEND "Enable the Intel oneMKL GPU backend for supported interfaces" ON) if(ENABLE_MKLCPU_BACKEND) option(ENABLE_MKLCPU_THREAD_TBB "Enable the use of Intel TBB with the oneMath CPU backend" ON) - option(ENABLE_MKLCPU_THREAD_OMP "Enable the use of Intel OpenMP with the oneMath CPU backend" OFF) endif() option(ENABLE_ARMPL_BACKEND "Enable the ArmPl backend for BLAS/LAPACK interface" OFF) @@ -51,6 +50,7 @@ endif() option(ENABLE_CUBLAS_BACKEND "Enable the cuBLAS backend for the BLAS interface" OFF) option(ENABLE_ROCBLAS_BACKEND "Enable the rocBLAS backend for the BLAS interface" OFF) option(ENABLE_NETLIB_BACKEND "Enable the Netlib backend for the BLAS interface" OFF) +option(ENABLE_OPENBLAS_BACKEND "Enable the OpenBLAS backend for the BLAS interface" OFF) option(ENABLE_GENERIC_BLAS_BACKEND "Enable the generic BLAS backend for the BLAS interface. Cannot be used with other BLAS backends." OFF) # rand @@ -95,6 +95,7 @@ if(ENABLE_MKLCPU_BACKEND OR ENABLE_CUBLAS_BACKEND OR ENABLE_ROCBLAS_BACKEND OR ENABLE_NETLIB_BACKEND + OR ENABLE_OPENBLAS_BACKEND OR ENABLE_GENERIC_BLAS_BACKEND OR ENABLE_ARMPL_BACKEND) list(APPEND DOMAINS_LIST "blas") @@ -133,6 +134,7 @@ if(ENABLE_GENERIC_BLAS_BACKEND OR ENABLE_MKLGPU_BACKEND OR ENABLE_CUBLAS_BACKEND OR ENABLE_ROCBLAS_BACKEND + OR ENABLE_OPENBLAS_BACKEND OR ENABLE_NETLIB_BACKEND)) message(FATAL_ERROR "ENABLE_GENERIC_BLAS_BACKEND cannot be enabled at the same time as other BLAS backends.") endif() @@ -257,12 +259,14 @@ if(ENABLE_MKLGPU_BACKEND OR ENABLE_MKLCPU_BACKEND) set(MKL_INTERFACE ilp64) if(ENABLE_MKLCPU_THREAD_TBB) set(MKL_THREADING tbb_thread) - elseif(ENABLE_MKLCPU_THREAD_OMP) - set(MKL_THREADING intel_thread) else() set(MKL_THREADING sequential) endif() - set(MKL_LINK dynamic) + if(BUILD_SHARED_LIBS AND NOT WIN32) + set(MKL_LINK dynamic) + else() + set(MKL_LINK static) + endif() # Enable SYCL API set(DPCPP_COMPILER ON) set(SYCL_COMPILER ON) @@ -319,8 +323,6 @@ endif() if(DEFINED REF_BLAS_ROOT OR DEFINED REF_LAPACK_ROOT) find_file(ONEMATH_REF_BLAS_LIBNAME NAMES blas.dll libblas.so HINTS ${REF_BLAS_ROOT} ${REF_LAPACK_ROOT} PATH_SUFFIXES lib lib64) find_file(ONEMATH_REF_CBLAS_LIBNAME NAMES cblas.dll libcblas.so HINTS ${REF_BLAS_ROOT} ${REF_LAPACK_ROOT} PATH_SUFFIXES lib lib64) - find_file(ONEMATH_REF_LAPACKE_LIBNAME NAMES lapacke64.dll liblapacke64.so HINTS ${REF_LAPACK_ROOT} PATH_SUFFIXES lib lib64) - find_file(ONEMATH_REF_LAPACK_LIBNAME NAMES lapack64.dll liblapack64.so HINTS ${REF_LAPACK_ROOT} PATH_SUFFIXES lib lib64) endif() # Add source directory and output to bin/ diff --git a/cmake/FindOpenBLAS.cmake b/cmake/FindOpenBLAS.cmake new file mode 100644 index 000000000..f68586e97 --- /dev/null +++ b/cmake/FindOpenBLAS.cmake @@ -0,0 +1,74 @@ +#=============================================================================== +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# +# SPDX-License-Identifier: Apache-2.0 +#=============================================================================== + +include_guard() +include(FindPackageHandleStandardArgs) + +if(DEFINED OPENBLAS_DIR) + set(_OPENBLAS_HINTS ${OPENBLAS_DIR}) +elseif(DEFINED ENV{OPENBLAS_DIR}) + set(_OPENBLAS_HINTS $ENV{OPENBLAS_DIR}) +elseif(CMAKE_PREFIX_PATH) + set(_OPENBLAS_HINTS ${CMAKE_PREFIX_PATH}) +endif() + + +find_library(OPENBLAS_LIBRARY + NAMES openblas libopenblas + HINTS ${_OPENBLAS_HINTS} + PATH_SUFFIXES lib lib64 +) + +find_path(OPENBLAS_INCLUDE + NAMES cblas.h + HINTS ${_OPENBLAS_HINTS} + PATH_SUFFIXES include include/openblas +) + +find_package_handle_standard_args(OpenBLAS + REQUIRED_VARS OPENBLAS_LIBRARY OPENBLAS_INCLUDE +) + +if(OpenBLAS_FOUND) + + get_filename_component(OPENBLAS_LIB_DIR + ${OPENBLAS_LIBRARY} + DIRECTORY + ) + + add_library(ONEMATH::OPENBLAS::OPENBLAS UNKNOWN IMPORTED) + + set_target_properties(ONEMATH::OPENBLAS::OPENBLAS PROPERTIES + IMPORTED_LOCATION ${OPENBLAS_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES ${OPENBLAS_INCLUDE} + ) + + if(UNIX AND OPENBLAS_LIB_DIR) + set_target_properties(ONEMATH::OPENBLAS::OPENBLAS PROPERTIES + INTERFACE_LINK_OPTIONS "-Wl,-rpath,${OPENBLAS_LIB_DIR}" + ) + endif() + +endif() + +mark_as_advanced( + OPENBLAS_LIBRARY + OPENBLAS_INCLUDE +) + diff --git a/cmake/mkl/MKLConfig.cmake b/cmake/mkl/MKLConfig.cmake index 9fd0693c7..219202995 100644 --- a/cmake/mkl/MKLConfig.cmake +++ b/cmake/mkl/MKLConfig.cmake @@ -980,6 +980,11 @@ if(NOT MKL_THREADING STREQUAL "tbb_thread" AND MKL_THREADING MATCHES "_thread") "windows/compiler/lib/${MKL_ARCH}_win" "../compiler/lib/${MKL_ARCH}_lin" "../compiler/lib/${MKL_ARCH}_win" "../compiler/lib/${MKL_ARCH}" "../compiler/lib" "compiler/lib" + "../../compiler/latest/lib" + "../../compiler/latest/linux/compiler/lib/${MKL_ARCH}" + "../../compiler/latest/linux/compiler/lib/${MKL_ARCH}_lin" + "../../compiler/latest/windows/compiler/lib/${MKL_ARCH}" + "../../compiler/latest/windows/compiler/lib/${MKL_ARCH}_win" NO_DEFAULT_PATH) if(WIN32) set(OMP_DLLNAME ${LIB_PREFIX}${MKL_OMP_LIB}.dll) @@ -990,6 +995,11 @@ if(NOT MKL_THREADING STREQUAL "tbb_thread" AND MKL_THREADING MATCHES "_thread") "redist/${MKL_ARCH}" "redist/${MKL_ARCH}_win" "redist/${MKL_ARCH}_win/compiler" "../redist/${MKL_ARCH}/compiler" "../compiler/lib" + "../../compiler/latest/bin" + "../../compiler/latest/windows/redist/${MKL_ARCH}_win" + "../../compiler/latest/windows/redist/${MKL_ARCH}_win/compiler" + "../../compiler/latest/windows/compiler/redist/${MKL_ARCH}_win" + "../../compiler/latest/windows/compiler/redist/${MKL_ARCH}_win/compiler" NO_DEFAULT_PATH) if(MKL_LINK STREQUAL "sdl" AND NOT OMP_DLL_DIR) mkl_message(WARNING "${OMP_DLLNAME} not found. MKL_ENV will not contain paths for ${OMP_DLLNAME}.") diff --git a/docs/building_the_project_with_adaptivecpp.rst b/docs/building_the_project_with_adaptivecpp.rst index 1dc1927b9..99a10a585 100644 --- a/docs/building_the_project_with_adaptivecpp.rst +++ b/docs/building_the_project_with_adaptivecpp.rst @@ -104,10 +104,7 @@ The most important supported build options are: - False * - ENABLE_MKLCPU_THREAD_TBB - True, False - - True - * - ENABLE_MKLCPU_THREAD_OMP - - True, False - - False + - True * - BUILD_FUNCTIONAL_TESTS - True, False - True diff --git a/docs/building_the_project_with_dpcpp.rst b/docs/building_the_project_with_dpcpp.rst index 5f98958b6..2b428dd86 100644 --- a/docs/building_the_project_with_dpcpp.rst +++ b/docs/building_the_project_with_dpcpp.rst @@ -135,10 +135,7 @@ The most important supported build options are: - False * - ENABLE_MKLCPU_THREAD_TBB - True, False - - True - * - ENABLE_MKLCPU_THREAD_OMP - - True, False - - False + - True * - ENABLE_GENERIC_BLAS_BACKEND - True, False - False diff --git a/docs/building_the_project_with_openblas.rst b/docs/building_the_project_with_openblas.rst new file mode 100644 index 000000000..402bfafec --- /dev/null +++ b/docs/building_the_project_with_openblas.rst @@ -0,0 +1,175 @@ +.. _building_the_project_with_openblas: + +Building the Project with OpenBLAS +================================== + +This page describes building oneMath with the OpenBLAS backend using +different SYCL implementations: + +- DPC++ (Intel oneAPI) +- AdaptiveCpp + +Environment Setup +################# + +#. + Install and build OpenBLAS. The installation prefix will be referred to as + ````. + +#. + Clone this project. The root directory will be referred to as + ````. + +#. + (Optional) Install a reference BLAS/LAPACK implementation for functional + testing. The installation prefix will be referred to as + ````. + +#. + Ensure required shared libraries are visible at runtime: + +.. code-block:: bash + + export LD_LIBRARY_PATH=:$LD_LIBRARY_PATH + export LD_LIBRARY_PATH=/lib:$LD_LIBRARY_PATH + + # Optional (only if functional tests are enabled) + export LD_LIBRARY_PATH=/lib:$LD_LIBRARY_PATH + + +Building with DPC++ +################### + +If using Intel oneAPI DPC++, load the compiler environment: + +.. code-block:: bash + + source + +Build commands: + +.. code-block:: bash + + cd + mkdir build && cd build + + cmake .. \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER= \ + -DCMAKE_CXX_COMPILER= \ + -DONEMATH_SYCL_IMPLEMENTATION=dpc++ \ + -DENABLE_MKLCPU_BACKEND=False \ + -DENABLE_MKLGPU_BACKEND=False \ + -DENABLE_NETLIB_BACKEND=False \ + -DENABLE_OPENBLAS_BACKEND=True \ + -DOPENBLAS_DIR= \ + -DBUILD_FUNCTIONAL_TESTS=True \ + -DBUILD_EXAMPLES=True \ + -DCMAKE_INSTALL_PREFIX= \ + -DREF_BLAS_ROOT= \ + -DREF_LAPACK_ROOT= + + make -j && make install + + +Building with AdaptiveCpp +######################### + +If using AdaptiveCpp, load the compiler environment: + +.. code-block:: bash + + source + +Build commands: + +.. code-block:: bash + + cd + mkdir build && cd build + + cmake .. \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER= \ + -DCMAKE_CXX_COMPILER= \ + -DONEMATH_SYCL_IMPLEMENTATION=adaptivecpp \ + -DACPP_TARGETS=omp \ + -DENABLE_MKLCPU_BACKEND=False \ + -DENABLE_MKLGPU_BACKEND=False \ + -DENABLE_NETLIB_BACKEND=False \ + -DENABLE_OPENBLAS_BACKEND=True \ + -DOPENBLAS_DIR= \ + -DBUILD_FUNCTIONAL_TESTS=True|False \ + -DBUILD_EXAMPLES=True \ + -DCMAKE_INSTALL_PREFIX= + + make -j && make install + + +Common Build Options +#################### + +.. list-table:: + :header-rows: 1 + + * - CMake Option + - Supported Values + - Description + * - ONEMATH_SYCL_IMPLEMENTATION + - dpc++, adaptivecpp + - Selects the SYCL implementation + * - ENABLE_OPENBLAS_BACKEND + - True, False + - Enables the OpenBLAS backend + * - OPENBLAS_DIR + - Path + - Path to the OpenBLAS installation + * - ENABLE_MKLCPU_BACKEND + - True, False + - Enables/disables MKL CPU backend + * - ENABLE_MKLGPU_BACKEND + - True, False + - Enables/disables MKL GPU backend + * - ENABLE_NETLIB_BACKEND + - True, False + - Enables/disables Netlib backend + * - BUILD_FUNCTIONAL_TESTS + - True, False + - Enables functional tests + * - BUILD_EXAMPLES + - True, False + - Builds example programs + * - CMAKE_INSTALL_PREFIX + - Path + - Installation directory + + +Running Tests +############# + +Running Test Binaries Directly +----------------------------- + +The BLAS test executables are located in the ``bin`` directory inside the +build folder. + +.. code-block:: bash + + cd /build + + # Run BLAS runtime tests + ./bin/test_main_blas_rt + + # Run BLAS compile-time tests + ./bin/test_main_blas_ct + + +Notes +##### + +* OpenBLAS is used as the CPU BLAS backend in this configuration. +* Ensure that OpenBLAS shared libraries are available via + ``LD_LIBRARY_PATH`` or system linker configuration. +* AdaptiveCpp requires proper target configuration via ``ACPP_TARGETS``. +* Functional tests require a reference BLAS/LAPACK installation. +* Test binaries are generated only if functional tests are enabled. \ No newline at end of file diff --git a/include/oneapi/math/blas.hpp b/include/oneapi/math/blas.hpp index 23d198ec5..269117d69 100644 --- a/include/oneapi/math/blas.hpp +++ b/include/oneapi/math/blas.hpp @@ -49,6 +49,9 @@ #ifdef ONEMATH_ENABLE_NETLIB_BACKEND #include "oneapi/math/blas/detail/netlib/blas_ct.hpp" #endif +#ifdef ONEMATH_ENABLE_OPENBLAS_BACKEND +#include "oneapi/math/blas/detail/openblas/blas_ct.hpp" +#endif #ifdef ONEMATH_ENABLE_ARMPL_BACKEND #include "oneapi/math/blas/detail/armpl/blas_ct.hpp" #endif diff --git a/include/oneapi/math/blas/detail/blas_ct_backends.hpp b/include/oneapi/math/blas/detail/blas_ct_backends.hpp index 189978edd..a5efb1cdd 100644 --- a/include/oneapi/math/blas/detail/blas_ct_backends.hpp +++ b/include/oneapi/math/blas/detail/blas_ct_backends.hpp @@ -51,6 +51,10 @@ namespace column_major { #define BACKEND netlib #include "blas_ct_backends.hxx" #undef BACKEND +#define BACKEND openblas +#include "blas_ct_backends.hxx" +#undef BACKEND + #define BACKEND armpl #include "blas_ct_backends.hxx" #undef BACKEND @@ -76,6 +80,9 @@ namespace row_major { #define BACKEND netlib #include "blas_ct_backends.hxx" #undef BACKEND +#define BACKEND openblas +#include "blas_ct_backends.hxx" +#undef BACKEND #define BACKEND armpl #include "blas_ct_backends.hxx" #undef BACKEND diff --git a/include/oneapi/math/blas/detail/blas_loader.hxx b/include/oneapi/math/blas/detail/blas_loader.hxx index 931e25ed2..1eee0067a 100644 --- a/include/oneapi/math/blas/detail/blas_loader.hxx +++ b/include/oneapi/math/blas/detail/blas_loader.hxx @@ -19,6 +19,7 @@ // Buffer APIs + ONEMATH_EXPORT void herk(oneapi::math::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, float alpha, sycl::buffer, 1>& a, std::int64_t lda, float beta, diff --git a/include/oneapi/math/blas/detail/onemath_blas_backends.hxx b/include/oneapi/math/blas/detail/onemath_blas_backends.hxx index 5f5f6881e..592013e1f 100644 --- a/include/oneapi/math/blas/detail/onemath_blas_backends.hxx +++ b/include/oneapi/math/blas/detail/onemath_blas_backends.hxx @@ -19,6 +19,7 @@ // Buffer APIs + ONEMATH_EXPORT void gemm(sycl::queue& queue, oneapi::math::transpose transa, oneapi::math::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, diff --git a/include/oneapi/math/blas/detail/openblas/blas_ct.hpp b/include/oneapi/math/blas/detail/openblas/blas_ct.hpp new file mode 100644 index 000000000..5a5f02849 --- /dev/null +++ b/include/oneapi/math/blas/detail/openblas/blas_ct.hpp @@ -0,0 +1,57 @@ +/******************************************************************************* +* Copyright 2020-2021 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#ifndef _DETAIL_OPENBLAS_BLAS_CT_HPP_ +#define _DETAIL_OPENBLAS_BLAS_CT_HPP_ + +#if __has_include() +#include +#else +#include +#endif +#include +#include + +#include "oneapi/math/types.hpp" +#include "oneapi/math/detail/backends.hpp" +#include "oneapi/math/blas/detail/openblas/onemath_blas_openblas.hpp" +#include "oneapi/math/blas/detail/blas_ct_backends.hpp" + +namespace oneapi { +namespace math { +namespace blas { +namespace column_major { + +#define MAJOR column_major +#include "blas_ct.hxx" +#undef MAJOR + +} //namespace column_major +namespace row_major { + +#define MAJOR row_major +#include "blas_ct.hxx" +#undef MAJOR + +} //namespace row_major +} //namespace blas +} //namespace math +} //namespace oneapi + +#endif //_DETAIL_OPENBLAS_BLAS_CT_HPP_ diff --git a/include/oneapi/math/blas/detail/openblas/blas_ct.hxx b/include/oneapi/math/blas/detail/openblas/blas_ct.hxx new file mode 100644 index 000000000..ed0e7a890 --- /dev/null +++ b/include/oneapi/math/blas/detail/openblas/blas_ct.hxx @@ -0,0 +1,4292 @@ +/******************************************************************************* +* Copyright 2020-2021 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +// Buffer APIs + +void herk(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer, 1>& a, + std::int64_t lda, float beta, sycl::buffer, 1>& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, + a, lda, beta, c, ldc); +} + +void herk(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, sycl::buffer, 1>& a, + std::int64_t lda, double beta, sycl::buffer, 1>& c, + std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, + a, lda, beta, c, ldc); +} + +void scal(backend_selector selector, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); +} + +void scal(backend_selector selector, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); +} + +void scal(backend_selector selector, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); +} + +void scal(backend_selector selector, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); +} + +void scal(backend_selector selector, std::int64_t n, float alpha, + sycl::buffer, 1>& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); +} + +void scal(backend_selector selector, std::int64_t n, double alpha, + sycl::buffer, 1>& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); +} + +void trmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, + a, lda, x, incx); +} + +void trmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, + a, lda, x, incx); +} + +void trmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, + a, lda, x, incx); +} + +void trmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, + a, lda, x, incx); +} + +void tpmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, + a, x, incx); +} + +void tpmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, + a, x, incx); +} + +void tpmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, + a, x, incx); +} + +void tpmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, + a, x, incx); +} + +void spr(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a) { + oneapi::math::blas::openblas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); +} + +void spr(backend_selector selector, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a) { + oneapi::math::blas::openblas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); +} + +void gemm_batch(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, + alpha, a, lda, stride_a, b, ldb, stride_b, beta, + c, ldc, stride_c, batch_size); +} + +void gemm_batch(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, double beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, + alpha, a, lda, stride_a, b, ldb, stride_b, beta, + c, ldc, stride_c, batch_size); +} + +void gemm_batch(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, + alpha, a, lda, stride_a, b, ldb, stride_b, beta, + c, ldc, stride_c, batch_size); +} + +void gemm_batch(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, + alpha, a, lda, stride_a, b, ldb, stride_b, beta, + c, ldc, stride_c, batch_size); +} + +void gemm_batch(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + sycl::half beta, sycl::buffer& c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, + alpha, a, lda, stride_a, b, ldb, stride_b, beta, + c, ldc, stride_c, batch_size); +} + +void gemm_batch(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, + alpha, a, lda, stride_a, b, ldb, stride_b, beta, + c, ldc, stride_c, batch_size); +} + +void gemm_batch(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, + alpha, a, lda, stride_a, b, ldb, stride_b, beta, + c, ldc, stride_c, batch_size); +} + +void gemm_batch(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + float beta, sycl::buffer& c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, + alpha, a, lda, stride_a, b, ldb, stride_b, beta, + c, ldc, stride_c, batch_size); +} + +void syrk(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + float beta, sycl::buffer& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, + a, lda, beta, c, ldc); +} + +void syrk(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, double beta, sycl::buffer& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, + a, lda, beta, c, ldc); +} + +void syrk(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, + a, lda, beta, c, ldc); +} + +void syrk(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, + a, lda, beta, c, ldc); +} + +void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, + alpha, a, lda, stride_a, beta, c, ldc, stride_c, + batch_size); +} + +void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, + alpha, a, lda, stride_a, beta, c, ldc, stride_c, + batch_size); +} + +void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, + alpha, a, lda, stride_a, beta, c, ldc, stride_c, + batch_size); +} + +void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, + alpha, a, lda, stride_a, beta, c, ldc, stride_c, + batch_size); +} + +void her2(backend_selector selector, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { + oneapi::math::blas::openblas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, + incy, a, lda); +} + +void her2(backend_selector selector, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { + oneapi::math::blas::openblas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, + incy, a, lda); +} + +void hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, + x, incx, beta, y, incy); +} + +void hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, + x, incx, beta, y, incy); +} + +void rot(backend_selector selector, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, float c, float s) { + oneapi::math::blas::openblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); +} + +void rot(backend_selector selector, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, double c, double s) { + oneapi::math::blas::openblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); +} + +void rot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, float c, float s) { + oneapi::math::blas::openblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); +} + +void rot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, double c, double s) { + oneapi::math::blas::openblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); +} + +void axpy(backend_selector selector, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); +} + +void axpy(backend_selector selector, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); +} + +void axpy(backend_selector selector, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); +} + +void axpy(backend_selector selector, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); +} + +void axpy_batch(backend_selector selector, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, + y, incy, stridey, batch_size); +} + +void axpy_batch(backend_selector selector, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, + y, incy, stridey, batch_size); +} + +void axpy_batch(backend_selector selector, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, + y, incy, stridey, batch_size); +} + +void axpy_batch(backend_selector selector, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, + y, incy, stridey, batch_size); +} + +void axpby(backend_selector selector, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, + std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, + incy); +} + +void axpby(backend_selector selector, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, + std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, + incy); +} + +void axpby(backend_selector selector, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, + incy); +} + +void axpby(backend_selector selector, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, + incy); +} + +void sdsdot(backend_selector selector, std::int64_t n, float sb, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& result) { + oneapi::math::blas::openblas::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y, incy, + result); +} + +void gerc(backend_selector selector, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { + oneapi::math::blas::openblas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, + lda); +} + +void gerc(backend_selector selector, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { + oneapi::math::blas::openblas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, + lda); +} + +void syr2k(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, + std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, + a, lda, b, ldb, beta, c, ldc); +} + +void syr2k(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, + a, lda, b, ldb, beta, c, ldc); +} + +void syr2k(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, + a, lda, b, ldb, beta, c, ldc); +} + +void syr2k(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, + a, lda, b, ldb, beta, c, ldc); +} + +void gemv(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, + std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, + incx, beta, y, incy); +} + +void gemv(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, + std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, + incx, beta, y, incy); +} + +void gemv(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, + incx, beta, y, incy); +} + +void gemv(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, + incx, beta, y, incy); +} + +void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, float beta, sycl::buffer& y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, + stridea, x, incx, stridex, beta, y, incy, stridey, + batch_size); +} + +void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, double beta, sycl::buffer& y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, + stridea, x, incx, stridex, beta, y, incy, stridey, + batch_size); +} + +void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, + stridea, x, incx, stridex, beta, y, incy, stridey, + batch_size); +} + +void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + std::complex beta, sycl::buffer, 1>& y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, + stridea, x, incx, stridex, beta, y, incy, stridey, + batch_size); +} + +void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, + stridea, x, incx, stridex, c, ldc, stridec, + batch_size); +} + +void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, + stridea, x, incx, stridex, c, ldc, stridec, + batch_size); +} + +void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, + stridea, x, incx, stridex, c, ldc, stridec, + batch_size); +} + +void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, + stridea, x, incx, stridex, c, ldc, stridec, + batch_size); +} + +void her(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { + oneapi::math::blas::openblas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, + lda); +} + +void her(backend_selector selector, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { + oneapi::math::blas::openblas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, + lda); +} + +void hpr(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { + oneapi::math::blas::openblas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); +} + +void hpr(backend_selector selector, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { + oneapi::math::blas::openblas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); +} + +void iamin(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result, + oneapi::math::index_base base) { + oneapi::math::blas::openblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result, base); +} + +void iamin(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result, + oneapi::math::index_base base) { + oneapi::math::blas::openblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result, base); +} + +void iamin(backend_selector selector, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result, oneapi::math::index_base base) { + oneapi::math::blas::openblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result, base); +} + +void iamin(backend_selector selector, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result, oneapi::math::index_base base) { + oneapi::math::blas::openblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result, base); +} + +void hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, + beta, y, incy); +} + +void hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, + beta, y, incy); +} + +void spmv(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& a, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, + beta, y, incy); +} + +void spmv(backend_selector selector, uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& a, sycl::buffer& x, std::int64_t incx, + double beta, sycl::buffer& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, + beta, y, incy); +} + +void gemm_bias(backend_selector selector, transpose transa, transpose transb, + offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, uint8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co) { + oneapi::math::blas::openblas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, + n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, + co); +} + +void gemm_bias(backend_selector selector, transpose transa, transpose transb, + offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, int8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co) { + oneapi::math::blas::openblas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, + n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, + co); +} + +void gemm_bias(backend_selector selector, transpose transa, transpose transb, + offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co) { + oneapi::math::blas::openblas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, + n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, + co); +} + +void gemm_bias(backend_selector selector, transpose transa, transpose transb, + offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co) { + oneapi::math::blas::openblas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, + n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, + co); +} + +void swap(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); +} + +void swap(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); +} + +void swap(backend_selector selector, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); +} + +void swap(backend_selector selector, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); +} + +void geru(backend_selector selector, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { + oneapi::math::blas::openblas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, + lda); +} + +void geru(backend_selector selector, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { + oneapi::math::blas::openblas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, + lda); +} + +void nrm2(backend_selector selector, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { + oneapi::math::blas::openblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); +} + +void nrm2(backend_selector selector, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { + oneapi::math::blas::openblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); +} + +void nrm2(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { + oneapi::math::blas::openblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); +} + +void nrm2(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { + oneapi::math::blas::openblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); +} + +void gemm(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, + lda, b, ldb, beta, c, ldc); +} + +void gemm(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, + lda, b, ldb, beta, c, ldc); +} + +void gemm(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, + lda, b, ldb, beta, c, ldc); +} + +void gemm(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, + lda, b, ldb, beta, c, ldc); +} + +void gemm(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::half beta, sycl::buffer& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, + lda, b, ldb, beta, c, ldc); +} + +void gemm(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, + lda, b, ldb, beta, c, ldc); +} + +void gemm(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, + lda, b, ldb, beta, c, ldc); +} + +void syr2(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { + oneapi::math::blas::openblas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, + incy, a, lda); +} + +void syr2(backend_selector selector, uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { + oneapi::math::blas::openblas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, + incy, a, lda); +} + +void ger(backend_selector selector, std::int64_t m, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& a, std::int64_t lda) { + oneapi::math::blas::openblas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, + lda); +} + +void ger(backend_selector selector, std::int64_t m, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { + oneapi::math::blas::openblas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, + lda); +} + +void trsm(backend_selector selector, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb) { + oneapi::math::blas::openblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, + unit_diag, m, n, alpha, a, lda, b, ldb); +} + +void trsm(backend_selector selector, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb) { + oneapi::math::blas::openblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, + unit_diag, m, n, alpha, a, lda, b, ldb); +} + +void trsm(backend_selector selector, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { + oneapi::math::blas::openblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, + unit_diag, m, n, alpha, a, lda, b, ldb); +} + +void trsm(backend_selector selector, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { + oneapi::math::blas::openblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, + unit_diag, m, n, alpha, a, lda, b, ldb); +} + +void dotu(backend_selector selector, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { + oneapi::math::blas::openblas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result); +} + +void dotu(backend_selector selector, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { + oneapi::math::blas::openblas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result); +} + +void hemm(backend_selector selector, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, + alpha, a, lda, b, ldb, beta, c, ldc); +} + +void hemm(backend_selector selector, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, + alpha, a, lda, b, ldb, beta, c, ldc); +} + +void hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a) { + oneapi::math::blas::openblas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, + incy, a); +} + +void hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a) { + oneapi::math::blas::openblas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, + incy, a); +} + +void gbmv(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, + lda, x, incx, beta, y, incy); +} + +void gbmv(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, + lda, x, incx, beta, y, incy); +} + +void gbmv(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, + lda, x, incx, beta, y, incy); +} + +void gbmv(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, + lda, x, incx, beta, y, incy); +} + +void tbmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, + k, a, lda, x, incx); +} + +void tbmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, + k, a, lda, x, incx); +} + +void tbmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, + k, a, lda, x, incx); +} + +void tbmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, + k, a, lda, x, incx); +} + +void symm(backend_selector selector, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, + std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, + alpha, a, lda, b, ldb, beta, c, ldc); +} + +void symm(backend_selector selector, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, + alpha, a, lda, b, ldb, beta, c, ldc); +} + +void symm(backend_selector selector, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, + alpha, a, lda, b, ldb, beta, c, ldc); +} + +void symm(backend_selector selector, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, + alpha, a, lda, b, ldb, beta, c, ldc); +} + +void dotc(backend_selector selector, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { + oneapi::math::blas::openblas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result); +} + +void dotc(backend_selector selector, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { + oneapi::math::blas::openblas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result); +} + +void syr(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a, + std::int64_t lda) { + oneapi::math::blas::openblas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, + lda); +} + +void syr(backend_selector selector, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a, + std::int64_t lda) { + oneapi::math::blas::openblas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, + lda); +} + +void trmm(backend_selector selector, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb) { + oneapi::math::blas::openblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, + unit_diag, m, n, alpha, a, lda, b, ldb); +} + +void trmm(backend_selector selector, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb) { + oneapi::math::blas::openblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, + unit_diag, m, n, alpha, a, lda, b, ldb); +} + +void trmm(backend_selector selector, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { + oneapi::math::blas::openblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, + unit_diag, m, n, alpha, a, lda, b, ldb); +} + +void trmm(backend_selector selector, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { + oneapi::math::blas::openblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, + unit_diag, m, n, alpha, a, lda, b, ldb); +} + +void rotmg(backend_selector selector, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, float y1, + sycl::buffer& param) { + oneapi::math::blas::openblas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param); +} + +void rotmg(backend_selector selector, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, double y1, + sycl::buffer& param) { + oneapi::math::blas::openblas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param); +} + +void tpsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, + a, x, incx); +} + +void tpsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, + a, x, incx); +} + +void tpsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, + a, x, incx); +} + +void tpsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, + a, x, incx); +} + +void trsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, + a, lda, x, incx); +} + +void trsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, + a, lda, x, incx); +} + +void trsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, + a, lda, x, incx); +} + +void trsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, + a, lda, x, incx); +} + +void copy(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); +} + +void copy(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); +} + +void copy(backend_selector selector, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); +} + +void copy(backend_selector selector, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); +} + +void copy_batch(backend_selector selector, std::int64_t n, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, + incy, stridey, batch_size); +} + +void copy_batch(backend_selector selector, std::int64_t n, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, + incy, stridey, batch_size); +} + +void copy_batch(backend_selector selector, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, + incy, stridey, batch_size); +} + +void copy_batch(backend_selector selector, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, + incy, stridey, batch_size); +} + +void hemv(backend_selector selector, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, + incx, beta, y, incy); +} + +void hemv(backend_selector selector, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, + incx, beta, y, incy); +} + +void gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, + k, alpha, a, lda, b, ldb, beta, c, ldc); +} + +void gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, double alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, double beta, sycl::buffer& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, + k, alpha, a, lda, b, ldb, beta, c, ldc); +} + +void gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, + k, alpha, a, lda, b, ldb, beta, c, ldc); +} + +void gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, + k, alpha, a, lda, b, ldb, beta, c, ldc); +} + +void asum(backend_selector selector, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { + oneapi::math::blas::openblas::MAJOR::asum(selector.get_queue(), n, x, incx, result); +} + +void asum(backend_selector selector, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { + oneapi::math::blas::openblas::MAJOR::asum(selector.get_queue(), n, x, incx, result); +} + +void asum(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { + oneapi::math::blas::openblas::MAJOR::asum(selector.get_queue(), n, x, incx, result); +} + +void asum(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { + oneapi::math::blas::openblas::MAJOR::asum(selector.get_queue(), n, x, incx, result); +} + +void sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, + std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, + x, incx, beta, y, incy); +} + +void sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, + std::int64_t k, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, + std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, + x, incx, beta, y, incy); +} + +void tbsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, + k, a, lda, x, incx); +} + +void tbsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, + k, a, lda, x, incx); +} + +void tbsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, + k, a, lda, x, incx); +} + +void tbsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { + oneapi::math::blas::openblas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, + k, a, lda, x, incx); +} + +void spr2(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a) { + oneapi::math::blas::openblas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, + incy, a); +} + +void spr2(backend_selector selector, uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a) { + oneapi::math::blas::openblas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, + incy, a); +} + +void iamax(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result, + oneapi::math::index_base base) { + oneapi::math::blas::openblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result, base); +} + +void iamax(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result, + oneapi::math::index_base base) { + oneapi::math::blas::openblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result, base); +} + +void iamax(backend_selector selector, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result, oneapi::math::index_base base) { + oneapi::math::blas::openblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result, base); +} + +void iamax(backend_selector selector, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result, oneapi::math::index_base base) { + oneapi::math::blas::openblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result, base); +} + +void rotm(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param) { + oneapi::math::blas::openblas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param); +} + +void rotm(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param) { + oneapi::math::blas::openblas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param); +} + +void dot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { + oneapi::math::blas::openblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); +} + +void dot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { + oneapi::math::blas::openblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); +} + +void dot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { + oneapi::math::blas::openblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); +} + +void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, + trans, unit_diag, m, n, alpha, a, lda, stride_a, + b, ldb, stride_b, batch_size); +} + +void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, + trans, unit_diag, m, n, alpha, a, lda, stride_a, + b, ldb, stride_b, batch_size); +} + +void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, + trans, unit_diag, m, n, alpha, a, lda, stride_a, + b, ldb, stride_b, batch_size); +} + +void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, + trans, unit_diag, m, n, alpha, a, lda, stride_a, + b, ldb, stride_b, batch_size); +} + +void her2k(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, float beta, + sycl::buffer, 1>& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, + a, lda, b, ldb, beta, c, ldc); +} + +void her2k(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, double beta, + sycl::buffer, 1>& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, + a, lda, b, ldb, beta, c, ldc); +} + +void rotg(backend_selector selector, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, sycl::buffer& s) { + oneapi::math::blas::openblas::MAJOR::rotg(selector.get_queue(), a, b, c, s); +} + +void rotg(backend_selector selector, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, sycl::buffer& s) { + oneapi::math::blas::openblas::MAJOR::rotg(selector.get_queue(), a, b, c, s); +} + +void rotg(backend_selector selector, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { + oneapi::math::blas::openblas::MAJOR::rotg(selector.get_queue(), a, b, c, s); +} + +void rotg(backend_selector selector, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { + oneapi::math::blas::openblas::MAJOR::rotg(selector.get_queue(), a, b, c, s); +} + +void symv(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, + incx, beta, y, incy); +} + +void symv(backend_selector selector, uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { + oneapi::math::blas::openblas::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, + incx, beta, y, incy); +} + +void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, + lda, stride_a, b, ldb, stride_b, batch_size); +} + +void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, + lda, stride_a, b, ldb, stride_b, batch_size); +} + +void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, + lda, stride_a, b, ldb, stride_b, batch_size); +} + +void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, + lda, stride_a, b, ldb, stride_b, batch_size); +} + +void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, + std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, + lda, ldb, stride, batch_size); +} + +void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, + std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, + lda, ldb, stride, batch_size); +} + +void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, + std::int64_t stride, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, + lda, ldb, stride, batch_size); +} + +void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, + std::int64_t stride, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, + lda, ldb, stride, batch_size); +} + +void omatadd_batch(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, + alpha, a, lda, stride_a, beta, b, ldb, + stride_b, c, ldc, stride_c, batch_size); +} + +void omatadd_batch(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, + alpha, a, lda, stride_a, beta, b, ldb, + stride_b, c, ldc, stride_c, batch_size); +} + +void omatadd_batch(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1>& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, + alpha, a, lda, stride_a, beta, b, ldb, + stride_b, c, ldc, stride_c, batch_size); +} + +void omatadd_batch(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, std::complex beta, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { + oneapi::math::blas::openblas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, + alpha, a, lda, stride_a, beta, b, ldb, + stride_b, c, ldc, stride_c, batch_size); +} + +void omatcopy(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { + oneapi::math::blas::openblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, + ldb); +} + +void omatcopy(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { + oneapi::math::blas::openblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, + ldb); +} + +void omatcopy(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { + oneapi::math::blas::openblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, + ldb); +} + +void omatcopy(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { + oneapi::math::blas::openblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, + ldb); +} + +void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, + std::int64_t strideb) { + oneapi::math::blas::openblas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, + stridea, b, ldb, strideb); +} + +void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, + std::int64_t strideb) { + oneapi::math::blas::openblas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, + stridea, b, ldb, strideb); +} + +void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t strideb) { + oneapi::math::blas::openblas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, + stridea, b, ldb, strideb); +} + +void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t strideb) { + oneapi::math::blas::openblas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, + stridea, b, ldb, strideb); +} + +void imatcopy(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, + std::int64_t ldb) { + oneapi::math::blas::openblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, + ldb); +} + +void imatcopy(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, + std::int64_t ldb) { + oneapi::math::blas::openblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, + ldb); +} + +void imatcopy(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& ab, + std::int64_t lda, std::int64_t ldb) { + oneapi::math::blas::openblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, + ldb); +} + +void imatcopy(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& ab, + std::int64_t lda, std::int64_t ldb) { + oneapi::math::blas::openblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, + ldb); +} + +void omatadd(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, float beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, + lda, beta, b, ldb, c, ldc); +} + +void omatadd(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, double beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, + lda, beta, b, ldb, c, ldc); +} + +void omatadd(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, + lda, beta, b, ldb, c, ldc); +} + +void omatadd(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc) { + oneapi::math::blas::openblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, + lda, beta, b, ldb, c, ldc); +} + +// USM APIs + +sycl::event syr2(backend_selector selector, uplo upper_lower, std::int64_t n, + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, + x, incx, y, incy, a, lda, dependencies); + return done; +} + +sycl::event syr2(backend_selector selector, uplo upper_lower, std::int64_t n, + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, + x, incx, y, incy, a, lda, dependencies); + return done; +} + +sycl::event scal(backend_selector selector, std::int64_t n, float alpha, float* x, + std::int64_t incx, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, + dependencies); + return done; +} + +sycl::event scal(backend_selector selector, std::int64_t n, double alpha, + double* x, std::int64_t incx, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, + dependencies); + return done; +} + +sycl::event scal(backend_selector selector, std::int64_t n, + std::complex alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, + dependencies); + return done; +} + +sycl::event scal(backend_selector selector, std::int64_t n, + std::complex alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, + dependencies); + return done; +} + +sycl::event scal(backend_selector selector, std::int64_t n, float alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, + dependencies); + return done; +} + +sycl::event scal(backend_selector selector, std::int64_t n, double alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, + dependencies); + return done; +} + +sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float* a, std::int64_t lda, float* x, + std::int64_t incx, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trmv( + selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double* a, std::int64_t lda, double* x, + std::int64_t incx, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trmv( + selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trmv( + selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trmv( + selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, + unit_diag, n, a, x, incx, dependencies); + return done; +} + +sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, + unit_diag, n, a, x, incx, dependencies); + return done; +} + +sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, + unit_diag, n, a, x, incx, dependencies); + return done; +} + +sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, + unit_diag, n, a, x, incx, dependencies); + return done; +} + +sycl::event spr(backend_selector selector, uplo upper_lower, std::int64_t n, + float alpha, const float* x, std::int64_t incx, float* a, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, + x, incx, a, dependencies); + return done; +} + +sycl::event spr(backend_selector selector, uplo upper_lower, std::int64_t n, + double alpha, const double* x, std::int64_t incx, double* a, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, + x, incx, a, dependencies); + return done; +} + +sycl::event hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, + a, x, incx, beta, y, incy, dependencies); + return done; +} + +sycl::event hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, + a, x, incx, beta, y, incy, dependencies); + return done; +} + +sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::syrk( + selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::syrk( + selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::syrk( + selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::syrk( + selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, float* alpha, + const float** a, std::int64_t* lda, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::syrk_batch( + selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, + group_size, dependencies); + return done; +} + +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, double* alpha, + const double** a, std::int64_t* lda, double* beta, double** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::syrk_batch( + selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, + group_size, dependencies); + return done; +} + +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::syrk_batch( + selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, + group_size, dependencies); + return done; +} + +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::syrk_batch( + selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, + group_size, dependencies); + return done; +} + +sycl::event syrk_batch(backend_selector selector, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, float beta, float* c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::syrk_batch( + selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, + stride_c, batch_size, dependencies); + return done; +} + +sycl::event syrk_batch(backend_selector selector, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, double alpha, + const double* a, std::int64_t lda, std::int64_t stride_a, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::syrk_batch( + selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, + stride_c, batch_size, dependencies); + return done; +} + +sycl::event syrk_batch(backend_selector selector, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, std::complex* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::syrk_batch( + selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, + stride_c, batch_size, dependencies); + return done; +} + +sycl::event syrk_batch(backend_selector selector, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, std::complex* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::syrk_batch( + selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, + stride_c, batch_size, dependencies); + return done; +} + +sycl::event her2(backend_selector selector, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, + x, incx, y, incy, a, lda, dependencies); + return done; +} + +sycl::event her2(backend_selector selector, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, + x, incx, y, incy, a, lda, dependencies); + return done; +} + +sycl::event hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, + lda, x, incx, beta, y, incy, dependencies); + return done; +} + +sycl::event hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, + lda, x, incx, beta, y, incy, dependencies); + return done; +} + +sycl::event rot(backend_selector selector, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, float c, float s, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, + s, dependencies); + return done; +} + +sycl::event rot(backend_selector selector, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, double c, double s, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, + s, dependencies); + return done; +} + +sycl::event rot(backend_selector selector, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, float c, float s, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, + s, dependencies); + return done; +} + +sycl::event rot(backend_selector selector, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, double c, double s, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, + s, dependencies); + return done; +} + +sycl::event axpy(backend_selector selector, std::int64_t n, float alpha, + const float* x, std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, + incy, dependencies); + return done; +} + +sycl::event axpy(backend_selector selector, std::int64_t n, double alpha, + const double* x, std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, + incy, dependencies); + return done; +} + +sycl::event axpy(backend_selector selector, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, + incy, dependencies); + return done; +} + +sycl::event axpy(backend_selector selector, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, + incy, dependencies); + return done; +} + +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, float* alpha, + const float** x, std::int64_t* incx, float** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::axpy_batch( + selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); + return done; +} + +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, double* alpha, + const double** x, std::int64_t* incx, double** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::axpy_batch( + selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); + return done; +} + +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::axpy_batch( + selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); + return done; +} + +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::axpy_batch( + selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); + return done; +} + +sycl::event axpy_batch(backend_selector selector, std::int64_t n, float alpha, + const float* x, std::int64_t incx, std::int64_t stridex, float* y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, + incx, stridex, y, incy, stridey, + batch_size, dependencies); + return done; +} + +sycl::event axpy_batch(backend_selector selector, std::int64_t n, double alpha, + const double* x, std::int64_t incx, std::int64_t stridex, double* y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, + incx, stridex, y, incy, stridey, + batch_size, dependencies); + return done; +} + +sycl::event axpy_batch(backend_selector selector, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, + incx, stridex, y, incy, stridey, + batch_size, dependencies); + return done; +} + +sycl::event axpy_batch(backend_selector selector, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, + incx, stridex, y, incy, stridey, + batch_size, dependencies); + return done; +} + +sycl::event axpby(backend_selector selector, std::int64_t n, float alpha, + const float* x, std::int64_t incx, const float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, + beta, y, incy, dependencies); + return done; +} + +sycl::event axpby(backend_selector selector, std::int64_t n, double alpha, + const double* x, std::int64_t incx, const double beta, double* y, + std::int64_t incy, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, + beta, y, incy, dependencies); + return done; +} + +sycl::event axpby(backend_selector selector, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, + beta, y, incy, dependencies); + return done; +} + +sycl::event axpby(backend_selector selector, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, + beta, y, incy, dependencies); + return done; +} + +sycl::event gerc(backend_selector selector, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, + y, incy, a, lda, dependencies); + return done; +} + +sycl::event gerc(backend_selector selector, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, + y, incy, a, lda, dependencies); + return done; +} + +sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + return done; +} + +sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + return done; +} + +sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + return done; +} + +sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + return done; +} + +sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, + lda, x, incx, beta, y, incy, dependencies); + return done; +} + +sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, + lda, x, incx, beta, y, incy, dependencies); + return done; +} + +sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, + lda, x, incx, beta, y, incy, dependencies); + return done; +} + +sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, + lda, x, incx, beta, y, incy, dependencies); + return done; +} + +sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stridea, const float* x, std::int64_t incx, + std::int64_t stridex, float beta, float* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemv_batch( + selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, + stridey, batch_size, dependencies); + return done; +} + +sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stridea, const double* x, std::int64_t incx, + std::int64_t stridex, double beta, double* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemv_batch( + selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, + stridey, batch_size, dependencies); + return done; +} + +sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex beta, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemv_batch( + selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, + stridey, batch_size, dependencies); + return done; +} + +sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex beta, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemv_batch( + selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, + stridey, batch_size, dependencies); + return done; +} + +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, float* alpha, const float** a, + std::int64_t* lda, const float** x, std::int64_t* incx, float* beta, + float** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemv_batch( + selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, + group_size, dependencies); + return done; +} + +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, double* alpha, const double** a, + std::int64_t* lda, const double** x, std::int64_t* incx, double* beta, + double** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemv_batch( + selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, + group_size, dependencies); + return done; +} + +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, std::complex* beta, + std::complex** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemv_batch( + selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, + group_size, dependencies); + return done; +} + +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, + std::complex* beta, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemv_batch( + selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, + group_size, dependencies); + return done; +} + +sycl::event dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, const float* a, std::int64_t lda, std::int64_t stridea, + const float* x, std::int64_t incx, std::int64_t stridex, float* c, + std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::dgmm_batch( + selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, + batch_size, dependencies); + return done; +} + +sycl::event dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, const double* a, std::int64_t lda, std::int64_t stridea, + const double* x, std::int64_t incx, std::int64_t stridex, double* c, + std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::dgmm_batch( + selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, + batch_size, dependencies); + return done; +} + +sycl::event dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::dgmm_batch( + selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, + batch_size, dependencies); + return done; +} + +sycl::event dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::dgmm_batch( + selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, + batch_size, dependencies); + return done; +} + +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const float** a, std::int64_t* lda, + const float** x, std::int64_t* incx, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::dgmm_batch( + selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, + dependencies); + return done; +} + +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const double** a, std::int64_t* lda, + const double** x, std::int64_t* incx, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::dgmm_batch( + selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, + dependencies); + return done; +} + +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const std::complex** a, + std::int64_t* lda, const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::dgmm_batch( + selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, + dependencies); + return done; +} + +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const std::complex** a, + std::int64_t* lda, const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::dgmm_batch( + selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, + dependencies); + return done; +} + +sycl::event her(backend_selector selector, uplo upper_lower, std::int64_t n, + float alpha, const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, + x, incx, a, lda, dependencies); + return done; +} + +sycl::event her(backend_selector selector, uplo upper_lower, std::int64_t n, + double alpha, const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, + x, incx, a, lda, dependencies); + return done; +} + +sycl::event hpr(backend_selector selector, uplo upper_lower, std::int64_t n, + float alpha, const std::complex* x, std::int64_t incx, + std::complex* a, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, + x, incx, a, dependencies); + return done; +} + +sycl::event hpr(backend_selector selector, uplo upper_lower, std::int64_t n, + double alpha, const std::complex* x, std::int64_t incx, + std::complex* a, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, + x, incx, a, dependencies); + return done; +} + +sycl::event iamin(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, oneapi::math::index_base base, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result, + base, dependencies); + return done; +} + +sycl::event iamin(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, oneapi::math::index_base base, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result, + base, dependencies); + return done; +} + +sycl::event iamin(backend_selector selector, std::int64_t n, + const std::complex* x, std::int64_t incx, std::int64_t* result, + oneapi::math::index_base base, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result, + base, dependencies); + return done; +} + +sycl::event iamin(backend_selector selector, std::int64_t n, + const std::complex* x, std::int64_t incx, std::int64_t* result, + oneapi::math::index_base base, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result, + base, dependencies); + return done; +} + +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const float** a, std::int64_t* lda, const float** b, + std::int64_t* ldb, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemm_batch( + selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + double* alpha, const double** a, std::int64_t* lda, const double** b, + std::int64_t* ldb, double* beta, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemm_batch( + selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + const std::complex** b, std::int64_t* ldb, std::complex* beta, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemm_batch( + selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, const std::complex** b, std::int64_t* ldb, + std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemm_batch( + selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + sycl::half* alpha, const sycl::half** a, std::int64_t* lda, + const sycl::half** b, std::int64_t* ldb, sycl::half* beta, sycl::half** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemm_batch( + selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const sycl::half** a, std::int64_t* lda, const sycl::half** b, + std::int64_t* ldb, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemm_batch( + selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemm_batch( + selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, std::int32_t** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemm_batch( + selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +sycl::event gemm_batch(backend_selector selector, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const float* a, std::int64_t lda, std::int64_t stride_a, + const float* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemm_batch( + selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +sycl::event gemm_batch(backend_selector selector, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + double alpha, const double* a, std::int64_t lda, std::int64_t stride_a, + const double* b, std::int64_t ldb, std::int64_t stride_b, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemm_batch( + selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +sycl::event gemm_batch(backend_selector selector, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex* c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemm_batch( + selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +sycl::event gemm_batch(backend_selector selector, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex* c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemm_batch( + selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +sycl::event gemm_batch(backend_selector selector, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::half alpha, const sycl::half* a, std::int64_t lda, + std::int64_t stride_a, const sycl::half* b, std::int64_t ldb, + std::int64_t stride_b, sycl::half beta, sycl::half* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemm_batch( + selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +sycl::event gemm_batch(backend_selector selector, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const sycl::half* a, std::int64_t lda, std::int64_t stride_a, + const sycl::half* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemm_batch( + selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +sycl::event gemm_batch(backend_selector selector, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemm_batch( + selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +sycl::event gemm_batch(backend_selector selector, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, float beta, + std::int32_t* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemm_batch( + selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +sycl::event spmv(backend_selector selector, uplo upper_lower, std::int64_t n, + float alpha, const float* a, const float* x, std::int64_t incx, float beta, + float* y, std::int64_t incy, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, + a, x, incx, beta, y, incy, dependencies); + return done; +} + +sycl::event spmv(backend_selector selector, uplo upper_lower, std::int64_t n, + double alpha, const double* a, const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, + a, x, incx, beta, y, incy, dependencies); + return done; +} + +sycl::event swap(backend_selector selector, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, + dependencies); + return done; +} + +sycl::event swap(backend_selector selector, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, + dependencies); + return done; +} + +sycl::event swap(backend_selector selector, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, + dependencies); + return done; +} + +sycl::event swap(backend_selector selector, std::int64_t n, + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, + dependencies); + return done; +} + +sycl::event geru(backend_selector selector, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, + y, incy, a, lda, dependencies); + return done; +} + +sycl::event geru(backend_selector selector, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, + y, incy, a, lda, dependencies); + return done; +} + +sycl::event nrm2(backend_selector selector, std::int64_t n, + const std::complex* x, std::int64_t incx, float* result, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, + dependencies); + return done; +} + +sycl::event nrm2(backend_selector selector, std::int64_t n, + const std::complex* x, std::int64_t incx, double* result, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, + dependencies); + return done; +} + +sycl::event nrm2(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, float* result, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, + dependencies); + return done; +} + +sycl::event nrm2(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, double* result, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, + dependencies); + return done; +} + +sycl::event gemm(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + return done; +} + +sycl::event gemm(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, double beta, double* c, + std::int64_t ldc, const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + return done; +} + +sycl::event gemm(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + return done; +} + +sycl::event gemm(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + return done; +} + +sycl::event gemm(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, + const sycl::half* a, std::int64_t lda, const sycl::half* b, std::int64_t ldb, + sycl::half beta, sycl::half* c, std::int64_t ldc, + const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + return done; +} + +sycl::event gemm(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half* a, + std::int64_t lda, const sycl::half* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + return done; +} + +sycl::event gemm(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16* a, + std::int64_t lda, const bfloat16* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + return done; +} + +sycl::event gemm_bias(backend_selector selector, transpose transa, + transpose transb, offset offsetc, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const std::int8_t* a, std::int64_t lda, + std::int8_t ao, const std::uint8_t* b, std::int64_t ldb, std::uint8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemm_bias( + selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, + c, ldc, co, dependencies); + return done; +} + +sycl::event gemm_bias(backend_selector selector, transpose transa, + transpose transb, offset offsetc, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const std::int8_t* a, std::int64_t lda, + std::int8_t ao, const std::int8_t* b, std::int64_t ldb, std::int8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemm_bias( + selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, + c, ldc, co, dependencies); + return done; +} + +sycl::event gemm_bias(backend_selector selector, transpose transa, + transpose transb, offset offsetc, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const std::uint8_t* a, std::int64_t lda, + std::uint8_t ao, const std::int8_t* b, std::int64_t ldb, std::int8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemm_bias( + selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, + c, ldc, co, dependencies); + return done; +} + +sycl::event gemm_bias(backend_selector selector, transpose transa, + transpose transb, offset offsetc, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const std::uint8_t* a, std::int64_t lda, + std::uint8_t ao, const std::uint8_t* b, std::int64_t ldb, std::uint8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemm_bias( + selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, + c, ldc, co, dependencies); + return done; +} + +sycl::event herk(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, const std::complex* a, + std::int64_t lda, float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::herk( + selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +sycl::event herk(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, const std::complex* a, + std::int64_t lda, double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::herk( + selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +sycl::event ger(backend_selector selector, std::int64_t m, std::int64_t n, + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, + y, incy, a, lda, dependencies); + return done; +} + +sycl::event ger(backend_selector selector, std::int64_t m, std::int64_t n, + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, + y, incy, a, lda, dependencies); + return done; +} + +sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, + const float* a, std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trsm(selector.get_queue(), left_right, + upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + return done; +} + +sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, + const double* a, std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trsm(selector.get_queue(), left_right, + upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + return done; +} + +sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trsm(selector.get_queue(), left_right, + upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + return done; +} + +sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trsm(selector.get_queue(), left_right, + upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + return done; +} + +sycl::event trsm_batch(backend_selector selector, side left_right, + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trsm_batch( + selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, + stride_a, b, ldb, stride_b, batch_size, dependencies); + return done; +} + +sycl::event trsm_batch(backend_selector selector, side left_right, + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trsm_batch( + selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, + stride_a, b, ldb, stride_b, batch_size, dependencies); + return done; +} + +sycl::event trsm_batch(backend_selector selector, side left_right, + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trsm_batch( + selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, + stride_a, b, ldb, stride_b, batch_size, dependencies); + return done; +} + +sycl::event trsm_batch(backend_selector selector, side left_right, + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trsm_batch( + selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, + stride_a, b, ldb, stride_b, batch_size, dependencies); + return done; +} + +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, float* alpha, const float** a, std::int64_t* lda, float** b, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trsm_batch( + selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, group_count, group_size, dependencies); + return done; +} + +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, double* alpha, const double** a, std::int64_t* lda, + double** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trsm_batch( + selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, group_count, group_size, dependencies); + return done; +} + +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trsm_batch( + selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, group_count, group_size, dependencies); + return done; +} + +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trsm_batch( + selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, group_count, group_size, dependencies); + return done; +} + +sycl::event dotu(backend_selector selector, std::int64_t n, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, + result, dependencies); + return done; +} + +sycl::event dotu(backend_selector selector, std::int64_t n, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, + result, dependencies); + return done; +} + +sycl::event hemm(backend_selector selector, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + return done; +} + +sycl::event hemm(backend_selector selector, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + return done; +} + +sycl::event hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, + x, incx, y, incy, a, dependencies); + return done; +} + +sycl::event hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, + x, incx, y, incy, a, dependencies); + return done; +} + +sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float* a, + std::int64_t lda, const float* x, std::int64_t incx, float beta, float* y, + std::int64_t incy, const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, + lda, x, incx, beta, y, incy, dependencies); + return done; +} + +sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, const double* a, + std::int64_t lda, const double* x, std::int64_t incx, double beta, double* y, + std::int64_t incy, const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, + lda, x, incx, beta, y, incy, dependencies); + return done; +} + +sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, + lda, x, incx, beta, y, incy, dependencies); + return done; +} + +sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, + lda, x, incx, beta, y, incy, dependencies); + return done; +} + +sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::tbmv( + selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::tbmv( + selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::tbmv( + selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::tbmv( + selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + return done; +} + +sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, double alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + return done; +} + +sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + return done; +} + +sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + return done; +} + +sycl::event dotc(backend_selector selector, std::int64_t n, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, + result, dependencies); + return done; +} + +sycl::event dotc(backend_selector selector, std::int64_t n, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, + result, dependencies); + return done; +} + +sycl::event syr(backend_selector selector, uplo upper_lower, std::int64_t n, + float alpha, const float* x, std::int64_t incx, float* a, std::int64_t lda, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, + x, incx, a, lda, dependencies); + return done; +} + +sycl::event syr(backend_selector selector, uplo upper_lower, std::int64_t n, + double alpha, const double* x, std::int64_t incx, double* a, std::int64_t lda, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, + x, incx, a, lda, dependencies); + return done; +} + +sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, + const float* a, std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trmm(selector.get_queue(), left_right, + upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + return done; +} + +sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, + const double* a, std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trmm(selector.get_queue(), left_right, + upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + return done; +} + +sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trmm(selector.get_queue(), left_right, + upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + return done; +} + +sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trmm(selector.get_queue(), left_right, + upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + return done; +} + +sycl::event rotmg(backend_selector selector, float* d1, float* d2, float* x1, + float y1, float* param, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, + param, dependencies); + return done; +} + +sycl::event rotmg(backend_selector selector, double* d1, double* d2, double* x1, + double y1, double* param, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, + param, dependencies); + return done; +} + +sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, + unit_diag, n, a, x, incx, dependencies); + return done; +} + +sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, + unit_diag, n, a, x, incx, dependencies); + return done; +} + +sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, + unit_diag, n, a, x, incx, dependencies); + return done; +} + +sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, + unit_diag, n, a, x, incx, dependencies); + return done; +} + +sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float* a, std::int64_t lda, float* x, + std::int64_t incx, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trsv( + selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double* a, std::int64_t lda, double* x, + std::int64_t incx, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trsv( + selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trsv( + selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::trsv( + selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +sycl::event copy(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, + dependencies); + return done; +} + +sycl::event copy(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, + dependencies); + return done; +} + +sycl::event copy(backend_selector selector, std::int64_t n, + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, + dependencies); + return done; +} + +sycl::event copy(backend_selector selector, std::int64_t n, + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, + dependencies); + return done; +} + +sycl::event copy_batch(backend_selector selector, std::int64_t* n, const float** x, + std::int64_t* incx, float** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::copy_batch( + selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); + return done; +} + +sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const double** x, std::int64_t* incx, double** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::copy_batch( + selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); + return done; +} + +sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::copy_batch( + selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); + return done; +} + +sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::copy_batch( + selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); + return done; +} + +sycl::event copy_batch(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t stridex, float* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::copy_batch( + selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); + return done; +} + +sycl::event copy_batch(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t stridex, double* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::copy_batch( + selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); + return done; +} + +sycl::event copy_batch(backend_selector selector, std::int64_t n, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::copy_batch( + selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); + return done; +} + +sycl::event copy_batch(backend_selector selector, std::int64_t n, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::copy_batch( + selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); + return done; +} + +sycl::event hemv(backend_selector selector, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::hemv( + selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +sycl::event hemv(backend_selector selector, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::hemv( + selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, + transb, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + return done; +} + +sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, double beta, double* c, + std::int64_t ldc, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, + transb, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + return done; +} + +sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, + transb, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + return done; +} + +sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, + transb, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + return done; +} + +sycl::event sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, + std::int64_t k, float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, + lda, x, incx, beta, y, incy, dependencies); + return done; +} + +sycl::event sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, + std::int64_t k, double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, + lda, x, incx, beta, y, incy, dependencies); + return done; +} + +sycl::event asum(backend_selector selector, std::int64_t n, + const std::complex* x, std::int64_t incx, float* result, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::asum(selector.get_queue(), n, x, incx, result, + dependencies); + return done; +} + +sycl::event asum(backend_selector selector, std::int64_t n, + const std::complex* x, std::int64_t incx, double* result, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::asum(selector.get_queue(), n, x, incx, result, + dependencies); + return done; +} + +sycl::event asum(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, float* result, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::asum(selector.get_queue(), n, x, incx, result, + dependencies); + return done; +} + +sycl::event asum(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, double* result, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::asum(selector.get_queue(), n, x, incx, result, + dependencies); + return done; +} + +sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::tbsv( + selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::tbsv( + selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::tbsv( + selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::tbsv( + selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +sycl::event spr2(backend_selector selector, uplo upper_lower, std::int64_t n, + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, + x, incx, y, incy, a, dependencies); + return done; +} + +sycl::event spr2(backend_selector selector, uplo upper_lower, std::int64_t n, + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, + x, incx, y, incy, a, dependencies); + return done; +} + +sycl::event iamax(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, oneapi::math::index_base base, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result, + base, dependencies); + return done; +} + +sycl::event iamax(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, oneapi::math::index_base base, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result, + base, dependencies); + return done; +} + +sycl::event iamax(backend_selector selector, std::int64_t n, + const std::complex* x, std::int64_t incx, std::int64_t* result, + oneapi::math::index_base base, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result, + base, dependencies); + return done; +} + +sycl::event iamax(backend_selector selector, std::int64_t n, + const std::complex* x, std::int64_t incx, std::int64_t* result, + oneapi::math::index_base base, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result, + base, dependencies); + return done; +} + +sycl::event rotm(backend_selector selector, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, float* param, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, + param, dependencies); + return done; +} + +sycl::event rotm(backend_selector selector, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, double* param, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, + param, dependencies); + return done; +} + +sycl::event rotg(backend_selector selector, float* a, float* b, float* c, float* s, + const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); + return done; +} + +sycl::event rotg(backend_selector selector, double* a, double* b, double* c, + double* s, const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); + return done; +} + +sycl::event rotg(backend_selector selector, std::complex* a, + std::complex* b, float* c, std::complex* s, + const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); + return done; +} + +sycl::event rotg(backend_selector selector, std::complex* a, + std::complex* b, double* c, std::complex* s, + const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); + return done; +} + +sycl::event sdsdot(backend_selector selector, std::int64_t n, float sb, + const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* result, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y, + incy, result, dependencies); + return done; +} + +sycl::event her2k(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + return done; +} + +sycl::event her2k(backend_selector selector, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { + auto done = + oneapi::math::blas::openblas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + return done; +} + +sycl::event dot(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, float* result, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, + result, dependencies); + return done; +} + +sycl::event dot(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, const double* y, std::int64_t incy, double* result, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, + result, dependencies); + return done; +} + +sycl::event dot(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, double* result, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, + result, dependencies); + return done; +} + +sycl::event symv(backend_selector selector, uplo upper_lower, std::int64_t n, + float alpha, const float* a, std::int64_t lda, const float* x, std::int64_t incx, + float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::symv( + selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +sycl::event symv(backend_selector selector, uplo upper_lower, std::int64_t n, + double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::symv( + selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +sycl::event omatcopy_batch(backend_selector selector, transpose trans, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, float* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatcopy_batch( + selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, + dependencies); + return done; +} + +sycl::event omatcopy_batch(backend_selector selector, transpose trans, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stride_a, double* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatcopy_batch( + selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, + dependencies); + return done; +} + +sycl::event omatcopy_batch(backend_selector selector, transpose trans, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatcopy_batch( + selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, + dependencies); + return done; +} + +sycl::event omatcopy_batch(backend_selector selector, transpose trans, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatcopy_batch( + selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, + dependencies); + return done; +} + +sycl::event imatcopy_batch(backend_selector selector, transpose trans, + std::int64_t m, std::int64_t n, float alpha, float* ab, std::int64_t lda, + std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::imatcopy_batch( + selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); + return done; +} + +sycl::event imatcopy_batch(backend_selector selector, transpose trans, + std::int64_t m, std::int64_t n, double alpha, double* ab, + std::int64_t lda, std::int64_t ldb, std::int64_t stride, + std::int64_t batch_size, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::imatcopy_batch( + selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); + return done; +} + +sycl::event imatcopy_batch(backend_selector selector, transpose trans, + std::int64_t m, std::int64_t n, std::complex alpha, + std::complex* ab, std::int64_t lda, std::int64_t ldb, + std::int64_t stride, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::imatcopy_batch( + selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); + return done; +} + +sycl::event imatcopy_batch(backend_selector selector, transpose trans, + std::int64_t m, std::int64_t n, std::complex alpha, + std::complex* ab, std::int64_t lda, std::int64_t ldb, + std::int64_t stride, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::imatcopy_batch( + selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); + return done; +} + +sycl::event omatadd_batch(backend_selector selector, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, float alpha, + const float* a, std::int64_t lda, std::int64_t stride_a, float beta, + const float* b, std::int64_t ldb, std::int64_t stride_b, float* c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatadd_batch( + selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, + c, ldc, stride_c, batch_size, dependencies); + return done; +} + +sycl::event omatadd_batch(backend_selector selector, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, double alpha, + const double* a, std::int64_t lda, std::int64_t stride_a, double beta, + const double* b, std::int64_t ldb, std::int64_t stride_b, double* c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatadd_batch( + selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, + c, ldc, stride_c, batch_size, dependencies); + return done; +} + +sycl::event omatadd_batch(backend_selector selector, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex beta, + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatadd_batch( + selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, + c, ldc, stride_c, batch_size, dependencies); + return done; +} + +sycl::event omatadd_batch(backend_selector selector, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex beta, + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatadd_batch( + selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, + c, ldc, stride_c, batch_size, dependencies); + return done; +} + +sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, float* b, + std::int64_t ldb, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, + alpha, a, lda, b, ldb, dependencies); + return done; +} + +sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, double* b, + std::int64_t ldb, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, + alpha, a, lda, b, ldb, dependencies); + return done; +} + +sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, + alpha, a, lda, b, ldb, dependencies); + return done; +} + +sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, + alpha, a, lda, b, ldb, dependencies); + return done; +} + +sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stridea, float* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatcopy2( + selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); + return done; +} + +sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stridea, double* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatcopy2( + selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); + return done; +} + +sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, std::complex* b, + std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatcopy2( + selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); + return done; +} + +sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, std::complex* b, + std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatcopy2( + selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); + return done; +} + +sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, + alpha, ab, lda, ldb, dependencies); + return done; +} + +sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, + alpha, ab, lda, ldb, dependencies); + return done; +} + +sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, std::complex* ab, + std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, + alpha, ab, lda, ldb, dependencies); + return done; +} + +sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, std::complex* ab, + std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, + alpha, ab, lda, ldb, dependencies); + return done; +} + +sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, + float beta, const float* b, std::int64_t ldb, float* c, std::int64_t ldc, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, + n, alpha, a, lda, beta, b, ldb, c, ldc, + dependencies); + return done; +} + +sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, double alpha, const double* a, std::int64_t lda, + double beta, const double* b, std::int64_t ldb, double* c, std::int64_t ldc, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, + n, alpha, a, lda, beta, b, ldb, c, ldc, + dependencies); + return done; +} + +sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, std::complex beta, + const std::complex* b, std::int64_t ldb, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, + n, alpha, a, lda, beta, b, ldb, c, ldc, + dependencies); + return done; +} + +sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, std::complex beta, + const std::complex* b, std::int64_t ldb, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, + n, alpha, a, lda, beta, b, ldb, c, ldc, + dependencies); + return done; +} + +sycl::event omatcopy_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, float* alpha, const float** a, + std::int64_t* lda, float** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatcopy_batch( + selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, + dependencies); + return done; +} + +sycl::event omatcopy_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, double* alpha, const double** a, + std::int64_t* lda, double** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatcopy_batch( + selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, + dependencies); + return done; +} + +sycl::event omatcopy_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatcopy_batch( + selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, + dependencies); + return done; +} + +sycl::event omatcopy_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::omatcopy_batch( + selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, + dependencies); + return done; +} + +sycl::event imatcopy_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, float* alpha, float** ab, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, + alpha, ab, lda, ldb, group_count, + groupsize, dependencies); + return done; +} + +sycl::event imatcopy_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, double* alpha, double** ab, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, + alpha, ab, lda, ldb, group_count, + groupsize, dependencies); + return done; +} + +sycl::event imatcopy_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + std::complex** ab, std::int64_t* lda, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, + alpha, ab, lda, ldb, group_count, + groupsize, dependencies); + return done; +} + +sycl::event imatcopy_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + std::complex** ab, std::int64_t* lda, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies) { + auto done = oneapi::math::blas::openblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, + alpha, ab, lda, ldb, group_count, + groupsize, dependencies); + return done; +} diff --git a/include/oneapi/math/blas/detail/openblas/onemath_blas_openblas.hpp b/include/oneapi/math/blas/detail/openblas/onemath_blas_openblas.hpp new file mode 100644 index 000000000..11ae942fe --- /dev/null +++ b/include/oneapi/math/blas/detail/openblas/onemath_blas_openblas.hpp @@ -0,0 +1,62 @@ +/******************************************************************************* +* Copyright 2020-2021 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#ifndef _ONEMATH_BLAS_OPENBLAS_HPP_ +#define _ONEMATH_BLAS_OPENBLAS_HPP_ + +#if __has_include() +#include +#else +#include +#endif + +#include +#include + +#include "oneapi/math/types.hpp" + +#include "oneapi/math/detail/export.hpp" + +namespace oneapi { +namespace math { + +using oneapi::math::transpose; +using oneapi::math::uplo; +using oneapi::math::side; +using oneapi::math::diag; +using oneapi::math::offset; + +namespace blas { +namespace openblas { +namespace column_major { + +#include "oneapi/math/blas/detail/onemath_blas_backends.hxx" + +} //namespace column_major +namespace row_major { + +#include "oneapi/math/blas/detail/onemath_blas_backends.hxx" + +} //namespace row_major +} //namespace openblas +} //namespace blas +} //namespace math +} //namespace oneapi + +#endif //_ONEMATH_BLAS_OPENBLAS_HPP_ diff --git a/include/oneapi/math/detail/backend_selector_predicates.hpp b/include/oneapi/math/detail/backend_selector_predicates.hpp index 6cbfedd18..b12df9e60 100644 --- a/include/oneapi/math/detail/backend_selector_predicates.hpp +++ b/include/oneapi/math/detail/backend_selector_predicates.hpp @@ -52,6 +52,24 @@ inline void backend_selector_precondition(sycl::queue& queue) { #endif } +template <> +inline void backend_selector_precondition(sycl::queue& queue) { +#ifndef ONEMATH_DISABLE_PREDICATES +#ifdef __ADAPTIVECPP__ + if (!(queue.is_host() || queue.get_device().is_cpu())) { +#else + if (!queue.get_device().is_cpu()) { +#endif + throw unsupported_device("", + "backend_selector", + queue.get_device()); + } +#endif +} + + + + template <> inline void backend_selector_precondition(sycl::queue& queue) { #ifndef ONEMATH_DISABLE_PREDICATES diff --git a/include/oneapi/math/detail/backends.hpp b/include/oneapi/math/detail/backends.hpp index db95e415f..eb37008cb 100644 --- a/include/oneapi/math/detail/backends.hpp +++ b/include/oneapi/math/detail/backends.hpp @@ -34,6 +34,7 @@ enum class backend { cusolver, curand, netlib, + openblas, armpl, rocblas, rocrand, @@ -57,6 +58,7 @@ static backendmap backend_map = { { backend::mklcpu, "mklcpu" }, { backend::cusolver, "cusolver" }, { backend::curand, "curand" }, { backend::netlib, "netlib" }, + { backend::openblas, "openblas" }, { backend::armpl, "armpl" }, { backend::rocblas, "rocblas" }, { backend::rocrand, "rocrand" }, diff --git a/include/oneapi/math/detail/backends_table.hpp b/include/oneapi/math/detail/backends_table.hpp index 50848e409..dad04c609 100644 --- a/include/oneapi/math/detail/backends_table.hpp +++ b/include/oneapi/math/detail/backends_table.hpp @@ -53,6 +53,10 @@ static std::map>> libraries = #ifdef ONEMATH_ENABLE_NETLIB_BACKEND LIB_NAME("blas_netlib"), #endif +#ifdef ONEMATH_ENABLE_OPENBLAS_BACKEND + LIB_NAME("blas_openblas"), +#endif + #ifdef ONEMATH_ENABLE_GENERIC_BLAS_BACKEND_INTEL_CPU LIB_NAME("blas_generic"), #endif @@ -64,6 +68,9 @@ static std::map>> libraries = #endif #ifdef ONEMATH_ENABLE_NETLIB_BACKEND LIB_NAME("blas_netlib") +#endif +#ifdef ONEMATH_ENABLE_OPENBLAS_BACKEND + LIB_NAME("blas_openblas") #endif } }, { device::intelgpu, @@ -108,12 +115,6 @@ static std::map>> libraries = #endif #ifdef ONEMATH_ENABLE_PORTFFT_BACKEND LIB_NAME("dft_portfft") -#endif - } }, - { device::aarch64cpu, - { -#ifdef ONEMATH_ENABLE_ARMPL_BACKEND - LIB_NAME("dft_armpl"), #endif } }, { device::intelgpu, diff --git a/include/oneapi/math/detail/get_device_id.hpp b/include/oneapi/math/detail/get_device_id.hpp index b1ed69de1..bf5df2500 100644 --- a/include/oneapi/math/detail/get_device_id.hpp +++ b/include/oneapi/math/detail/get_device_id.hpp @@ -37,6 +37,7 @@ #define AMD_ID 1022 #endif + namespace oneapi { namespace math { diff --git a/include/oneapi/math/dft/detail/descriptor_impl.hpp b/include/oneapi/math/dft/detail/descriptor_impl.hpp index e9fd92eb4..6267eb86d 100644 --- a/include/oneapi/math/dft/detail/descriptor_impl.hpp +++ b/include/oneapi/math/dft/detail/descriptor_impl.hpp @@ -91,10 +91,6 @@ class descriptor { void commit(backend_selector selector); #endif -#ifdef ONEMATH_ENABLE_ARMPL_BACKEND - void commit(backend_selector selector); -#endif - const dft_values& get_values() const noexcept { return values_; } diff --git a/include/oneapi/math/lapack/detail/armpl/onemath_lapack_armpl.hpp b/include/oneapi/math/lapack/detail/armpl/onemath_lapack_armpl.hpp index fce52ab4b..2cb51847a 100644 --- a/include/oneapi/math/lapack/detail/armpl/onemath_lapack_armpl.hpp +++ b/include/oneapi/math/lapack/detail/armpl/onemath_lapack_armpl.hpp @@ -23,8 +23,7 @@ #include #include -//this may have been defined in complex.h and conflicts with enums in types.hpp -#undef I + #include "oneapi/math/types.hpp" namespace oneapi { namespace math { diff --git a/include/oneapi/math/lapack/detail/armpl/onemath_lapack_armpl.hxx b/include/oneapi/math/lapack/detail/armpl/onemath_lapack_armpl.hxx index d9b990ef6..528e78910 100644 --- a/include/oneapi/math/lapack/detail/armpl/onemath_lapack_armpl.hxx +++ b/include/oneapi/math/lapack/detail/armpl/onemath_lapack_armpl.hxx @@ -1,5 +1,6 @@ // Buffer APIs + void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer>& a, std::int64_t lda, sycl::buffer& d, sycl::buffer& e, sycl::buffer>& tauq, sycl::buffer>& taup, diff --git a/include/oneapi/math/lapack/detail/lapack_loader.hpp b/include/oneapi/math/lapack/detail/lapack_loader.hpp index be5015257..a7deb65df 100644 --- a/include/oneapi/math/lapack/detail/lapack_loader.hpp +++ b/include/oneapi/math/lapack/detail/lapack_loader.hpp @@ -33,6 +33,7 @@ #include "oneapi/math/detail/export.hpp" #include "oneapi/math/detail/get_device_id.hpp" + namespace oneapi { namespace math { namespace lapack { diff --git a/include/oneapi/math/rng/detail/engine_impl.hpp b/include/oneapi/math/rng/detail/engine_impl.hpp index 2d3c2b562..ce1463785 100644 --- a/include/oneapi/math/rng/detail/engine_impl.hpp +++ b/include/oneapi/math/rng/detail/engine_impl.hpp @@ -33,6 +33,7 @@ #include "oneapi/math/rng/distributions.hpp" #include "oneapi/math/types.hpp" + namespace oneapi { namespace math { namespace rng { diff --git a/include/oneapi/math/rng/device/detail/exponential_impl.hpp b/include/oneapi/math/rng/device/detail/exponential_impl.hpp index 682f788c8..6efe800d5 100644 --- a/include/oneapi/math/rng/device/detail/exponential_impl.hpp +++ b/include/oneapi/math/rng/device/detail/exponential_impl.hpp @@ -82,7 +82,7 @@ class distribution_base #ifndef __ADAPTIVECPP__ res = sycl::fmax(res, a_); #else - // a workaround for AdaptiveCpp + // a workaround for hipSYCL (AdaptiveCpp) if constexpr (EngineType::vec_size == 1) { res = std::fmax(res, a_); } @@ -105,7 +105,7 @@ class distribution_base #ifndef __ADAPTIVECPP__ res = sycl::fmax(res, a_); #else - // a workaround for AdaptiveCpp + // a workaround for hipSYCL (AdaptiveCpp) if constexpr (EngineType::vec_size == 1) { res = std::fmax(res, a_); } diff --git a/include/oneapi/math/rng/device/detail/mcg31m1_impl.hpp b/include/oneapi/math/rng/device/detail/mcg31m1_impl.hpp index c95f721c6..2865a1df1 100644 --- a/include/oneapi/math/rng/device/detail/mcg31m1_impl.hpp +++ b/include/oneapi/math/rng/device/detail/mcg31m1_impl.hpp @@ -56,8 +56,8 @@ constexpr sycl::vec select_vector_a_mcg31m1() { UINT64_C(650347998) }); } -// AdaptiveCpp doesn't support constexpr sycl::vec constructor -// that's why in case of AdaptiveCpp backend sycl::vec is created as a local variable +// hipSYCL (AdaptiveCpp) doesn't support constexpr sycl::vec constructor +// that's why in case of hipSYCL backend sycl::vec is created as a local variable #ifndef __ADAPTIVECPP__ template struct mcg31m1_vector_a { @@ -160,7 +160,7 @@ static inline sycl::vec generate( #ifndef __ADAPTIVECPP__ res = custom_mod(mcg31m1_vector_a::vector_a * x); #else - // a workaround for AdaptiveCpp + // a workaround for hipSYCL (AdaptiveCpp) res = custom_mod(select_vector_a_mcg31m1() * x); #endif state.s = diff --git a/include/oneapi/math/rng/device/detail/mcg59_impl.hpp b/include/oneapi/math/rng/device/detail/mcg59_impl.hpp index c39706507..befa4657d 100644 --- a/include/oneapi/math/rng/device/detail/mcg59_impl.hpp +++ b/include/oneapi/math/rng/device/detail/mcg59_impl.hpp @@ -57,8 +57,8 @@ constexpr sycl::vec select_vector_a_mcg59() { UINT64_C(0x58145D06A37D795) }); } -// AdaptiveCpp doesn't support constexpr sycl::vec constructor -// that's why in case of AdaptiveCpp backend sycl::vec is created as a local variable +// hipSYCL (AdaptiveCpp) doesn't support constexpr sycl::vec constructor +// that's why in case of hipSYCL backend sycl::vec is created as a local variable #ifndef __ADAPTIVECPP__ template struct mcg59_vector_a { @@ -129,7 +129,7 @@ static inline sycl::vec generate( #ifndef __ADAPTIVECPP__ res = custom_mod(mcg59_vector_a::vector_a * res); #else - // a workaround for AdaptiveCpp + // a workaround for hipSYCL (AdaptiveCpp) res = custom_mod(select_vector_a_mcg59() * res); #endif state.s = custom_mod(mcg59_param::a * res[VecSize - 1]); diff --git a/include/oneapi/math/rng/device/detail/uniform_impl.hpp b/include/oneapi/math/rng/device/detail/uniform_impl.hpp index 5ddb67010..b312ba023 100644 --- a/include/oneapi/math/rng/device/detail/uniform_impl.hpp +++ b/include/oneapi/math/rng/device/detail/uniform_impl.hpp @@ -220,7 +220,7 @@ class distribution_base> { res = sycl::fmax(res, a_); res = sycl::fmin(res, b_); #else - // a workaround for AdaptiveCpp + // a workaround for hipSYCL (AdaptiveCpp) if constexpr (EngineType::vec_size == 1) { res = sycl::fmax(res, a_); res = sycl::fmin(res, b_); @@ -298,7 +298,7 @@ class distribution_base> { res = sycl::fmax(res, a_); res = sycl::fmin(res, b_); #else - // a workaround for AdaptiveCpp + // a workaround for hipSYCL (AdaptiveCpp) if constexpr (EngineType::vec_size == 1) { res = sycl::fmax(res, a_); res = sycl::fmin(res, b_); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8b849720b..39234bf17 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -88,6 +88,7 @@ function(generate_header_file) set(ONEMATH_ENABLE_CUBLAS_BACKEND ${ENABLE_CUBLAS_BACKEND}) set(ONEMATH_ENABLE_ROCBLAS_BACKEND ${ENABLE_ROCBLAS_BACKEND}) set(ONEMATH_ENABLE_NETLIB_BACKEND ${ENABLE_NETLIB_BACKEND}) + set(ONEMATH_ENABLE_OPENBLAS_BACKEND ${ENABLE_OPENBLAS_BACKEND}) set(ONEMATH_ENABLE_ARMPL_BACKEND ${ENABLE_ARMPL_BACKEND}) set(ONEMATH_ENABLE_ARMPL_OPENRNG ${ENABLE_ARMPL_OPENRNG}) set(ONEMATH_ENABLE_GENERIC_BLAS_BACKEND ${ENABLE_GENERIC_BLAS_BACKEND}) diff --git a/src/blas/backends/CMakeLists.txt b/src/blas/backends/CMakeLists.txt index 39ea278d3..d388aad33 100644 --- a/src/blas/backends/CMakeLists.txt +++ b/src/blas/backends/CMakeLists.txt @@ -28,6 +28,10 @@ if(ENABLE_NETLIB_BACKEND) add_subdirectory(netlib) endif() +if(ENABLE_OPENBLAS_BACKEND) + add_subdirectory(openblas) +endif() + if(ENABLE_MKLGPU_BACKEND) add_subdirectory(mklgpu) endif() diff --git a/src/blas/backends/openblas/CMakeLists.txt b/src/blas/backends/openblas/CMakeLists.txt new file mode 100644 index 000000000..ecd97720e --- /dev/null +++ b/src/blas/backends/openblas/CMakeLists.txt @@ -0,0 +1,118 @@ +#=============================================================================== +# OpenBLAS Backend for oneMath BLAS +#=============================================================================== + +set(LIB_NAME onemath_blas_openblas) +set(LIB_OBJ ${LIB_NAME}_obj) + +# ------------------------------------------------------------------------------ +# Find OpenBLAS (uses custom FindOpenBLAS.cmake) +# ------------------------------------------------------------------------------ +find_package(OpenBLAS REQUIRED) + +set(SOURCES + openblas_common.hpp + openblas_level1.cpp + openblas_level2.cpp + openblas_level3.cpp + openblas_batch.cpp + openblas_extensions.cpp + $<$:openblas_wrappers.cpp> +) + +# ------------------------------------------------------------------------------ +# Create libraries +# ------------------------------------------------------------------------------ + +add_library(${LIB_NAME}) +add_deprecated_library(${LIB_NAME}) + +add_library(${LIB_OBJ} OBJECT ${SOURCES}) +add_dependencies(onemath_backend_libs_blas ${LIB_NAME}) + +# ------------------------------------------------------------------------------ +# SYCL integration +# ------------------------------------------------------------------------------ + +if (USE_ADD_SYCL_TO_TARGET_INTEGRATION) + add_sycl_to_target(TARGET ${LIB_OBJ} SOURCES ${SOURCES}) +endif() + +# ------------------------------------------------------------------------------ +# Include directories +# ------------------------------------------------------------------------------ + +target_include_directories(${LIB_OBJ} + PUBLIC + ${ONEMATH_INCLUDE_DIRS} + ${OPENBLAS_INCLUDE} + PRIVATE + ${PROJECT_SOURCE_DIR}/src/include + ${PROJECT_SOURCE_DIR}/src + ${CMAKE_BINARY_DIR}/bin + ${ONEMATH_GENERATED_INCLUDE_PATH} +) + +# ------------------------------------------------------------------------------ +# Compile options +# ------------------------------------------------------------------------------ + +target_compile_options(${LIB_OBJ} + PRIVATE ${ONEMATH_BUILD_COPT} +) + +# ------------------------------------------------------------------------------ +# Link OpenBLAS properly (via imported target) +# ------------------------------------------------------------------------------ + +target_link_libraries(${LIB_OBJ} + PUBLIC + ONEMATH::SYCL::SYCL + ONEMATH::OPENBLAS::OPENBLAS +) + +set_target_properties(${LIB_OBJ} PROPERTIES + POSITION_INDEPENDENT_CODE ON +) + +# ------------------------------------------------------------------------------ +# Link object library into main backend lib +# ------------------------------------------------------------------------------ + +target_link_libraries(${LIB_NAME} + PRIVATE ${LIB_OBJ} +) + +target_include_directories(${LIB_NAME} + PUBLIC ${ONEMATH_INCLUDE_DIRS} +) + +if(BUILD_SHARED_LIBS) + set_target_properties(${LIB_NAME} PROPERTIES + INTERFACE_LINK_LIBRARIES ONEMATH::SYCL::SYCL + ) +endif() + +set_target_properties(${LIB_NAME} PROPERTIES + SOVERSION ${PROJECT_VERSION_MAJOR} +) + +# ------------------------------------------------------------------------------ +# RPATH handling +# ------------------------------------------------------------------------------ + +list(APPEND CMAKE_BUILD_RPATH $) + +# ------------------------------------------------------------------------------ +# Install +# ------------------------------------------------------------------------------ + +install(TARGETS ${LIB_OBJ} EXPORT oneMathTargets) + +install(TARGETS ${LIB_NAME} + EXPORT oneMathTargets + RUNTIME DESTINATION bin + ARCHIVE DESTINATION lib + LIBRARY DESTINATION lib +) + diff --git a/src/blas/backends/openblas/openblas_batch.cpp b/src/blas/backends/openblas/openblas_batch.cpp new file mode 100644 index 000000000..2883de606 --- /dev/null +++ b/src/blas/backends/openblas/openblas_batch.cpp @@ -0,0 +1,51 @@ +/******************************************************************************* +* Copyright 2020-2021 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#if __has_include() +#include +#else +#include +#endif + +#include "openblas_common.hpp" +#include "oneapi/math/exceptions.hpp" +#include "oneapi/math/blas/detail/openblas/onemath_blas_openblas.hpp" + +namespace oneapi { +namespace math { +namespace blas { +namespace openblas { +namespace column_major { + +#define COLUMN_MAJOR +#include "openblas_batch.cxx" +#undef COLUMN_MAJOR + +} // namespace column_major +namespace row_major { + +#define ROW_MAJOR +#include "openblas_batch.cxx" +#undef ROW_MAJOR + +} // namespace row_major +} // namespace openblas +} // namespace blas +} // namespace math +} // namespace oneapi diff --git a/src/blas/backends/openblas/openblas_batch.cxx b/src/blas/backends/openblas/openblas_batch.cxx new file mode 100644 index 000000000..fc1810fed --- /dev/null +++ b/src/blas/backends/openblas/openblas_batch.cxx @@ -0,0 +1,1712 @@ +/******************************************************************************* +* Copyright 2020-2021 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +// Buffer APIs + +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, + int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "copy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "copy_batch", "for row_major layout"); +#endif +} + +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, + int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "copy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "copy_batch", "for row_major layout"); +#endif +} + +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, + int64_t incx, int64_t stridex, sycl::buffer, 1>& y, + int64_t incy, int64_t stridey, int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "copy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "copy_batch", "for row_major layout"); +#endif +} + +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, + int64_t incx, int64_t stridex, sycl::buffer, 1>& y, + int64_t incy, int64_t stridey, int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "copy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "copy_batch", "for row_major layout"); +#endif +} + +void axpy_batch(sycl::queue& queue, int64_t n, float alpha, sycl::buffer& x, int64_t incx, + int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, + int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "axpy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "axpy_batch", "for row_major layout"); +#endif +} + +void axpy_batch(sycl::queue& queue, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, int64_t stridex, sycl::buffer& y, int64_t incy, + int64_t stridey, int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "axpy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "axpy_batch", "for row_major layout"); +#endif +} + +void axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, int64_t stridex, + sycl::buffer, 1>& y, int64_t incy, int64_t stridey, + int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "axpy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "axpy_batch", "for row_major layout"); +#endif +} + +void axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, int64_t stridex, + sycl::buffer, 1>& y, int64_t incy, int64_t stridey, + int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "axpy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "axpy_batch", "for row_major layout"); +#endif +} + +void gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, sycl::buffer& x, + int64_t incx, int64_t stride_x, float beta, sycl::buffer& y, int64_t incy, + int64_t stride_y, int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemv_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemv_batch", "for row_major layout"); +#endif +} + +void gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& x, int64_t incx, int64_t stride_x, double beta, + sycl::buffer& y, int64_t incy, int64_t stride_y, int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemv_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemv_batch", "for row_major layout"); +#endif +} + +void gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, sycl::buffer, 1>& x, int64_t incx, + int64_t stride_x, std::complex beta, sycl::buffer, 1>& y, + int64_t incy, int64_t stride_y, int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemv_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemv_batch", "for row_major layout"); +#endif +} + +void gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, sycl::buffer, 1>& x, int64_t incx, + int64_t stride_x, std::complex beta, + sycl::buffer, 1>& y, int64_t incy, int64_t stride_y, + int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemv_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemv_batch", "for row_major layout"); +#endif +} + +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer& a, int64_t lda, int64_t stride_a, sycl::buffer& x, + int64_t incx, int64_t stride_x, sycl::buffer& c, int64_t ldc, + int64_t stride_c, int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); +#endif +} + +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& x, int64_t incx, int64_t stride_x, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); +#endif +} + +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& x, int64_t incx, int64_t stride_x, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, + int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); +#endif +} + +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& x, int64_t incx, int64_t stride_x, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, + int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); +#endif +} + +void gemm_batch(sycl::queue&, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, float beta, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { + auto a_acc = a.get_access(); + auto b_acc = b.get_access(); + auto c_acc = c.get_access(); + + const float* A0 = a_acc.get_pointer(); + const float* B0 = b_acc.get_pointer(); + float* C0 = c_acc.get_pointer(); + + CBLAS_TRANSPOSE tA = (transa == transpose::nontrans) + ? CblasNoTrans + : ((transa == transpose::trans) ? CblasTrans : CblasConjTrans); + CBLAS_TRANSPOSE tB = (transb == transpose::nontrans) + ? CblasNoTrans + : ((transb == transpose::trans) ? CblasTrans : CblasConjTrans); + +#ifdef COLUMN_MAJOR + constexpr CBLAS_ORDER order = CblasColMajor; +#endif +#ifdef ROW_MAJOR + constexpr CBLAS_ORDER order = CblasRowMajor; +#endif + + for (int64_t i = 0; i < batch_size; ++i) { + const float* A = A0 + i * stride_a; + const float* B = B0 + i * stride_b; + float* C = C0 + i * stride_c; + + cblas_sgemm(order, tA, tB, + (blasint)m, (blasint)n, (blasint)k, + alpha, + A, (blasint)lda, + B, (blasint)ldb, + beta, + C, (blasint)ldc); + } +} + +void gemm_batch(sycl::queue&, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, double alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, double beta, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { + auto a_acc = a.get_access(); + auto b_acc = b.get_access(); + auto c_acc = c.get_access(); + + const double* A0 = a_acc.get_pointer(); + const double* B0 = b_acc.get_pointer(); + double* C0 = c_acc.get_pointer(); + + CBLAS_TRANSPOSE tA = (transa == transpose::nontrans) + ? CblasNoTrans + : ((transa == transpose::trans) ? CblasTrans : CblasConjTrans); + CBLAS_TRANSPOSE tB = (transb == transpose::nontrans) + ? CblasNoTrans + : ((transb == transpose::trans) ? CblasTrans : CblasConjTrans); + +#ifdef COLUMN_MAJOR + constexpr CBLAS_ORDER order = CblasColMajor; +#endif +#ifdef ROW_MAJOR + constexpr CBLAS_ORDER order = CblasRowMajor; +#endif + + for (int64_t i = 0; i < batch_size; ++i) { + const double* A = A0 + i * stride_a; + const double* B = B0 + i * stride_b; + double* C = C0 + i * stride_c; + + cblas_dgemm(order, tA, tB, + (blasint)m, (blasint)n, (blasint)k, + alpha, + A, (blasint)lda, + B, (blasint)ldb, + beta, + C, (blasint)ldc); + } +} + +void gemm_batch(sycl::queue&, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, int64_t stride_a, sycl::buffer, 1>& b, int64_t ldb, + int64_t stride_b, std::complex beta, sycl::buffer, 1>& c, + int64_t ldc, int64_t stride_c, int64_t batch_size) { + auto a_acc = a.get_access(); + auto b_acc = b.get_access(); + auto c_acc = c.get_access(); + + const std::complex* A0 = a_acc.get_pointer(); + const std::complex* B0 = b_acc.get_pointer(); + std::complex* C0 = c_acc.get_pointer(); + + CBLAS_TRANSPOSE tA = (transa == transpose::nontrans) + ? CblasNoTrans + : ((transa == transpose::trans) ? CblasTrans : CblasConjTrans); + CBLAS_TRANSPOSE tB = (transb == transpose::nontrans) + ? CblasNoTrans + : ((transb == transpose::trans) ? CblasTrans : CblasConjTrans); + +#ifdef COLUMN_MAJOR + constexpr CBLAS_ORDER order = CblasColMajor; +#endif +#ifdef ROW_MAJOR + constexpr CBLAS_ORDER order = CblasRowMajor; +#endif + + for (int64_t i = 0; i < batch_size; ++i) { + const std::complex* A = A0 + i * stride_a; + const std::complex* B = B0 + i * stride_b; + std::complex* C = C0 + i * stride_c; + + cblas_cgemm(order, tA, tB, + (blasint)m, (blasint)n, (blasint)k, + static_cast(&alpha), + static_cast(A), (blasint)lda, + static_cast(B), (blasint)ldb, + static_cast(&beta), + static_cast(C), (blasint)ldc); + } +} + +void gemm_batch(sycl::queue&, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, int64_t stride_a, sycl::buffer, 1>& b, + int64_t ldb, int64_t stride_b, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, + int64_t batch_size) { + auto a_acc = a.get_access(); + auto b_acc = b.get_access(); + auto c_acc = c.get_access(); + + const std::complex* A0 = a_acc.get_pointer(); + const std::complex* B0 = b_acc.get_pointer(); + std::complex* C0 = c_acc.get_pointer(); + + CBLAS_TRANSPOSE tA = (transa == transpose::nontrans) + ? CblasNoTrans + : ((transa == transpose::trans) ? CblasTrans : CblasConjTrans); + CBLAS_TRANSPOSE tB = (transb == transpose::nontrans) + ? CblasNoTrans + : ((transb == transpose::trans) ? CblasTrans : CblasConjTrans); + +#ifdef COLUMN_MAJOR + constexpr CBLAS_ORDER order = CblasColMajor; +#endif +#ifdef ROW_MAJOR + constexpr CBLAS_ORDER order = CblasRowMajor; +#endif + + for (int64_t i = 0; i < batch_size; ++i) { + const std::complex* A = A0 + i * stride_a; + const std::complex* B = B0 + i * stride_b; + std::complex* C = C0 + i * stride_c; + + cblas_zgemm(order, tA, tB, + (blasint)m, (blasint)n, (blasint)k, + static_cast(&alpha), + static_cast(A), (blasint)lda, + static_cast(B), (blasint)ldb, + static_cast(&beta), + static_cast(C), (blasint)ldc); + } +} + +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, sycl::half alpha, sycl::buffer& a, int64_t lda, + int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, + sycl::half beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, + int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_batch", "for row_major layout"); +#endif +} + +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, + float beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, + int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_batch", "for row_major layout"); +#endif +} + +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, + float beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, + int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_batch", "for row_major layout"); +#endif +} + +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, + float beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, + int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_batch", "for row_major layout"); +#endif +} + +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, float alpha, sycl::buffer& a, + int64_t lda, int64_t stride_a, sycl::buffer& b, int64_t ldb, + int64_t stride_b, int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "trsm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "trsm_batch", "for row_major layout"); +#endif +} + +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, double alpha, sycl::buffer& a, + int64_t lda, int64_t stride_a, sycl::buffer& b, int64_t ldb, + int64_t stride_b, int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "trsm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "trsm_batch", "for row_major layout"); +#endif +} + +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, + int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "trsm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "trsm_batch", "for row_major layout"); +#endif +} + +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, + int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "trsm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "trsm_batch", "for row_major layout"); +#endif +} + +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, float beta, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "syrk_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "syrk_batch", "for row_major layout"); +#endif +} + +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + double beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, + int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "syrk_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "syrk_batch", "for row_major layout"); +#endif +} + +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, std::complex beta, sycl::buffer, 1>& c, + int64_t ldc, int64_t stride_c, int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "syrk_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "syrk_batch", "for row_major layout"); +#endif +} + +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, + int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "syrk_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "syrk_batch", "for row_major layout"); +#endif +} + +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); +#endif +} + +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); +#endif +} + +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, sycl::buffer, 1>& b, int64_t ldb, + int64_t stride_b, int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); +#endif +} + +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, int64_t stride_a, sycl::buffer, 1>& b, + int64_t ldb, int64_t stride_b, int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); +#endif +} + +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb, int64_t stride, + int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); +#endif +} + +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb, int64_t stride, + int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); +#endif +} + +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, + int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); +#endif +} + +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, + int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); +#endif +} + +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + float alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + float beta, sycl::buffer& b, int64_t ldb, int64_t stride_b, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatadd_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatadd_batch", "for row_major layout"); +#endif +} + +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + double alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + double beta, sycl::buffer& b, int64_t ldb, int64_t stride_b, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatadd_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatadd_batch", "for row_major layout"); +#endif +} + +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, std::complex beta, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, + int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatadd_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatadd_batch", "for row_major layout"); +#endif +} + +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, int64_t stride_a, std::complex beta, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, + int64_t batch_size) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatadd_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatadd_batch", "for row_major layout"); +#endif +} + +// USM APIs + +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const float** x, int64_t* incx, float** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "copy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "copy_batch", "for row_major layout"); +#endif +} + +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const double** x, int64_t* incx, double** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "copy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "copy_batch", "for row_major layout"); +#endif +} + +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const std::complex** x, int64_t* incx, + std::complex** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "copy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "copy_batch", "for row_major layout"); +#endif +} + +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const std::complex** x, + int64_t* incx, std::complex** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "copy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "copy_batch", "for row_major layout"); +#endif +} + +sycl::event copy_batch(sycl::queue& queue, int64_t n, const float* x, int64_t incx, + std::int64_t stridex, float* y, int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "copy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "copy_batch", "for row_major layout"); +#endif +} + +sycl::event copy_batch(sycl::queue& queue, int64_t n, const double* x, int64_t incx, + std::int64_t stridex, double* y, int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "copy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "copy_batch", "for row_major layout"); +#endif +} + +sycl::event copy_batch(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + std::int64_t stridex, std::complex* y, int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "copy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "copy_batch", "for row_major layout"); +#endif +} + +sycl::event copy_batch(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + std::int64_t stridex, std::complex* y, int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "copy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "copy_batch", "for row_major layout"); +#endif +} + +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, float* alpha, const float** x, int64_t* incx, + float** y, int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "axpy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "axpy_batch", "for row_major layout"); +#endif +} + +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, double* alpha, const double** x, + int64_t* incx, double** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "axpy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "axpy_batch", "for row_major layout"); +#endif +} + +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, std::complex* alpha, + const std::complex** x, int64_t* incx, std::complex** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "axpy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "axpy_batch", "for row_major layout"); +#endif +} + +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, std::complex* alpha, + const std::complex** x, int64_t* incx, std::complex** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "axpy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "axpy_batch", "for row_major layout"); +#endif +} + +sycl::event axpy_batch(sycl::queue& queue, int64_t n, float alpha, const float* x, int64_t incx, + int64_t stridex, float* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "axpy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "axpy_batch", "for row_major layout"); +#endif +} + +sycl::event axpy_batch(sycl::queue& queue, int64_t n, double alpha, const double* x, int64_t incx, + int64_t stridex, double* y, int64_t incy, int64_t stridey, + int64_t batch_size, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "axpy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "axpy_batch", "for row_major layout"); +#endif +} + +sycl::event axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, int64_t stridex, + std::complex* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "axpy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "axpy_batch", "for row_major layout"); +#endif +} + +sycl::event axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, int64_t stridex, + std::complex* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "axpy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "axpy_batch", "for row_major layout"); +#endif +} + +sycl::event gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, int64_t stride_a, const float* x, int64_t incx, + int64_t stride_x, float beta, float* y, int64_t incy, int64_t stride_y, + int64_t batch_size, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemv_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemv_batch", "for row_major layout"); +#endif +} + +sycl::event gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, int64_t stride_a, const double* x, + int64_t incx, int64_t stride_x, double beta, double* y, int64_t incy, + int64_t stride_y, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemv_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemv_batch", "for row_major layout"); +#endif +} + +sycl::event gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, const std::complex* x, int64_t incx, + int64_t stride_x, std::complex beta, std::complex* y, + int64_t incy, int64_t stride_y, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemv_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemv_batch", "for row_major layout"); +#endif +} + +sycl::event gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, const std::complex* x, int64_t incx, + int64_t stride_x, std::complex beta, std::complex* y, + int64_t incy, int64_t stride_y, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemv_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemv_batch", "for row_major layout"); +#endif +} + +sycl::event gemv_batch(sycl::queue& queue, transpose* transa, int64_t* m, int64_t* n, float* alpha, + const float** a, int64_t* lda, const float** x, int64_t* incx, float* beta, + float** y, int64_t* incy, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemv_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemv_batch", "for row_major layout"); +#endif +} + +sycl::event gemv_batch(sycl::queue& queue, transpose* transa, int64_t* m, int64_t* n, double* alpha, + const double** a, int64_t* lda, const double** x, int64_t* incx, + double* beta, double** y, int64_t* incy, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemv_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemv_batch", "for row_major layout"); +#endif +} + +sycl::event gemv_batch(sycl::queue& queue, transpose* transa, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, int64_t* lda, + const std::complex** x, int64_t* incx, std::complex* beta, + std::complex** y, int64_t* incy, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemv_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemv_batch", "for row_major layout"); +#endif +} + +sycl::event gemv_batch(sycl::queue& queue, transpose* transa, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, int64_t* lda, + const std::complex** x, int64_t* incx, std::complex* beta, + std::complex** y, int64_t* incy, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemv_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemv_batch", "for row_major layout"); +#endif +} + +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, const float* a, + int64_t lda, int64_t stride_a, const float* x, int64_t incx, + int64_t stride_x, float* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); +#endif +} + +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, const double* a, + int64_t lda, int64_t stride_a, const double* x, int64_t incx, + int64_t stride_x, double* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); +#endif +} + +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + const std::complex* a, int64_t lda, int64_t stride_a, + const std::complex* x, int64_t incx, int64_t stride_x, + std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); +#endif +} + +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + const std::complex* a, int64_t lda, int64_t stride_a, + const std::complex* x, int64_t incx, int64_t stride_x, + std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); +#endif +} + +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const float** a, int64_t* lda, const float** x, int64_t* incx, float** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); +#endif +} + +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const double** a, int64_t* lda, const double** x, int64_t* incx, double** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); +#endif +} + +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const std::complex** a, int64_t* lda, const std::complex** x, + int64_t* incx, std::complex** c, int64_t* ldc, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); +#endif +} + +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const std::complex** a, int64_t* lda, const std::complex** x, + int64_t* incx, std::complex** c, int64_t* ldc, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "dgmm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "dgmm_batch", "for row_major layout"); +#endif +} + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, float* alpha, const float** a, int64_t* lda, + const float** b, int64_t* ldb, float* beta, float** c, int64_t* ldc, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_batch", "for row_major layout"); +#endif +} + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, double* alpha, const double** a, int64_t* lda, + const double** b, int64_t* ldb, double* beta, double** c, int64_t* ldc, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_batch", "for row_major layout"); +#endif +} + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, std::complex* alpha, + const std::complex** a, int64_t* lda, const std::complex** b, + int64_t* ldb, std::complex* beta, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_batch", "for row_major layout"); +#endif +} + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, std::complex* alpha, + const std::complex** a, int64_t* lda, const std::complex** b, + int64_t* ldb, std::complex* beta, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_batch", "for row_major layout"); +#endif +} + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, sycl::half* alpha, const sycl::half** a, + int64_t* lda, const sycl::half** b, int64_t* ldb, sycl::half* beta, + sycl::half** c, int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_batch", "for row_major layout"); +#endif +} + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, float* alpha, const sycl::half** a, int64_t* lda, + const sycl::half** b, int64_t* ldb, float* beta, float** c, int64_t* ldc, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_batch", "for row_major layout"); +#endif +} + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, float* alpha, const std::int8_t** a, int64_t* lda, + const std::int8_t** b, int64_t* ldb, float* beta, float** c, int64_t* ldc, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_batch", "for row_major layout"); +#endif +} + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, float* alpha, const std::int8_t** a, int64_t* lda, + const std::int8_t** b, int64_t* ldb, float* beta, std::int32_t** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_batch", "for row_major layout"); +#endif +} + +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const float* a, int64_t lda, int64_t stride_a, + const float* b, int64_t ldb, int64_t stride_b, float beta, float* c, + int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_batch", "for row_major layout"); +#endif +} + +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, double alpha, const double* a, int64_t lda, int64_t stride_a, + const double* b, int64_t ldb, int64_t stride_b, double beta, double* c, + int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_batch", "for row_major layout"); +#endif +} + +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, int64_t stride_a, const std::complex* b, int64_t ldb, + int64_t stride_b, std::complex beta, std::complex* c, + int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_batch", "for row_major layout"); +#endif +} + +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, int64_t stride_a, const std::complex* b, int64_t ldb, + int64_t stride_b, std::complex beta, std::complex* c, + int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_batch", "for row_major layout"); +#endif +} + +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, sycl::half alpha, const sycl::half* a, int64_t lda, + int64_t stride_a, const sycl::half* b, int64_t ldb, int64_t stride_b, + sycl::half beta, sycl::half* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_batch", "for row_major layout"); +#endif +} + +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const sycl::half* a, int64_t lda, int64_t stride_a, + const sycl::half* b, int64_t ldb, int64_t stride_b, float beta, float* c, + int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_batch", "for row_major layout"); +#endif +} + +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const std::int8_t* a, int64_t lda, int64_t stride_a, + const std::int8_t* b, int64_t ldb, int64_t stride_b, float beta, float* c, + int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_batch", "for row_major layout"); +#endif +} + +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const std::int8_t* a, int64_t lda, int64_t stride_a, + const std::int8_t* b, int64_t ldb, int64_t stride_b, float beta, + std::int32_t* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_batch", "for row_major layout"); +#endif +} + +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, float alpha, const float* a, + int64_t lda, int64_t stride_a, float* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "trsm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "trsm_batch", "for row_major layout"); +#endif +} + +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, double alpha, const double* a, + int64_t lda, int64_t stride_a, double* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "trsm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "trsm_batch", "for row_major layout"); +#endif +} + +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, std::complex alpha, + const std::complex* a, int64_t lda, int64_t stride_a, + std::complex* b, int64_t ldb, int64_t stride_b, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "trsm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "trsm_batch", "for row_major layout"); +#endif +} + +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, std::complex alpha, + const std::complex* a, int64_t lda, int64_t stride_a, + std::complex* b, int64_t ldb, int64_t stride_b, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "trsm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "trsm_batch", "for row_major layout"); +#endif +} + +sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, float* alpha, const float** a, + int64_t* lda, float** b, int64_t* ldb, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "trsm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "trsm_batch", "for row_major layout"); +#endif +} + +sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, double* alpha, const double** a, + int64_t* lda, double** b, int64_t* ldb, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "trsm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "trsm_batch", "for row_major layout"); +#endif +} + +sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, std::complex* alpha, + const std::complex** a, int64_t* lda, std::complex** b, + int64_t* ldb, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "trsm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "trsm_batch", "for row_major layout"); +#endif +} + +sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, std::complex* alpha, + const std::complex** a, int64_t* lda, std::complex** b, + int64_t* ldb, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "trsm_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "trsm_batch", "for row_major layout"); +#endif +} + +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, float* alpha, const float** a, int64_t* lda, float* beta, + float** c, int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "syrk_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "syrk_batch", "for row_major layout"); +#endif +} + +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, double* alpha, const double** a, int64_t* lda, double* beta, + double** c, int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "syrk_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "syrk_batch", "for row_major layout"); +#endif +} + +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, std::complex* alpha, const std::complex** a, + int64_t* lda, std::complex* beta, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "syrk_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "syrk_batch", "for row_major layout"); +#endif +} + +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, std::complex* alpha, const std::complex** a, + int64_t* lda, std::complex* beta, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "syrk_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "syrk_batch", "for row_major layout"); +#endif +} + +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, const float* a, int64_t lda, int64_t stride_a, float beta, + float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "syrk_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "syrk_batch", "for row_major layout"); +#endif +} + +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, const double* a, int64_t lda, int64_t stride_a, double beta, + double* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "syrk_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "syrk_batch", "for row_major layout"); +#endif +} + +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex beta, std::complex* c, + int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "syrk_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "syrk_batch", "for row_major layout"); +#endif +} + +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex beta, std::complex* c, + int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "syrk_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "syrk_batch", "for row_major layout"); +#endif +} + +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, int64_t stride_a, float* b, int64_t ldb, + int64_t stride_b, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); +#endif +} + +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, int64_t stride_a, double* b, int64_t ldb, + int64_t stride_b, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); +#endif +} + +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); +#endif +} + +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); +#endif +} + +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + float* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); +#endif +} + +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + double* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); +#endif +} + +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, + int64_t ldb, int64_t stride, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); +#endif +} + +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, + int64_t ldb, int64_t stride, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); +#endif +} + +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, float alpha, const float* a, int64_t lda, int64_t stride_a, + float beta, const float* b, int64_t ldb, int64_t stride_b, float* c, + int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatadd_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatadd_batch", "for row_major layout"); +#endif +} + +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, double alpha, const double* a, int64_t lda, int64_t stride_a, + double beta, const double* b, int64_t ldb, int64_t stride_b, double* c, + int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatadd_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatadd_batch", "for row_major layout"); +#endif +} + +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, std::complex alpha, const std::complex* a, + int64_t lda, int64_t stride_a, std::complex beta, + const std::complex* b, int64_t ldb, int64_t stride_b, + std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatadd_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatadd_batch", "for row_major layout"); +#endif +} + +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, std::complex alpha, const std::complex* a, + int64_t lda, int64_t stride_a, std::complex beta, + const std::complex* b, int64_t ldb, int64_t stride_b, + std::complex* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatadd_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatadd_batch", "for row_major layout"); +#endif +} + +sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + float* alpha, const float** a, int64_t* lda, float** b, int64_t* ldb, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); +#endif +} + +sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + double* alpha, const double** a, int64_t* lda, double** b, int64_t* ldb, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); +#endif +} + +sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, int64_t* lda, + std::complex** b, int64_t* ldb, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); +#endif +} + +sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, + int64_t* lda, std::complex** b, int64_t* ldb, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); +#endif +} + +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + float* alpha, float** ab, int64_t* lda, int64_t* ldb, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); +#endif +} + +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + double* alpha, double** ab, int64_t* lda, int64_t* ldb, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); +#endif +} + +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + std::complex* alpha, std::complex** ab, int64_t* lda, + int64_t* ldb, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); +#endif +} + +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + std::complex* alpha, std::complex** ab, int64_t* lda, + int64_t* ldb, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); +#endif +} diff --git a/src/blas/backends/openblas/openblas_common.hpp b/src/blas/backends/openblas/openblas_common.hpp new file mode 100644 index 000000000..10c6e81d9 --- /dev/null +++ b/src/blas/backends/openblas/openblas_common.hpp @@ -0,0 +1,103 @@ +/******************************************************************************* +* Copyright 2020-2021 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#ifndef _NETLIB_COMMON_HPP_ +#define _NETLIB_COMMON_HPP_ + +#if __has_include() +#include +#else +#include +#endif +#include + +#include "cblas.h" + +#include "oneapi/math/blas/detail/openblas/onemath_blas_openblas.hpp" +#include "oneapi/math/types.hpp" + +#define GET_MULTI_PTR template get_multi_ptr().get_raw() + +namespace oneapi { +namespace math { +namespace blas { +namespace openblas { + +typedef enum { CblasFixOffset = 101, CblasColOffset = 102, CblasRowOffset = 103 } CBLAS_OFFSET; + +/** + * Helper methods for converting between onemath types and their CBLAS + * equivalents. + */ + +inline CBLAS_TRANSPOSE convert_to_cblas_trans(transpose trans) { + if (trans == transpose::trans) + return CBLAS_TRANSPOSE::CblasTrans; + else if (trans == transpose::conjtrans) + return CBLAS_TRANSPOSE::CblasConjTrans; + else + return CBLAS_TRANSPOSE::CblasNoTrans; +} + +inline CBLAS_UPLO convert_to_cblas_uplo(uplo is_upper) { + return is_upper == uplo::upper ? CBLAS_UPLO::CblasUpper : CBLAS_UPLO::CblasLower; +} + +inline CBLAS_DIAG convert_to_cblas_diag(diag is_unit) { + return is_unit == diag::unit ? CBLAS_DIAG::CblasUnit : CBLAS_DIAG::CblasNonUnit; +} + +inline CBLAS_SIDE convert_to_cblas_side(side is_left) { + return is_left == side::left ? CBLAS_SIDE::CblasLeft : CBLAS_SIDE::CblasRight; +} + +inline CBLAS_OFFSET convert_to_cblas_offset(offset offsetc) { + if (offsetc == offset::fix) + return CBLAS_OFFSET::CblasFixOffset; + else if (offsetc == offset::column) + return CBLAS_OFFSET::CblasColOffset; + else + return CBLAS_OFFSET::CblasRowOffset; +} + +// host_task automatically uses run_on_host_intel if it is supported by the +// compiler. Otherwise, it falls back to single_task. +template +static inline auto host_task_internal(H& cgh, F f, int) -> decltype(cgh.host_task(f)) { + return cgh.host_task(f); +} + +template +static inline void host_task_internal(H& cgh, F f, long) { +#ifndef __SYCL_DEVICE_ONLY__ + cgh.template single_task(f); +#endif +} + +template +static inline void host_task(H& cgh, F f) { + (void)host_task_internal(cgh, f, 0); +} + +} // namespace openblas +} // namespace blas +} // namespace math +} // namespace oneapi + +#endif //_NETLIB_COMMON_HPP_ diff --git a/src/blas/backends/openblas/openblas_extensions.cpp b/src/blas/backends/openblas/openblas_extensions.cpp new file mode 100644 index 000000000..60fc130cc --- /dev/null +++ b/src/blas/backends/openblas/openblas_extensions.cpp @@ -0,0 +1,51 @@ +/******************************************************************************* +* Copyright 2020-2021 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#if __has_include() +#include +#else +#include +#endif + +#include "openblas_common.hpp" +#include "oneapi/math/blas/detail/openblas/onemath_blas_openblas.hpp" +#include "oneapi/math/exceptions.hpp" + +namespace oneapi { +namespace math { +namespace blas { +namespace openblas { +namespace column_major { + +#define COLUMN_MAJOR +#include "openblas_extensions.cxx" +#undef COLUMN_MAJOR + +} // namespace column_major +namespace row_major { + +#define ROW_MAJOR +#include "openblas_extensions.cxx" +#undef ROW_MAJOR + +} // namespace row_major +} // namespace openblas +} // namespace blas +} // namespace math +} // namespace oneapi diff --git a/src/blas/backends/openblas/openblas_extensions.cxx b/src/blas/backends/openblas/openblas_extensions.cxx new file mode 100644 index 000000000..61044ff88 --- /dev/null +++ b/src/blas/backends/openblas/openblas_extensions.cxx @@ -0,0 +1,698 @@ +/******************************************************************************* +* Copyright 2020-2021 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +// Buffer APIs + +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int8_t ao, sycl::buffer& b, int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_bias", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_bias", "for row_major layout"); +#endif +} + +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int8_t ao, sycl::buffer& b, int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_bias", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_bias", "for row_major layout"); +#endif +} + +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + uint8_t ao, sycl::buffer& b, int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_bias", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_bias", "for row_major layout"); +#endif +} + +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + uint8_t ao, sycl::buffer& b, int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_bias", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_bias", "for row_major layout"); +#endif +} + +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, float beta, sycl::buffer& c, + int64_t ldc) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemmt", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemmt", "for row_major layout"); +#endif +} + +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, double alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, double beta, sycl::buffer& c, + int64_t ldc) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemmt", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemmt", "for row_major layout"); +#endif +} + +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemmt", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemmt", "for row_major layout"); +#endif +} + +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemmt", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemmt", "for row_major layout"); +#endif +} + +void omatcopy(sycl::queue&, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb) +{ + auto a_acc = a.get_access(); + auto b_acc = b.get_access(); + + const float* A = a_acc.get_pointer(); + float* B = b_acc.get_pointer(); + + CBLAS_TRANSPOSE t = (trans == transpose::nontrans) ? CblasNoTrans : CblasTrans; + +#ifdef COLUMN_MAJOR + cblas_somatcopy(CblasColMajor, t, m, n, alpha, A, lda, B, ldb); +#endif + +#ifdef ROW_MAJOR + cblas_somatcopy(CblasRowMajor, t, m, n, alpha, A, lda, B, ldb); +#endif +} + +void omatcopy(sycl::queue&, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb) +{ + auto a_acc = a.get_access(); + auto b_acc = b.get_access(); + + const double* A = a_acc.get_pointer(); + double* B = b_acc.get_pointer(); + + CBLAS_TRANSPOSE t = (trans == transpose::nontrans) ? CblasNoTrans : CblasTrans; + +#ifdef COLUMN_MAJOR + cblas_domatcopy(CblasColMajor, t, m, n, alpha, A, lda, B, ldb); +#endif + +#ifdef ROW_MAJOR + cblas_domatcopy(CblasRowMajor, t, m, n, alpha, A, lda, B, ldb); +#endif +} + + +void omatcopy(sycl::queue&, transpose trans, + int64_t m, int64_t n, + std::complex alpha, + sycl::buffer,1>& a, int64_t lda, + sycl::buffer,1>& b, int64_t ldb) +{ + auto a_acc = a.get_access(); + auto b_acc = b.get_access(); + + const float* A = reinterpret_cast(a_acc.get_pointer()); + float* B = reinterpret_cast(b_acc.get_pointer()); + const float* alpha_ptr = reinterpret_cast(&alpha); + + CBLAS_TRANSPOSE t; + if (trans == transpose::nontrans) { + t = CblasNoTrans; + } + else if (trans == transpose::trans) { + t = CblasTrans; + } + else { + t = CblasConjTrans; + } + +#ifdef COLUMN_MAJOR + cblas_comatcopy(CblasColMajor, t, (blasint)m, (blasint)n, + alpha_ptr, A, (blasint)lda, B, (blasint)ldb); +#endif + +#ifdef ROW_MAJOR + cblas_comatcopy(CblasRowMajor, t, (blasint)m, (blasint)n, + alpha_ptr, A, (blasint)lda, B, (blasint)ldb); +#endif +} + +void omatcopy(sycl::queue&, transpose trans, + int64_t m, int64_t n, + std::complex alpha, + sycl::buffer,1>& a, int64_t lda, + sycl::buffer,1>& b, int64_t ldb) +{ + auto a_acc = a.get_access(); + auto b_acc = b.get_access(); + + const double* A = reinterpret_cast(a_acc.get_pointer()); + double* B = reinterpret_cast(b_acc.get_pointer()); + const double* alpha_ptr = reinterpret_cast(&alpha); + + CBLAS_TRANSPOSE t; + if (trans == transpose::nontrans) { + t = CblasNoTrans; + } + else if (trans == transpose::trans) { + t = CblasTrans; + } + else { + t = CblasConjTrans; + } + +#ifdef COLUMN_MAJOR + cblas_zomatcopy(CblasColMajor, t, (blasint)m, (blasint)n, + alpha_ptr, A, (blasint)lda, B, (blasint)ldb); +#endif + +#ifdef ROW_MAJOR + cblas_zomatcopy(CblasRowMajor, t, (blasint)m, (blasint)n, + alpha_ptr, A, (blasint)lda, B, (blasint)ldb); +#endif +} + + +void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, std::int64_t stridea, + sycl::buffer& b, int64_t ldb, std::int64_t strideb) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy2", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy2", "for row_major layout"); +#endif +} + +void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, std::int64_t stridea, + sycl::buffer& b, int64_t ldb, std::int64_t strideb) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy2", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy2", "for row_major layout"); +#endif +} + +void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& b, int64_t ldb, std::int64_t strideb) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy2", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy2", "for row_major layout"); +#endif +} + +void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& b, int64_t ldb, + std::int64_t strideb) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy2", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy2", "for row_major layout"); +#endif +} + + +void imatcopy(sycl::queue&, transpose trans, + int64_t m, int64_t n, float alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb) +{ + auto acc = ab.get_access(); + float* A = acc.get_pointer(); + + CBLAS_TRANSPOSE t = + (trans == transpose::nontrans) ? CblasNoTrans : CblasTrans; + +#ifdef COLUMN_MAJOR + cblas_simatcopy(CblasColMajor, t, m, n, alpha, A, lda, ldb); +#endif + +#ifdef ROW_MAJOR + cblas_simatcopy(CblasRowMajor, t, m, n, alpha, A, lda, ldb); +#endif +} + + +void imatcopy(sycl::queue&, transpose trans, + int64_t m, int64_t n, double alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb) +{ + auto acc = ab.get_access(); + double* A = acc.get_pointer(); + + CBLAS_TRANSPOSE t = + (trans == transpose::nontrans) ? CblasNoTrans : CblasTrans; + +#ifdef COLUMN_MAJOR + cblas_dimatcopy(CblasColMajor, t, m, n, alpha, A, lda, ldb); +#endif + +#ifdef ROW_MAJOR + cblas_dimatcopy(CblasRowMajor, t, m, n, alpha, A, lda, ldb); +#endif +} + +void imatcopy(sycl::queue&, transpose trans, + int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& ab, int64_t lda, int64_t ldb) +{ + auto acc = ab.get_access(); + + float* A = reinterpret_cast(acc.get_pointer()); + const float* alpha_ptr = reinterpret_cast(&alpha); + + CBLAS_TRANSPOSE t = + (trans == transpose::nontrans) ? CblasNoTrans : CblasTrans; + +#ifdef COLUMN_MAJOR + cblas_cimatcopy(CblasColMajor, t, m, n, + alpha_ptr, A, lda, ldb); +#endif + +#ifdef ROW_MAJOR + cblas_cimatcopy(CblasRowMajor, t, m, n, + alpha_ptr, A, lda, ldb); +#endif +} + +void imatcopy(sycl::queue&, transpose trans, + int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& ab, int64_t lda, int64_t ldb) +{ + auto acc = ab.get_access(); + + double* A = reinterpret_cast(acc.get_pointer()); + const double* alpha_ptr = reinterpret_cast(&alpha); + + CBLAS_TRANSPOSE t = + (trans == transpose::nontrans) ? CblasNoTrans : CblasTrans; + +#ifdef COLUMN_MAJOR + cblas_zimatcopy(CblasColMajor, t, m, n, + alpha_ptr, A, lda, ldb); +#endif + +#ifdef ROW_MAJOR + cblas_zimatcopy(CblasRowMajor, t, m, n, + alpha_ptr, A, lda, ldb); +#endif +} + + +void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + float alpha, sycl::buffer& a, int64_t lda, float beta, + sycl::buffer& b, int64_t ldb, sycl::buffer& c, int64_t ldc) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatadd", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatadd", "for row_major layout"); +#endif +} + +void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + double alpha, sycl::buffer& a, int64_t lda, double beta, + sycl::buffer& b, int64_t ldb, sycl::buffer& c, int64_t ldc) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatadd", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatadd", "for row_major layout"); +#endif +} + +void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::complex beta, sycl::buffer, 1>& b, int64_t ldb, + sycl::buffer, 1>& c, int64_t ldc) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatadd", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatadd", "for row_major layout"); +#endif +} + +void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::complex beta, sycl::buffer, 1>& b, int64_t ldb, + sycl::buffer, 1>& c, int64_t ldc) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatadd", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatadd", "for row_major layout"); +#endif +} + +// USM APIs + +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const int8_t* a, int64_t lda, + int8_t ao, const int8_t* b, int64_t ldb, int8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_bias", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_bias", "for row_major layout"); +#endif +} + +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const int8_t* a, int64_t lda, + int8_t ao, const uint8_t* b, int64_t ldb, uint8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_bias", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_bias", "for row_major layout"); +#endif +} + +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const uint8_t* a, int64_t lda, + uint8_t ao, const int8_t* b, int64_t ldb, int8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_bias", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_bias", "for row_major layout"); +#endif +} + +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const uint8_t* a, int64_t lda, + uint8_t ao, const uint8_t* b, int64_t ldb, uint8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm_bias", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm_bias", "for row_major layout"); +#endif +} + +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, float alpha, const float* a, int64_t lda, const float* b, + int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemmt", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemmt", "for row_major layout"); +#endif +} + +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, double alpha, const double* a, int64_t lda, const double* b, + int64_t ldb, double beta, double* c, int64_t ldc, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemmt", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemmt", "for row_major layout"); +#endif +} + +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemmt", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemmt", "for row_major layout"); +#endif +} + +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, + std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemmt", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemmt", "for row_major layout"); +#endif +} + +sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, float* b, int64_t ldb, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy", "for row_major layout"); +#endif +} + +sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, double* b, int64_t ldb, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy", "for row_major layout"); +#endif +} + +sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex* b, int64_t ldb, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy", "for row_major layout"); +#endif +} + +sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex* b, int64_t ldb, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy", "for row_major layout"); +#endif +} + +sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, std::int64_t stridea, float* b, int64_t ldb, + std::int64_t strideb, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy2", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy2", "for row_major layout"); +#endif +} + +sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, std::int64_t stridea, double* b, int64_t ldb, + std::int64_t strideb, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy2", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy2", "for row_major layout"); +#endif +} + +sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::int64_t stridea, std::complex* b, int64_t ldb, + std::int64_t strideb, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy2", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy2", "for row_major layout"); +#endif +} + +sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::int64_t stridea, std::complex* b, int64_t ldb, + std::int64_t strideb, const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatcopy2", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatcopy2", "for row_major layout"); +#endif +} + +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + float* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "imatcopy", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "imatcopy", "for row_major layout"); +#endif +} + +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + double* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "imatcopy", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "imatcopy", "for row_major layout"); +#endif +} + +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "imatcopy", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "imatcopy", "for row_major layout"); +#endif +} + +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "imatcopy", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "imatcopy", "for row_major layout"); +#endif +} + +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + float alpha, const float* a, int64_t lda, float beta, const float* b, + int64_t ldb, float* c, int64_t ldc, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatadd", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatadd", "for row_major layout"); +#endif +} + +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + double alpha, const double* a, int64_t lda, double beta, const double* b, + int64_t ldb, double* c, int64_t ldc, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatadd", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatadd", "for row_major layout"); +#endif +} + +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex beta, const std::complex* b, int64_t ldb, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatadd", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatadd", "for row_major layout"); +#endif +} + +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex beta, const std::complex* b, int64_t ldb, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "omatadd", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "omatadd", "for row_major layout"); +#endif +} diff --git a/src/blas/backends/openblas/openblas_level1.cpp b/src/blas/backends/openblas/openblas_level1.cpp new file mode 100644 index 000000000..2fa7c9fa2 --- /dev/null +++ b/src/blas/backends/openblas/openblas_level1.cpp @@ -0,0 +1,68 @@ +/******************************************************************************* +* Copyright 2020-2021 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#if __has_include() +#include +#else +#include +#endif + +#include "openblas_common.hpp" +#include "oneapi/math/exceptions.hpp" +#include "oneapi/math/blas/detail/openblas/onemath_blas_openblas.hpp" + +inline float abs_val(float val) { + return std::abs(val); +} + +inline double abs_val(double val) { + return std::abs(val); +} + +inline float abs_val(std::complex val) { + return std::abs(val.real()) + std::abs(val.imag()); +} + +inline double abs_val(std::complex val) { + return std::abs(val.real()) + std::abs(val.imag()); +} + + +namespace oneapi { +namespace math { +namespace blas { +namespace openblas { +namespace column_major { + +#define COLUMN_MAJOR +#include "openblas_level1.cxx" +#undef COLUMN_MAJOR + +} // namespace column_major +namespace row_major { + +#define ROW_MAJOR +#include "openblas_level1.cxx" +#undef ROW_MAJOR + +} // namespace row_major +} // namespace openblas +} // namespace blas +} // namespace math +} // namespace oneapi diff --git a/src/blas/backends/openblas/openblas_level1.cxx b/src/blas/backends/openblas/openblas_level1.cxx new file mode 100644 index 000000000..70015e714 --- /dev/null +++ b/src/blas/backends/openblas/openblas_level1.cxx @@ -0,0 +1,1705 @@ +/******************************************************************************* +* Copyright 2020-2021 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ +#include +// Buffer APIs + +void asum(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + accessor_result[0] = + ::cblas_sasum((const int)n, accessor_x.GET_MULTI_PTR, (const int)std::abs(incx)); + }); + }); +} + +void asum(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + accessor_result[0] = + ::cblas_dasum((const int)n, accessor_x.GET_MULTI_PTR, (const int)std::abs(incx)); + }); + }); +} + +void asum(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + accessor_result[0] = + ::cblas_scasum((const int)n, accessor_x.GET_MULTI_PTR, (const int)std::abs(incx)); + }); + }); +} + +void asum(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + accessor_result[0] = + ::cblas_dzasum((const int)n, accessor_x.GET_MULTI_PTR, (const int)std::abs(incx)); + }); + }); +} + +void axpy(sycl::queue& queue, int64_t n, float alpha, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_saxpy((const int)n, (const float)alpha, accessor_x.GET_MULTI_PTR, + (const int)incx, accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void axpy(sycl::queue& queue, int64_t n, double alpha, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_daxpy((const int)n, (const double)alpha, accessor_x.GET_MULTI_PTR, + (const int)incx, accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void axpy(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_caxpy((const int)n, (const void*)&alpha, accessor_x.GET_MULTI_PTR, + (const int)incx, accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void axpy(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zaxpy((const int)n, (const void*)&alpha, accessor_x.GET_MULTI_PTR, + (const int)incx, accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void axpby(sycl::queue&, int64_t n, float alpha, + sycl::buffer& x, int64_t incx, + float beta, + sycl::buffer& y, int64_t incy) +{ + auto x_acc = x.get_access(); + auto y_acc = y.get_access(); + + const float* X = x_acc.get_pointer(); + float* Y = y_acc.get_pointer(); + + blasint N = (blasint)n; + blasint incX = (blasint)incx; + blasint incY = (blasint)incy; + + cblas_saxpby(N, alpha, X, incX, beta, Y, incY); +} + +void axpby(sycl::queue&, int64_t n, double alpha, + sycl::buffer& x, int64_t incx, + double beta, + sycl::buffer& y, int64_t incy) +{ + auto x_acc = x.get_access(); + auto y_acc = y.get_access(); + + const double* X = x_acc.get_pointer(); + double* Y = y_acc.get_pointer(); + + blasint N = (blasint)n; + blasint incX = (blasint)incx; + blasint incY = (blasint)incy; + + cblas_daxpby(N, alpha, X, incX, beta, Y, incY); +} + +void axpby(sycl::queue&, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + std::complex beta, + sycl::buffer, 1>& y, int64_t incy) +{ + auto x_acc = x.get_access(); + auto y_acc = y.get_access(); + + const void* X = static_cast(x_acc.get_pointer()); + void* Y = static_cast(y_acc.get_pointer()); + + blasint N = (blasint)n; + blasint incX = (blasint)incx; + blasint incY = (blasint)incy; + + cblas_caxpby(N, + static_cast(&alpha), + X, incX, + static_cast(&beta), + Y, incY); +} + +void axpby(sycl::queue&, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + std::complex beta, + sycl::buffer, 1>& y, int64_t incy) +{ + auto x_acc = x.get_access(); + auto y_acc = y.get_access(); + + const void* X = static_cast(x_acc.get_pointer()); + void* Y = static_cast(y_acc.get_pointer()); + + blasint N = (blasint)n; + blasint incX = (blasint)incx; + blasint incY = (blasint)incy; + + cblas_zaxpby(N, + static_cast(&alpha), + X, incX, + static_cast(&beta), + Y, incY); +} + +void copy(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_scopy((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void copy(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dcopy((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void copy(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ccopy((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void copy(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zcopy((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void dot(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + accessor_result[0] = + ::cblas_sdot((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void dot(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + accessor_result[0] = + ::cblas_ddot((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void dot(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + double sum = 0.0; + int64_t ix = (incx > 0) ? 0 : (1 - n) * incx; + int64_t iy = (incy > 0) ? 0 : (1 - n) * incy; + + for (int64_t i = 0; i < n; ++i) { + sum += static_cast(accessor_x.GET_MULTI_PTR[ix]) * + static_cast(accessor_y.GET_MULTI_PTR[iy]); + ix += incx; + iy += incy; + } + + accessor_result[0] = sum; + }); + }); +} + +void dotc(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& result) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_cdotc_sub((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy, + accessor_result.GET_MULTI_PTR); + }); + }); +} + +void dotc(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& result) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zdotc_sub((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy, + accessor_result.GET_MULTI_PTR); + }); + }); +} + +void dotu(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& result) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_cdotu_sub((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy, + accessor_result.GET_MULTI_PTR); + }); + }); +} + +void dotu(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& result) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zdotu_sub((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy, + accessor_result.GET_MULTI_PTR); + }); + }); +} + +void iamin(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result, oneapi::math::index_base base) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + accessor_result[0] = ::cblas_isamin((int)n, accessor_x.GET_MULTI_PTR, (int)incx); + if (base == oneapi::math::index_base::one && n >= 1 && incx >= 1) + accessor_result[0]++; + }); + }); +} + +void iamin(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result, oneapi::math::index_base base) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.template get_access(cgh); + auto accessor_result = result.template get_access(cgh); + host_task(cgh, [=]() { + accessor_result[0] = ::cblas_idamin((int)n, accessor_x.GET_MULTI_PTR, (int)incx); + if (base == oneapi::math::index_base::one && n >= 1 && incx >= 1) + accessor_result[0]++; + }); + }); +} + +void iamin(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result, oneapi::math::index_base base) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + accessor_result[0] = ::cblas_icamin((int)n, accessor_x.GET_MULTI_PTR, (int)incx); + if (base == oneapi::math::index_base::one && n >= 1 && incx >= 1) + accessor_result[0]++; + }); + }); +} + +void iamin(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result, oneapi::math::index_base base) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + accessor_result[0] = ::cblas_izamin((int)n, accessor_x.GET_MULTI_PTR, (int)incx); + if (base == oneapi::math::index_base::one && n >= 1 && incx >= 1) + accessor_result[0]++; + }); + }); +} + +void iamax(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result, oneapi::math::index_base base) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + accessor_result[0] = ::cblas_isamax((int)n, accessor_x.GET_MULTI_PTR, (int)incx); + if (base == oneapi::math::index_base::one && n >= 1 && incx >= 1) + accessor_result[0]++; + }); + }); +} + +void iamax(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result, oneapi::math::index_base base) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + accessor_result[0] = ::cblas_idamax((int)n, accessor_x.GET_MULTI_PTR, (int)incx); + if (base == oneapi::math::index_base::one && n >= 1 && incx >= 1) + accessor_result[0]++; + }); + }); +} + +void iamax(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result, oneapi::math::index_base base) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + accessor_result[0] = ::cblas_icamax((int)n, accessor_x.GET_MULTI_PTR, (int)incx); + if (base == oneapi::math::index_base::one && n >= 1 && incx >= 1) + accessor_result[0]++; + }); + }); +} + +void iamax(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result, oneapi::math::index_base base) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + accessor_result[0] = ::cblas_izamax((int)n, accessor_x.GET_MULTI_PTR, (int)incx); + if (base == oneapi::math::index_base::one && n >= 1 && incx >= 1) + accessor_result[0]++; + }); + }); +} + +void nrm2(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.template get_access(cgh); + auto accessor_result = result.template get_access(cgh); + host_task(cgh, [=]() { + accessor_result[0] = + ::cblas_snrm2((const int)n, accessor_x.GET_MULTI_PTR, (const int)std::abs(incx)); + }); + }); +} + +void nrm2(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + accessor_result[0] = + ::cblas_dnrm2((const int)n, accessor_x.GET_MULTI_PTR, (const int)std::abs(incx)); + }); + }); +} + +void nrm2(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + accessor_result[0] = + ::cblas_scnrm2((const int)n, accessor_x.GET_MULTI_PTR, (const int)std::abs(incx)); + }); + }); +} + +void nrm2(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + accessor_result[0] = + ::cblas_dznrm2((const int)n, accessor_x.GET_MULTI_PTR, (const int)std::abs(incx)); + }); + }); +} + +void rot(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, float c, float s) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_srot((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy, (const float)c, (const float)s); + }); + }); +} + +void rot(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, double c, double s) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_drot((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy, (const float)c, (const float)s); + }); + }); +} + +void rot(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, float c, float s) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_csrot((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy, (const float)c, + (const float)s); + }); + }); +} + +void rot(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, double c, double s) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zdrot((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy, (const double)c, + (const double)s); + }); + }); +} + +void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, + sycl::buffer& c, sycl::buffer& s) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + auto accessor_c = c.get_access(cgh); + auto accessor_s = s.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_srotg(accessor_a.GET_MULTI_PTR, accessor_b.GET_MULTI_PTR, + accessor_c.GET_MULTI_PTR, accessor_s.GET_MULTI_PTR); + }); + }); +} + +void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, + sycl::buffer& c, sycl::buffer& s) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + auto accessor_c = c.get_access(cgh); + auto accessor_s = s.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_drotg(accessor_a.GET_MULTI_PTR, accessor_b.GET_MULTI_PTR, + accessor_c.GET_MULTI_PTR, accessor_s.GET_MULTI_PTR); + }); + }); +} + +void rotg(sycl::queue& queue, + sycl::buffer, 1>& a, + sycl::buffer, 1>& b, + sycl::buffer& c, + sycl::buffer, 1>& s) +{ + queue.submit([&](sycl::handler& cgh) { + + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + auto accessor_c = c.get_access(cgh); + auto accessor_s = s.get_access(cgh); + + host_task(cgh, [=]() { + + ::cblas_crotg( + const_cast( + reinterpret_cast(accessor_a.GET_MULTI_PTR) + ), + const_cast( + reinterpret_cast(accessor_b.GET_MULTI_PTR) + ), + accessor_c.GET_MULTI_PTR, + const_cast( + reinterpret_cast(accessor_s.GET_MULTI_PTR) + ) + ); + + }); + }); +} + +void rotg(sycl::queue& queue, + sycl::buffer, 1>& a, + sycl::buffer, 1>& b, + sycl::buffer& c, + sycl::buffer, 1>& s) +{ + queue.submit([&](sycl::handler& cgh) { + + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + auto accessor_c = c.get_access(cgh); + auto accessor_s = s.get_access(cgh); + + host_task(cgh, [=]() { + + void* a_ptr = const_cast( + reinterpret_cast(accessor_a.GET_MULTI_PTR) + ); + + void* b_ptr = const_cast( + reinterpret_cast(accessor_b.GET_MULTI_PTR) + ); + + double* c_ptr = accessor_c.GET_MULTI_PTR; + + void* s_ptr = const_cast( + reinterpret_cast(accessor_s.GET_MULTI_PTR) + ); + + ::cblas_zrotg(a_ptr, b_ptr, c_ptr, s_ptr); + }); + }); +} + + +void rotm(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& param) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_param = param.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_srotm((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy, accessor_param.GET_MULTI_PTR); + }); + }); +} + +void rotm(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& param) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_param = param.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_drotm((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy, accessor_param.GET_MULTI_PTR); + }); + }); +} + +void rotmg(sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, + sycl::buffer& x1, float y1, sycl::buffer& param) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_d1 = d1.get_access(cgh); + auto accessor_d2 = d2.get_access(cgh); + auto accessor_x1 = x1.get_access(cgh); + auto accessor_param = param.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_srotmg(accessor_d1.GET_MULTI_PTR, accessor_d2.GET_MULTI_PTR, + accessor_x1.GET_MULTI_PTR, (float)y1, accessor_param.GET_MULTI_PTR); + }); + }); +} + +void rotmg(sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, + sycl::buffer& x1, double y1, sycl::buffer& param) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_d1 = d1.get_access(cgh); + auto accessor_d2 = d2.get_access(cgh); + auto accessor_x1 = x1.get_access(cgh); + auto accessor_param = param.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_drotmg(accessor_d1.GET_MULTI_PTR, accessor_d2.GET_MULTI_PTR, + accessor_x1.GET_MULTI_PTR, (double)y1, accessor_param.GET_MULTI_PTR); + }); + }); +} + +void scal(sycl::queue& queue, int64_t n, float alpha, sycl::buffer& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_sscal((const int)n, (const float)alpha, accessor_x.GET_MULTI_PTR, + (const int)std::abs(incx)); + }); + }); +} + +void scal(sycl::queue& queue, int64_t n, double alpha, sycl::buffer& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dscal((const int)n, (const double)alpha, accessor_x.GET_MULTI_PTR, + (const int)std::abs(incx)); + }); + }); +} + +void scal(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_cscal((const int)n, (const void*)&alpha, accessor_x.GET_MULTI_PTR, + (const int)std::abs(incx)); + }); + }); +} + +void scal(sycl::queue& queue, int64_t n, float alpha, sycl::buffer, 1>& x, + int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_csscal((const int)n, (const float)alpha, accessor_x.GET_MULTI_PTR, + (const int)std::abs(incx)); + }); + }); +} + +void scal(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zscal((const int)n, (const void*)&alpha, accessor_x.GET_MULTI_PTR, + (const int)std::abs(incx)); + }); + }); +} + +void scal(sycl::queue& queue, int64_t n, double alpha, sycl::buffer, 1>& x, + int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zdscal((const int)n, (const double)alpha, accessor_x.GET_MULTI_PTR, + (const int)std::abs(incx)); + }); + }); +} + +void sdsdot(sycl::queue& queue, int64_t n, float sb, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_result = result.get_access(cgh); + host_task(cgh, [=]() { + accessor_result[0] = + ::cblas_sdsdot((const int)n, (const float)sb, accessor_x.GET_MULTI_PTR, + (const int)incx, accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void swap(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_sswap((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void swap(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dswap((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void swap(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_cswap((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void swap(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zswap((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +// USM APIs + +sycl::event asum(sycl::queue& queue, int64_t n, const float* x, int64_t incx, float* result, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task( + cgh, [=]() { result[0] = ::cblas_sasum((const int)n, x, (const int)std::abs(incx)); }); + }); + return done; +} + +sycl::event asum(sycl::queue& queue, int64_t n, const double* x, int64_t incx, double* result, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task( + cgh, [=]() { result[0] = ::cblas_dasum((const int)n, x, (const int)std::abs(incx)); }); + }); + return done; +} + +sycl::event asum(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + float* result, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task( + cgh, [=]() { result[0] = ::cblas_scasum((const int)n, x, (const int)std::abs(incx)); }); + }); + return done; +} + +sycl::event asum(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + double* result, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task( + cgh, [=]() { result[0] = ::cblas_dzasum((const int)n, x, (const int)std::abs(incx)); }); + }); + return done; +} + +sycl::event axpy(sycl::queue& queue, int64_t n, float alpha, const float* x, int64_t incx, float* y, + int64_t incy, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_saxpy((const int)n, (const float)alpha, x, (const int)incx, y, (const int)incy); + }); + }); + return done; +} + +sycl::event axpy(sycl::queue& queue, int64_t n, double alpha, const double* x, int64_t incx, + double* y, int64_t incy, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_daxpy((const int)n, (const double)alpha, x, (const int)incx, y, + (const int)incy); + }); + }); + return done; +} + +sycl::event axpy(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_caxpy((const int)n, (const void*)&alpha, x, (const int)incx, y, + (const int)incy); + }); + }); + return done; +} + +sycl::event axpy(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zaxpy((const int)n, (const void*)&alpha, x, (const int)incx, y, + (const int)incy); + }); + }); + return done; +} + +sycl::event axpby(sycl::queue& queue, int64_t n, float alpha, const float* x, int64_t incx, + float beta, float* y, int64_t incy, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "axpby", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "axpby", "for row_major layout"); +#endif +} + +sycl::event axpby(sycl::queue& queue, int64_t n, double alpha, const double* x, int64_t incx, + double beta, double* y, int64_t incy, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "axpby", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "axpby", "for row_major layout"); +#endif +} + +sycl::event axpby(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "axpby", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "axpby", "for row_major layout"); +#endif +} + +sycl::event axpby(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "axpby", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "axpby", "for row_major layout"); +#endif +} + +sycl::event copy(sycl::queue& queue, int64_t n, const float* x, int64_t incx, float* y, + int64_t incy, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task( + cgh, [=]() { ::cblas_scopy((const int)n, x, (const int)incx, y, (const int)incy); }); + }); + return done; +} + +sycl::event copy(sycl::queue& queue, int64_t n, const double* x, int64_t incx, double* y, + int64_t incy, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task( + cgh, [=]() { ::cblas_dcopy((const int)n, x, (const int)incx, y, (const int)incy); }); + }); + return done; +} + +sycl::event copy(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task( + cgh, [=]() { ::cblas_ccopy((const int)n, x, (const int)incx, y, (const int)incy); }); + }); + return done; +} + +sycl::event copy(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task( + cgh, [=]() { ::cblas_zcopy((const int)n, x, (const int)incx, y, (const int)incy); }); + }); + return done; +} + +sycl::event dot(sycl::queue& queue, int64_t n, const float* x, int64_t incx, const float* y, + int64_t incy, float* result, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::cblas_sdot((const int)n, x, (const int)incx, y, (const int)incy); + }); + }); + return done; +} + +sycl::event dot(sycl::queue& queue, int64_t n, const double* x, int64_t incx, const double* y, + int64_t incy, double* result, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::cblas_ddot((const int)n, x, (const int)incx, y, (const int)incy); + }); + }); + return done; +} + +sycl::event dot(sycl::queue& queue, int64_t n, + const float* x, int64_t incx, + const float* y, int64_t incy, + double* result, + const std::vector& dependencies) { + return queue.submit([&](sycl::handler& cgh) { + for (const auto& dep : dependencies) { + cgh.depends_on(dep); + } + + host_task(cgh, [=]() { + double sum = 0.0; + int64_t ix = (incx > 0) ? 0 : (1 - n) * incx; + int64_t iy = (incy > 0) ? 0 : (1 - n) * incy; + + for (int64_t i = 0; i < n; ++i) { + sum += static_cast(x[ix]) * + static_cast(y[iy]); + ix += incx; + iy += incy; + } + + *result = sum; + }); + }); +} + +sycl::event dotc(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + const std::complex* y, int64_t incy, std::complex* result, + const std::vector& dependencies) { + return queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + std::complex sum = { 0.0f, 0.0f }; + int64_t ix = (incx > 0) ? 0 : (1 - n) * incx; + int64_t iy = (incy > 0) ? 0 : (1 - n) * incy; + + for (int64_t i = 0; i < n; ++i) { + sum += std::conj(x[ix]) * y[iy]; + ix += incx; + iy += incy; + } + + *result = sum; + }); + }); +} + +sycl::event dotc(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + const std::complex* y, int64_t incy, std::complex* result, + const std::vector& dependencies) { + return queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + std::complex sum = { 0.0, 0.0 }; + int64_t ix = (incx > 0) ? 0 : (1 - n) * incx; + int64_t iy = (incy > 0) ? 0 : (1 - n) * incy; + + for (int64_t i = 0; i < n; ++i) { + sum += std::conj(x[ix]) * y[iy]; + ix += incx; + iy += incy; + } + + *result = sum; + }); + }); +} + +sycl::event dotu(sycl::queue& queue, int64_t n, + const std::complex* x, int64_t incx, + const std::complex* y, int64_t incy, + std::complex* result, + const std::vector& dependencies) { + return queue.submit([&](sycl::handler& cgh) { + for (const auto& dep : dependencies) { + cgh.depends_on(dep); + } + + host_task(cgh, [=]() { + std::complex sum = { 0.0f, 0.0f }; + int64_t ix = (incx > 0) ? 0 : (1 - n) * incx; + int64_t iy = (incy > 0) ? 0 : (1 - n) * incy; + + for (int64_t i = 0; i < n; ++i) { + sum += x[ix] * y[iy]; + ix += incx; + iy += incy; + } + + *result = sum; + }); + }); +} +sycl::event dotu(sycl::queue& queue, int64_t n, + const std::complex* x, int64_t incx, + const std::complex* y, int64_t incy, + std::complex* result, + const std::vector& dependencies) { + return queue.submit([&](sycl::handler& cgh) { + for (const auto& dep : dependencies) { + cgh.depends_on(dep); + } + + host_task(cgh, [=]() { + std::complex sum = { 0.0, 0.0 }; + int64_t ix = (incx > 0) ? 0 : (1 - n) * incx; + int64_t iy = (incy > 0) ? 0 : (1 - n) * incy; + + for (int64_t i = 0; i < n; ++i) { + sum += x[ix] * y[iy]; + ix += incx; + iy += incy; + } + + *result = sum; + }); + }); +} + +sycl::event iamin(sycl::queue& queue, int64_t n, const float* x, int64_t incx, int64_t* result, + oneapi::math::index_base base, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::cblas_isamin((int)n, x, (int)incx); + if (base == oneapi::math::index_base::one && n >= 1 && incx >= 1) + result[0]++; + }); + }); + return done; +} + +sycl::event iamin(sycl::queue& queue, int64_t n, const double* x, int64_t incx, int64_t* result, + oneapi::math::index_base base, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::cblas_idamin((const int)n, x, (const int)incx); + if (base == oneapi::math::index_base::one && n >= 1 && incx >= 1) + result[0]++; + }); + }); + return done; +} + +sycl::event iamin(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + int64_t* result, oneapi::math::index_base base, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::cblas_icamin((int)n, x, (int)incx); + if (base == oneapi::math::index_base::one && n >= 1 && incx >= 1) + result[0]++; + }); + }); + return done; +} + +sycl::event iamin(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + int64_t* result, oneapi::math::index_base base, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::cblas_izamin((int)n, x, (int)incx); + if (base == oneapi::math::index_base::one && n >= 1 && incx >= 1) + result[0]++; + }); + }); + return done; +} + +sycl::event iamax(sycl::queue& queue, int64_t n, const float* x, int64_t incx, int64_t* result, + oneapi::math::index_base base, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::cblas_isamax((int)n, x, (int)incx); + if (base == oneapi::math::index_base::one && n >= 1 && incx >= 1) + result[0]++; + }); + }); + return done; +} + +sycl::event iamax(sycl::queue& queue, int64_t n, const double* x, int64_t incx, int64_t* result, + oneapi::math::index_base base, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::cblas_idamax((int)n, x, (int)incx); + if (base == oneapi::math::index_base::one && n >= 1 && incx >= 1) + result[0]++; + }); + }); + return done; +} + +sycl::event iamax(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + int64_t* result, oneapi::math::index_base base, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::cblas_icamax((int)n, x, (int)incx); + if (base == oneapi::math::index_base::one && n >= 1 && incx >= 1) + result[0]++; + }); + }); + return done; +} + +sycl::event iamax(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + int64_t* result, oneapi::math::index_base base, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::cblas_izamax((int)n, x, (int)incx); + if (base == oneapi::math::index_base::one && n >= 1 && incx >= 1) + result[0]++; + }); + }); + return done; +} + +sycl::event nrm2(sycl::queue& queue, int64_t n, const float* x, int64_t incx, float* result, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task( + cgh, [=]() { result[0] = ::cblas_snrm2((const int)n, x, (const int)std::abs(incx)); }); + }); + return done; +} + +sycl::event nrm2(sycl::queue& queue, int64_t n, const double* x, int64_t incx, double* result, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task( + cgh, [=]() { result[0] = ::cblas_dnrm2((const int)n, x, (const int)std::abs(incx)); }); + }); + return done; +} + +sycl::event nrm2(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + float* result, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task( + cgh, [=]() { result[0] = ::cblas_scnrm2((const int)n, x, (const int)std::abs(incx)); }); + }); + return done; +} + +sycl::event nrm2(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + double* result, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task( + cgh, [=]() { result[0] = ::cblas_dznrm2((const int)n, x, (const int)std::abs(incx)); }); + }); + return done; +} + +sycl::event rot(sycl::queue& queue, int64_t n, float* x, int64_t incx, float* y, int64_t incy, + float c, float s, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_srot((const int)n, x, (const int)incx, y, (const int)incy, (const float)c, + (const float)s); + }); + }); + return done; +} + +sycl::event rot(sycl::queue& queue, int64_t n, double* x, int64_t incx, double* y, int64_t incy, + double c, double s, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_drot((const int)n, x, (const int)incx, y, (const int)incy, (const float)c, + (const float)s); + }); + }); + return done; +} + +sycl::event rot(sycl::queue& queue, int64_t n, std::complex* x, int64_t incx, + std::complex* y, int64_t incy, float c, float s, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_csrot((const int)n, x, (const int)incx, y, (const int)incy, (const float)c, + (const float)s); + }); + }); + return done; +} + +sycl::event rot(sycl::queue& queue, int64_t n, std::complex* x, int64_t incx, + std::complex* y, int64_t incy, double c, double s, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zdrot((const int)n, x, (const int)incx, y, (const int)incy, (const double)c, + (const double)s); + }); + }); + return done; +} + +sycl::event rotg(sycl::queue& queue, float* a, float* b, float* c, float* s, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { ::cblas_srotg(a, b, c, s); }); + }); + return done; +} + +sycl::event rotg(sycl::queue& queue, double* a, double* b, double* c, double* s, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { ::cblas_drotg(a, b, c, s); }); + }); + return done; +} + +sycl::event rotg(sycl::queue& queue, std::complex* a, std::complex* b, float* c, + std::complex* s, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { ::cblas_crotg(a, b, c, s); }); + }); + return done; +} + +sycl::event rotg(sycl::queue& queue, std::complex* a, std::complex* b, double* c, + std::complex* s, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { ::cblas_zrotg(a, b, c, s); }); + }); + return done; +} + +sycl::event rotm(sycl::queue& queue, int64_t n, float* x, int64_t incx, float* y, int64_t incy, + float* param, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_srotm((const int)n, x, (const int)incx, y, (const int)incy, param); + }); + }); + return done; +} + +sycl::event rotm(sycl::queue& queue, int64_t n, double* x, int64_t incx, double* y, int64_t incy, + double* param, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_drotm((const int)n, x, (const int)incx, y, (const int)incy, param); + }); + }); + return done; +} + +sycl::event rotmg(sycl::queue& queue, float* d1, float* d2, float* x1, float y1, float* param, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, + [=]() { ::cblas_srotmg(d1, d2, x1, (float)y1, param); }); + }); + return done; +} + +sycl::event rotmg(sycl::queue& queue, double* d1, double* d2, double* x1, double y1, double* param, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task( + cgh, [=]() { ::cblas_drotmg(d1, d2, x1, (double)y1, param); }); + }); + return done; +} + +sycl::event scal(sycl::queue& queue, int64_t n, float alpha, float* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_sscal((const int)n, (const float)alpha, x, (const int)std::abs(incx)); + }); + }); + return done; +} + +sycl::event scal(sycl::queue& queue, int64_t n, double alpha, double* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dscal((const int)n, (const double)alpha, x, (const int)std::abs(incx)); + }); + }); + return done; +} + +sycl::event scal(sycl::queue& queue, int64_t n, std::complex alpha, std::complex* x, + int64_t incx, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_cscal((const int)n, (const void*)&alpha, x, (const int)std::abs(incx)); + }); + }); + return done; +} + +sycl::event scal(sycl::queue& queue, int64_t n, float alpha, std::complex* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_csscal((const int)n, (const float)alpha, x, (const int)std::abs(incx)); + }); + }); + return done; +} + +sycl::event scal(sycl::queue& queue, int64_t n, std::complex alpha, std::complex* x, + int64_t incx, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zscal((const int)n, (const void*)&alpha, x, (const int)std::abs(incx)); + }); + }); + return done; +} + +sycl::event scal(sycl::queue& queue, int64_t n, double alpha, std::complex* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zdscal((const int)n, (const double)alpha, x, (const int)std::abs(incx)); + }); + }); + return done; +} + +sycl::event sdsdot(sycl::queue& queue, int64_t n, float sb, const float* x, int64_t incx, + const float* y, int64_t incy, float* result, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::cblas_sdsdot((const int)n, (const float)sb, x, (const int)incx, y, + (const int)incy); + }); + }); + return done; +} + +sycl::event swap(sycl::queue& queue, int64_t n, float* x, int64_t incx, float* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task( + cgh, [=]() { ::cblas_sswap((const int)n, x, (const int)incx, y, (const int)incy); }); + }); + return done; +} + +sycl::event swap(sycl::queue& queue, int64_t n, double* x, int64_t incx, double* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task( + cgh, [=]() { ::cblas_dswap((const int)n, x, (const int)incx, y, (const int)incy); }); + }); + return done; +} + +sycl::event swap(sycl::queue& queue, int64_t n, std::complex* x, int64_t incx, + std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task( + cgh, [=]() { ::cblas_cswap((const int)n, x, (const int)incx, y, (const int)incy); }); + }); + return done; +} + +sycl::event swap(sycl::queue& queue, int64_t n, std::complex* x, int64_t incx, + std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task( + cgh, [=]() { ::cblas_zswap((const int)n, x, (const int)incx, y, (const int)incy); }); + }); + return done; +} diff --git a/src/blas/backends/openblas/openblas_level2.cpp b/src/blas/backends/openblas/openblas_level2.cpp new file mode 100644 index 000000000..977956782 --- /dev/null +++ b/src/blas/backends/openblas/openblas_level2.cpp @@ -0,0 +1,50 @@ +/******************************************************************************* +* Copyright 2020-2021 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#if __has_include() +#include +#else +#include +#endif + +#include "openblas_common.hpp" +#include "oneapi/math/blas/detail/openblas/onemath_blas_openblas.hpp" + +namespace oneapi { +namespace math { +namespace blas { +namespace openblas { +namespace column_major { + +#define MAJOR CblasColMajor +#include "openblas_level2.cxx" +#undef MAJOR + +} // namespace column_major +namespace row_major { + +#define MAJOR CblasRowMajor +#include "openblas_level2.cxx" +#undef MAJOR + +} // namespace row_major +} // namespace openblas +} // namespace blas +} // namespace math +} // namespace oneapi diff --git a/src/blas/backends/openblas/openblas_level2.cxx b/src/blas/backends/openblas/openblas_level2.cxx new file mode 100644 index 000000000..215fa1e43 --- /dev/null +++ b/src/blas/backends/openblas/openblas_level2.cxx @@ -0,0 +1,2138 @@ +/******************************************************************************* +* Copyright 2020-2021 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +// Buffer APIs + +void gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + float alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx, float beta, sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_sgbmv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, + (const int)kl, (const int)ku, (const float)alpha, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR, + (const int)incx, (const float)beta, accessor_y.GET_MULTI_PTR, + (const int)incy); + }); + }); +} + +void gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + double alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx, double beta, sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dgbmv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, + (const int)kl, (const int)ku, (const double)alpha, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR, + (const int)incx, (const double)beta, accessor_y.GET_MULTI_PTR, + (const int)incy); + }); + }); +} + +void gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_cgbmv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, + (const int)kl, (const int)ku, (const void*)&alpha, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR, + (const int)incx, (const void*)&beta, accessor_y.GET_MULTI_PTR, + (const int)incy); + }); + }); +} + +void gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zgbmv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, + (const int)kl, (const int)ku, (const void*)&alpha, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR, + (const int)incx, (const void*)&beta, accessor_y.GET_MULTI_PTR, + (const int)incy); + }); + }); +} + +void gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, + float beta, sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_sgemv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, + (const float)alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_x.GET_MULTI_PTR, (const int)incx, (const float)beta, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, + double beta, sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dgemv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, + (const double)alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_x.GET_MULTI_PTR, (const int)incx, (const double)beta, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_cgemv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_x.GET_MULTI_PTR, (const int)incx, (const void*)&beta, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zgemv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_x.GET_MULTI_PTR, (const int)incx, (const void*)&beta, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void ger(sycl::queue& queue, int64_t m, int64_t n, float alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& a, + int64_t lda) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_a = a.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_sger(MAJOR, (const int)m, (const int)n, (const float)alpha, + accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR, + (const int)incy, accessor_a.GET_MULTI_PTR, (const int)lda); + }); + }); +} + +void ger(sycl::queue& queue, int64_t m, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& a, + int64_t lda) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_a = a.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dger(MAJOR, (const int)m, (const int)n, (const double)alpha, + accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR, + (const int)incy, accessor_a.GET_MULTI_PTR, (const int)lda); + }); + }); +} + +void gerc(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_a = a.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_cgerc(MAJOR, (const int)m, (const int)n, (const void*)&alpha, + accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR, + (const int)incy, accessor_a.GET_MULTI_PTR, (const int)lda); + }); + }); +} + +void gerc(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_a = a.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zgerc(MAJOR, (const int)m, (const int)n, (const void*)&alpha, + accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR, + (const int)incy, accessor_a.GET_MULTI_PTR, (const int)lda); + }); + }); +} + +void geru(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_a = a.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_cgeru(MAJOR, (const int)m, (const int)n, (const void*)&alpha, + accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR, + (const int)incy, accessor_a.GET_MULTI_PTR, (const int)lda); + }); + }); +} + +void geru(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_a = a.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zgeru(MAJOR, (const int)m, (const int)n, (const void*)&alpha, + accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR, + (const int)incy, accessor_a.GET_MULTI_PTR, (const int)lda); + }); + }); +} + +void hbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_chbmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, (const int)k, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_x.GET_MULTI_PTR, (const int)incx, (const void*)&beta, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void hbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zhbmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, (const int)k, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_x.GET_MULTI_PTR, (const int)incx, (const void*)&beta, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void hemv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_chemv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_x.GET_MULTI_PTR, (const int)incx, (const void*)&beta, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void hemv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zhemv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_x.GET_MULTI_PTR, (const int)incx, (const void*)&beta, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void her(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& a, int64_t lda) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_a = a.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_cher(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const float)alpha, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_a.GET_MULTI_PTR, (const int)lda); + }); + }); +} + +void her(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& a, int64_t lda) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_a = a.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zher(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const double)alpha, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_a.GET_MULTI_PTR, (const int)lda); + }); + }); +} + +void her2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_a = a.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_cher2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const void*)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy, accessor_a.GET_MULTI_PTR, + (const int)lda); + }); + }); +} + +void her2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_a = a.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zher2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const void*)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy, accessor_a.GET_MULTI_PTR, + (const int)lda); + }); + }); +} + +void hpmv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& ap, sycl::buffer, 1>& x, + int64_t incx, std::complex beta, sycl::buffer, 1>& y, + int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_ap = ap.get_access(cgh); + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_chpmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const void*)&alpha, accessor_ap.GET_MULTI_PTR, accessor_x.GET_MULTI_PTR, + (const int)incx, (const void*)&beta, accessor_y.GET_MULTI_PTR, + (const int)incy); + }); + }); +} + +void hpmv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& ap, sycl::buffer, 1>& x, + int64_t incx, std::complex beta, sycl::buffer, 1>& y, + int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_ap = ap.get_access(cgh); + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zhpmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const void*)&alpha, accessor_ap.GET_MULTI_PTR, accessor_x.GET_MULTI_PTR, + (const int)incx, (const void*)&beta, accessor_y.GET_MULTI_PTR, + (const int)incy); + }); + }); +} + +void hpr(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& ap) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_ap = ap.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_chpr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const float)alpha, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_ap.GET_MULTI_PTR); + }); + }); +} + +void hpr(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& ap) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_ap = ap.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zhpr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const double)alpha, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_ap.GET_MULTI_PTR); + }); + }); +} + +void hpr2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& ap) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_ap = ap.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_chpr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const void*)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy, accessor_ap.GET_MULTI_PTR); + }); + }); +} + +void hpr2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& ap) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_ap = ap.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zhpr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const void*)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy, accessor_ap.GET_MULTI_PTR); + }); + }); +} + +void sbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, float alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, + float beta, sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ssbmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, (const int)k, + (const float)alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_x.GET_MULTI_PTR, (const int)incx, (const float)beta, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void sbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, double alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, + double beta, sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dsbmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, (const int)k, + (const double)alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_x.GET_MULTI_PTR, (const int)incx, (const double)beta, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void spmv(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer& ap, + sycl::buffer& x, int64_t incx, float beta, sycl::buffer& y, + int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_ap = ap.get_access(cgh); + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_sspmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const float)alpha, accessor_ap.GET_MULTI_PTR, accessor_x.GET_MULTI_PTR, + (const int)incx, (const float)beta, accessor_y.GET_MULTI_PTR, + (const int)incy); + }); + }); +} + +void spmv(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, + sycl::buffer& ap, sycl::buffer& x, int64_t incx, double beta, + sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_ap = ap.get_access(cgh); + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dspmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const double)alpha, accessor_ap.GET_MULTI_PTR, accessor_x.GET_MULTI_PTR, + (const int)incx, (const double)beta, accessor_y.GET_MULTI_PTR, + (const int)incy); + }); + }); +} + +void spr(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& ap) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_ap = ap.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_sspr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const float)alpha, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_ap.GET_MULTI_PTR); + }); + }); +} + +void spr(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& ap) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_ap = ap.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dspr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const double)alpha, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_ap.GET_MULTI_PTR); + }); + }); +} + +void spr2(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& ap) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_ap = ap.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_sspr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const float)alpha, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy, accessor_ap.GET_MULTI_PTR); + }); + }); +} + +void spr2(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& ap) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_ap = ap.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dspr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const double)alpha, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy, accessor_ap.GET_MULTI_PTR); + }); + }); +} + +void symv(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer& a, + int64_t lda, sycl::buffer& x, int64_t incx, float beta, + sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ssymv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const float)alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_x.GET_MULTI_PTR, (const int)incx, (const float)beta, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void symv(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer& a, + int64_t lda, sycl::buffer& x, int64_t incx, double beta, + sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dsymv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const double)alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_x.GET_MULTI_PTR, (const int)incx, (const double)beta, + accessor_y.GET_MULTI_PTR, (const int)incy); + }); + }); +} + +void syr(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& a, int64_t lda) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_a = a.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ssyr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const float)alpha, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_a.GET_MULTI_PTR, (const int)lda); + }); + }); +} + +void syr(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& a, int64_t lda) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_a = a.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dsyr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const double)alpha, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_a.GET_MULTI_PTR, (const int)lda); + }); + }); +} + +void syr2(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& a, + int64_t lda) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_a = a.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ssyr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const float)alpha, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy, accessor_a.GET_MULTI_PTR, + (const int)lda); + }); + }); +} + +void syr2(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& a, + int64_t lda) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_x = x.get_access(cgh); + auto accessor_y = y.get_access(cgh); + auto accessor_a = a.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dsyr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const double)alpha, accessor_x.GET_MULTI_PTR, (const int)incx, + accessor_y.GET_MULTI_PTR, (const int)incy, accessor_a.GET_MULTI_PTR, + (const int)lda); + }); + }); +} + +void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_stbmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR, + (const int)incx); + }); + }); +} + +void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dtbmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR, + (const int)incx); + }); + }); +} + +void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ctbmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR, + (const int)incx); + }); + }); +} + +void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ztbmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR, + (const int)incx); + }); + }); +} + +void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_stbsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR, + (const int)incx); + }); + }); +} + +void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dtbsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR, + (const int)incx); + }); + }); +} + +void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ctbsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR, + (const int)incx); + }); + }); +} + +void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ztbsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR, + (const int)incx); + }); + }); +} + +void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer& ap, sycl::buffer& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_ap = ap.get_access(cgh); + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_stpmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, accessor_ap.GET_MULTI_PTR, + accessor_x.GET_MULTI_PTR, (const int)incx); + }); + }); +} + +void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer& ap, sycl::buffer& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_ap = ap.get_access(cgh); + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dtpmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, accessor_ap.GET_MULTI_PTR, + accessor_x.GET_MULTI_PTR, (const int)incx); + }); + }); +} + +void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer, 1>& ap, sycl::buffer, 1>& x, + int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_ap = ap.get_access(cgh); + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ctpmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, accessor_ap.GET_MULTI_PTR, + accessor_x.GET_MULTI_PTR, (const int)incx); + }); + }); +} + +void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer, 1>& ap, sycl::buffer, 1>& x, + int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_ap = ap.get_access(cgh); + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ztpmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, accessor_ap.GET_MULTI_PTR, + accessor_x.GET_MULTI_PTR, (const int)incx); + }); + }); +} + +void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer& ap, sycl::buffer& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_ap = ap.get_access(cgh); + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_stpsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, accessor_ap.GET_MULTI_PTR, + accessor_x.GET_MULTI_PTR, (const int)incx); + }); + }); +} + +void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer& ap, sycl::buffer& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_ap = ap.get_access(cgh); + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dtpsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, accessor_ap.GET_MULTI_PTR, + accessor_x.GET_MULTI_PTR, (const int)incx); + }); + }); +} + +void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer, 1>& ap, sycl::buffer, 1>& x, + int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_ap = ap.get_access(cgh); + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ctpsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, accessor_ap.GET_MULTI_PTR, + accessor_x.GET_MULTI_PTR, (const int)incx); + }); + }); +} + +void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer, 1>& ap, sycl::buffer, 1>& x, + int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_ap = ap.get_access(cgh); + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ztpsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, accessor_ap.GET_MULTI_PTR, + accessor_x.GET_MULTI_PTR, (const int)incx); + }); + }); +} + +void trmv(sycl::queue& queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_strmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)n, accessor_a.GET_MULTI_PTR, + (const int)lda, accessor_b.GET_MULTI_PTR, (const int)incx); + }); + }); +} + +void trmv(sycl::queue& queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dtrmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)n, accessor_a.GET_MULTI_PTR, + (const int)lda, accessor_b.GET_MULTI_PTR, (const int)incx); + }); + }); +} + +void trmv(sycl::queue& queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ctrmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)n, accessor_a.GET_MULTI_PTR, + (const int)lda, accessor_b.GET_MULTI_PTR, (const int)incx); + }); + }); +} + +void trmv(sycl::queue& queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ztrmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)n, accessor_a.GET_MULTI_PTR, + (const int)lda, accessor_b.GET_MULTI_PTR, (const int)incx); + }); + }); +} + +void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_strsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, accessor_a.GET_MULTI_PTR, + (const int)lda, accessor_x.GET_MULTI_PTR, (const int)incx); + }); + }); +} + +void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dtrsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, accessor_a.GET_MULTI_PTR, + (const int)lda, accessor_x.GET_MULTI_PTR, (const int)incx); + }); + }); +} + +void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ctrsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, accessor_a.GET_MULTI_PTR, + (const int)lda, accessor_x.GET_MULTI_PTR, (const int)incx); + }); + }); +} + +void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_x = x.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ztrsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, accessor_a.GET_MULTI_PTR, + (const int)lda, accessor_x.GET_MULTI_PTR, (const int)incx); + }); + }); +} + +// USM APIs + +sycl::event gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + float alpha, const float* a, int64_t lda, const float* x, int64_t incx, float beta, + float* y, int64_t incy, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_sgbmv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, + (const int)kl, (const int)ku, (const float)alpha, a, (const int)lda, x, + (const int)incx, (const float)beta, y, (const int)incy); + }); + }); + return done; +} + +sycl::event gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + double alpha, const double* a, int64_t lda, const double* x, int64_t incx, + double beta, double* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dgbmv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, + (const int)kl, (const int)ku, (const double)alpha, a, (const int)lda, x, + (const int)incx, (const double)beta, y, (const int)incy); + }); + }); + return done; +} + +sycl::event gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_cgbmv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, + (const int)kl, (const int)ku, (const void*)&alpha, a, (const int)lda, x, + (const int)incx, (const void*)&beta, y, (const int)incy); + }); + }); + return done; +} + +sycl::event gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zgbmv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, + (const int)kl, (const int)ku, (const void*)&alpha, a, (const int)lda, x, + (const int)incx, (const void*)&beta, y, (const int)incy); + }); + }); + return done; +} + +sycl::event gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, const float* x, int64_t incx, float beta, float* y, + int64_t incy, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_sgemv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, + (const float)alpha, a, (const int)lda, x, (const int)incx, + (const float)beta, y, (const int)incy); + }); + }); + return done; +} + +sycl::event gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, const double* x, int64_t incx, double beta, + double* y, int64_t incy, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dgemv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, + (const double)alpha, a, (const int)lda, x, (const int)incx, + (const double)beta, y, (const int)incy); + }); + }); + return done; +} + +sycl::event gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_cgemv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, + (const void*)&alpha, a, (const int)lda, x, (const int)incx, + (const void*)&beta, y, (const int)incy); + }); + }); + return done; +} + +sycl::event gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zgemv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, + (const void*)&alpha, a, (const int)lda, x, (const int)incx, + (const void*)&beta, y, (const int)incy); + }); + }); + return done; +} + +sycl::event ger(sycl::queue& queue, int64_t m, int64_t n, float alpha, const float* x, int64_t incx, + const float* y, int64_t incy, float* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_sger(MAJOR, (const int)m, (const int)n, (const float)alpha, x, (const int)incx, + y, (const int)incy, a, (const int)lda); + }); + }); + return done; +} + +sycl::event ger(sycl::queue& queue, int64_t m, int64_t n, double alpha, const double* x, + int64_t incx, const double* y, int64_t incy, double* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dger(MAJOR, (const int)m, (const int)n, (const double)alpha, x, (const int)incx, + y, (const int)incy, a, (const int)lda); + }); + }); + return done; +} + +sycl::event gerc(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_cgerc(MAJOR, (const int)m, (const int)n, (const void*)&alpha, x, + (const int)incx, y, (const int)incy, a, (const int)lda); + }); + }); + return done; +} + +sycl::event gerc(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zgerc(MAJOR, (const int)m, (const int)n, (const void*)&alpha, x, + (const int)incx, y, (const int)incy, a, (const int)lda); + }); + }); + return done; +} + +sycl::event geru(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_cgeru(MAJOR, (const int)m, (const int)n, (const void*)&alpha, x, + (const int)incx, y, (const int)incy, a, (const int)lda); + }); + }); + return done; +} + +sycl::event geru(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zgeru(MAJOR, (const int)m, (const int)n, (const void*)&alpha, x, + (const int)incx, y, (const int)incy, a, (const int)lda); + }); + }); + return done; +} + +sycl::event hbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_chbmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, (const int)k, + (const void*)&alpha, a, (const int)lda, x, (const int)incx, + (const void*)&beta, y, (const int)incy); + }); + }); + return done; +} + +sycl::event hbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zhbmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, (const int)k, + (const void*)&alpha, a, (const int)lda, x, (const int)incx, + (const void*)&beta, y, (const int)incy); + }); + }); + return done; +} + +sycl::event hemv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* a, int64_t lda, const std::complex* x, + int64_t incx, std::complex beta, std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_chemv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const void*)&alpha, a, (const int)lda, x, (const int)incx, + (const void*)&beta, y, (const int)incy); + }); + }); + return done; +} + +sycl::event hemv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* a, int64_t lda, const std::complex* x, + int64_t incx, std::complex beta, std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zhemv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const void*)&alpha, a, (const int)lda, x, (const int)incx, + (const void*)&beta, y, (const int)incy); + }); + }); + return done; +} + +sycl::event her(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, + const std::complex* x, int64_t incx, std::complex* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_cher(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const float)alpha, x, (const int)incx, a, (const int)lda); + }); + }); + return done; +} + +sycl::event her(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, + const std::complex* x, int64_t incx, std::complex* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zher(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const double)alpha, x, (const int)incx, a, (const int)lda); + }); + }); + return done; +} + +sycl::event her2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_cher2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const void*)&alpha, x, (const int)incx, y, (const int)incy, a, + (const int)lda); + }); + }); + return done; +} + +sycl::event her2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zher2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const void*)&alpha, x, (const int)incx, y, (const int)incy, a, + (const int)lda); + }); + }); + return done; +} + +sycl::event hpmv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* ap, const std::complex* x, int64_t incx, + std::complex beta, std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_chpmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const void*)&alpha, ap, x, (const int)incx, (const void*)&beta, y, + (const int)incy); + }); + }); + return done; +} + +sycl::event hpmv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* ap, const std::complex* x, int64_t incx, + std::complex beta, std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zhpmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const void*)&alpha, ap, x, (const int)incx, (const void*)&beta, y, + (const int)incy); + }); + }); + return done; +} + +sycl::event hpr(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, + const std::complex* x, int64_t incx, std::complex* ap, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_chpr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const float)alpha, x, (const int)incx, ap); + }); + }); + return done; +} + +sycl::event hpr(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, + const std::complex* x, int64_t incx, std::complex* ap, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zhpr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const double)alpha, x, (const int)incx, ap); + }); + }); + return done; +} + +sycl::event hpr2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* ap, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_chpr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const void*)&alpha, x, (const int)incx, y, (const int)incy, ap); + }); + }); + return done; +} + +sycl::event hpr2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* ap, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zhpr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const void*)&alpha, x, (const int)incx, y, (const int)incy, ap); + }); + }); + return done; +} + +sycl::event sbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, float alpha, + const float* a, int64_t lda, const float* x, int64_t incx, float beta, float* y, + int64_t incy, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ssbmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, (const int)k, + (const float)alpha, a, (const int)lda, x, (const int)incx, + (const float)beta, y, (const int)incy); + }); + }); + return done; +} + +sycl::event sbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, double alpha, + const double* a, int64_t lda, const double* x, int64_t incx, double beta, + double* y, int64_t incy, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dsbmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, (const int)k, + (const double)alpha, a, (const int)lda, x, (const int)incx, + (const double)beta, y, (const int)incy); + }); + }); + return done; +} + +sycl::event spmv(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, const float* ap, + const float* x, int64_t incx, float beta, float* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_sspmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const float)alpha, ap, x, (const int)incx, (const float)beta, y, + (const int)incy); + }); + }); + return done; +} + +sycl::event spmv(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, const double* ap, + const double* x, int64_t incx, double beta, double* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dspmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const double)alpha, ap, x, (const int)incx, (const double)beta, y, + (const int)incy); + }); + }); + return done; +} + +sycl::event spr(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, const float* x, + int64_t incx, float* ap, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_sspr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const float)alpha, x, (const int)incx, ap); + }); + }); + return done; +} + +sycl::event spr(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, const double* x, + int64_t incx, double* ap, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dspr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const double)alpha, x, (const int)incx, ap); + }); + }); + return done; +} + +sycl::event spr2(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, const float* x, + int64_t incx, const float* y, int64_t incy, float* ap, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_sspr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const float)alpha, x, (const int)incx, y, (const int)incy, ap); + }); + }); + return done; +} + +sycl::event spr2(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, const double* x, + int64_t incx, const double* y, int64_t incy, double* ap, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dspr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const double)alpha, x, (const int)incx, y, (const int)incy, ap); + }); + }); + return done; +} + +sycl::event symv(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, const float* a, + int64_t lda, const float* x, int64_t incx, float beta, float* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ssymv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const float)alpha, a, (const int)lda, x, (const int)incx, + (const float)beta, y, (const int)incy); + }); + }); + return done; +} + +sycl::event symv(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, const double* a, + int64_t lda, const double* x, int64_t incx, double beta, double* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dsymv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const double)alpha, a, (const int)lda, x, (const int)incx, + (const double)beta, y, (const int)incy); + }); + }); + return done; +} + +sycl::event syr(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, const float* x, + int64_t incx, float* a, int64_t lda, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ssyr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const float)alpha, x, (const int)incx, a, (const int)lda); + }); + }); + return done; +} + +sycl::event syr(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, const double* x, + int64_t incx, double* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dsyr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const double)alpha, x, (const int)incx, a, (const int)lda); + }); + }); + return done; +} + +sycl::event syr2(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, const float* x, + int64_t incx, const float* y, int64_t incy, float* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ssyr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const float)alpha, x, (const int)incx, y, (const int)incy, a, + (const int)lda); + }); + }); + return done; +} + +sycl::event syr2(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, const double* x, + int64_t incx, const double* y, int64_t incy, double* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dsyr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, + (const double)alpha, x, (const int)incx, y, (const int)incy, a, + (const int)lda); + }); + }); + return done; +} + +sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const float* a, int64_t lda, float* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_stbmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, a, + (const int)lda, x, (const int)incx); + }); + }); + return done; +} + +sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const double* a, int64_t lda, double* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dtbmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, a, + (const int)lda, x, (const int)incx); + }); + }); + return done; +} + +sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const std::complex* a, int64_t lda, std::complex* x, + int64_t incx, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ctbmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, a, + (const int)lda, x, (const int)incx); + }); + }); + return done; +} + +sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const std::complex* a, int64_t lda, std::complex* x, + int64_t incx, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ztbmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, a, + (const int)lda, x, (const int)incx); + }); + }); + return done; +} + +sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const float* a, int64_t lda, float* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_stbsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, a, + (const int)lda, x, (const int)incx); + }); + }); + return done; +} + +sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const double* a, int64_t lda, double* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dtbsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, a, + (const int)lda, x, (const int)incx); + }); + }); + return done; +} + +sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const std::complex* a, int64_t lda, std::complex* x, + int64_t incx, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ctbsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, a, + (const int)lda, x, (const int)incx); + }); + }); + return done; +} + +sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const std::complex* a, int64_t lda, std::complex* x, + int64_t incx, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ztbsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, a, + (const int)lda, x, (const int)incx); + }); + }); + return done; +} + +sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const float* ap, float* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_stpmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, ap, x, (const int)incx); + }); + }); + return done; +} + +sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const double* ap, double* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dtpmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, ap, x, (const int)incx); + }); + }); + return done; +} + +sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const std::complex* ap, std::complex* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ctpmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, ap, x, (const int)incx); + }); + }); + return done; +} + +sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const std::complex* ap, std::complex* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ztpmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, ap, x, (const int)incx); + }); + }); + return done; +} + +sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const float* ap, float* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_stpsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, ap, x, (const int)incx); + }); + }); + return done; +} + +sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const double* ap, double* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dtpsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, ap, x, (const int)incx); + }); + }); + return done; +} + +sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const std::complex* ap, std::complex* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ctpsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, ap, x, (const int)incx); + }); + }); + return done; +} + +sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const std::complex* ap, std::complex* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ztpsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, ap, x, (const int)incx); + }); + }); + return done; +} + +sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, + const float* a, int64_t lda, float* b, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_strmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)n, a, (const int)lda, b, + (const int)incx); + }); + }); + return done; +} + +sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, + const double* a, int64_t lda, double* b, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dtrmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)n, a, (const int)lda, b, + (const int)incx); + }); + }); + return done; +} + +sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, + const std::complex* a, int64_t lda, std::complex* b, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ctrmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)n, a, (const int)lda, b, + (const int)incx); + }); + }); + return done; +} + +sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, + const std::complex* a, int64_t lda, std::complex* b, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ztrmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)n, a, (const int)lda, b, + (const int)incx); + }); + }); + return done; +} + +sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const float* a, int64_t lda, float* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_strsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, a, (const int)lda, x, + (const int)incx); + }); + }); + return done; +} + +sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const double* a, int64_t lda, double* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dtrsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, a, (const int)lda, x, + (const int)incx); + }); + }); + return done; +} + +sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const std::complex* a, int64_t lda, std::complex* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ctrsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, a, (const int)lda, x, + (const int)incx); + }); + }); + return done; +} + +sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const std::complex* a, int64_t lda, std::complex* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ztrsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + convert_to_cblas_diag(unit_diag), (const int)n, a, (const int)lda, x, + (const int)incx); + }); + }); + return done; +} diff --git a/src/blas/backends/openblas/openblas_level3.cpp b/src/blas/backends/openblas/openblas_level3.cpp new file mode 100644 index 000000000..82487e025 --- /dev/null +++ b/src/blas/backends/openblas/openblas_level3.cpp @@ -0,0 +1,55 @@ +/******************************************************************************* +* Copyright 2020-2021 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#if __has_include() +#include +#else +#include +#endif + +#include "openblas_common.hpp" +#include "oneapi/math/exceptions.hpp" +#include "oneapi/math/blas/detail/openblas/onemath_blas_openblas.hpp" + +namespace oneapi { +namespace math { +namespace blas { +namespace openblas { +namespace column_major { + +#define MAJOR CblasColMajor +#define COLUMN_MAJOR +#include "openblas_level3.cxx" +#undef MAJOR +#undef COLUMN_MAJOR + +} // namespace column_major +namespace row_major { + +#define MAJOR CblasRowMajor +#define ROW_MAJOR +#include "openblas_level3.cxx" +#undef MAJOR +#undef ROW_MAJOR + +} // namespace row_major +} // namespace openblas +} // namespace blas +} // namespace math +} // namespace oneapi diff --git a/src/blas/backends/openblas/openblas_level3.cxx b/src/blas/backends/openblas/openblas_level3.cxx new file mode 100644 index 000000000..5a28f430c --- /dev/null +++ b/src/blas/backends/openblas/openblas_level3.cxx @@ -0,0 +1,1146 @@ +/******************************************************************************* +* Copyright 2020-2021 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +// Buffer APIs + +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + float alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, float beta, sycl::buffer& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_sgemm(MAJOR, convert_to_cblas_trans(transa), convert_to_cblas_trans(transb), + (const int)m, (const int)n, (const int)k, (const float)alpha, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR, + (const int)ldb, (const float)beta, accessor_c.GET_MULTI_PTR, + (const int)ldc); + }); + }); +} + +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + double alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, double beta, sycl::buffer& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dgemm(MAJOR, convert_to_cblas_trans(transa), convert_to_cblas_trans(transb), + (const int)m, (const int)n, (const int)k, (const double)alpha, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR, + (const int)ldb, (const double)beta, accessor_c.GET_MULTI_PTR, + (const int)ldc); + }); + }); +} + +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_cgemm(MAJOR, convert_to_cblas_trans(transa), convert_to_cblas_trans(transb), + (const int)m, (const int)n, (const int)k, (const void*)&alpha, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR, + (const int)ldb, (const void*)&beta, accessor_c.GET_MULTI_PTR, + (const int)ldc); + }); + }); +} + +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zgemm(MAJOR, convert_to_cblas_trans(transa), convert_to_cblas_trans(transb), + (const int)m, (const int)n, (const int)k, (const void*)&alpha, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR, + (const int)ldb, (const void*)&beta, accessor_c.GET_MULTI_PTR, + (const int)ldc); + }); + }); +} + +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + sycl::half alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, sycl::half beta, + sycl::buffer& c, int64_t ldc) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm", "for row_major layout"); +#endif +} + +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + float alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, float beta, sycl::buffer& c, int64_t ldc) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm", "for row_major layout"); +#endif +} + +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + float alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, float beta, sycl::buffer& c, int64_t ldc) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm", "for row_major layout"); +#endif +} + +void hemm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_chemm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_b.GET_MULTI_PTR, (const int)ldb, (const void*)&beta, + accessor_c.GET_MULTI_PTR, (const int)ldc); + }); + }); +} + +void hemm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zhemm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_b.GET_MULTI_PTR, (const int)ldb, (const void*)&beta, + accessor_c.GET_MULTI_PTR, (const int)ldc); + }); + }); +} + +void herk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha, + sycl::buffer, 1>& a, int64_t lda, float beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_cherk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const float)alpha, accessor_a.GET_MULTI_PTR, + (const int)lda, (const float)beta, accessor_c.GET_MULTI_PTR, + (const int)ldc); + }); + }); +} + +void herk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, double alpha, + sycl::buffer, 1>& a, int64_t lda, double beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zherk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const double)alpha, accessor_a.GET_MULTI_PTR, + (const int)lda, (const double)beta, accessor_c.GET_MULTI_PTR, + (const int)ldc); + }); + }); +} + +void her2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, float beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_cher2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const void*)&alpha, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR, + (const int)ldb, (const float)beta, accessor_c.GET_MULTI_PTR, + (const int)ldc); + }); + }); +} + +void her2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, double beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zher2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const void*)&alpha, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR, + (const int)ldb, (const double)beta, accessor_c.GET_MULTI_PTR, + (const int)ldc); + }); + }); +} + +void symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb, + float beta, sycl::buffer& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ssymm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n, + (const float)alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_b.GET_MULTI_PTR, (const int)ldb, (const float)beta, + accessor_c.GET_MULTI_PTR, (const int)ldc); + }); + }); +} + +void symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb, + double beta, sycl::buffer& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dsymm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n, + (const double)alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_b.GET_MULTI_PTR, (const int)ldb, (const double)beta, + accessor_c.GET_MULTI_PTR, (const int)ldc); + }); + }); +} + +void symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_csymm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_b.GET_MULTI_PTR, (const int)ldb, (const void*)&beta, + accessor_c.GET_MULTI_PTR, (const int)ldc); + }); + }); +} + +void symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zsymm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_b.GET_MULTI_PTR, (const int)ldb, (const void*)&beta, + accessor_c.GET_MULTI_PTR, (const int)ldc); + }); + }); +} + +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha, + sycl::buffer& a, int64_t lda, float beta, sycl::buffer& c, + int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ssyrk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const float)alpha, accessor_a.GET_MULTI_PTR, + (const int)lda, (const float)beta, accessor_c.GET_MULTI_PTR, + (const int)ldc); + }); + }); +} + +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, double alpha, + sycl::buffer& a, int64_t lda, double beta, sycl::buffer& c, + int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dsyrk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const double)alpha, accessor_a.GET_MULTI_PTR, + (const int)lda, (const double)beta, accessor_c.GET_MULTI_PTR, + (const int)ldc); + }); + }); +} + +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_csyrk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const void*)&alpha, accessor_a.GET_MULTI_PTR, + (const int)lda, (const void*)&beta, accessor_c.GET_MULTI_PTR, + (const int)ldc); + }); + }); +} + +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zsyrk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const void*)&alpha, accessor_a.GET_MULTI_PTR, + (const int)lda, (const void*)&beta, accessor_c.GET_MULTI_PTR, + (const int)ldc); + }); + }); +} + +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb, + float beta, sycl::buffer& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ssyr2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const float)alpha, accessor_a.GET_MULTI_PTR, + (const int)lda, accessor_b.GET_MULTI_PTR, (const int)ldb, + (const float)beta, accessor_c.GET_MULTI_PTR, (const int)ldc); + }); + }); +} + +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, double beta, sycl::buffer& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dsyr2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const double)alpha, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR, + (const int)ldb, (const double)beta, accessor_c.GET_MULTI_PTR, + (const int)ldc); + }); + }); +} + +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_csyr2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const void*)&alpha, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR, + (const int)ldb, (const void*)&beta, accessor_c.GET_MULTI_PTR, + (const int)ldc); + }); + }); +} + +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + auto accessor_c = c.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_zsyr2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const void*)&alpha, + accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR, + (const int)ldb, (const void*)&beta, accessor_c.GET_MULTI_PTR, + (const int)ldc); + }); + }); +} + +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, + int64_t m, int64_t n, float alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_strmm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, + (const float)alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_b.GET_MULTI_PTR, (const int)ldb); + }); + }); +} + +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, + int64_t m, int64_t n, double alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dtrmm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, + (const double)alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_b.GET_MULTI_PTR, (const int)ldb); + }); + }); +} + +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, + int64_t m, int64_t n, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ctrmm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_b.GET_MULTI_PTR, (const int)ldb); + }); + }); +} + +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, + int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ztrmm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_b.GET_MULTI_PTR, (const int)ldb); + }); + }); +} + +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, + int64_t m, int64_t n, float alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_strsm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, + (const float)alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_b.GET_MULTI_PTR, (const int)ldb); + }); + }); +} + +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, + int64_t m, int64_t n, double alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_dtrsm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, + (const double)alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_b.GET_MULTI_PTR, (const int)ldb); + }); + }); +} + +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, + int64_t m, int64_t n, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ctrsm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_b.GET_MULTI_PTR, (const int)ldb); + }); + }); +} + +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, + int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb) { + queue.submit([&](sycl::handler& cgh) { + auto accessor_a = a.get_access(cgh); + auto accessor_b = b.get_access(cgh); + host_task(cgh, [=]() { + ::cblas_ztrsm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_b.GET_MULTI_PTR, (const int)ldb); + }); + }); +} + +// USM APIs + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const float* a, int64_t lda, const float* b, int64_t ldb, + float beta, float* c, int64_t ldc, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_sgemm(MAJOR, convert_to_cblas_trans(transa), convert_to_cblas_trans(transb), + (const int)m, (const int)n, (const int)k, (const float)alpha, a, + (const int)lda, b, (const int)ldb, (const float)beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, double alpha, const double* a, int64_t lda, const double* b, + int64_t ldb, double beta, double* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dgemm(MAJOR, convert_to_cblas_trans(transa), convert_to_cblas_trans(transb), + (const int)m, (const int)n, (const int)k, (const double)alpha, a, + (const int)lda, b, (const int)ldb, (const double)beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_cgemm(MAJOR, convert_to_cblas_trans(transa), convert_to_cblas_trans(transb), + (const int)m, (const int)n, (const int)k, (const void*)&alpha, a, + (const int)lda, b, (const int)ldb, (const void*)&beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zgemm(MAJOR, convert_to_cblas_trans(transa), convert_to_cblas_trans(transb), + (const int)m, (const int)n, (const int)k, (const void*)&alpha, a, + (const int)lda, b, (const int)ldb, (const void*)&beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, sycl::half alpha, const sycl::half* a, int64_t lda, const sycl::half* b, + int64_t ldb, sycl::half beta, sycl::half* c, int64_t ldc, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm", "for row_major layout"); +#endif +} + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const sycl::half* a, int64_t lda, const sycl::half* b, + int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm", "for row_major layout"); +#endif +} + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const bfloat16* a, int64_t lda, const bfloat16* b, + int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { +#ifdef COLUMN_MAJOR + throw unimplemented("blas", "gemm", "for column_major layout"); +#endif +#ifdef ROW_MAJOR + throw unimplemented("blas", "gemm", "for row_major layout"); +#endif +} + +sycl::event hemm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_chemm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n, + (const void*)&alpha, a, (const int)lda, b, (const int)ldb, + (const void*)&beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event hemm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zhemm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n, + (const void*)&alpha, a, (const int)lda, b, (const int)ldb, + (const void*)&beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event herk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, const std::complex* a, int64_t lda, float beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_cherk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const float)alpha, a, (const int)lda, + (const float)beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event herk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, const std::complex* a, int64_t lda, double beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zherk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const double)alpha, a, (const int)lda, + (const double)beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event her2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, float beta, std::complex* c, + int64_t ldc, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_cher2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const void*)&alpha, a, (const int)lda, b, + (const int)ldb, (const float)beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event her2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, double beta, std::complex* c, + int64_t ldc, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zher2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const void*)&alpha, a, (const int)lda, b, + (const int)ldb, (const double)beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + float alpha, const float* a, int64_t lda, const float* b, int64_t ldb, float beta, + float* c, int64_t ldc, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ssymm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n, + (const float)alpha, a, (const int)lda, b, (const int)ldb, + (const float)beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + double alpha, const double* a, int64_t lda, const double* b, int64_t ldb, + double beta, double* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dsymm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n, + (const double)alpha, a, (const int)lda, b, (const int)ldb, + (const double)beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_csymm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n, + (const void*)&alpha, a, (const int)lda, b, (const int)ldb, + (const void*)&beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zsymm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n, + (const void*)&alpha, a, (const int)lda, b, (const int)ldb, + (const void*)&beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, const float* a, int64_t lda, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ssyrk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const float)alpha, a, (const int)lda, + (const float)beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, const double* a, int64_t lda, double beta, double* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dsyrk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const double)alpha, a, (const int)lda, + (const double)beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_csyrk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const void*)&alpha, a, (const int)lda, + (const void*)&beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zsyrk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const void*)&alpha, a, (const int)lda, + (const void*)&beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, const float* a, int64_t lda, const float* b, int64_t ldb, float beta, + float* c, int64_t ldc, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ssyr2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const float)alpha, a, (const int)lda, b, + (const int)ldb, (const float)beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, const double* a, int64_t lda, const double* b, int64_t ldb, + double beta, double* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dsyr2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const double)alpha, a, (const int)lda, b, + (const int)ldb, (const double)beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_csyr2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const void*)&alpha, a, (const int)lda, b, + (const int)ldb, (const void*)&beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_zsyr2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), + (const int)n, (const int)k, (const void*)&alpha, a, (const int)lda, b, + (const int)ldb, (const void*)&beta, c, (const int)ldc); + }); + }); + return done; +} + +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, int64_t m, int64_t n, float alpha, const float* a, int64_t lda, + float* b, int64_t ldb, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_strmm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, + (const float)alpha, a, (const int)lda, b, (const int)ldb); + }); + }); + return done; +} + +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, int64_t m, int64_t n, double alpha, const double* a, int64_t lda, + double* b, int64_t ldb, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dtrmm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, + (const double)alpha, a, (const int)lda, b, (const int)ldb); + }); + }); + return done; +} + +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, int64_t m, int64_t n, std::complex alpha, + const std::complex* a, int64_t lda, std::complex* b, int64_t ldb, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ctrmm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, + (const void*)&alpha, a, (const int)lda, b, (const int)ldb); + }); + }); + return done; +} + +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, int64_t m, int64_t n, std::complex alpha, + const std::complex* a, int64_t lda, std::complex* b, int64_t ldb, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ztrmm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, + (const void*)&alpha, a, (const int)lda, b, (const int)ldb); + }); + }); + return done; +} + +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, int64_t m, int64_t n, float alpha, const float* a, int64_t lda, + float* b, int64_t ldb, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_strsm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, + (const float)alpha, a, (const int)lda, b, (const int)ldb); + }); + }); + return done; +} + +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, int64_t m, int64_t n, double alpha, const double* a, int64_t lda, + double* b, int64_t ldb, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_dtrsm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, + (const double)alpha, a, (const int)lda, b, (const int)ldb); + }); + }); + return done; +} + +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, int64_t m, int64_t n, std::complex alpha, + const std::complex* a, int64_t lda, std::complex* b, int64_t ldb, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ctrsm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, + (const void*)&alpha, a, (const int)lda, b, (const int)ldb); + }); + }); + return done; +} + +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, int64_t m, int64_t n, std::complex alpha, + const std::complex* a, int64_t lda, std::complex* b, int64_t ldb, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cblas_ztrsm(MAJOR, convert_to_cblas_side(left_right), + convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, + (const void*)&alpha, a, (const int)lda, b, (const int)ldb); + }); + }); + return done; +} diff --git a/src/blas/backends/openblas/openblas_wrappers.cpp b/src/blas/backends/openblas/openblas_wrappers.cpp new file mode 100644 index 000000000..568a54aa2 --- /dev/null +++ b/src/blas/backends/openblas/openblas_wrappers.cpp @@ -0,0 +1,35 @@ +/******************************************************************************* +* Copyright 2020-2021 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include "blas/function_table.hpp" +#include "oneapi/math/blas/detail/openblas/onemath_blas_openblas.hpp" + +#define WRAPPER_VERSION 1 + +extern "C" ONEMATH_EXPORT blas_function_table_t onemath_blas_table = { + WRAPPER_VERSION, +#define BACKEND openblas +#define MAJOR column_major +#include "../backend_wrappers.cxx" +#undef MAJOR +#define MAJOR row_major +#include "../backend_wrappers.cxx" +#undef MAJOR +#undef BACKEND +}; diff --git a/src/config.hpp.in b/src/config.hpp.in index 9b737853e..ae0dc6702 100644 --- a/src/config.hpp.in +++ b/src/config.hpp.in @@ -30,6 +30,7 @@ #cmakedefine ONEMATH_ENABLE_MKLCPU_BACKEND #cmakedefine ONEMATH_ENABLE_MKLGPU_BACKEND #cmakedefine ONEMATH_ENABLE_NETLIB_BACKEND +#cmakedefine ONEMATH_ENABLE_OPENBLAS_BACKEND #cmakedefine ONEMATH_ENABLE_GENERIC_BLAS_BACKEND #cmakedefine ONEMATH_ENABLE_GENERIC_BLAS_BACKEND_AMD_GPU #cmakedefine ONEMATH_ENABLE_GENERIC_BLAS_BACKEND_INTEL_CPU diff --git a/src/lapack/backends/mklcpu/CMakeLists.txt b/src/lapack/backends/mklcpu/CMakeLists.txt index 9c6e08aa6..b518fd8c1 100644 --- a/src/lapack/backends/mklcpu/CMakeLists.txt +++ b/src/lapack/backends/mklcpu/CMakeLists.txt @@ -39,7 +39,7 @@ target_include_directories(${LIB_OBJ} target_compile_options(${LIB_OBJ} PRIVATE ${ONEMATH_BUILD_COPT}) if(TARGET MKL::MKL_SYCL::LAPACK) - target_link_libraries(${LIB_OBJ} PUBLIC ONEMATH::SYCL::SYCL MKL::MKL_SYCL::LAPACK MKL::MKL_SYCL::BLAS) + target_link_libraries(${LIB_OBJ} PUBLIC ONEMATH::SYCL::SYCL MKL::MKL_SYCL::LAPACK) else() target_link_libraries(${LIB_OBJ} PUBLIC ONEMATH::SYCL::SYCL MKL::MKL_DPCPP) endif() diff --git a/src/lapack/backends/mklgpu/CMakeLists.txt b/src/lapack/backends/mklgpu/CMakeLists.txt index ce7dea0ab..087e59886 100644 --- a/src/lapack/backends/mklgpu/CMakeLists.txt +++ b/src/lapack/backends/mklgpu/CMakeLists.txt @@ -39,7 +39,7 @@ target_include_directories(${LIB_OBJ} target_compile_options(${LIB_OBJ} PRIVATE ${ONEMATH_BUILD_COPT}) if(TARGET MKL::MKL_SYCL::LAPACK) - target_link_libraries(${LIB_OBJ} PUBLIC ONEMATH::SYCL::SYCL MKL::MKL_SYCL::LAPACK MKL::MKL_SYCL::BLAS) + target_link_libraries(${LIB_OBJ} PUBLIC ONEMATH::SYCL::SYCL MKL::MKL_SYCL::LAPACK) else() target_link_libraries(${LIB_OBJ} PUBLIC ONEMATH::SYCL::SYCL MKL::MKL_DPCPP) endif() diff --git a/tests/unit_tests/CMakeLists.txt b/tests/unit_tests/CMakeLists.txt index d774d5616..d4341b41d 100644 --- a/tests/unit_tests/CMakeLists.txt +++ b/tests/unit_tests/CMakeLists.txt @@ -20,20 +20,28 @@ set(TEST_TARGET_DOMAINS ${TARGET_DOMAINS}) if(("blas" IN_LIST TEST_TARGET_DOMAINS) OR ("lapack" IN_LIST TEST_TARGET_DOMAINS)) - find_package(CBLAS) - if(NOT CBLAS_FOUND) - # TODO: add list of tests without Netlib dependency - message(WARNING "Netlib CBLAS headers or libraries are not found, BLAS/LAPACK unit tests will be skipped") - list(REMOVE_ITEM TEST_TARGET_DOMAINS "blas" "lapack") + if(ENABLE_OPENBLAS_BACKEND) + message(STATUS "Using OpenBLAS for BLAS/LAPACK unit tests") + set(CBLAS_FOUND TRUE) + set(LAPACKE_FOUND TRUE) + else() + find_package(CBLAS) + if(NOT CBLAS_FOUND) + # TODO: add list of tests without Netlib dependency + message(WARNING "Netlib CBLAS headers or libraries are not found, BLAS/LAPACK unit tests will be skipped") + list(REMOVE_ITEM TEST_TARGET_DOMAINS "blas" "lapack") + endif() endif() endif() if("lapack" IN_LIST TEST_TARGET_DOMAINS) - find_package(LAPACKE) - if(NOT LAPACKE_FOUND) - # TODO: add list of tests without Netlib dependency - message(WARNING "Netlib LAPACKE headers or libraries are not found, LAPACK unit tests will be skipped") - list(REMOVE_ITEM TEST_TARGET_DOMAINS "lapack") + if(NOT ENABLE_OPENBLAS_BACKEND) + find_package(LAPACKE) + if(NOT LAPACKE_FOUND) + # TODO: add list of tests without Netlib dependency + message(WARNING "Netlib LAPACKE headers or libraries are not found, LAPACK unit tests will be skipped") + list(REMOVE_ITEM TEST_TARGET_DOMAINS "lapack") + endif() endif() endif() @@ -103,17 +111,25 @@ foreach(domain ${TEST_TARGET_DOMAINS}) list(TRANSFORM ${domain}_DEVICE_TEST_LIST_CT APPEND _ct) add_executable(test_main_${domain}_ct main_test.cpp) - target_include_directories(test_main_${domain}_ct PUBLIC ${GTEST_INCLUDE_DIR}) - + target_include_directories(test_main_${domain}_ct PUBLIC ${GTEST_INCLUDE_DIR} ${ONEMATH_INCLUDE_DIRS}) + if(ENABLE_OPENBLAS_BACKEND) + target_include_directories(test_main_${domain}_ct PUBLIC ${OPENBLAS_INCLUDE}) + endif() + if (USE_ADD_SYCL_TO_TARGET_INTEGRATION) add_sycl_to_target(TARGET test_main_${domain}_ct SOURCES main_test.cpp) else() - target_compile_options(test_main_${domain}_ct PRIVATE -fsycl) + if(NOT ${ONEMATH_SYCL_IMPLEMENTATION} STREQUAL "adaptivecpp") + target_compile_options(test_main_${domain}_ct PRIVATE -fsycl) + endif() endif() if(BUILD_SHARED_LIBS) add_executable(test_main_${domain}_rt main_test.cpp) - target_include_directories(test_main_${domain}_rt PUBLIC ${GTEST_INCLUDE_DIR}) + target_include_directories(test_main_${domain}_rt PUBLIC ${GTEST_INCLUDE_DIR} ${ONEMATH_INCLUDE_DIRS}) + if(ENABLE_OPENBLAS_BACKEND) + target_include_directories(test_main_${domain}_rt PUBLIC ${OPENBLAS_INCLUDE}) + endif() if(NOT ${ONEMATH_SYCL_IMPLEMENTATION} STREQUAL "adaptivecpp") target_compile_options(test_main_${domain}_rt PRIVATE -fsycl) endif() @@ -156,6 +172,11 @@ foreach(domain ${TEST_TARGET_DOMAINS}) list(APPEND ONEMATH_LIBRARIES_${domain} onemath_${domain}_netlib) endif() + if(domain STREQUAL "blas" AND ENABLE_OPENBLAS_BACKEND) + add_dependencies(test_main_${domain}_ct onemath_${domain}_openblas) + list(APPEND ONEMATH_LIBRARIES_${domain} onemath_${domain}_openblas) + endif() + if(domain STREQUAL "blas" AND ENABLE_ARMPL_BACKEND) add_dependencies(test_main_${domain}_ct onemath_${domain}_armpl) list(APPEND ONEMATH_LIBRARIES_${domain} onemath_${domain}_armpl) @@ -244,9 +265,17 @@ foreach(domain ${TEST_TARGET_DOMAINS}) string(TOUPPER ${domain} DOMAIN_PREFIX) if(domain STREQUAL "blas" AND CBLAS_FOUND) - set(TEST_LD_LIBRARY_PATH ${CMAKE_BINARY_DIR}/lib:${CBLAS_LIB_DIR}:$ENV{LD_LIBRARY_PATH}) + if(ENABLE_OPENBLAS_BACKEND) + set(TEST_LD_LIBRARY_PATH ${CMAKE_BINARY_DIR}/lib:${OPENBLAS_LIB_DIR}:$ENV{LD_LIBRARY_PATH}) + else() + set(TEST_LD_LIBRARY_PATH ${CMAKE_BINARY_DIR}/lib:${CBLAS_LIB_DIR}:$ENV{LD_LIBRARY_PATH}) + endif() elseif(domain STREQUAL "lapack" AND LAPACKE_FOUND) - set(TEST_LD_LIBRARY_PATH ${CMAKE_BINARY_DIR}/lib:${LAPACKE64_LIB_DIR}:${CBLAS_LIB_DIR}:$ENV{LD_LIBRARY_PATH}) + if(ENABLE_OPENBLAS_BACKEND) + set(TEST_LD_LIBRARY_PATH ${CMAKE_BINARY_DIR}/lib:${OPENBLAS_LIB_DIR}:$ENV{LD_LIBRARY_PATH}) + else() + set(TEST_LD_LIBRARY_PATH ${CMAKE_BINARY_DIR}/lib:${LAPACKE64_LIB_DIR}:${CBLAS_LIB_DIR}:$ENV{LD_LIBRARY_PATH}) + endif() else() set(TEST_LD_LIBRARY_PATH ${CMAKE_BINARY_DIR}/lib:$ENV{LD_LIBRARY_PATH}) endif() diff --git a/tests/unit_tests/backup_files/CMakeLists.txt b/tests/unit_tests/backup_files/CMakeLists.txt new file mode 100644 index 000000000..d774d5616 --- /dev/null +++ b/tests/unit_tests/backup_files/CMakeLists.txt @@ -0,0 +1,278 @@ +#=============================================================================== +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# +# SPDX-License-Identifier: Apache-2.0 +#=============================================================================== + +set(TEST_TARGET_DOMAINS ${TARGET_DOMAINS}) + +if(("blas" IN_LIST TEST_TARGET_DOMAINS) OR ("lapack" IN_LIST TEST_TARGET_DOMAINS)) + find_package(CBLAS) + if(NOT CBLAS_FOUND) + # TODO: add list of tests without Netlib dependency + message(WARNING "Netlib CBLAS headers or libraries are not found, BLAS/LAPACK unit tests will be skipped") + list(REMOVE_ITEM TEST_TARGET_DOMAINS "blas" "lapack") + endif() +endif() + +if("lapack" IN_LIST TEST_TARGET_DOMAINS) + find_package(LAPACKE) + if(NOT LAPACKE_FOUND) + # TODO: add list of tests without Netlib dependency + message(WARNING "Netlib LAPACKE headers or libraries are not found, LAPACK unit tests will be skipped") + list(REMOVE_ITEM TEST_TARGET_DOMAINS "lapack") + endif() +endif() + +foreach(domain ${TEST_TARGET_DOMAINS}) + # Build tests first + add_subdirectory(${domain}) +endforeach() + +include(GoogleTest) + +get_target_property(GTEST_INCLUDE_DIR gtest INTERFACE_INCLUDE_DIRECTORIES) + +# Build final test binaries: test_main_rt is for testing RunTime API (RT), test_main_ct is for testing CompileTime API (CT) + +# BLAS config +set(blas_TEST_LIST + blas_level1 + blas_level2 + blas_level3 + blas_batch + blas_extensions) + +if(${ONEMATH_SYCL_IMPLEMENTATION} STREQUAL "dpc++") + list(APPEND blas_TEST_LIST blas_sycl_graph) +endif() + +set(blas_TEST_LINK "") + +# LAPACK config +set(lapack_TEST_LIST + lapack_source) + +if(LAPACKE_FOUND) + set(lapack_TEST_LINK "") +endif() + +# RNG config +set(rng_TEST_LIST + rng_statistics + rng_service) +set(rng_DEVICE_TEST_LIST + rng_device_moments + rng_device_service +) + +set(rng_TEST_LINK "") + +# DFT config +set(dft_TEST_LIST + dft_source) + +set(dft_TEST_LINK "") + +# Sparse BLAS config +set(sparse_blas_TEST_LIST + spblas_source) + +set(sparse_blas_TEST_LINK "") + +foreach(domain ${TEST_TARGET_DOMAINS}) + # Generate RT and CT test lists + set(${domain}_TEST_LIST_RT ${${domain}_TEST_LIST}) + set(${domain}_TEST_LIST_CT ${${domain}_TEST_LIST}) + set(${domain}_DEVICE_TEST_LIST_CT ${${domain}_DEVICE_TEST_LIST}) + list(TRANSFORM ${domain}_TEST_LIST_RT APPEND _rt) + list(TRANSFORM ${domain}_TEST_LIST_CT APPEND _ct) + list(TRANSFORM ${domain}_DEVICE_TEST_LIST_CT APPEND _ct) + + add_executable(test_main_${domain}_ct main_test.cpp) + target_include_directories(test_main_${domain}_ct PUBLIC ${GTEST_INCLUDE_DIR}) + + if (USE_ADD_SYCL_TO_TARGET_INTEGRATION) + add_sycl_to_target(TARGET test_main_${domain}_ct SOURCES main_test.cpp) + else() + target_compile_options(test_main_${domain}_ct PRIVATE -fsycl) + endif() + + if(BUILD_SHARED_LIBS) + add_executable(test_main_${domain}_rt main_test.cpp) + target_include_directories(test_main_${domain}_rt PUBLIC ${GTEST_INCLUDE_DIR}) + if(NOT ${ONEMATH_SYCL_IMPLEMENTATION} STREQUAL "adaptivecpp") + target_compile_options(test_main_${domain}_rt PRIVATE -fsycl) + endif() + target_link_libraries(test_main_${domain}_rt PUBLIC + gtest + gtest_main + ${CMAKE_DL_LIBS} + ${${domain}_TEST_LINK} + ONEMATH::SYCL::SYCL + onemath + ${${domain}_TEST_LIST_RT} + ) + if (USE_ADD_SYCL_TO_TARGET_INTEGRATION) + add_sycl_to_target(TARGET test_main_${domain}_rt SOURCES main_test.cpp) + endif() + endif() + + if(ENABLE_MKLCPU_BACKEND) + add_dependencies(test_main_${domain}_ct onemath_${domain}_mklcpu) + list(APPEND ONEMATH_LIBRARIES_${domain} onemath_${domain}_mklcpu) + endif() + + if(ENABLE_MKLGPU_BACKEND) + add_dependencies(test_main_${domain}_ct onemath_${domain}_mklgpu) + list(APPEND ONEMATH_LIBRARIES_${domain} onemath_${domain}_mklgpu) + endif() + + if(domain STREQUAL "blas" AND ENABLE_CUBLAS_BACKEND) + add_dependencies(test_main_${domain}_ct onemath_${domain}_cublas) + list(APPEND ONEMATH_LIBRARIES_${domain} onemath_${domain}_cublas) + endif() + + if(domain STREQUAL "blas" AND ENABLE_ROCBLAS_BACKEND) + add_dependencies(test_main_${domain}_ct onemath_${domain}_rocblas) + list(APPEND ONEMATH_LIBRARIES_${domain} onemath_${domain}_rocblas) + endif() + + if(domain STREQUAL "blas" AND ENABLE_NETLIB_BACKEND) + add_dependencies(test_main_${domain}_ct onemath_${domain}_netlib) + list(APPEND ONEMATH_LIBRARIES_${domain} onemath_${domain}_netlib) + endif() + + if(domain STREQUAL "blas" AND ENABLE_ARMPL_BACKEND) + add_dependencies(test_main_${domain}_ct onemath_${domain}_armpl) + list(APPEND ONEMATH_LIBRARIES_${domain} onemath_${domain}_armpl) + endif() + + if(domain STREQUAL "blas" AND ENABLE_GENERIC_BLAS_BACKEND) + add_dependencies(test_main_${domain}_ct onemath_${domain}_generic) + list(APPEND ONEMATH_LIBRARIES_${domain} onemath_${domain}_generic) + endif() + + if(domain STREQUAL "lapack" AND ENABLE_CUSOLVER_BACKEND) + add_dependencies(test_main_${domain}_ct onemath_${domain}_cusolver) + list(APPEND ONEMATH_LIBRARIES_${domain} onemath_${domain}_cusolver) + endif() + + if(domain STREQUAL "lapack" AND ENABLE_ROCSOLVER_BACKEND) + add_dependencies(test_main_${domain}_ct onemath_${domain}_rocsolver) + list(APPEND ONEMATH_LIBRARIES_${domain} onemath_${domain}_rocsolver) + endif() + + if(domain STREQUAL "lapack" AND ENABLE_ARMPL_BACKEND) + add_dependencies(test_main_${domain}_ct onemath_${domain}_armpl) + list(APPEND ONEMATH_LIBRARIES_${domain} onemath_${domain}_armpl) + endif() + + if(domain STREQUAL "rng" AND ENABLE_CURAND_BACKEND) + add_dependencies(test_main_${domain}_ct onemath_${domain}_curand) + list(APPEND ONEMATH_LIBRARIES_${domain} onemath_${domain}_curand) + endif() + + if(domain STREQUAL "rng" AND ENABLE_ROCRAND_BACKEND) + add_dependencies(test_main_${domain}_ct onemath_${domain}_rocrand) + list(APPEND ONEMATH_LIBRARIES_${domain} onemath_${domain}_rocrand) + endif() + + if(domain STREQUAL "rng" AND ENABLE_ARMPL_BACKEND) + add_dependencies(test_main_${domain}_ct onemath_${domain}_armpl) + list(APPEND ONEMATH_LIBRARIES_${domain} onemath_${domain}_armpl) + endif() + + if(domain STREQUAL "dft" AND ENABLE_CUFFT_BACKEND) + add_dependencies(test_main_${domain}_ct onemath_${domain}_cufft) + list(APPEND ONEMATH_LIBRARIES_${domain} onemath_${domain}_cufft) + endif() + + if(domain STREQUAL "dft" AND ENABLE_ROCFFT_BACKEND) + add_dependencies(test_main_${domain}_ct onemath_dft_rocfft) + list(APPEND ONEMATH_LIBRARIES_${domain} onemath_dft_rocfft) + endif() + + if(domain STREQUAL "dft" AND ENABLE_PORTFFT_BACKEND) + add_dependencies(test_main_${domain}_ct onemath_dft_portfft) + list(APPEND ONEMATH_LIBRARIES_${domain} onemath_dft_portfft) + endif() + + if(domain STREQUAL "dft" AND ENABLE_ARMPL_BACKEND) + add_dependencies(test_main_${domain}_ct onemath_${domain}_armpl) + list(APPEND ONEMATH_LIBRARIES_${domain} onemath_${domain}_armpl) + endif() + + if(domain STREQUAL "sparse_blas" AND ENABLE_CUSPARSE_BACKEND) + add_dependencies(test_main_${domain}_ct onemath_${domain}_cusparse) + list(APPEND ONEMATH_LIBRARIES_${domain} onemath_${domain}_cusparse) + endif() + + if(domain STREQUAL "sparse_blas" AND ENABLE_ROCSPARSE_BACKEND) + add_dependencies(test_main_${domain}_ct onemath_${domain}_rocsparse) + list(APPEND ONEMATH_LIBRARIES_${domain} onemath_${domain}_rocsparse) + endif() + + target_link_libraries(test_main_${domain}_ct PUBLIC + gtest + gtest_main + ${CMAKE_DL_LIBS} + ${${domain}_TEST_LINK} + ${ONEMATH_LIBRARIES_${domain}} + ONEMATH::SYCL::SYCL + ${${domain}_TEST_LIST_CT} + ${${domain}_DEVICE_TEST_LIST_CT} + ) + + if(NOT ${ONEMATH_SYCL_IMPLEMENTATION} STREQUAL "adaptivecpp") + target_link_options(test_main_${domain}_ct PUBLIC -fsycl-device-code-split=per_kernel) + endif() + + string(TOUPPER ${domain} DOMAIN_PREFIX) + + if(domain STREQUAL "blas" AND CBLAS_FOUND) + set(TEST_LD_LIBRARY_PATH ${CMAKE_BINARY_DIR}/lib:${CBLAS_LIB_DIR}:$ENV{LD_LIBRARY_PATH}) + elseif(domain STREQUAL "lapack" AND LAPACKE_FOUND) + set(TEST_LD_LIBRARY_PATH ${CMAKE_BINARY_DIR}/lib:${LAPACKE64_LIB_DIR}:${CBLAS_LIB_DIR}:$ENV{LD_LIBRARY_PATH}) + else() + set(TEST_LD_LIBRARY_PATH ${CMAKE_BINARY_DIR}/lib:$ENV{LD_LIBRARY_PATH}) + endif() + + if(BUILD_SHARED_LIBS) + set_target_properties(test_main_${domain}_rt + PROPERTIES BUILD_RPATH $) + # Find individual tests within executable + gtest_discover_tests(test_main_${domain}_rt + PROPERTIES BUILD_RPATH ${CMAKE_BINARY_DIR}/lib + PROPERTIES ENVIRONMENT LD_LIBRARY_PATH=${TEST_LD_LIBRARY_PATH} + PROPERTIES TEST_PREFIX ${DOMAIN_PREFIX}/RT/ + DISCOVERY_TIMEOUT 30 + ) + endif() + + gtest_discover_tests(test_main_${domain}_ct + PROPERTIES BUILD_RPATH ${CMAKE_BINARY_DIR}/lib + PROPERTIES ENVIRONMENT LD_LIBRARY_PATH=${TEST_LD_LIBRARY_PATH} + PROPERTIES TEST_PREFIX ${DOMAIN_PREFIX}/CT/ + DISCOVERY_TIMEOUT 30 + ) + + if(BUILD_SHARED_LIBS) + if (USE_ADD_SYCL_TO_TARGET_INTEGRATION) + add_sycl_to_target(TARGET test_main_${domain}_rt) + endif() + endif() +endforeach() diff --git a/tests/unit_tests/blas/CMakeLists.txt b/tests/unit_tests/blas/CMakeLists.txt index a2553fda1..a43e3b1b0 100644 --- a/tests/unit_tests/blas/CMakeLists.txt +++ b/tests/unit_tests/blas/CMakeLists.txt @@ -22,6 +22,11 @@ add_compile_definitions( ONEMATH_REF_BLAS_LIBNAME="${ONEMATH_REF_BLAS_LIBNAME}" ONEMATH_REF_CBLAS_LIBNAME="${ONEMATH_REF_CBLAS_LIBNAME}") +if(ENABLE_OPENBLAS_BACKEND) + message(STATUS "Adding OpenBLAS include directory to BLAS unit tests") + include_directories(${OPENBLAS_INCLUDE}) +endif() + add_subdirectory(level1) add_subdirectory(level2) add_subdirectory(level3) diff --git a/tests/unit_tests/include/test_helper.hpp b/tests/unit_tests/include/test_helper.hpp index c029e3907..315077601 100644 --- a/tests/unit_tests/include/test_helper.hpp +++ b/tests/unit_tests/include/test_helper.hpp @@ -82,7 +82,7 @@ #endif #if defined(ONEMATH_ENABLE_MKLCPU_BACKEND) || defined(ONEMATH_ENABLE_NETLIB_BACKEND) || \ - defined(ONEMATH_ENABLE_ARMPL_BACKEND) + defined(ONEMATH_ENABLE_OPENBLAS_BACKEND) || defined(ONEMATH_ENABLE_ARMPL_BACKEND) #ifdef ONEMATH_ENABLE_MKLCPU_BACKEND #define TEST_RUN_INTELCPU_SELECT_NO_ARGS(q, func) \ func(oneapi::math::backend_selector{ q }) @@ -90,7 +90,10 @@ func(oneapi::math::backend_selector{ q }, __VA_ARGS__) #elif defined(ONEMATH_ENABLE_NETLIB_BACKEND) #define TEST_RUN_INTELCPU_SELECT(q, func, ...) \ - func(oneapi::math::backend_selector{ q }, __VA_ARGS__) + func(oneapi::math::backend_selector{ q }, __VA_ARGS__) +#elif defined(ONEMATH_ENABLE_OPENBLAS_BACKEND) +#define TEST_RUN_INTELCPU_SELECT(q, func, ...) \ + func(oneapi::math::backend_selector{ q }, __VA_ARGS__) #elif defined(ONEMATH_ENABLE_ARMPL_BACKEND) #define TEST_RUN_INTELCPU_SELECT_NO_ARGS(q, func) \ func(oneapi::math::backend_selector{ q }) diff --git a/tests/unit_tests/lapack/CMakeLists.txt b/tests/unit_tests/lapack/CMakeLists.txt index cec4e89f8..304cc69db 100644 --- a/tests/unit_tests/lapack/CMakeLists.txt +++ b/tests/unit_tests/lapack/CMakeLists.txt @@ -17,8 +17,8 @@ # SPDX-License-Identifier: Apache-2.0 #=============================================================================== add_compile_definitions( - ONEMATH_REF_LAPACK_LIBNAME="${ONEMATH_REF_LAPACK_LIBNAME}" - ONEMATH_REF_LAPACKE_LIBNAME="${ONEMATH_REF_LAPACKE_LIBNAME}" + ONEMATH_REF_LAPACK_LIBNAME="${LAPACK64_file}" + ONEMATH_REF_LAPACKE_LIBNAME="${LAPACKE64_file}" ONEMATH_REF_CBLAS_LIBNAME="${ONEMATH_REF_CBLAS_LIBNAME}") add_subdirectory(source) add_subdirectory(common) diff --git a/tests/unit_tests/main_test.cpp b/tests/unit_tests/main_test.cpp index 841fd60b2..759f359bd 100644 --- a/tests/unit_tests/main_test.cpp +++ b/tests/unit_tests/main_test.cpp @@ -112,7 +112,8 @@ int main(int argc, char** argv) { if (unique_devices.find(dev.get_info()) == unique_devices.end()) { unique_devices.insert(dev.get_info()); -#if !defined(ONEMATH_ENABLE_MKLCPU_BACKEND) && \ +#if !defined(ONEMATH_ENABLE_MKLCPU_BACKEND) && \ + !defined(ONEMATH_ENABLE_OPENBLAS_BACKEND) && \ !defined(ONEMATH_ENABLE_GENERIC_BLAS_BACKEND_INTEL_CPU) && \ !defined(ONEMATH_ENABLE_PORTFFT_BACKEND) && !defined(ONEMATH_ENABLE_NETLIB_BACKEND) && \ !defined(ONEMATH_ENABLE_ARMPL_BACKEND)