diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c index f8ef32c0b..fa79a55ac 100644 --- a/config/zen/bli_cntx_init_zen.c +++ b/config/zen/bli_cntx_init_zen.c @@ -141,6 +141,8 @@ void bli_cntx_init_zen( cntx_t* cntx ) // copyv BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, + BLIS_COPYV_KER, BLIS_SCOMPLEX, bli_ccopyv_zen_int, + BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_zen_int, // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c index 53b83e1fa..827ffcad4 100644 --- a/config/zen2/bli_cntx_init_zen2.c +++ b/config/zen2/bli_cntx_init_zen2.c @@ -154,6 +154,8 @@ void bli_cntx_init_zen2( cntx_t* cntx ) //copy BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, + BLIS_COPYV_KER, BLIS_SCOMPLEX, bli_ccopyv_zen_int, + BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_zen_int, //set BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, diff --git a/config/zen3/bli_cntx_init_zen3.c b/config/zen3/bli_cntx_init_zen3.c index 331054dff..ec2a5fe6a 100644 --- a/config/zen3/bli_cntx_init_zen3.c +++ b/config/zen3/bli_cntx_init_zen3.c @@ -130,6 +130,8 @@ void bli_cntx_init_zen3( cntx_t* cntx ) // copyv BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, + BLIS_COPYV_KER, BLIS_SCOMPLEX, bli_ccopyv_zen_int, + BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_zen_int, // setv BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, diff --git a/kernels/zen/1/bli_copyv_zen_int.c b/kernels/zen/1/bli_copyv_zen_int.c index 8e105e601..979d6897a 100644 --- a/kernels/zen/1/bli_copyv_zen_int.c +++ b/kernels/zen/1/bli_copyv_zen_int.c @@ -334,3 +334,533 @@ void bli_dcopyv_zen_int } } +void bli_ccopyv_zen_int +( + conj_t conjx, + dim_t n, + const void* x, inc_t incx, + void* y, inc_t incy, + const cntx_t* cntx +) +{ + const scomplex* x0 = x; + scomplex* y0 = y; + // If the vector dimension is zero return early. + if ( bli_zero_dim1( n ) ) + { + return; + } + + // Setting the local pointers and iterator + dim_t i = 0; + + // Handling conjugate separately + if ( bli_is_conj( conjx ) ) + { + if ( incx == 1 && incy == 1 ) + { + const dim_t n_elem_per_reg = 4; + __m256 x_vec[8]; + + __m256 conj_reg = _mm256_setr_ps(1, -1, 1, -1, 1, -1, 1, -1); + + for (; (i + 31) < n; i += 32) + { + /* 8 float values = 4 float complex values are loaded*/ + x_vec[0] = _mm256_loadu_ps((float *)x0); + x_vec[1] = _mm256_loadu_ps((float *)(x0 + n_elem_per_reg)); + x_vec[2] = _mm256_loadu_ps((float *)(x0 + 2 * n_elem_per_reg)); + x_vec[3] = _mm256_loadu_ps((float *)(x0 + 3 * n_elem_per_reg)); + x_vec[4] = _mm256_loadu_ps((float *)(x0 + 4 * n_elem_per_reg)); + x_vec[5] = _mm256_loadu_ps((float *)(x0 + 5 * n_elem_per_reg)); + x_vec[6] = _mm256_loadu_ps((float *)(x0 + 6 * n_elem_per_reg)); + x_vec[7] = _mm256_loadu_ps((float *)(x0 + 7 * n_elem_per_reg)); + + /* Perform conjugation by multiplying the imaginary + part with -1 and real part with 1*/ + x_vec[0] = _mm256_mul_ps(x_vec[0], conj_reg); + x_vec[1] = _mm256_mul_ps(x_vec[1], conj_reg); + x_vec[2] = _mm256_mul_ps(x_vec[2], conj_reg); + x_vec[3] = _mm256_mul_ps(x_vec[3], conj_reg); + x_vec[4] = _mm256_mul_ps(x_vec[4], conj_reg); + x_vec[5] = _mm256_mul_ps(x_vec[5], conj_reg); + x_vec[6] = _mm256_mul_ps(x_vec[6], conj_reg); + x_vec[7] = _mm256_mul_ps(x_vec[7], conj_reg); + + _mm256_storeu_ps((float *)y0, x_vec[0]); + _mm256_storeu_ps((float *)(y0 + n_elem_per_reg), x_vec[1]); + _mm256_storeu_ps((float *)(y0 + 2 * n_elem_per_reg), x_vec[2]); + _mm256_storeu_ps((float *)(y0 + 3 * n_elem_per_reg), x_vec[3]); + _mm256_storeu_ps((float *)(y0 + 4 * n_elem_per_reg), x_vec[4]); + _mm256_storeu_ps((float *)(y0 + 5 * n_elem_per_reg), x_vec[5]); + _mm256_storeu_ps((float *)(y0 + 6 * n_elem_per_reg), x_vec[6]); + _mm256_storeu_ps((float *)(y0 + 7 * n_elem_per_reg), x_vec[7]); + + x0 += 8 * n_elem_per_reg; + y0 += 8 * n_elem_per_reg; + } + + for (; (i + 15) < n; i += 16) + { + x_vec[0] = _mm256_loadu_ps((float *)x0); + x_vec[1] = _mm256_loadu_ps((float *)(x0 + n_elem_per_reg)); + x_vec[2] = _mm256_loadu_ps((float *)(x0 + 2 * n_elem_per_reg)); + x_vec[3] = _mm256_loadu_ps((float *)(x0 + 3 * n_elem_per_reg)); + + x_vec[0] = _mm256_mul_ps(x_vec[0], conj_reg); + x_vec[1] = _mm256_mul_ps(x_vec[1], conj_reg); + x_vec[2] = _mm256_mul_ps(x_vec[2], conj_reg); + x_vec[3] = _mm256_mul_ps(x_vec[3], conj_reg); + + x0 += 4 * n_elem_per_reg; + + _mm256_storeu_ps((float *)y0, x_vec[0]); + _mm256_storeu_ps((float *)(y0 + n_elem_per_reg), x_vec[1]); + _mm256_storeu_ps((float *)(y0 + 2 * n_elem_per_reg), x_vec[2]); + _mm256_storeu_ps((float *)(y0 + 3 * n_elem_per_reg), x_vec[3]); + + y0 += 4 * n_elem_per_reg; + } + + for (; (i + 7) < n; i += 8) + { + x_vec[0] = _mm256_loadu_ps((float *)x0); + x_vec[1] = _mm256_loadu_ps((float *)(x0 + n_elem_per_reg)); + + x0 += 2 * n_elem_per_reg; + + x_vec[0] = _mm256_mul_ps(x_vec[0], conj_reg); + x_vec[1] = _mm256_mul_ps(x_vec[1], conj_reg); + + _mm256_storeu_ps((float *)y0, x_vec[0]); + _mm256_storeu_ps((float *)(y0 + n_elem_per_reg), x_vec[1]); + + y0 += 2 * n_elem_per_reg; + } + + for (; (i + 3) < n; i += 4) + { + x_vec[0] = _mm256_loadu_ps((float *)x0); + + x_vec[0] = _mm256_mul_ps(x_vec[0], conj_reg); + + x0 += n_elem_per_reg; + + _mm256_storeu_ps((float *)y0, x_vec[0]); + + y0 += n_elem_per_reg; + } + + } + + // Handling fringe cases or non-unit strided inputs + for (; i < n; i += 1) + { + scomplex temp = *x0; + temp.imag = -temp.imag; + *y0 = temp; + + x0 += incx; + y0 += incy; + } + } + else + { + if (incx == 1 && incy == 1) + { + const dim_t n_elem_per_reg = 4; + __m256 x_vec[8]; + + for (; (i + 31) < n; i += 32) + { + x_vec[0] = _mm256_loadu_ps((float *)x0); + x_vec[1] = _mm256_loadu_ps((float *)(x0 + n_elem_per_reg)); + x_vec[2] = _mm256_loadu_ps((float *)(x0 + 2 * n_elem_per_reg)); + x_vec[3] = _mm256_loadu_ps((float *)(x0 + 3 * n_elem_per_reg)); + x_vec[4] = _mm256_loadu_ps((float *)(x0 + 4 * n_elem_per_reg)); + x_vec[5] = _mm256_loadu_ps((float *)(x0 + 5 * n_elem_per_reg)); + x_vec[6] = _mm256_loadu_ps((float *)(x0 + 6 * n_elem_per_reg)); + x_vec[7] = _mm256_loadu_ps((float *)(x0 + 7 * n_elem_per_reg)); + + x0 += 8 * n_elem_per_reg; + + _mm256_storeu_ps((float *)y0, x_vec[0]); + _mm256_storeu_ps((float *)(y0 + n_elem_per_reg), x_vec[1]); + _mm256_storeu_ps((float *)(y0 + 2 * n_elem_per_reg), x_vec[2]); + _mm256_storeu_ps((float *)(y0 + 3 * n_elem_per_reg), x_vec[3]); + _mm256_storeu_ps((float *)(y0 + 4 * n_elem_per_reg), x_vec[4]); + _mm256_storeu_ps((float *)(y0 + 5 * n_elem_per_reg), x_vec[5]); + _mm256_storeu_ps((float *)(y0 + 6 * n_elem_per_reg), x_vec[6]); + _mm256_storeu_ps((float *)(y0 + 7 * n_elem_per_reg), x_vec[7]); + + y0 += 8 * n_elem_per_reg; + } + + for (; (i + 15) < n; i += 16) + { + x_vec[0] = _mm256_loadu_ps((float *)x0); + x_vec[1] = _mm256_loadu_ps((float *)(x0 + n_elem_per_reg)); + x_vec[2] = _mm256_loadu_ps((float *)(x0 + 2 * n_elem_per_reg)); + x_vec[3] = _mm256_loadu_ps((float *)(x0 + 3 * n_elem_per_reg)); + + x0 += 4 * n_elem_per_reg; + + _mm256_storeu_ps((float *)y0, x_vec[0]); + _mm256_storeu_ps((float *)(y0 + n_elem_per_reg), x_vec[1]); + _mm256_storeu_ps((float *)(y0 + 2 * n_elem_per_reg), x_vec[2]); + _mm256_storeu_ps((float *)(y0 + 3 * n_elem_per_reg), x_vec[3]); + + y0 += 4 * n_elem_per_reg; + } + + for (; (i + 7) < n; i += 8) + { + x_vec[0] = _mm256_loadu_ps((float *)x0); + x_vec[1] = _mm256_loadu_ps((float *)(x0 + n_elem_per_reg)); + + x0 += 2 * n_elem_per_reg; + + _mm256_storeu_ps((float *)y0, x_vec[0]); + _mm256_storeu_ps((float *)(y0 + n_elem_per_reg), x_vec[1]); + + y0 += 2 * n_elem_per_reg; + } + + for (; (i + 3) < n; i += 4) + { + x_vec[0] = _mm256_loadu_ps((float *)x0); + + x0 += n_elem_per_reg; + + _mm256_storeu_ps((float *)y0, x_vec[0]); + + y0 += n_elem_per_reg; + } + + } + for (; i < n; i += 1) + { + *y0 = *x0; + + x0 += incx; + y0 += incy; + } + } +} + +void bli_zcopyv_zen_int +( + conj_t conjx, + dim_t n, + const void* x, inc_t incx, + void* y, inc_t incy, + const cntx_t* cntx +) +{ + const dcomplex* x0 = x; + dcomplex* y0 = y; + // If the vector dimension is zero return early. + if (bli_zero_dim1(n)) + { + return; + } + + dim_t i = 0; + + if (bli_is_conj(conjx)) + { + + if (incx == 1 && incy == 1) + { + const dim_t n_elem_per_reg = 2; + __m256d x_vec[8]; + + __m256d conj_reg = _mm256_setr_pd(1, -1, 1, -1); + + for (; (i + 15) < n; i += 16) + { + /* 4 double values = 2 double complex values are loaded*/ + x_vec[0] = _mm256_loadu_pd((double *)x0); + x_vec[1] = _mm256_loadu_pd((double *)(x0 + n_elem_per_reg)); + x_vec[2] = _mm256_loadu_pd((double *)(x0 + 2 * n_elem_per_reg)); + x_vec[3] = _mm256_loadu_pd((double *)(x0 + 3 * n_elem_per_reg)); + x_vec[4] = _mm256_loadu_pd((double *)(x0 + 4 * n_elem_per_reg)); + x_vec[5] = _mm256_loadu_pd((double *)(x0 + 5 * n_elem_per_reg)); + x_vec[6] = _mm256_loadu_pd((double *)(x0 + 6 * n_elem_per_reg)); + x_vec[7] = _mm256_loadu_pd((double *)(x0 + 7 * n_elem_per_reg)); + + /* Perform conjugation by multiplying the imaginary + part with -1 and real part with 1*/ + x_vec[0] = _mm256_mul_pd(x_vec[0], conj_reg); + x_vec[1] = _mm256_mul_pd(x_vec[1], conj_reg); + x_vec[2] = _mm256_mul_pd(x_vec[2], conj_reg); + x_vec[3] = _mm256_mul_pd(x_vec[3], conj_reg); + x_vec[4] = _mm256_mul_pd(x_vec[4], conj_reg); + x_vec[5] = _mm256_mul_pd(x_vec[5], conj_reg); + x_vec[6] = _mm256_mul_pd(x_vec[6], conj_reg); + x_vec[7] = _mm256_mul_pd(x_vec[7], conj_reg); + + _mm256_storeu_pd((double *)y0, x_vec[0]); + _mm256_storeu_pd((double *)(y0 + n_elem_per_reg), x_vec[1]); + _mm256_storeu_pd((double *)(y0 + 2 * n_elem_per_reg), x_vec[2]); + _mm256_storeu_pd((double *)(y0 + 3 * n_elem_per_reg), x_vec[3]); + _mm256_storeu_pd((double *)(y0 + 4 * n_elem_per_reg), x_vec[4]); + _mm256_storeu_pd((double *)(y0 + 5 * n_elem_per_reg), x_vec[5]); + _mm256_storeu_pd((double *)(y0 + 6 * n_elem_per_reg), x_vec[6]); + _mm256_storeu_pd((double *)(y0 + 7 * n_elem_per_reg), x_vec[7]); + + x0 += 8 * n_elem_per_reg; + y0 += 8 * n_elem_per_reg; + } + + for (; (i + 7) < n; i += 8) + { + x_vec[0] = _mm256_loadu_pd((double *)x0); + x_vec[1] = _mm256_loadu_pd((double *)(x0 + n_elem_per_reg)); + x_vec[2] = _mm256_loadu_pd((double *)(x0 + 2 * n_elem_per_reg)); + x_vec[3] = _mm256_loadu_pd((double *)(x0 + 3 * n_elem_per_reg)); + + x_vec[0] = _mm256_mul_pd(x_vec[0], conj_reg); + x_vec[1] = _mm256_mul_pd(x_vec[1], conj_reg); + x_vec[2] = _mm256_mul_pd(x_vec[2], conj_reg); + x_vec[3] = _mm256_mul_pd(x_vec[3], conj_reg); + + x0 += 4 * n_elem_per_reg; + + _mm256_storeu_pd((double *)y0, x_vec[0]); + _mm256_storeu_pd((double *)(y0 + n_elem_per_reg), x_vec[1]); + _mm256_storeu_pd((double *)(y0 + 2 * n_elem_per_reg), x_vec[2]); + _mm256_storeu_pd((double *)(y0 + 3 * n_elem_per_reg), x_vec[3]); + + y0 += 4 * n_elem_per_reg; + } + + for (; (i + 3) < n; i += 4) + { + x_vec[0] = _mm256_loadu_pd((double *)x0); + x_vec[1] = _mm256_loadu_pd((double *)(x0 + n_elem_per_reg)); + + x0 += 2 * n_elem_per_reg; + + x_vec[0] = _mm256_mul_pd(x_vec[0], conj_reg); + x_vec[1] = _mm256_mul_pd(x_vec[1], conj_reg); + + _mm256_storeu_pd((double *)y0, x_vec[0]); + _mm256_storeu_pd((double *)(y0 + n_elem_per_reg), x_vec[1]); + + y0 += 2 * n_elem_per_reg; + } + + for (; (i + 1) < n; i += 2) + { + x_vec[0] = _mm256_loadu_pd((double *)x0); + + x_vec[0] = _mm256_mul_pd(x_vec[0], conj_reg); + + x0 += n_elem_per_reg; + + _mm256_storeu_pd((double *)y0, x_vec[0]); + + y0 += n_elem_per_reg; + } + + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from AVX to SSE instructions (which may occur as soon + // as the n_left cleanup loop below if BLIS is compiled with + // -mfpmath=sse). + _mm256_zeroupper(); + } + else + { + /*Since double complex elements are of size 128 bits, vectorization + can be done using XMM registers when incx and incy are not 1. This is done + in the else condition.*/ + __m128d conj_reg = _mm_setr_pd(1, -1); + __m128d x_vec[4]; + + for (; (i + 3) < n; i += 4) + { + /* 2 double values = 1 double complex value(s) are(is) loaded*/ + x_vec[0] = _mm_loadu_pd((double *)x0); + x_vec[1] = _mm_loadu_pd((double *)(x0 + incx)); + x_vec[2] = _mm_loadu_pd((double *)(x0 + 2 * incx)); + x_vec[3] = _mm_loadu_pd((double *)(x0 + 3 * incx)); + + x_vec[0] = _mm_mul_pd(x_vec[0], conj_reg); + x_vec[1] = _mm_mul_pd(x_vec[1], conj_reg); + x_vec[2] = _mm_mul_pd(x_vec[2], conj_reg); + x_vec[3] = _mm_mul_pd(x_vec[3], conj_reg); + + _mm_storeu_pd((double *)y0, x_vec[0]); + _mm_storeu_pd((double *)(y0 + incy), x_vec[1]); + _mm_storeu_pd((double *)(y0 + 2 * incy), x_vec[2]); + _mm_storeu_pd((double *)(y0 + 3 * incy), x_vec[3]); + + x0 += 4 * incx; + y0 += 4 * incy; + } + + for (; (i + 1) < n; i += 2) + { + x_vec[0] = _mm_loadu_pd((double *)x0); + x_vec[1] = _mm_loadu_pd((double *)(x0 + incx)); + + x_vec[0] = _mm_mul_pd(x_vec[0], conj_reg); + x_vec[1] = _mm_mul_pd(x_vec[1], conj_reg); + + _mm_storeu_pd((double *)y0, x_vec[0]); + _mm_storeu_pd((double *)(y0 + incy), x_vec[1]); + + x0 += 2 * incx; + y0 += 2 * incy; + } + } + + __m128d conj_reg = _mm_setr_pd(1, -1); + __m128d x_vec[1]; + + for (; i < n; i += 1) + { + x_vec[0] = _mm_loadu_pd((double *)x0); + + x_vec[0] = _mm_mul_pd(x_vec[0], conj_reg); + + _mm_storeu_pd((double *)y0, x_vec[0]); + + x0 += incx; + y0 += incy; + } + } + else + { + + if (incx == 1 && incy == 1) + { + const dim_t n_elem_per_reg = 2; + __m256d x_vec[8]; + + for (; (i + 15) < n; i += 16) + { + x_vec[0] = _mm256_loadu_pd((double *)x0); + x_vec[1] = _mm256_loadu_pd((double *)(x0 + n_elem_per_reg)); + x_vec[2] = _mm256_loadu_pd((double *)(x0 + 2 * n_elem_per_reg)); + x_vec[3] = _mm256_loadu_pd((double *)(x0 + 3 * n_elem_per_reg)); + x_vec[4] = _mm256_loadu_pd((double *)(x0 + 4 * n_elem_per_reg)); + x_vec[5] = _mm256_loadu_pd((double *)(x0 + 5 * n_elem_per_reg)); + x_vec[6] = _mm256_loadu_pd((double *)(x0 + 6 * n_elem_per_reg)); + x_vec[7] = _mm256_loadu_pd((double *)(x0 + 7 * n_elem_per_reg)); + + x0 += 8 * n_elem_per_reg; + + _mm256_storeu_pd((double *)y0, x_vec[0]); + _mm256_storeu_pd((double *)(y0 + n_elem_per_reg), x_vec[1]); + _mm256_storeu_pd((double *)(y0 + 2 * n_elem_per_reg), x_vec[2]); + _mm256_storeu_pd((double *)(y0 + 3 * n_elem_per_reg), x_vec[3]); + _mm256_storeu_pd((double *)(y0 + 4 * n_elem_per_reg), x_vec[4]); + _mm256_storeu_pd((double *)(y0 + 5 * n_elem_per_reg), x_vec[5]); + _mm256_storeu_pd((double *)(y0 + 6 * n_elem_per_reg), x_vec[6]); + _mm256_storeu_pd((double *)(y0 + 7 * n_elem_per_reg), x_vec[7]); + + y0 += 8 * n_elem_per_reg; + } + + for (; (i + 7) < n; i += 8) + { + x_vec[0] = _mm256_loadu_pd((double *)x0); + x_vec[1] = _mm256_loadu_pd((double *)(x0 + n_elem_per_reg)); + x_vec[2] = _mm256_loadu_pd((double *)(x0 + 2 * n_elem_per_reg)); + x_vec[3] = _mm256_loadu_pd((double *)(x0 + 3 * n_elem_per_reg)); + + x0 += 4 * n_elem_per_reg; + + _mm256_storeu_pd((double *)y0, x_vec[0]); + _mm256_storeu_pd((double *)(y0 + n_elem_per_reg), x_vec[1]); + _mm256_storeu_pd((double *)(y0 + 2 * n_elem_per_reg), x_vec[2]); + _mm256_storeu_pd((double *)(y0 + 3 * n_elem_per_reg), x_vec[3]); + + y0 += 4 * n_elem_per_reg; + } + + for (; (i + 3) < n; i += 4) + { + x_vec[0] = _mm256_loadu_pd((double *)x0); + x_vec[1] = _mm256_loadu_pd((double *)(x0 + n_elem_per_reg)); + + x0 += 2 * n_elem_per_reg; + + _mm256_storeu_pd((double *)y0, x_vec[0]); + _mm256_storeu_pd((double *)(y0 + n_elem_per_reg), x_vec[1]); + + y0 += 2 * n_elem_per_reg; + } + + for (; (i + 1) < n; i += 2) + { + x_vec[0] = _mm256_loadu_pd((double *)x0); + + x0 += n_elem_per_reg; + + _mm256_storeu_pd((double *)y0, x_vec[0]); + + y0 += n_elem_per_reg; + } + + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from AVX to SSE instructions (which may occur as soon + // as the n_left cleanup loop below if BLIS is compiled with + // -mfpmath=sse). + _mm256_zeroupper(); + } + else + { + /*Since double complex elements are of size 128 bits, vectorization + can be done using XMM registers when incx and incy are not 1. This is done + in the else condition.*/ + __m128d x_vec[4]; + + for (; (i + 3) < n; i += 4) + { + x_vec[0] = _mm_loadu_pd((double *)x0); + x_vec[1] = _mm_loadu_pd((double *)(x0 + incx)); + x_vec[2] = _mm_loadu_pd((double *)(x0 + 2 * incx)); + x_vec[3] = _mm_loadu_pd((double *)(x0 + 3 * incx)); + + x0 += 4 * incx; + + _mm_storeu_pd((double *)y0, x_vec[0]); + _mm_storeu_pd((double *)(y0 + incy), x_vec[1]); + _mm_storeu_pd((double *)(y0 + 2 * incy), x_vec[2]); + _mm_storeu_pd((double *)(y0 + 3 * incy), x_vec[3]); + + y0 += 4 * incy; + } + + for (; (i + 1) < n; i += 2) + { + x_vec[0] = _mm_loadu_pd((double *)x0); + x_vec[1] = _mm_loadu_pd((double *)(x0 + incx)); + + x0 += 2 * incx; + + _mm_storeu_pd((double *)y0, x_vec[0]); + _mm_storeu_pd((double *)(y0 + incy), x_vec[1]); + + y0 += 2 * incy; + } + } + __m128d x_vec[1]; + + for (; i < n; i += 1) + { + x_vec[0] = _mm_loadu_pd((double *)x0); + + x0 += incx; + + _mm_storeu_pd((double *)y0, x_vec[0]); + + y0 += incy; + } + } +} diff --git a/kernels/zen/bli_kernels_zen.h b/kernels/zen/bli_kernels_zen.h index 97f243776..8c854006d 100644 --- a/kernels/zen/bli_kernels_zen.h +++ b/kernels/zen/bli_kernels_zen.h @@ -83,6 +83,8 @@ SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // copyv (intrinsics) COPYV_KER_PROT( float, s, copyv_zen_int ) COPYV_KER_PROT( double, d, copyv_zen_int ) +COPYV_KER_PROT( scomplex, c, copyv_zen_int ) +COPYV_KER_PROT( dcomplex, z, copyv_zen_int ) // SETV_KER_PROT(float, s, setv_zen_int)