19 #error Implementation only for AVX capable architectures
22 #include <immintrin.h>
31 const unsigned quot = n / 8;
32 const unsigned rem = n % 8;
33 __m256d t0 = _mm256_setzero_pd();
34 __m256d t1 = _mm256_setzero_pd();
35 for (
unsigned k = 0; k < quot; k++) {
36 __m256d f0 = _mm256_loadu_pd(u);
37 __m256d f1 = _mm256_loadu_pd(v);
38 f0 = _mm256_mul_pd(f0, f1);
39 t0 = _mm256_add_pd(t0, f0);
42 __m256d f2 = _mm256_loadu_pd(u);
43 __m256d f3 = _mm256_loadu_pd(v);
44 f2 = _mm256_mul_pd(f2, f3);
45 t1 = _mm256_add_pd(t1, f2);
49 t0 = _mm256_hadd_pd(t0, t1);
50 alignas(32)
double tmp[4];
51 _mm256_store_pd(tmp, t0);
52 double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
53 for (
unsigned k = 0; k < rem; k++) {
54 result += *u++ * *v++;