19 #error Implementation only for FMA capable architectures
22 #include <immintrin.h>
31 const unsigned quot = n / 8;
32 const unsigned rem = n % 8;
33 __m256d t0 = _mm256_setzero_pd();
34 __m256d t1 = _mm256_setzero_pd();
35 for (
unsigned k = 0; k < quot; k++) {
36 __m256d f0 = _mm256_loadu_pd(u);
37 __m256d f1 = _mm256_loadu_pd(v);
38 t0 = _mm256_fmadd_pd(f0, f1, t0);
41 __m256d f2 = _mm256_loadu_pd(u);
42 __m256d f3 = _mm256_loadu_pd(v);
43 t1 = _mm256_fmadd_pd(f2, f3, t1);
47 t0 = _mm256_hadd_pd(t0, t1);
48 alignas(32)
double tmp[4];
49 _mm256_store_pd(tmp, t0);
50 double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
51 for (
unsigned k = 0; k < rem; k++) {
52 result += *u++ * *v++;