18 #if !defined(__SSE4_1__)
19 #error Implementation only for SSE 4.1 capable architectures
22 #include <emmintrin.h>
23 #include <smmintrin.h>
32 int max_offset = n - 2;
36 __m128d sum = _mm_setzero_pd();
37 if (offset <= max_offset) {
40 if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
41 (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
44 __m128d floats2 = _mm_load_pd(v);
46 sum = _mm_mul_pd(sum, floats2);
47 while (offset <= max_offset) {
48 __m128d floats1 = _mm_load_pd(u + offset);
49 floats2 = _mm_load_pd(v + offset);
51 floats1 = _mm_mul_pd(floats1, floats2);
52 sum = _mm_add_pd(sum, floats1);
56 sum = _mm_loadu_pd(u);
57 __m128d floats2 = _mm_loadu_pd(v);
59 sum = _mm_mul_pd(sum, floats2);
60 while (offset <= max_offset) {
61 __m128d floats1 = _mm_loadu_pd(u + offset);
62 floats2 = _mm_loadu_pd(v + offset);
64 floats1 = _mm_mul_pd(floats1, floats2);
65 sum = _mm_add_pd(sum, floats1);
70 sum = _mm_hadd_pd(sum, sum);
72 double result = _mm_cvtsd_f64(sum);
75 result += u[offset] * v[offset];