28 fprintf(stderr,
"DotProductAVX can't be used on Android\n");
33 #else // !defined(__AVX__) 35 #include <immintrin.h> 43 double DotProductAVX(
const double* u,
const double* v,
int n) {
44 int max_offset = n - 4;
48 __m256d sum = _mm256_setzero_pd();
49 if (offset <= max_offset) {
52 if ((reinterpret_cast<uintptr_t>(u) & 31) == 0 &&
53 (reinterpret_cast<uintptr_t>(v) & 31) == 0) {
55 __m256d floats1 = _mm256_load_pd(u);
56 __m256d floats2 = _mm256_load_pd(v);
58 sum = _mm256_mul_pd(floats1, floats2);
59 while (offset <= max_offset) {
60 floats1 = _mm256_load_pd(u + offset);
61 floats2 = _mm256_load_pd(v + offset);
63 __m256d product = _mm256_mul_pd(floats1, floats2);
64 sum = _mm256_add_pd(sum, product);
68 __m256d floats1 = _mm256_loadu_pd(u);
69 __m256d floats2 = _mm256_loadu_pd(v);
71 sum = _mm256_mul_pd(floats1, floats2);
72 while (offset <= max_offset) {
73 floats1 = _mm256_loadu_pd(u + offset);
74 floats2 = _mm256_loadu_pd(v + offset);
76 __m256d product = _mm256_mul_pd(floats1, floats2);
77 sum = _mm256_add_pd(sum, product);
84 __m256d sum2 = _mm256_permute2f128_pd(sum, sum, 1);
85 sum = _mm256_hadd_pd(sum, sum2);
86 sum = _mm256_hadd_pd(sum, sum);
91 auto cast_sum = _mm256_castpd_si256(sum);
92 #pragma GCC diagnostic push 93 #pragma GCC diagnostic ignored "-Wstrict-aliasing" 94 *(
reinterpret_cast<int64_t*
>(&result)) =
95 #if defined(_WIN32) || defined(__i386__) 99 ((uint64_t*)&cast_sum)[0]
101 _mm256_extract_epi64(cast_sum, 0)
104 #pragma GCC diagnostic pop 106 result += u[offset] * v[offset];
114 #endif // ANDROID_BUILD double DotProductAVX(const double *u, const double *v, int n)