19 #if !defined(__SSE4_1__) 28 fprintf(stderr,
"DotProductSSE can't be used on Android\n");
32 fprintf(stderr,
"IntDotProductSSE can't be used on Android\n");
37 #else // !defined(__SSE4_1__) 40 #include <emmintrin.h> 41 #include <smmintrin.h> 49 double DotProductSSE(
const double* u,
const double* v,
int n) {
50 int max_offset = n - 2;
54 __m128d sum = _mm_setzero_pd();
55 if (offset <= max_offset) {
58 if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
59 (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
62 __m128d floats2 = _mm_load_pd(v);
64 sum = _mm_mul_pd(sum, floats2);
65 while (offset <= max_offset) {
66 __m128d floats1 = _mm_load_pd(u + offset);
67 floats2 = _mm_load_pd(v + offset);
69 floats1 = _mm_mul_pd(floats1, floats2);
70 sum = _mm_add_pd(sum, floats1);
74 sum = _mm_loadu_pd(u);
75 __m128d floats2 = _mm_loadu_pd(v);
77 sum = _mm_mul_pd(sum, floats2);
78 while (offset <= max_offset) {
79 __m128d floats1 = _mm_loadu_pd(u + offset);
80 floats2 = _mm_loadu_pd(v + offset);
82 floats1 = _mm_mul_pd(floats1, floats2);
83 sum = _mm_add_pd(sum, floats1);
88 sum = _mm_hadd_pd(sum, sum);
90 double result = _mm_cvtsd_f64(sum);
93 result += u[offset] * v[offset];
102 int max_offset = n - 8;
106 __m128i sum = _mm_setzero_si128();
107 if (offset <= max_offset) {
109 __m128i packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u));
110 __m128i packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v));
111 sum = _mm_cvtepi8_epi16(packed1);
112 packed2 = _mm_cvtepi8_epi16(packed2);
116 sum = _mm_madd_epi16(sum, packed2);
117 while (offset <= max_offset) {
118 packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u + offset));
119 packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v + offset));
121 packed1 = _mm_cvtepi8_epi16(packed1);
122 packed2 = _mm_cvtepi8_epi16(packed2);
123 packed1 = _mm_madd_epi16(packed1, packed2);
124 sum = _mm_add_epi32(sum, packed1);
128 sum = _mm_hadd_epi32(sum, sum);
129 sum = _mm_hadd_epi32(sum, sum);
130 int32_t result = _mm_cvtsi128_si32(sum);
132 result += u[offset] * v[offset];
140 #endif // ANDROID_BUILD double DotProductSSE(const double *u, const double *v, int n)
int32_t IntDotProductSSE(const int8_t *u, const int8_t *v, int n)