tesseract  4.0.0-1-g2a2b
dotproductavx.cpp
Go to the documentation of this file.
1 // File: dotproductavx.cpp
3 // Description: Architecture-specific dot-product function.
4 // Author: Ray Smith
5 // Created: Wed Jul 22 10:48:05 PDT 2015
6 //
7 // (C) Copyright 2015, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
18 
19 #if !defined(__AVX__)
20 // Implementation for non-avx archs.
21 
22 #include "dotproductavx.h"
23 #include <cstdio>
24 #include <cstdlib>
25 
26 namespace tesseract {
27 double DotProductAVX(const double* u, const double* v, int n) {
28  fprintf(stderr, "DotProductAVX can't be used on Android\n");
29  abort();
30 }
31 } // namespace tesseract
32 
33 #else // !defined(__AVX__)
34 // Implementation for avx capable archs.
35 #include <immintrin.h>
36 #include <cstdint>
37 #include "dotproductavx.h"
38 
39 namespace tesseract {
40 
41 // Computes and returns the dot product of the n-vectors u and v.
42 // Uses Intel AVX intrinsics to access the SIMD instruction set.
43 double DotProductAVX(const double* u, const double* v, int n) {
44  int max_offset = n - 4;
45  int offset = 0;
46  // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
47  // v, and multiplying them together in parallel.
48  __m256d sum = _mm256_setzero_pd();
49  if (offset <= max_offset) {
50  offset = 4;
51  // Aligned load is reputedly faster but requires 32 byte aligned input.
52  if ((reinterpret_cast<uintptr_t>(u) & 31) == 0 &&
53  (reinterpret_cast<uintptr_t>(v) & 31) == 0) {
54  // Use aligned load.
55  __m256d floats1 = _mm256_load_pd(u);
56  __m256d floats2 = _mm256_load_pd(v);
57  // Multiply.
58  sum = _mm256_mul_pd(floats1, floats2);
59  while (offset <= max_offset) {
60  floats1 = _mm256_load_pd(u + offset);
61  floats2 = _mm256_load_pd(v + offset);
62  offset += 4;
63  __m256d product = _mm256_mul_pd(floats1, floats2);
64  sum = _mm256_add_pd(sum, product);
65  }
66  } else {
67  // Use unaligned load.
68  __m256d floats1 = _mm256_loadu_pd(u);
69  __m256d floats2 = _mm256_loadu_pd(v);
70  // Multiply.
71  sum = _mm256_mul_pd(floats1, floats2);
72  while (offset <= max_offset) {
73  floats1 = _mm256_loadu_pd(u + offset);
74  floats2 = _mm256_loadu_pd(v + offset);
75  offset += 4;
76  __m256d product = _mm256_mul_pd(floats1, floats2);
77  sum = _mm256_add_pd(sum, product);
78  }
79  }
80  }
81  // Add the 4 product sums together horizontally. Not so easy as with sse, as
82  // there is no add across the upper/lower 128 bit boundary, so permute to
83  // move the upper 128 bits to lower in another register.
84  __m256d sum2 = _mm256_permute2f128_pd(sum, sum, 1);
85  sum = _mm256_hadd_pd(sum, sum2);
86  sum = _mm256_hadd_pd(sum, sum);
87  double result;
88  // _mm256_extract_f64 doesn't exist, but resist the temptation to use an sse
89  // instruction, as that introduces a 70 cycle delay. All this casting is to
90  // fool the intrinsics into thinking we are extracting the bottom int64.
91  auto cast_sum = _mm256_castpd_si256(sum);
92 #pragma GCC diagnostic push
93 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
94  *(reinterpret_cast<int64_t*>(&result)) =
95 #if defined(_WIN32) || defined(__i386__)
96  // This is a very simple workaround that is activated
97  // for all platforms that do not have _mm256_extract_epi64.
98  // _mm256_extract_epi64(X, Y) == ((uint64_t*)&X)[Y]
99  ((uint64_t*)&cast_sum)[0]
100 #else
101  _mm256_extract_epi64(cast_sum, 0)
102 #endif
103  ;
104 #pragma GCC diagnostic pop
105  while (offset < n) {
106  result += u[offset] * v[offset];
107  ++offset;
108  }
109  return result;
110 }
111 
112 } // namespace tesseract.
113 
114 #endif // ANDROID_BUILD
double DotProductAVX(const double *u, const double *v, int n)