tesseract  4.0.0-1-g2a2b
intsimdmatrix.cpp
Go to the documentation of this file.
1 // File: intsimdmatrix.cpp
3 // Description: Base class for 8-bit int SIMD matrix multipliers.
4 // Author: Ray Smith
5 // Created: Tue Aug 15 08:01:32 PST 2017
6 //
7 // (C) Copyright 2017, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
18 
19 #include "intsimdmatrix.h"
20 #include "genericvector.h" // for GenericVector
21 #include "intsimdmatrixavx2.h" // for IntSimdMatrixAVX2
22 #include "intsimdmatrixsse.h" // for IntSimdMatrixSSE
23 #include "matrix.h" // for GENERIC_2D_ARRAY
24 #include "simddetect.h" // for SIMDDetect
25 
26 namespace tesseract {
27 
28 // Factory makes and returns an IntSimdMatrix (sub)class of the best
29 // available type for the current architecture.
30 /* static */
32  IntSimdMatrix* multiplier = nullptr;
34  multiplier = new IntSimdMatrixAVX2();
35  } else if (SIMDDetect::IsSSEAvailable()) {
36  multiplier = new IntSimdMatrixSSE();
37  } else {
38  // Default c++ implementation.
39  multiplier = new IntSimdMatrix();
40  }
41  return multiplier;
42 }
43 
44 // Computes a reshaped copy of the weight matrix w. If there are no
45 // partial_funcs_, it does nothing.
47  if (partial_funcs_.empty()) return;
48  int num_out = w.dim1();
49  int num_in = w.dim2() - 1;
50  // The rounded-up sizes of the reshaped weight matrix, excluding biases.
51  int rounded_num_in = Roundup(num_in, num_inputs_per_group_);
52  int rounded_num_out = RoundOutputs(num_out);
53  // Add the bias and compute the required size.
54  shaped_w_.resize((rounded_num_in + 1) * rounded_num_out, 0);
55  int shaped_index = 0;
56  int output = 0;
57  // Each number of registers needs a different format! Iterates over the
58  // different numbers of registers (each a power of 2).
59  for (int num_registers = max_output_registers_; num_registers >= 1;
60  num_registers /= 2) {
61  // The number of outputs that we will generate with this many registers.
62  int num_outputs_per_register_set =
63  num_registers * num_outputs_per_register_;
64  // Use the max number of registers until we have to go fewer.
65  while (output + num_outputs_per_register_set <= rounded_num_out) {
66  // Accumulating outputs in registers saves iterating over the inputs, so
67  // we only have to do it once per output register set.
68  for (int input = 0; input < num_in; input += num_inputs_per_group_) {
69  // Iterate over the number of outputs in a register set.
70  for (int j = 0; j < num_outputs_per_register_set; ++j) {
71  // Inner-most loop corresponds to the number of inputs in an input
72  // group.
73  for (int i = 0; i < num_inputs_per_group_; ++i) {
74  int8_t weight = 0;
75  if (output + j < num_out && input + i < num_in)
76  weight = w(output + j, input + i);
77  shaped_w_[shaped_index++] = weight;
78  }
79  }
80  }
81  // Append the bias weights for the register set.
82  for (int j = 0; j < num_outputs_per_register_set; ++j) {
83  int8_t weight = 0;
84  if (output + j < num_out) weight = w(output + j, num_in);
85  shaped_w_[shaped_index++] = weight;
86  }
87  output += num_outputs_per_register_set;
88  }
89  }
90 }
91 
92 // Computes matrix.vector v = Wu.
93 // u is of size W.dim2() - 1 and the output v is of size W.dim1().
94 // u is imagined to have an extra element at the end with value 1, to
95 // implement the bias, but it doesn't actually have it.
97  const GenericVector<double>& scales,
98  const int8_t* u, double* v) const {
99  int num_out = w.dim1();
100  int num_in = w.dim2() - 1;
101  if (partial_funcs_.empty()) {
102  // Base implementation.
103  for (int i = 0; i < num_out; ++i) {
104  const int8_t* wi = w[i];
105  int total = 0;
106  for (int j = 0; j < num_in; ++j) total += wi[j] * u[j];
107  // Add in the bias and correct for integer values.
108  v[i] = (static_cast<double>(total) / INT8_MAX + wi[num_in]) * scales[i];
109  }
110  } else {
111  const int8_t* w_data = shaped_w_.data();
112  const double* scales_data = &scales[0];
113  // Each call to a partial_func_ produces group_size outputs, except the
114  // last one, which can produce less.
116  int rounded_num_in = Roundup(num_in, num_inputs_per_group_);
117  int rounded_num_out = RoundOutputs(num_out);
118  int output = 0;
119  for (auto fn : partial_funcs_) {
120  // The amount of w_data consumed by each call to fn.
121  int w_step = (rounded_num_in + 1) * group_size;
122  // Run with this group size, until it would produce too much output, then
123  // switch to a smaller size.
124  for (; output + group_size <= rounded_num_out; output += group_size) {
125  (*fn)(w_data, scales_data, u, rounded_num_in, num_out - output, v);
126  w_data += w_step;
127  scales_data += group_size;
128  v += group_size;
129  }
130  group_size /= 2;
131  }
132  }
133 }
134 
135 } // namespace tesseract
std::vector< PartialFunc > partial_funcs_
static IntSimdMatrix * GetFastestMultiplier()
void Init(const GENERIC_2D_ARRAY< int8_t > &w)
void MatrixDotVector(const GENERIC_2D_ARRAY< int8_t > &w, const GenericVector< double > &scales, const int8_t *u, double *v) const
int RoundOutputs(int size) const
Definition: intsimdmatrix.h:85
static int Roundup(int input, int factor)
std::vector< int8_t > shaped_w_
int dim1() const
Definition: matrix.h:206
static bool IsSSEAvailable()
Definition: simddetect.h:40
int dim2() const
Definition: matrix.h:207
static bool IsAVX2Available()
Definition: simddetect.h:30