tesseract  4.0.0-1-g2a2b
weightmatrix.cpp
Go to the documentation of this file.
1 // File: weightmatrix.cpp
3 // Description: Hides distinction between float/int implementations.
4 // Author: Ray Smith
5 // Created: Tue Jun 17 11:46:20 PST 2014
6 //
7 // (C) Copyright 2014, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
18 
19 #include "weightmatrix.h"
20 
21 #include "dotproductavx.h"
22 #include "dotproductsse.h"
23 #include "intsimdmatrix.h"
24 #include "simddetect.h"
25 #include "statistc.h"
26 #include "tprintf.h"
27 
28 namespace tesseract {
29 
30 #if defined(ANDROID)
31 static inline double log2(double n) {
32  return log(n) / log(2.0);
33 }
34 #endif // ANDROID
35 
36 // Number of iterations after which the correction effectively becomes unity.
37 const int kAdamCorrectionIterations = 200000;
38 // Epsilon in Adam to prevent division by zero.
39 const double kAdamEpsilon = 1e-8;
40 
41 // Copies the whole input transposed, converted to double, into *this.
43  int width = input.dim1();
44  int num_features = input.dim2();
45  ResizeNoInit(num_features, width);
46  for (int t = 0; t < width; ++t) WriteStrided(t, input[t]);
47 }
48 
49 // Destructor.
50 // It is defined here, so the compiler can create a single vtable
51 // instead of weak vtables in every compilation unit.
53 
54 // Sets up the network for training. Initializes weights using weights of
55 // scale `range` picked according to the random number generator `randomizer`.
56 int WeightMatrix::InitWeightsFloat(int no, int ni, bool use_adam,
57  float weight_range, TRand* randomizer) {
58  int_mode_ = false;
59  wf_.Resize(no, ni, 0.0);
60  if (randomizer != nullptr) {
61  for (int i = 0; i < no; ++i) {
62  for (int j = 0; j < ni; ++j) {
63  wf_[i][j] = randomizer->SignedRand(weight_range);
64  }
65  }
66  }
67  use_adam_ = use_adam;
68  InitBackward();
69  return ni * no;
70 }
71 
72 // Changes the number of outputs to the size of the given code_map, copying
73 // the old weight matrix entries for each output from code_map[output] where
74 // non-negative, and uses the mean (over all outputs) of the existing weights
75 // for all outputs with negative code_map entries. Returns the new number of
76 // weights.
77 int WeightMatrix::RemapOutputs(const std::vector<int>& code_map) {
78  GENERIC_2D_ARRAY<double> old_wf(wf_);
79  int old_no = wf_.dim1();
80  int new_no = code_map.size();
81  int ni = wf_.dim2();
82  std::vector<double> means(ni, 0.0);
83  for (int c = 0; c < old_no; ++c) {
84  const double* weights = wf_[c];
85  for (int i = 0; i < ni; ++i) means[i] += weights[i];
86  }
87  for (double& mean : means) mean /= old_no;
88  wf_.ResizeNoInit(new_no, ni);
89  InitBackward();
90  for (int dest = 0; dest < new_no; ++dest) {
91  int src = code_map[dest];
92  const double* src_data = src >= 0 ? old_wf[src] : means.data();
93  memcpy(wf_[dest], src_data, ni * sizeof(*src_data));
94  }
95  return ni * new_no;
96 }
97 
98 // Converts a float network to an int network. Each set of input weights that
99 // corresponds to a single output weight is converted independently:
100 // Compute the max absolute value of the weight set.
101 // Scale so the max absolute value becomes INT8_MAX.
102 // Round to integer.
103 // Store a multiplicative scale factor (as a double) that will reproduce
104 // the original value, subject to rounding errors.
106  wi_.ResizeNoInit(wf_.dim1(), wf_.dim2());
107  scales_.init_to_size(wi_.dim1(), 0.0);
108  int dim2 = wi_.dim2();
109  for (int t = 0; t < wi_.dim1(); ++t) {
110  double* f_line = wf_[t];
111  int8_t* i_line = wi_[t];
112  double max_abs = 0.0;
113  for (int f = 0; f < dim2; ++f) {
114  double abs_val = fabs(f_line[f]);
115  if (abs_val > max_abs) max_abs = abs_val;
116  }
117  double scale = max_abs / INT8_MAX;
118  scales_[t] = scale;
119  if (scale == 0.0) scale = 1.0;
120  for (int f = 0; f < dim2; ++f) {
121  i_line[f] = IntCastRounded(f_line[f] / scale);
122  }
123  }
124  wf_.Resize(1, 1, 0.0);
125  int_mode_ = true;
126  multiplier_.reset(IntSimdMatrix::GetFastestMultiplier());
127  if (multiplier_ != nullptr) multiplier_->Init(wi_);
128 }
129 
130 // Allocates any needed memory for running Backward, and zeroes the deltas,
131 // thus eliminating any existing momentum.
133  int no = int_mode_ ? wi_.dim1() : wf_.dim1();
134  int ni = int_mode_ ? wi_.dim2() : wf_.dim2();
135  dw_.Resize(no, ni, 0.0);
136  updates_.Resize(no, ni, 0.0);
137  wf_t_.Transpose(wf_);
138  if (use_adam_) dw_sq_sum_.Resize(no, ni, 0.0);
139 }
140 
141 // Flag on mode to indicate that this weightmatrix uses int8_t.
142 const int kInt8Flag = 1;
143 // Flag on mode to indicate that this weightmatrix uses adam.
144 const int kAdamFlag = 4;
145 // Flag on mode to indicate that this weightmatrix uses double. Set
146 // independently of kInt8Flag as even in int mode the scales can
147 // be float or double.
148 const int kDoubleFlag = 128;
149 
150 // Writes to the given file. Returns false in case of error.
151 bool WeightMatrix::Serialize(bool training, TFile* fp) const {
152  // For backward compatibility, add kDoubleFlag to mode to indicate the doubles
153  // format, without errs, so we can detect and read old format weight matrices.
154  uint8_t mode =
155  (int_mode_ ? kInt8Flag : 0) | (use_adam_ ? kAdamFlag : 0) | kDoubleFlag;
156  if (!fp->Serialize(&mode)) return false;
157  if (int_mode_) {
158  if (!wi_.Serialize(fp)) return false;
159  if (!scales_.Serialize(fp)) return false;
160  } else {
161  if (!wf_.Serialize(fp)) return false;
162  if (training && !updates_.Serialize(fp)) return false;
163  if (training && use_adam_ && !dw_sq_sum_.Serialize(fp)) return false;
164  }
165  return true;
166 }
167 
168 // Reads from the given file. Returns false in case of error.
169 
170 bool WeightMatrix::DeSerialize(bool training, TFile* fp) {
171  uint8_t mode;
172  if (!fp->DeSerialize(&mode)) return false;
173  int_mode_ = (mode & kInt8Flag) != 0;
174  use_adam_ = (mode & kAdamFlag) != 0;
175  if ((mode & kDoubleFlag) == 0) return DeSerializeOld(training, fp);
176  if (int_mode_) {
177  if (!wi_.DeSerialize(fp)) return false;
178  if (!scales_.DeSerialize(fp)) return false;
179  multiplier_.reset(IntSimdMatrix::GetFastestMultiplier());
180  if (multiplier_ != nullptr) multiplier_->Init(wi_);
181  } else {
182  if (!wf_.DeSerialize(fp)) return false;
183  if (training) {
184  InitBackward();
185  if (!updates_.DeSerialize(fp)) return false;
186  if (use_adam_ && !dw_sq_sum_.DeSerialize(fp)) return false;
187  }
188  }
189  return true;
190 }
191 
192 // As DeSerialize, but reads an old (float) format WeightMatrix for
193 // backward compatibility.
194 bool WeightMatrix::DeSerializeOld(bool training, TFile* fp) {
195  GENERIC_2D_ARRAY<float> float_array;
196  if (int_mode_) {
197  if (!wi_.DeSerialize(fp)) return false;
198  GenericVector<float> old_scales;
199  if (!old_scales.DeSerialize(fp)) return false;
200  scales_.resize_no_init(old_scales.size());
201  for (int i = 0; i < old_scales.size(); ++i) scales_[i] = old_scales[i];
202  } else {
203  if (!float_array.DeSerialize(fp)) return false;
204  FloatToDouble(float_array, &wf_);
205  }
206  if (training) {
207  InitBackward();
208  if (!float_array.DeSerialize(fp)) return false;
209  FloatToDouble(float_array, &updates_);
210  // Errs was only used in int training, which is now dead.
211  if (!float_array.DeSerialize(fp)) return false;
212  }
213  return true;
214 }
215 
216 // Computes matrix.vector v = Wu.
217 // u is of size W.dim2() - 1 and the output v is of size W.dim1().
218 // u is imagined to have an extra element at the end with value 1, to
219 // implement the bias, but it doesn't actually have it.
220 // Asserts that the call matches what we have.
221 void WeightMatrix::MatrixDotVector(const double* u, double* v) const {
222  ASSERT_HOST(!int_mode_);
223  MatrixDotVectorInternal(wf_, true, false, u, v);
224 }
225 
226 void WeightMatrix::MatrixDotVector(const int8_t* u, double* v) const {
227  ASSERT_HOST(int_mode_);
228  ASSERT_HOST(multiplier_ != nullptr);
229  multiplier_->MatrixDotVector(wi_, scales_, u, v);
230 }
231 
232 // MatrixDotVector for peep weights, MultiplyAccumulate adds the
233 // component-wise products of *this[0] and v to inout.
234 void WeightMatrix::MultiplyAccumulate(const double* v, double* inout) {
235  ASSERT_HOST(!int_mode_);
236  ASSERT_HOST(wf_.dim1() == 1);
237  int n = wf_.dim2();
238  const double* u = wf_[0];
239  for (int i = 0; i < n; ++i) {
240  inout[i] += u[i] * v[i];
241  }
242 }
243 
244 // Computes vector.matrix v = uW.
245 // u is of size W.dim1() and the output v is of size W.dim2() - 1.
246 // The last result is discarded, as v is assumed to have an imaginary
247 // last value of 1, as with MatrixDotVector.
248 void WeightMatrix::VectorDotMatrix(const double* u, double* v) const {
249  ASSERT_HOST(!int_mode_);
250  MatrixDotVectorInternal(wf_t_, false, true, u, v);
251 }
252 
253 // Fills dw_[i][j] with the dot product u[i][] . v[j][], using elements from
254 // u and v. In terms of the neural network, u is the gradients and v is the
255 // inputs.
256 // Note that (matching MatrixDotVector) v[last][] is missing, presumed 1.0.
257 // Runs parallel if requested. Note that u and v must be transposed.
259  const TransposedArray& v,
260  bool in_parallel) {
261  ASSERT_HOST(!int_mode_);
262  int num_outputs = dw_.dim1();
263  ASSERT_HOST(u.dim1() == num_outputs);
264  ASSERT_HOST(u.dim2() == v.dim2());
265  int num_inputs = dw_.dim2() - 1;
266  int num_samples = u.dim2();
267  // v is missing the last element in dim1.
268  ASSERT_HOST(v.dim1() == num_inputs);
269 #ifdef _OPENMP
270 #pragma omp parallel for num_threads(4) if (in_parallel)
271 #endif
272  for (int i = 0; i < num_outputs; ++i) {
273  double* dwi = dw_[i];
274  const double* ui = u[i];
275  for (int j = 0; j < num_inputs; ++j) {
276  dwi[j] = DotProduct(ui, v[j], num_samples);
277  }
278  // The last element of v is missing, presumed 1.0f.
279  double total = 0.0;
280  for (int k = 0; k < num_samples; ++k) total += ui[k];
281  dwi[num_inputs] = total;
282  }
283 }
284 
285 // Updates the weights using the given learning rate and momentum.
286 // num_samples is the quotient to be used in the adam computation iff
287 // use_adam_ is true.
288 void WeightMatrix::Update(double learning_rate, double momentum,
289  double adam_beta, int num_samples) {
290  ASSERT_HOST(!int_mode_);
291  if (use_adam_ && num_samples > 0 && num_samples < kAdamCorrectionIterations) {
292  learning_rate *= sqrt(1.0 - pow(adam_beta, num_samples));
293  learning_rate /= 1.0 - pow(momentum, num_samples);
294  }
295  if (use_adam_ && num_samples > 0 && momentum > 0.0) {
296  dw_sq_sum_.SumSquares(dw_, adam_beta);
297  dw_ *= learning_rate * (1.0 - momentum);
298  updates_ *= momentum;
299  updates_ += dw_;
300  wf_.AdamUpdate(updates_, dw_sq_sum_, learning_rate * kAdamEpsilon);
301  } else {
302  dw_ *= learning_rate;
303  updates_ += dw_;
304  if (momentum > 0.0) wf_ += updates_;
305  if (momentum >= 0.0) updates_ *= momentum;
306  }
307  wf_t_.Transpose(wf_);
308 }
309 
310 // Adds the dw_ in other to the dw_ is *this.
312  ASSERT_HOST(dw_.dim1() == other.dw_.dim1());
313  ASSERT_HOST(dw_.dim2() == other.dw_.dim2());
314  dw_ += other.dw_;
315 }
316 
317 // Sums the products of weight updates in *this and other, splitting into
318 // positive (same direction) in *same and negative (different direction) in
319 // *changed.
320 void WeightMatrix::CountAlternators(const WeightMatrix& other, double* same,
321  double* changed) const {
322  int num_outputs = updates_.dim1();
323  int num_inputs = updates_.dim2();
324  ASSERT_HOST(num_outputs == other.updates_.dim1());
325  ASSERT_HOST(num_inputs == other.updates_.dim2());
326  for (int i = 0; i < num_outputs; ++i) {
327  const double* this_i = updates_[i];
328  const double* other_i = other.updates_[i];
329  for (int j = 0; j < num_inputs; ++j) {
330  double product = this_i[j] * other_i[j];
331  if (product < 0.0)
332  *changed -= product;
333  else
334  *same += product;
335  }
336  }
337 }
338 
339 // Helper computes an integer histogram bucket for a weight and adds it
340 // to the histogram.
341 const int kHistogramBuckets = 16;
342 static void HistogramWeight(double weight, STATS* histogram) {
343  int bucket = kHistogramBuckets - 1;
344  if (weight != 0.0) {
345  double logval = -log2(fabs(weight));
346  bucket = ClipToRange(IntCastRounded(logval), 0, kHistogramBuckets - 1);
347  }
348  histogram->add(bucket, 1);
349 }
350 
351 void WeightMatrix::Debug2D(const char* msg) {
352  STATS histogram(0, kHistogramBuckets);
353  if (int_mode_) {
354  for (int i = 0; i < wi_.dim1(); ++i) {
355  for (int j = 0; j < wi_.dim2(); ++j) {
356  HistogramWeight(wi_[i][j] * scales_[i], &histogram);
357  }
358  }
359  } else {
360  for (int i = 0; i < wf_.dim1(); ++i) {
361  for (int j = 0; j < wf_.dim2(); ++j) {
362  HistogramWeight(wf_[i][j], &histogram);
363  }
364  }
365  }
366  tprintf("%s\n", msg);
367  histogram.print();
368 }
369 
370 // Computes and returns the dot product of the two n-vectors u and v.
371 /* static */
372 double WeightMatrix::DotProduct(const double* u, const double* v, int n) {
373  // Note: because the order of addition is different among the 3 DotProduct
374  // functions, the results can (and do) vary slightly (although they agree
375  // to within about 4e-15). This produces different results when running
376  // training, despite all random inputs being precisely equal.
377  // To get consistent results, use just one of these DotProduct functions.
378  // On a test multi-layer network, serial is 57% slower than sse, and avx
379  // is about 8% faster than sse. This suggests that the time is memory
380  // bandwidth constrained and could benefit from holding the reused vector
381  // in AVX registers.
382  if (SIMDDetect::IsAVXAvailable()) return DotProductAVX(u, v, n);
383  if (SIMDDetect::IsSSEAvailable()) return DotProductSSE(u, v, n);
384  double total = 0.0;
385  for (int k = 0; k < n; ++k) total += u[k] * v[k];
386  return total;
387 }
388 
389 // Utility function converts an array of float to the corresponding array
390 // of double.
391 /* static */
394  int dim1 = wf.dim1();
395  int dim2 = wf.dim2();
396  wd->ResizeNoInit(dim1, dim2);
397  for (int i = 0; i < dim1; ++i) {
398  const float* wfi = wf[i];
399  double* wdi = (*wd)[i];
400  for (int j = 0; j < dim2; ++j) wdi[j] = static_cast<double>(wfi[j]);
401  }
402 }
403 
404 // Computes matrix.vector v = Wu.
405 // u is of size W.dim2() - add_bias_fwd and the output v is of size
406 // W.dim1() - skip_bias_back.
407 // If add_bias_fwd, u is imagined to have an extra element at the end with value
408 // 1, to implement the bias, weight.
409 // If skip_bias_back, we are actullay performing the backwards product on a
410 // transposed matrix, so we need to drop the v output corresponding to the last
411 // element in dim1.
412 void WeightMatrix::MatrixDotVectorInternal(const GENERIC_2D_ARRAY<double>& w,
413  bool add_bias_fwd,
414  bool skip_bias_back, const double* u,
415  double* v) {
416  int num_results = w.dim1() - skip_bias_back;
417  int extent = w.dim2() - add_bias_fwd;
418  for (int i = 0; i < num_results; ++i) {
419  const double* wi = w[i];
420  double total = DotProduct(wi, u, extent);
421  if (add_bias_fwd) total += wi[extent]; // The bias value.
422  v[i] = total;
423  }
424 }
425 
426 } // namespace tesseract.
const int kDoubleFlag
void resize_no_init(int size)
Definition: genericvector.h:65
int size() const
Definition: genericvector.h:71
void SumSquares(const GENERIC_2D_ARRAY< T > &src, const T &decay_factor)
Definition: matrix.h:368
double DotProductAVX(const double *u, const double *v, int n)
const int kAdamCorrectionIterations
void Transpose(const GENERIC_2D_ARRAY< double > &input)
static double DotProduct(const double *u, const double *v, int n)
const int kInt8Flag
void CountAlternators(const WeightMatrix &other, double *same, double *changed) const
bool DeSerialize(char *data, size_t count=1)
Definition: serialis.cpp:103
static IntSimdMatrix * GetFastestMultiplier()
int RemapOutputs(const std::vector< int > &code_map)
bool DeSerialize(bool swap, FILE *fp)
const double kAdamEpsilon
void VectorDotMatrix(const double *u, double *v) const
Definition: statistc.h:33
void Update(double learning_rate, double momentum, double adam_beta, int num_samples)
bool Serialize(FILE *fp) const
bool Serialize(bool training, TFile *fp) const
double DotProductSSE(const double *u, const double *v, int n)
void ResizeNoInit(int size1, int size2, int pad=0)
Definition: matrix.h:91
int InitWeightsFloat(int no, int ni, bool use_adam, float weight_range, TRand *randomizer)
void Debug2D(const char *msg)
void AdamUpdate(const GENERIC_2D_ARRAY< T > &sum, const GENERIC_2D_ARRAY< T > &sqsum, const T &epsilon)
Definition: matrix.h:379
void MultiplyAccumulate(const double *v, double *inout)
void init_to_size(int size, const T &t)
int IntCastRounded(double x)
Definition: helpers.h:168
bool Serialize(const char *data, size_t count=1)
Definition: serialis.cpp:147
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool Serialize(FILE *fp) const
Definition: matrix.h:144
void MatrixDotVector(const double *u, double *v) const
void add(int32_t value, int32_t count)
Definition: statistc.cpp:100
int dim1() const
Definition: matrix.h:206
bool DeSerialize(bool swap, FILE *fp)
Definition: matrix.h:161
static bool IsSSEAvailable()
Definition: simddetect.h:40
static bool IsAVXAvailable()
Definition: simddetect.h:28
const int kAdamFlag
bool DeSerializeOld(bool training, TFile *fp)
double SignedRand(double range)
Definition: helpers.h:61
void print() const
Definition: statistc.cpp:533
bool DeSerialize(bool training, TFile *fp)
int dim2() const
Definition: matrix.h:207
void AddDeltas(const WeightMatrix &other)
void WriteStrided(int t, const float *data)
Definition: weightmatrix.h:40
void Resize(int size1, int size2, const T &empty)
Definition: matrix.h:105
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:111
static void FloatToDouble(const GENERIC_2D_ARRAY< float > &wf, GENERIC_2D_ARRAY< double > *wd)
const int kHistogramBuckets
void SumOuterTransposed(const TransposedArray &u, const TransposedArray &v, bool parallel)
#define ASSERT_HOST(x)
Definition: errcode.h:84