tesseract  5.0.0-alpha-619-ge9db
params_model.cpp
Go to the documentation of this file.
1 // File: params_model.cpp
3 // Description: Trained language model parameters.
4 // Author: David Eger
5 //
6 // (C) Copyright 2012, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include "params_model.h"
20 
21 #include <cctype>
22 #include <cmath>
23 #include <cstdio>
24 
25 #include "bitvector.h"
26 #include "tprintf.h"
27 
28 namespace tesseract {
29 
30 // Scale factor to apply to params model scores.
31 static const float kScoreScaleFactor = 100.0f;
32 // Minimum cost result to return.
33 static const float kMinFinalCost = 0.001f;
34 // Maximum cost result to return.
35 static const float kMaxFinalCost = 100.0f;
36 
38  for (int p = 0; p < PTRAIN_NUM_PASSES; ++p) {
39  tprintf("ParamsModel for pass %d lang %s\n", p, lang_.c_str());
40  for (int i = 0; i < weights_vec_[p].size(); ++i) {
41  tprintf("%s = %g\n", kParamsTrainingFeatureTypeName[i],
42  weights_vec_[p][i]);
43  }
44  }
45 }
46 
47 void ParamsModel::Copy(const ParamsModel &other_model) {
48  for (int p = 0; p < PTRAIN_NUM_PASSES; ++p) {
49  weights_vec_[p] = other_model.weights_for_pass(
50  static_cast<PassEnum>(p));
51  }
52 }
53 
54 // Given a (modifiable) line, parse out a key / value pair.
55 // Return true on success.
56 bool ParamsModel::ParseLine(char *line, char** key, float *val) {
57  if (line[0] == '#')
58  return false;
59  int end_of_key = 0;
60  while (line[end_of_key] &&
61  !(isascii(line[end_of_key]) && isspace(line[end_of_key]))) {
62  end_of_key++;
63  }
64  if (!line[end_of_key]) {
65  tprintf("ParamsModel::Incomplete line %s\n", line);
66  return false;
67  }
68  line[end_of_key++] = 0;
69  *key = line;
70  if (sscanf(line + end_of_key, " %f", val) != 1)
71  return false;
72  return true;
73 }
74 
75 // Applies params model weights to the given features.
76 // Assumes that features is an array of size PTRAIN_NUM_FEATURE_TYPES.
77 // The cost is set to a number that can be multiplied by the outline length,
78 // as with the old ratings scheme. This enables words of different length
79 // and combinations of words to be compared meaningfully.
80 float ParamsModel::ComputeCost(const float features[]) const {
81  float unnorm_score = 0.0;
82  for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {
83  unnorm_score += weights_vec_[pass_][f] * features[f];
84  }
85  return ClipToRange(-unnorm_score / kScoreScaleFactor,
86  kMinFinalCost, kMaxFinalCost);
87 }
88 
89 bool ParamsModel::Equivalent(const ParamsModel &that) const {
90  float epsilon = 0.0001;
91  for (int p = 0; p < PTRAIN_NUM_PASSES; ++p) {
92  if (weights_vec_[p].size() != that.weights_vec_[p].size()) return false;
93  for (int i = 0; i < weights_vec_[p].size(); i++) {
94  if (weights_vec_[p][i] != that.weights_vec_[p][i] &&
95  fabs(weights_vec_[p][i] - that.weights_vec_[p][i]) > epsilon)
96  return false;
97  }
98  }
99  return true;
100 }
101 
102 bool ParamsModel::LoadFromFp(const char *lang, TFile *fp) {
103  const int kMaxLineSize = 100;
104  char line[kMaxLineSize];
105  BitVector present;
107  lang_ = lang;
108  // Load weights for passes with adaption on.
109  GenericVector<float> &weights = weights_vec_[pass_];
111 
112  while (fp->FGets(line, kMaxLineSize) != nullptr) {
113  char *key = nullptr;
114  float value;
115  if (!ParseLine(line, &key, &value))
116  continue;
117  int idx = ParamsTrainingFeatureByName(key);
118  if (idx < 0) {
119  tprintf("ParamsModel::Unknown parameter %s\n", key);
120  continue;
121  }
122  if (!present[idx]) {
123  present.SetValue(idx, true);
124  }
125  weights[idx] = value;
126  }
127  bool complete = (present.NumSetBits() == PTRAIN_NUM_FEATURE_TYPES);
128  if (!complete) {
129  for (int i = 0; i < PTRAIN_NUM_FEATURE_TYPES; i++) {
130  if (!present[i]) {
131  tprintf("Missing field %s.\n", kParamsTrainingFeatureTypeName[i]);
132  }
133  }
134  lang_ = "";
135  weights.truncate(0);
136  }
137  return complete;
138 }
139 
140 bool ParamsModel::SaveToFile(const char *full_path) const {
141  const GenericVector<float> &weights = weights_vec_[pass_];
143  tprintf("Refusing to save ParamsModel that has not been initialized.\n");
144  return false;
145  }
146  FILE *fp = fopen(full_path, "wb");
147  if (!fp) {
148  tprintf("Could not open %s for writing.\n", full_path);
149  return false;
150  }
151  bool all_good = true;
152  for (int i = 0; i < weights.size(); i++) {
153  if (fprintf(fp, "%s %f\n", kParamsTrainingFeatureTypeName[i], weights[i])
154  < 0) {
155  all_good = false;
156  }
157  }
158  fclose(fp);
159  return all_good;
160 }
161 
162 } // namespace tesseract
tesseract::ParamsModel::Equivalent
bool Equivalent(const ParamsModel &that) const
Definition: params_model.cpp:89
ClipToRange
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:106
tesseract::ParamsModel::Copy
void Copy(const ParamsModel &other_model)
Definition: params_model.cpp:47
tesseract::ParamsTrainingFeatureByName
int ParamsTrainingFeatureByName(const char *name)
Definition: params_training_featdef.cpp:26
tesseract::ParamsModel::weights_for_pass
const GenericVector< float > & weights_for_pass(PassEnum pass) const
Definition: params_model.h:69
tesseract::BitVector::NumSetBits
int NumSetBits() const
Definition: bitvector.cpp:216
tesseract::BitVector::Init
void Init(int length)
Definition: bitvector.cpp:139
tesseract::PTRAIN_NUM_FEATURE_TYPES
Definition: params_training_featdef.h:70
tesseract::ParamsModel::SaveToFile
bool SaveToFile(const char *full_path) const
Definition: params_model.cpp:140
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
tesseract::TFile
Definition: serialis.h:75
tesseract::ParamsModel::Print
void Print()
Definition: params_model.cpp:37
tesseract::ParamsModel
Definition: params_model.h:31
tesseract::ParamsModel::weights
const GenericVector< float > & weights() const
Definition: params_model.h:66
tesseract
Definition: baseapi.h:65
tesseract::BitVector::SetValue
void SetValue(int index, bool value)
Definition: bitvector.h:75
tprintf.h
bitvector.h
tesseract::BitVector
Definition: bitvector.h:30
GenericVector< float >
tesseract::ParamsModel::PTRAIN_NUM_PASSES
Definition: params_model.h:38
tesseract::ParamsModel::ComputeCost
float ComputeCost(const float features[]) const
Definition: params_model.cpp:80
params_model.h
GenericVector::truncate
void truncate(int size)
Definition: genericvector.h:132
tesseract::TFile::FGets
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:262
GenericVector::init_to_size
void init_to_size(int size, const T &t)
Definition: genericvector.h:706
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::ParamsModel::LoadFromFp
bool LoadFromFp(const char *lang, TFile *fp)
Definition: params_model.cpp:102
GenericVector::size
int size() const
Definition: genericvector.h:71