All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tesseract_cube_combiner.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tesseract_cube_combiner.h
3  * Description: Declaration of the Tesseract & Cube results combiner Class
4  * Author: Ahmad Abdulkader
5  * Created: 2008
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 // The TesseractCubeCombiner class provides the functionality of combining
21 // the recognition results of Tesseract and Cube at the word level
22 
23 #include <algorithm>
24 #include <string>
25 #include <vector>
26 #include <wctype.h>
27 
29 
30 #include "cube_object.h"
31 #include "cube_reco_context.h"
32 #include "cube_utils.h"
33 #include "neural_net.h"
34 #include "tesseractclass.h"
35 #include "word_altlist.h"
36 
37 namespace tesseract {
38 
40  cube_cntxt_ = cube_cntxt;
41  combiner_net_ = NULL;
42 }
43 
45  if (combiner_net_ != NULL) {
46  delete combiner_net_;
47  combiner_net_ = NULL;
48  }
49 }
50 
52  ASSERT_HOST(cube_cntxt_);
53  // Compute the path of the combiner net
54  string data_path;
55  cube_cntxt_->GetDataFilePath(&data_path);
56  string net_file_name = data_path + cube_cntxt_->Lang() +
57  ".tesseract_cube.nn";
58 
59  // Return false if file does not exist
60  FILE *fp = fopen(net_file_name.c_str(), "rb");
61  if (fp == NULL)
62  return false;
63  else
64  fclose(fp);
65 
66  // Load and validate net
67  combiner_net_ = NeuralNet::FromFile(net_file_name);
68  if (combiner_net_ == NULL) {
69  tprintf("Could not read combiner net file %s", net_file_name.c_str());
70  return false;
71  } else if (combiner_net_->out_cnt() != 2) {
72  tprintf("Invalid combiner net file %s! Output count != 2\n",
73  net_file_name.c_str());
74  delete combiner_net_;
75  combiner_net_ = NULL;
76  return false;
77  }
78  return true;
79 }
80 
81 // Normalize a UTF-8 string. Converts the UTF-8 string to UTF32 and optionally
82 // strips punc and/or normalizes case and then converts back
83 string TesseractCubeCombiner::NormalizeString(const string &str,
84  bool remove_punc,
85  bool norm_case) {
86  // convert to UTF32
87  string_32 str32;
88  CubeUtils::UTF8ToUTF32(str.c_str(), &str32);
89  // strip punc and normalize
90  string_32 new_str32;
91  for (int idx = 0; idx < str32.length(); idx++) {
92  // if no punc removal is required or not a punctuation character
93  if (!remove_punc || iswpunct(str32[idx]) == 0) {
94  char_32 norm_char = str32[idx];
95  // normalize case if required
96  if (norm_case && iswalpha(norm_char)) {
97  norm_char = towlower(norm_char);
98  }
99  new_str32.push_back(norm_char);
100  }
101  }
102  // convert back to UTF8
103  string new_str;
104  CubeUtils::UTF32ToUTF8(new_str32.c_str(), &new_str);
105  return new_str;
106 }
107 
108 // Compares 2 strings optionally ignoring punctuation
109 int TesseractCubeCombiner::CompareStrings(const string &str1,
110  const string &str2,
111  bool ignore_punc,
112  bool ignore_case) {
113  if (!ignore_punc && !ignore_case) {
114  return str1.compare(str2);
115  }
116  string norm_str1 = NormalizeString(str1, ignore_punc, ignore_case);
117  string norm_str2 = NormalizeString(str2, ignore_punc, ignore_case);
118  return norm_str1.compare(norm_str2);
119 }
120 
121 // Check if a string is a valid Tess dict word or not
122 bool TesseractCubeCombiner::ValidWord(const string &str) {
123  return (cube_cntxt_->TesseractObject()->getDict().valid_word(str.c_str())
124  > 0);
125 }
126 
127 // Public method for computing the combiner features. The agreement
128 // output parameter will be true if both answers are identical,
129 // and false otherwise.
131  int tess_confidence,
132  CubeObject *cube_obj,
133  WordAltList *cube_alt_list,
134  vector<double> *features,
135  bool *agreement) {
136  features->clear();
137  *agreement = false;
138  if (cube_alt_list == NULL || cube_alt_list->AltCount() <= 0)
139  return false;
140 
141  // Get Cube's best string; return false if empty
142  char_32 *cube_best_str32 = cube_alt_list->Alt(0);
143  if (cube_best_str32 == NULL || CubeUtils::StrLen(cube_best_str32) < 1)
144  return false;
145  string cube_best_str;
146  int cube_best_cost = cube_alt_list->AltCost(0);
147  int cube_best_bigram_cost = 0;
148  bool cube_best_bigram_cost_valid = true;
149  if (cube_cntxt_->Bigrams())
150  cube_best_bigram_cost = cube_cntxt_->Bigrams()->
151  Cost(cube_best_str32, cube_cntxt_->CharacterSet());
152  else
153  cube_best_bigram_cost_valid = false;
154  CubeUtils::UTF32ToUTF8(cube_best_str32, &cube_best_str);
155 
156  // Get Tesseract's UTF32 string
157  string_32 tess_str32;
158  CubeUtils::UTF8ToUTF32(tess_str.c_str(), &tess_str32);
159 
160  // Compute agreement flag
161  *agreement = (tess_str.compare(cube_best_str) == 0);
162 
163  // Get Cube's second best string; if empty, return false
164  char_32 *cube_next_best_str32;
165  string cube_next_best_str;
166  int cube_next_best_cost = WORST_COST;
167  if (cube_alt_list->AltCount() > 1) {
168  cube_next_best_str32 = cube_alt_list->Alt(1);
169  if (cube_next_best_str32 == NULL ||
170  CubeUtils::StrLen(cube_next_best_str32) == 0) {
171  return false;
172  }
173  cube_next_best_cost = cube_alt_list->AltCost(1);
174  CubeUtils::UTF32ToUTF8(cube_next_best_str32, &cube_next_best_str);
175  }
176  // Rank of Tesseract's top result in Cube's alternate list
177  int tess_rank = 0;
178  for (tess_rank = 0; tess_rank < cube_alt_list->AltCount(); tess_rank++) {
179  string alt_str;
180  CubeUtils::UTF32ToUTF8(cube_alt_list->Alt(tess_rank), &alt_str);
181  if (alt_str == tess_str)
182  break;
183  }
184 
185  // Cube's cost for tesseract's result. Note that this modifies the
186  // state of cube_obj, including its alternate list by calling RecognizeWord()
187  int tess_cost = cube_obj->WordCost(tess_str.c_str());
188  // Cube's bigram cost of Tesseract's string
189  int tess_bigram_cost = 0;
190  int tess_bigram_cost_valid = true;
191  if (cube_cntxt_->Bigrams())
192  tess_bigram_cost = cube_cntxt_->Bigrams()->
193  Cost(tess_str32.c_str(), cube_cntxt_->CharacterSet());
194  else
195  tess_bigram_cost_valid = false;
196 
197  // Tesseract confidence
198  features->push_back(tess_confidence);
199  // Cube cost of Tesseract string
200  features->push_back(tess_cost);
201  // Cube Rank of Tesseract string
202  features->push_back(tess_rank);
203  // length of Tesseract OCR string
204  features->push_back(tess_str.length());
205  // Tesseract OCR string in dictionary
206  features->push_back(ValidWord(tess_str));
207  if (tess_bigram_cost_valid) {
208  // bigram cost of Tesseract string
209  features->push_back(tess_bigram_cost);
210  }
211  // Cube tess_cost of Cube best string
212  features->push_back(cube_best_cost);
213  // Cube tess_cost of Cube next best string
214  features->push_back(cube_next_best_cost);
215  // length of Cube string
216  features->push_back(cube_best_str.length());
217  // Cube string in dictionary
218  features->push_back(ValidWord(cube_best_str));
219  if (cube_best_bigram_cost_valid) {
220  // bigram cost of Cube string
221  features->push_back(cube_best_bigram_cost);
222  }
223  // case-insensitive string comparison, including punctuation
224  int compare_nocase_punc = CompareStrings(cube_best_str,
225  tess_str, false, true);
226  features->push_back(compare_nocase_punc == 0);
227  // case-sensitive string comparison, ignoring punctuation
228  int compare_case_nopunc = CompareStrings(cube_best_str,
229  tess_str, true, false);
230  features->push_back(compare_case_nopunc == 0);
231  // case-insensitive string comparison, ignoring punctuation
232  int compare_nocase_nopunc = CompareStrings(cube_best_str,
233  tess_str, true, true);
234  features->push_back(compare_nocase_nopunc == 0);
235  return true;
236 }
237 
238 // The CubeObject parameter is used for 2 purposes: 1) to retrieve
239 // cube's alt list, and 2) to compute cube's word cost for the
240 // tesseract result. The call to CubeObject::WordCost() modifies
241 // the object's alternate list, so previous state will be lost.
243  CubeObject *cube_obj) {
244  // If no combiner is loaded or the cube object is undefined,
245  // tesseract wins with probability 1.0
246  if (combiner_net_ == NULL || cube_obj == NULL) {
247  tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): "
248  "Cube objects not initialized; defaulting to Tesseract\n");
249  return 1.0;
250  }
251 
252  // Retrieve the alternate list from the CubeObject's current state.
253  // If the alt list empty, tesseract wins with probability 1.0
254  WordAltList *cube_alt_list = cube_obj->AlternateList();
255  if (cube_alt_list == NULL)
256  cube_alt_list = cube_obj->RecognizeWord();
257  if (cube_alt_list == NULL || cube_alt_list->AltCount() <= 0) {
258  tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): "
259  "Cube returned no results; defaulting to Tesseract\n");
260  return 1.0;
261  }
262  return CombineResults(tess_res, cube_obj, cube_alt_list);
263 }
264 
265 // The alt_list parameter is expected to have been extracted from the
266 // CubeObject that recognized the word to be combined. The cube_obj
267 // parameter passed may be either same instance or a separate instance to
268 // be used only by the combiner. In both cases, its alternate
269 // list will be modified by an internal call to RecognizeWord().
271  CubeObject *cube_obj,
272  WordAltList *cube_alt_list) {
273  // If no combiner is loaded or the cube object is undefined, or the
274  // alt list is empty, tesseract wins with probability 1.0
275  if (combiner_net_ == NULL || cube_obj == NULL ||
276  cube_alt_list == NULL || cube_alt_list->AltCount() <= 0) {
277  tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): "
278  "Cube result cannot be retrieved; defaulting to Tesseract\n");
279  return 1.0;
280  }
281 
282  // Tesseract result string, tesseract confidence, and cost of
283  // tesseract result according to cube
284  string tess_str = tess_res->best_choice->unichar_string().string();
285  // Map certainty [-20.0, 0.0] to confidence [0, 100]
286  int tess_confidence = MIN(100, MAX(1, static_cast<int>(
287  100 + (5 * tess_res->best_choice->certainty()))));
288 
289  // Compute the combiner features. If feature computation fails or
290  // answers are identical, tesseract wins with probability 1.0
291  vector<double> features;
292  bool agreement;
293  bool combiner_success = ComputeCombinerFeatures(tess_str, tess_confidence,
294  cube_obj, cube_alt_list,
295  &features, &agreement);
296  if (!combiner_success || agreement)
297  return 1.0;
298 
299  // Classify combiner feature vector and return output (probability
300  // of tesseract class).
301  double net_out[2];
302  if (!combiner_net_->FeedForward(&features[0], net_out))
303  return 1.0;
304  return net_out[1];
305 }
306 }
int WordCost(const char *str)
#define MAX(x, y)
Definition: ndminx.h:24
#define WORST_COST
Definition: cube_const.h:30
float CombineResults(WERD_RES *tess_res, CubeObject *cube_obj)
tesseract::Tesseract * TesseractObject() const
WERD_CHOICE * best_choice
Definition: pageres.h:219
int AltCost(int alt_idx) const
Definition: altlist.h:41
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:705
#define tprintf(...)
Definition: tprintf.h:31
#define MIN(x, y)
Definition: ndminx.h:28
basic_string< char_32 > string_32
Definition: string_32.h:41
WordAltList * RecognizeWord(LangModel *lang_mod=NULL)
#define ASSERT_HOST(x)
Definition: errcode.h:84
const STRING & unichar_string() const
Definition: ratngs.h:524
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
Definition: cube_utils.cpp:266
WordAltList * AlternateList() const
Definition: cube_object.h:119
bool FeedForward(const Type *inputs, Type *outputs)
Definition: neural_net.cpp:79
float certainty() const
Definition: ratngs.h:327
CharBigrams * Bigrams() const
int out_cnt() const
Definition: neural_net.h:41
Dict & getDict()
Definition: classify.h:65
char_32 * Alt(int alt_idx)
Definition: word_altlist.h:41
static int StrLen(const char_32 *str)
Definition: cube_utils.cpp:54
static void UTF32ToUTF8(const char_32 *utf32_str, string *str)
Definition: cube_utils.cpp:282
TesseractCubeCombiner(CubeRecoContext *cube_cntxt)
CharSet * CharacterSet() const
bool ComputeCombinerFeatures(const string &tess_res, int tess_confidence, CubeObject *cube_obj, WordAltList *cube_alt_list, vector< double > *features, bool *agreement)
int AltCount() const
Definition: altlist.h:39
signed int char_32
Definition: string_32.h:40
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
bool GetDataFilePath(string *path) const
const string & Lang() const
static NeuralNet * FromFile(const string file_name)
Definition: neural_net.cpp:204