tesseract  5.0.0-alpha-619-ge9db
stopper.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: stopper.c
3  ** Purpose: Stopping criteria for word classifier.
4  ** Author: Dan Johnson
5  **
6  ** (c) Copyright Hewlett-Packard Company, 1988.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  ******************************************************************************/
17 
18 #include <cstdio>
19 #include <cstring>
20 #include <cctype>
21 #include <cmath>
22 
23 #include "stopper.h"
24 #ifndef DISABLED_LEGACY_ENGINE
25 #include "ambigs.h"
26 #endif
27 #include "ccutil.h"
28 #include "dict.h"
29 #include <tesseract/helpers.h>
30 #include "matchdefs.h"
31 #include "pageres.h"
32 #include "params.h"
33 #include "ratngs.h"
34 #include <tesseract/unichar.h>
35 
36 /*----------------------------------------------------------------------------
37  Private Code
38 ----------------------------------------------------------------------------*/
39 
40 namespace tesseract {
41 
42 bool Dict::AcceptableChoice(const WERD_CHOICE& best_choice,
43  XHeightConsistencyEnum xheight_consistency) {
44  float CertaintyThreshold = stopper_nondict_certainty_base;
45  int WordSize;
46 
47  if (stopper_no_acceptable_choices) return false;
48 
49  if (best_choice.length() == 0) return false;
50 
51  bool no_dang_ambigs = !best_choice.dangerous_ambig_found();
52  bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);
53  bool is_case_ok = case_ok(best_choice);
54 
55  if (stopper_debug_level >= 1) {
56  const char *xht = "UNKNOWN";
57  switch (xheight_consistency) {
58  case XH_GOOD: xht = "NORMAL"; break;
59  case XH_SUBNORMAL: xht = "SUBNORMAL"; break;
60  case XH_INCONSISTENT: xht = "INCONSISTENT"; break;
61  default: xht = "UNKNOWN";
62  }
63  tprintf("\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
64  best_choice.unichar_string().c_str(),
65  (is_valid_word ? 'y' : 'n'),
66  (is_case_ok ? 'y' : 'n'),
67  xht,
68  best_choice.min_x_height(),
69  best_choice.max_x_height());
70  }
71  // Do not accept invalid words in PASS1.
72  if (reject_offset_ <= 0.0f && !is_valid_word) return false;
73  if (is_valid_word && is_case_ok) {
74  WordSize = LengthOfShortestAlphaRun(best_choice);
75  WordSize -= stopper_smallword_size;
76  if (WordSize < 0)
77  WordSize = 0;
78  CertaintyThreshold += WordSize * stopper_certainty_per_char;
79  }
80 
81  if (stopper_debug_level >= 1)
82  tprintf("Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
83  best_choice.rating(), best_choice.certainty(), CertaintyThreshold);
84 
85  if (no_dang_ambigs &&
86  best_choice.certainty() > CertaintyThreshold &&
87  xheight_consistency < XH_INCONSISTENT &&
88  UniformCertainties(best_choice)) {
89  return true;
90  } else {
91  if (stopper_debug_level >= 1) {
92  tprintf("AcceptableChoice() returned false"
93  " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
94  no_dang_ambigs, best_choice.certainty(),
95  CertaintyThreshold,
96  UniformCertainties(best_choice));
97  }
98  return false;
99  }
100 }
101 
102 bool Dict::AcceptableResult(WERD_RES *word) const {
103  if (word->best_choice == nullptr) return false;
104  float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
105  int WordSize;
106 
107  if (stopper_debug_level >= 1) {
108  tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
109  word->best_choice->debug_string().c_str(),
110  (valid_word(*word->best_choice) ? 'y' : 'n'),
111  (case_ok(*word->best_choice) ? 'y' : 'n'),
112  word->best_choice->dangerous_ambig_found() ? 'n' : 'y',
113  word->best_choices.singleton() ? 'n' : 'y');
114  }
115 
116  if (word->best_choice->length() == 0 || !word->best_choices.singleton())
117  return false;
118  if (valid_word(*word->best_choice) && case_ok(*word->best_choice)) {
119  WordSize = LengthOfShortestAlphaRun(*word->best_choice);
120  WordSize -= stopper_smallword_size;
121  if (WordSize < 0)
122  WordSize = 0;
123  CertaintyThreshold += WordSize * stopper_certainty_per_char;
124  }
125 
126  if (stopper_debug_level >= 1)
127  tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
128  word->best_choice->certainty(), CertaintyThreshold);
129 
130  if (word->best_choice->certainty() > CertaintyThreshold &&
132  if (stopper_debug_level >= 1)
133  tprintf("ACCEPTED\n");
134  return true;
135  } else {
136  if (stopper_debug_level >= 1)
137  tprintf("REJECTED\n");
138  return false;
139  }
140 }
141 
142 #if !defined(DISABLED_LEGACY_ENGINE)
143 
144 bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
145  DANGERR *fixpt,
146  bool fix_replaceable,
147  MATRIX *ratings) {
148  if (stopper_debug_level > 2) {
149  tprintf("\nRunning NoDangerousAmbig() for %s\n",
150  best_choice->debug_string().c_str());
151  }
152 
153  // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
154  // for each unichar id in BestChoice.
155  BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
156  int i;
157  bool ambigs_found = false;
158  // For each position in best_choice:
159  // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
160  // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
161  // -- look for ambiguities corresponding to wrong_ngram in the list while
162  // adding the following unichar_ids from best_choice to wrong_ngram
163  //
164  // Repeat the above procedure twice: first time look through
165  // ambigs to be replaced and replace all the ambiguities found;
166  // second time look through dangerous ambiguities and construct
167  // ambig_blob_choices with fake a blob choice for each ambiguity
168  // and pass them to dawg_permute_and_select() to search for
169  // ambiguous words in the dictionaries.
170  //
171  // Note that during the execution of the for loop (on the first pass)
172  // if replacements are made the length of best_choice might change.
173  for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
174  bool replace = (fix_replaceable && pass == 0);
175  const UnicharAmbigsVector &table = replace ?
177  if (!replace) {
178  // Initialize ambig_blob_choices with lists containing a single
179  // unichar id for the corresponding position in best_choice.
180  // best_choice consisting from only the original letters will
181  // have a rating of 0.0.
182  for (i = 0; i < best_choice->length(); ++i) {
183  auto *lst = new BLOB_CHOICE_LIST();
184  BLOB_CHOICE_IT lst_it(lst);
185  // TODO(rays/antonova) Put real xheights and y shifts here.
186  lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
187  0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));
188  ambig_blob_choices.push_back(lst);
189  }
190  }
191  UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
192  int wrong_ngram_index;
193  int next_index;
194  int blob_index = 0;
195  for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i),
196  ++i) {
197  UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
198  if (stopper_debug_level > 2) {
199  tprintf("Looking for %s ngrams starting with %s:\n",
200  replace ? "replaceable" : "ambiguous",
201  getUnicharset().debug_str(curr_unichar_id).c_str());
202  }
203  int num_wrong_blobs = best_choice->state(i);
204  wrong_ngram_index = 0;
205  wrong_ngram[wrong_ngram_index] = curr_unichar_id;
206  if (curr_unichar_id == INVALID_UNICHAR_ID ||
207  curr_unichar_id >= table.size() ||
208  table[curr_unichar_id] == nullptr) {
209  continue; // there is no ambig spec for this unichar id
210  }
211  AmbigSpec_IT spec_it(table[curr_unichar_id]);
212  for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
213  const AmbigSpec *ambig_spec = spec_it.data();
214  wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
215  int compare = UnicharIdArrayUtils::compare(wrong_ngram,
216  ambig_spec->wrong_ngram);
217  if (stopper_debug_level > 2) {
218  tprintf("candidate ngram: ");
220  tprintf("current ngram from spec: ");
222  tprintf("comparison result: %d\n", compare);
223  }
224  if (compare == 0) {
225  // Record the place where we found an ambiguity.
226  if (fixpt != nullptr) {
227  UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
228  fixpt->push_back(DANGERR_INFO(
229  blob_index, blob_index + num_wrong_blobs, replace,
230  getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
231  leftmost_id));
232  if (stopper_debug_level > 1) {
233  tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index,
234  blob_index + num_wrong_blobs, false,
235  getUnicharset().get_isngram(
236  ambig_spec->correct_ngram_id),
237  getUnicharset().id_to_unichar(leftmost_id));
238  }
239  }
240 
241  if (replace) {
242  if (stopper_debug_level > 2) {
243  tprintf("replace ambiguity with %s : ",
244  getUnicharset().id_to_unichar(
245  ambig_spec->correct_ngram_id));
247  ambig_spec->correct_fragments, getUnicharset());
248  }
249  ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
250  ambig_spec->correct_ngram_id,
251  best_choice, ratings);
252  } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
253  // We found dang ambig - update ambig_blob_choices.
254  if (stopper_debug_level > 2) {
255  tprintf("found ambiguity: ");
257  ambig_spec->correct_fragments, getUnicharset());
258  }
259  ambigs_found = true;
260  for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
261  ++tmp_index) {
262  // Add a blob choice for the corresponding fragment of the
263  // ambiguity. These fake blob choices are initialized with
264  // negative ratings (which are not possible for real blob
265  // choices), so that dawg_permute_and_select() considers any
266  // word not consisting of only the original letters a better
267  // choice and stops searching for alternatives once such a
268  // choice is found.
269  BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
270  bc_it.add_to_end(new BLOB_CHOICE(
271  ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
272  -1, 0, 1, 0, BCC_AMBIG));
273  }
274  }
275  spec_it.forward();
276  } else if (compare == -1) {
277  if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
278  ((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
279  // Add the next unichar id to wrong_ngram and keep looking for
280  // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
281  wrong_ngram[++wrong_ngram_index] =
282  best_choice->unichar_id(next_index);
283  num_wrong_blobs += best_choice->state(next_index);
284  } else {
285  break; // no more matching ambigs in this AMBIG_SPEC_LIST
286  }
287  } else {
288  spec_it.forward();
289  }
290  } // end searching AmbigSpec_LIST
291  } // end searching best_choice
292  } // end searching replace and dangerous ambigs
293 
294  // If any ambiguities were found permute the constructed ambig_blob_choices
295  // to see if an alternative dictionary word can be found.
296  if (ambigs_found) {
297  if (stopper_debug_level > 2) {
298  tprintf("\nResulting ambig_blob_choices:\n");
299  for (i = 0; i < ambig_blob_choices.size(); ++i) {
300  print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
301  tprintf("\n");
302  }
303  }
304  WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
305  ambigs_found = (alt_word->rating() < 0.0);
306  if (ambigs_found) {
307  if (stopper_debug_level >= 1) {
308  tprintf ("Stopper: Possible ambiguous word = %s\n",
309  alt_word->debug_string().c_str());
310  }
311  if (fixpt != nullptr) {
312  // Note: Currently character choices combined from fragments can only
313  // be generated by NoDangrousAmbigs(). This code should be updated if
314  // the capability to produce classifications combined from character
315  // fragments is added to other functions.
316  int orig_i = 0;
317  for (i = 0; i < alt_word->length(); ++i) {
318  const UNICHARSET &uchset = getUnicharset();
319  bool replacement_is_ngram =
320  uchset.get_isngram(alt_word->unichar_id(i));
321  UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
322  if (replacement_is_ngram) {
323  // we have to extract the leftmost unichar from the ngram.
324  const char *str = uchset.id_to_unichar(leftmost_id);
325  int step = uchset.step(str);
326  if (step) leftmost_id = uchset.unichar_to_id(str, step);
327  }
328  int end_i = orig_i + alt_word->state(i);
329  if (alt_word->state(i) > 1 ||
330  (orig_i + 1 == end_i && replacement_is_ngram)) {
331  // Compute proper blob indices.
332  int blob_start = 0;
333  for (int j = 0; j < orig_i; ++j)
334  blob_start += best_choice->state(j);
335  int blob_end = blob_start;
336  for (int j = orig_i; j < end_i; ++j)
337  blob_end += best_choice->state(j);
338  fixpt->push_back(DANGERR_INFO(blob_start, blob_end, true,
339  replacement_is_ngram, leftmost_id));
340  if (stopper_debug_level > 1) {
341  tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i,
342  true, replacement_is_ngram,
343  uchset.id_to_unichar(leftmost_id));
344  }
345  }
346  orig_i += alt_word->state(i);
347  }
348  }
349  }
350  delete alt_word;
351  }
352  if (output_ambig_words_file_ != nullptr) {
353  fprintf(output_ambig_words_file_, "\n");
354  }
355 
356  ambig_blob_choices.delete_data_pointers();
357  return !ambigs_found;
358 }
359 
360 void Dict::EndDangerousAmbigs() {}
361 
362 #endif // !defined(DISABLED_LEGACY_ENGINE)
363 
365  reject_offset_ = 0.0;
366 }
367 
370 }
371 
372 void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
373  UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
374  MATRIX *ratings) {
375  int num_blobs_to_replace = 0;
376  int begin_blob_index = 0;
377  int i;
378  // Rating and certainty for the new BLOB_CHOICE are derived from the
379  // replaced choices.
380  float new_rating = 0.0f;
381  float new_certainty = 0.0f;
382  BLOB_CHOICE* old_choice = nullptr;
383  for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
384  if (i >= wrong_ngram_begin_index) {
385  int num_blobs = werd_choice->state(i);
386  int col = begin_blob_index + num_blobs_to_replace;
387  int row = col + num_blobs - 1;
388  BLOB_CHOICE_LIST* choices = ratings->get(col, row);
389  ASSERT_HOST(choices != nullptr);
390  old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
391  ASSERT_HOST(old_choice != nullptr);
392  new_rating += old_choice->rating();
393  new_certainty += old_choice->certainty();
394  num_blobs_to_replace += num_blobs;
395  } else {
396  begin_blob_index += werd_choice->state(i);
397  }
398  }
399  new_certainty /= wrong_ngram_size;
400  // If there is no entry in the ratings matrix, add it.
401  MATRIX_COORD coord(begin_blob_index,
402  begin_blob_index + num_blobs_to_replace - 1);
403  if (!coord.Valid(*ratings)) {
404  ratings->IncreaseBandSize(coord.row - coord.col + 1);
405  }
406  if (ratings->get(coord.col, coord.row) == nullptr)
407  ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
408  BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row);
409  BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices);
410  if (choice != nullptr) {
411  // Already there. Upgrade if new rating better.
412  if (new_rating < choice->rating())
413  choice->set_rating(new_rating);
414  if (new_certainty < choice->certainty())
415  choice->set_certainty(new_certainty);
416  // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
417  } else {
418  // Need a new choice with the correct_ngram_id.
419  choice = new BLOB_CHOICE(*old_choice);
420  choice->set_unichar_id(correct_ngram_id);
421  choice->set_rating(new_rating);
422  choice->set_certainty(new_certainty);
423  choice->set_classifier(BCC_AMBIG);
424  choice->set_matrix_cell(coord.col, coord.row);
425  BLOB_CHOICE_IT it (new_choices);
426  it.add_to_end(choice);
427  }
428  // Remove current unichar from werd_choice. On the last iteration
429  // set the correct replacement unichar instead of removing a unichar.
430  for (int replaced_count = 0; replaced_count < wrong_ngram_size;
431  ++replaced_count) {
432  if (replaced_count + 1 == wrong_ngram_size) {
433  werd_choice->set_blob_choice(wrong_ngram_begin_index,
434  num_blobs_to_replace, choice);
435  } else {
436  werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
437  }
438  }
439  if (stopper_debug_level >= 1) {
440  werd_choice->print("ReplaceAmbig() ");
441  tprintf("Modified blob_choices: ");
442  print_ratings_list("\n", new_choices, getUnicharset());
443  }
444 }
445 
446 int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const {
447  int shortest = INT32_MAX;
448  int curr_len = 0;
449  for (int w = 0; w < WordChoice.length(); ++w) {
450  if (WordChoice.unicharset()->get_isalpha(WordChoice.unichar_id(w))) {
451  curr_len++;
452  } else if (curr_len > 0) {
453  if (curr_len < shortest) shortest = curr_len;
454  curr_len = 0;
455  }
456  }
457  if (curr_len > 0 && curr_len < shortest) {
458  shortest = curr_len;
459  } else if (shortest == INT32_MAX) {
460  shortest = 0;
461  }
462  return shortest;
463 }
464 
465 int Dict::UniformCertainties(const WERD_CHOICE& word) {
466  float Certainty;
467  float WorstCertainty = FLT_MAX;
468  float CertaintyThreshold;
469  double TotalCertainty;
470  double TotalCertaintySquared;
471  double Variance;
472  float Mean, StdDev;
473  int word_length = word.length();
474 
475  if (word_length < 3)
476  return true;
477 
478  TotalCertainty = TotalCertaintySquared = 0.0;
479  for (int i = 0; i < word_length; ++i) {
480  Certainty = word.certainty(i);
481  TotalCertainty += Certainty;
482  TotalCertaintySquared += static_cast<double>(Certainty) * Certainty;
483  if (Certainty < WorstCertainty)
484  WorstCertainty = Certainty;
485  }
486 
487  // Subtract off worst certainty from statistics.
488  word_length--;
489  TotalCertainty -= WorstCertainty;
490  TotalCertaintySquared -= static_cast<double>(WorstCertainty) * WorstCertainty;
491 
492  Mean = TotalCertainty / word_length;
493  Variance = ((word_length * TotalCertaintySquared -
494  TotalCertainty * TotalCertainty) /
495  (word_length * (word_length - 1)));
496  if (Variance < 0.0)
497  Variance = 0.0;
498  StdDev = sqrt(Variance);
499 
500  CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
501  if (CertaintyThreshold > stopper_nondict_certainty_base)
502  CertaintyThreshold = stopper_nondict_certainty_base;
503 
504  if (word.certainty() < CertaintyThreshold) {
505  if (stopper_debug_level >= 1)
506  tprintf("Stopper: Non-uniform certainty = %4.1f"
507  " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
508  word.certainty(), Mean, StdDev, CertaintyThreshold);
509  return false;
510  } else {
511  return true;
512  }
513 }
514 
515 } // namespace tesseract
WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:529
GenericVector::delete_data_pointers
void delete_data_pointers()
Definition: genericvector.h:872
pageres.h
tesseract::AmbigSpec::wrong_ngram
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE+1]
Definition: ambigs.h:124
tesseract::XH_SUBNORMAL
Definition: dict.h:78
dict.h
Mean
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
Definition: cluster.cpp:602
tesseract::CASE_AMBIG
Definition: ambigs.h:42
tesseract::Dict::UniformCertainties
int UniformCertainties(const WERD_CHOICE &word)
Definition: stopper.cpp:479
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:303
WERD_CHOICE
Definition: ratngs.h:261
WERD_CHOICE::dangerous_ambig_found
bool dangerous_ambig_found() const
Definition: ratngs.h:351
UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:481
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
tesseract::Dict::stopper_certainty_per_char
double stopper_certainty_per_char
Definition: dict.h:635
tesseract::Dict::dawg_permute_and_select
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:182
tesseract::Dict::getUnicharAmbigs
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:108
BLOB_CHOICE::certainty
float certainty() const
Definition: ratngs.h:81
params.h
FindMatchingChoice
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:182
MATRIX
Definition: matrix.h:574
WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:318
tesseract::Dict::stopper_debug_level
int stopper_debug_level
Definition: dict.h:638
DANGERR_INFO
Definition: stopper.h:33
WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:334
WERD_RES
Definition: pageres.h:160
tesseract::Dict::EndDangerousAmbigs
void EndDangerousAmbigs()
Definition: stopper.cpp:374
MAX_AMBIG_SIZE
#define MAX_AMBIG_SIZE
Definition: ambigs.h:31
tesseract::XH_GOOD
Definition: dict.h:78
UNICHARSET::get_isngram
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:516
MATRIX::IncreaseBandSize
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:47
stopper.h
WERD_CHOICE::unicharset
const UNICHARSET * unicharset() const
Definition: ratngs.h:288
tesseract::Dict::AcceptableChoice
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:56
tesseract::Dict::SettupStopperPass1
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
Definition: stopper.cpp:378
WERD_CHOICE::state
int state(int index) const
Definition: ratngs.h:307
UNICHARSET::step
int step(const char *str) const
Definition: unicharset.cpp:232
tesseract::Dict::valid_word_permuter
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:474
tesseract::XHeightConsistencyEnum
XHeightConsistencyEnum
Definition: dict.h:78
BCC_AMBIG
Definition: ratngs.h:45
BLOB_CHOICE::set_classifier
void set_classifier(BlobChoiceClassifier classifier)
Definition: ratngs.h:155
ratngs.h
tesseract::AmbigSpec::correct_fragments
UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE+1]
Definition: ambigs.h:125
tesseract::Dict::stopper_no_acceptable_choices
bool stopper_no_acceptable_choices
Definition: dict.h:641
tesseract::Dict::AcceptableResult
bool AcceptableResult(WERD_RES *word) const
Definition: stopper.cpp:116
tesseract::AmbigSpec::correct_ngram_id
UNICHAR_ID correct_ngram_id
Definition: ambigs.h:126
BLOB_CHOICE::set_rating
void set_rating(float newrat)
Definition: ratngs.h:142
BLOB_CHOICE::set_matrix_cell
void set_matrix_cell(int col, int row)
Definition: ratngs.h:151
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
WERD_CHOICE::min_x_height
float min_x_height() const
Definition: ratngs.h:324
WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
BLOB_CHOICE::set_certainty
void set_certainty(float newrat)
Definition: ratngs.h:145
ccutil.h
tesseract::Dict::stopper_allowable_character_badness
double stopper_allowable_character_badness
Definition: dict.h:637
tesseract::Dict::LengthOfShortestAlphaRun
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const
Returns the length of the shortest alpha run in WordChoice.
Definition: stopper.cpp:460
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
tesseract::UnicharAmbigs::dang_ambigs
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:145
GENERIC_2D_ARRAY::get
T get(ICOORD pos) const
Definition: matrix.h:227
tesseract::XH_INCONSISTENT
Definition: dict.h:78
UNICHARSET
Definition: unicharset.h:145
tesseract::Dict::stopper_smallword_size
int stopper_smallword_size
Definition: dict.h:633
tesseract::AmbigSpec::wrong_ngram_size
int wrong_ngram_size
Definition: ambigs.h:128
helpers.h
tesseract
Definition: baseapi.h:65
WERD_CHOICE::debug_string
const STRING debug_string() const
Definition: ratngs.h:493
WERD_RES::best_choices
WERD_CHOICE_LIST best_choices
Definition: pageres.h:243
BLOB_CHOICE::rating
float rating() const
Definition: ratngs.h:78
tesseract::Dict::ReplaceAmbig
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
Definition: stopper.cpp:386
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
tesseract::UnicharIdArrayUtils::compare
static int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2)
Definition: ambigs.h:55
GenericVector< DANGERR_INFO >
tesseract::UnicharIdArrayUtils::print
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
Definition: ambigs.h:91
tesseract::AmbigSpec
Definition: ambigs.h:107
WERD_CHOICE::print
void print() const
Definition: ratngs.h:568
tesseract::Dict::stopper_phase2_certainty_rejection_offset
double stopper_phase2_certainty_rejection_offset
Definition: dict.h:631
WERD_CHOICE::length
int length() const
Definition: ratngs.h:291
BLOB_CHOICE
Definition: ratngs.h:49
MATRIX_COORD
Definition: matrix.h:604
unichar.h
GenericVector::get
T & get(int index) const
Definition: genericvector.h:716
tesseract::Dict::case_ok
int case_ok(const WERD_CHOICE &word) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:61
print_ratings_list
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:835
GENERIC_2D_ARRAY::put
void put(ICOORD pos, const T &thing)
Definition: matrix.h:219
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::Dict::getUnicharset
const UNICHARSET & getUnicharset() const
Definition: dict.h:101
tesseract::UnicharAmbigs::replace_ambigs
const UnicharAmbigsVector & replace_ambigs() const
Definition: ambigs.h:146
tesseract::Dict::valid_word
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:778
tesseract::Dict::NoDangerousAmbig
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:158
WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:315
ambigs.h
WERD_CHOICE::remove_unichar_id
void remove_unichar_id(int index)
Definition: ratngs.h:472
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::Dict::stopper_nondict_certainty_base
double stopper_nondict_certainty_base
Definition: dict.h:629
tesseract::AmbigSpec::type
AmbigType type
Definition: ambigs.h:127
tesseract::Dict::SettupStopperPass2
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
Definition: stopper.cpp:382
BLOB_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:139
matchdefs.h
WERD_CHOICE::set_blob_choice
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:314
WERD_CHOICE::max_x_height
float max_x_height() const
Definition: ratngs.h:327