tesseract  5.0.0-alpha-619-ge9db
permdawg.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  *
3  * File: permdawg.cpp (Formerly permdawg.c)
4  * Description: Scale word choices by a dictionary
5  * Author: Mark Seaman, OCR Technology
6  *
7  * (c) Copyright 1987, Hewlett-Packard Company.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  *****************************************************************************/
19 /*----------------------------------------------------------------------
20  I n c l u d e s
21 ----------------------------------------------------------------------*/
22 
23 #include "dawg.h"
24 #include "stopper.h"
25 #include "tprintf.h"
26 #include "params.h"
27 
28 #include <algorithm>
29 #include <cctype>
30 #include "dict.h"
31 
32 /*----------------------------------------------------------------------
33  F u n c t i o n s
34 ----------------------------------------------------------------------*/
35 namespace tesseract {
36 
44  const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
45  int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
46  bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
47  WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) {
48  auto *more_args = static_cast<DawgArgs *>(void_more_args);
49  word_ending = (char_choice_index == char_choices.size()-1);
50  int word_index = word->length() - 1;
51  if (best_choice->rating() < *limit) return;
52  // Look up char in DAWG
53 
54  // If the current unichar is an ngram first try calling
55  // letter_is_okay() for each unigram it contains separately.
56  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
57  bool checked_unigrams = false;
58  if (getUnicharset().get_isngram(orig_uch_id)) {
59  if (dawg_debug_level) {
60  tprintf("checking unigrams in an ngram %s\n",
61  getUnicharset().debug_str(orig_uch_id).c_str());
62  }
63  int num_unigrams = 0;
64  word->remove_last_unichar_id();
66  const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
67  // Since the string came out of the unicharset, failure is impossible.
68  ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr,
69  nullptr));
70  bool unigrams_ok = true;
71  // Construct DawgArgs that reflect the current state.
72  DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
73  DawgPositionVector unigram_updated_dawgs;
74  DawgArgs unigram_dawg_args(&unigram_active_dawgs,
75  &unigram_updated_dawgs,
76  more_args->permuter);
77  // Check unigrams in the ngram with letter_is_okay().
78  for (int i = 0; unigrams_ok && i < encoding.size(); ++i) {
79  UNICHAR_ID uch_id = encoding[i];
80  ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
81  ++num_unigrams;
82  word->append_unichar_id(uch_id, 1, 0.0, 0.0);
83  unigrams_ok = (this->*letter_is_okay_)(
84  &unigram_dawg_args, *word->unicharset(),
85  word->unichar_id(word_index+num_unigrams-1),
86  word_ending && i == encoding.size() - 1);
87  (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
88  if (dawg_debug_level) {
89  tprintf("unigram %s is %s\n",
90  getUnicharset().debug_str(uch_id).c_str(),
91  unigrams_ok ? "OK" : "not OK");
92  }
93  }
94  // Restore the word and copy the updated dawg state if needed.
95  while (num_unigrams-- > 0) word->remove_last_unichar_id();
96  word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
97  if (unigrams_ok) {
98  checked_unigrams = true;
99  more_args->permuter = unigram_dawg_args.permuter;
100  *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
101  }
102  }
103 
104  // Check which dawgs from the dawgs_ vector contain the word
105  // up to and including the current unichar.
106  if (checked_unigrams || (this->*letter_is_okay_)(
107  more_args, *word->unicharset(), word->unichar_id(word_index),
108  word_ending)) {
109  // Add a new word choice
110  if (word_ending) {
111  if (dawg_debug_level) {
112  tprintf("found word = %s\n", word->debug_string().c_str());
113  }
114  if (strcmp(output_ambig_words_file.c_str(), "") != 0) {
115  if (output_ambig_words_file_ == nullptr) {
116  output_ambig_words_file_ =
117  fopen(output_ambig_words_file.c_str(), "wb+");
118  if (output_ambig_words_file_ == nullptr) {
119  tprintf("Failed to open output_ambig_words_file %s\n",
120  output_ambig_words_file.c_str());
121  exit(1);
122  }
123  STRING word_str;
124  word->string_and_lengths(&word_str, nullptr);
125  word_str += " ";
126  fprintf(output_ambig_words_file_, "%s", word_str.c_str());
127  }
128  STRING word_str;
129  word->string_and_lengths(&word_str, nullptr);
130  word_str += " ";
131  fprintf(output_ambig_words_file_, "%s", word_str.c_str());
132  }
133  WERD_CHOICE *adjusted_word = word;
134  adjusted_word->set_permuter(more_args->permuter);
135  update_best_choice(*adjusted_word, best_choice);
136  } else { // search the next letter
137  // Make updated_* point to the next entries in the DawgPositionVector
138  // arrays (that were originally created in dawg_permute_and_select)
139  ++(more_args->updated_dawgs);
140  // Make active_dawgs and constraints point to the updated ones.
141  ++(more_args->active_dawgs);
142  permute_choices(debug, char_choices, char_choice_index + 1,
143  prev_char_frag_info, word, certainties, limit,
144  best_choice, attempts_left, more_args);
145  // Restore previous state to explore another letter in this position.
146  --(more_args->updated_dawgs);
147  --(more_args->active_dawgs);
148  }
149  } else {
150  if (dawg_debug_level) {
151  tprintf("last unichar not OK at index %d in %s\n",
152  word_index, word->debug_string().c_str());
153  }
154  }
155 }
156 
157 
168  const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
169  auto *best_choice = new WERD_CHOICE(&getUnicharset());
170  best_choice->make_bad();
171  best_choice->set_rating(rating_limit);
172  if (char_choices.size() == 0 || char_choices.size() > MAX_WERD_LENGTH)
173  return best_choice;
174  auto *active_dawgs =
175  new DawgPositionVector[char_choices.size() + 1];
176  init_active_dawgs(&(active_dawgs[0]), true);
177  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
179 
180  float certainties[MAX_WERD_LENGTH];
182  int attempts_left = max_permuter_attempts;
183  permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr,
184  char_choices, 0, nullptr, &word, certainties, &rating_limit, best_choice,
185  &attempts_left, &dawg_args);
186  delete[] active_dawgs;
187  return best_choice;
188 }
189 
197  const char *debug,
198  const BLOB_CHOICE_LIST_VECTOR &char_choices,
199  int char_choice_index,
200  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
201  WERD_CHOICE *word,
202  float certainties[],
203  float *limit,
204  WERD_CHOICE *best_choice,
205  int *attempts_left,
206  void *more_args) {
207  if (debug) {
208  tprintf("%s permute_choices: char_choice_index=%d"
209  " limit=%g rating=%g, certainty=%g word=%s\n",
210  debug, char_choice_index, *limit, word->rating(),
211  word->certainty(), word->debug_string().c_str());
212  }
213  if (char_choice_index < char_choices.size()) {
214  BLOB_CHOICE_IT blob_choice_it;
215  blob_choice_it.set_to_list(char_choices.get(char_choice_index));
216  for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
217  blob_choice_it.forward()) {
218  (*attempts_left)--;
219  append_choices(debug, char_choices, *(blob_choice_it.data()),
220  char_choice_index, prev_char_frag_info, word,
221  certainties, limit, best_choice, attempts_left, more_args);
222  if (*attempts_left <= 0) {
223  if (debug) tprintf("permute_choices(): attempts_left is 0\n");
224  break;
225  }
226  }
227  }
228 }
229 
239  const char *debug,
240  const BLOB_CHOICE_LIST_VECTOR &char_choices,
241  const BLOB_CHOICE &blob_choice,
242  int char_choice_index,
243  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
244  WERD_CHOICE *word,
245  float certainties[],
246  float *limit,
247  WERD_CHOICE *best_choice,
248  int *attempts_left,
249  void *more_args) {
250  int word_ending = (char_choice_index == char_choices.size() - 1);
251 
252  // Deal with fragments.
253  CHAR_FRAGMENT_INFO char_frag_info;
254  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
255  blob_choice.certainty(), prev_char_frag_info, debug,
256  word_ending, &char_frag_info)) {
257  return; // blob_choice must be an invalid fragment
258  }
259  // Search the next letter if this character is a fragment.
260  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
261  permute_choices(debug, char_choices, char_choice_index + 1,
262  &char_frag_info, word, certainties, limit,
263  best_choice, attempts_left, more_args);
264  return;
265  }
266 
267  // Add the next unichar.
268  float old_rating = word->rating();
269  float old_certainty = word->certainty();
270  uint8_t old_permuter = word->permuter();
271  certainties[word->length()] = char_frag_info.certainty;
273  char_frag_info.unichar_id, char_frag_info.num_fragments,
274  char_frag_info.rating, char_frag_info.certainty);
275 
276  // Explore the next unichar.
277  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
278  &char_frag_info, word_ending, word, certainties,
279  limit, best_choice, attempts_left, more_args);
280 
281  // Remove the unichar we added to explore other choices in it's place.
282  word->remove_last_unichar_id();
283  word->set_rating(old_rating);
284  word->set_certainty(old_certainty);
285  word->set_permuter(old_permuter);
286 }
287 
313 bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id,
314  float curr_rating, float curr_certainty,
315  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
316  const char *debug, int word_ending,
317  CHAR_FRAGMENT_INFO *char_frag_info) {
318  const CHAR_FRAGMENT *this_fragment =
319  getUnicharset().get_fragment(curr_unichar_id);
320  const CHAR_FRAGMENT *prev_fragment =
321  prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;
322 
323  // Print debug info for fragments.
324  if (debug && (prev_fragment || this_fragment)) {
325  tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
326  getUnicharset().debug_str(curr_unichar_id).c_str(),
327  word_ending);
328  if (prev_fragment) {
329  tprintf("prev_fragment %s\n", prev_fragment->to_string().c_str());
330  }
331  if (this_fragment) {
332  tprintf("this_fragment %s\n", this_fragment->to_string().c_str());
333  }
334  }
335 
336  char_frag_info->unichar_id = curr_unichar_id;
337  char_frag_info->fragment = this_fragment;
338  char_frag_info->rating = curr_rating;
339  char_frag_info->certainty = curr_certainty;
340  char_frag_info->num_fragments = 1;
341  if (prev_fragment && !this_fragment) {
342  if (debug) tprintf("Skip choice with incomplete fragment\n");
343  return false;
344  }
345  if (this_fragment) {
346  // We are dealing with a fragment.
347  char_frag_info->unichar_id = INVALID_UNICHAR_ID;
348  if (prev_fragment) {
349  if (!this_fragment->is_continuation_of(prev_fragment)) {
350  if (debug) tprintf("Non-matching fragment piece\n");
351  return false;
352  }
353  if (this_fragment->is_ending()) {
354  char_frag_info->unichar_id =
355  getUnicharset().unichar_to_id(this_fragment->get_unichar());
356  char_frag_info->fragment = nullptr;
357  if (debug) {
358  tprintf("Built character %s from fragments\n",
359  getUnicharset().debug_str(
360  char_frag_info->unichar_id).c_str());
361  }
362  } else {
363  if (debug) tprintf("Record fragment continuation\n");
364  char_frag_info->fragment = this_fragment;
365  }
366  // Update certainty and rating.
367  char_frag_info->rating =
368  prev_char_frag_info->rating + curr_rating;
369  char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
370  char_frag_info->certainty =
371  std::min(curr_certainty, prev_char_frag_info->certainty);
372  } else {
373  if (this_fragment->is_beginning()) {
374  if (debug) tprintf("Record fragment beginning\n");
375  } else {
376  if (debug) {
377  tprintf("Non-starting fragment piece with no prev_fragment\n");
378  }
379  return false;
380  }
381  }
382  }
383  if (word_ending && char_frag_info->fragment) {
384  if (debug) tprintf("Word can not end with a fragment\n");
385  return false;
386  }
387  return true;
388 }
389 
390 } // namespace tesseract
tesseract::Dict::fragment_state_okay
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Definition: permdawg.cpp:328
MAX_WERD_LENGTH
#define MAX_WERD_LENGTH
Definition: dict.h:39
CHAR_FRAGMENT_INFO::num_fragments
int num_fragments
Definition: dict.h:46
tesseract::Dict::max_permuter_attempts
int max_permuter_attempts
Definition: dict.h:658
dict.h
tesseract::Dict::go_deeper_fxn_
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:216
CHAR_FRAGMENT_INFO::fragment
const CHAR_FRAGMENT * fragment
Definition: dict.h:45
tesseract::Dict::output_ambig_words_file
char * output_ambig_words_file
Definition: dict.h:620
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:303
CHAR_FRAGMENT::is_continuation_of
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:98
WERD_CHOICE
Definition: ratngs.h:261
CHAR_FRAGMENT::to_string
static STRING to_string(const char *unichar, int pos, int total, bool natural)
Definition: unicharset.cpp:1044
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
tesseract::Dict::dawg_permute_and_select
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:182
WERD_CHOICE::set_certainty
void set_certainty(float new_val)
Definition: ratngs.h:360
BLOB_CHOICE::certainty
float certainty() const
Definition: ratngs.h:81
params.h
CHAR_FRAGMENT::get_unichar
const char * get_unichar() const
Definition: unicharset.h:70
WERD_CHOICE::make_bad
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:431
WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:318
NO_PERM
Definition: ratngs.h:231
STRING
Definition: strngs.h:45
WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:334
tesseract::DawgArgs
Definition: dict.h:80
stopper.h
WERD_CHOICE::append_unichar_id_space_allocated
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:440
WERD_CHOICE::unicharset
const UNICHARSET * unicharset() const
Definition: ratngs.h:288
BLOB_CHOICE::unichar_id
UNICHAR_ID unichar_id() const
Definition: ratngs.h:75
WERD_CHOICE::string_and_lengths
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:451
tesseract::DawgArgs::permuter
PermuterType permuter
Definition: dict.h:86
tesseract::Dict::go_deeper_dawg_fxn
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Definition: permdawg.cpp:58
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
dawg.h
tesseract::Dict::update_best_choice
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:182
tesseract::Dict::permute_choices
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:211
tesseract::DawgArgs::updated_dawgs
DawgPositionVector * updated_dawgs
Definition: dict.h:85
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
CHAR_FRAGMENT::is_ending
bool is_ending() const
Definition: unicharset.h:108
tesseract::Dict::dawg_debug_level
int dawg_debug_level
Definition: dict.h:622
WERD_CHOICE::set_rating
void set_rating(float new_val)
Definition: ratngs.h:357
tesseract
Definition: baseapi.h:65
WERD_CHOICE::debug_string
const STRING debug_string() const
Definition: ratngs.h:493
tprintf.h
BLOB_CHOICE::rating
float rating() const
Definition: ratngs.h:78
tesseract::DawgPositionVector
Definition: dawg.h:373
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
GenericVector
Definition: baseapi.h:40
CHAR_FRAGMENT_INFO::certainty
float certainty
Definition: dict.h:48
WERD_CHOICE::remove_last_unichar_id
void remove_last_unichar_id()
Definition: ratngs.h:471
tesseract::Dict::init_active_dawgs
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:600
CHAR_FRAGMENT
Definition: unicharset.h:48
CHAR_FRAGMENT_INFO::unichar_id
UNICHAR_ID unichar_id
Definition: dict.h:44
WERD_CHOICE::length
int length() const
Definition: ratngs.h:291
BLOB_CHOICE
Definition: ratngs.h:49
CHAR_FRAGMENT_INFO
Definition: dict.h:43
GenericVector::get
T & get(int index) const
Definition: genericvector.h:716
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::Dict::getUnicharset
const UNICHARSET & getUnicharset() const
Definition: dict.h:101
UNICHARSET::get_fragment
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:724
tesseract::Dict::append_choices
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:253
tesseract::DawgArgs::active_dawgs
DawgPositionVector * active_dawgs
Definition: dict.h:84
CHAR_FRAGMENT_INFO::rating
float rating
Definition: dict.h:47
WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:315
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
CHAR_FRAGMENT::is_beginning
bool is_beginning() const
Definition: unicharset.h:105
GenericVector::size
int size() const
Definition: genericvector.h:71
WERD_CHOICE::set_permuter
void set_permuter(uint8_t perm)
Definition: ratngs.h:363
WERD_CHOICE::append_unichar_id
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:470
tesseract::Dict::letter_is_okay_
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:372