tesseract  4.0.0-1-g2a2b
permdawg.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: permdawg.cpp (Formerly permdawg.c)
5  * Description: Scale word choices by a dictionary
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Oct 16 14:37:00 1987
8  * Modified: Tue Jul 9 15:43:18 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 /*----------------------------------------------------------------------
26  I n c l u d e s
27 ----------------------------------------------------------------------*/
28 
29 #include "dawg.h"
30 #include "globals.h"
31 #include "stopper.h"
32 #include "tprintf.h"
33 #include "params.h"
34 
35 #include <algorithm>
36 #include <cctype>
37 #include "dict.h"
38 
39 /*----------------------------------------------------------------------
40  F u n c t i o n s
41 ----------------------------------------------------------------------*/
42 namespace tesseract {
43 
51  const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
52  int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
53  bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
54  WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) {
55  DawgArgs *more_args = static_cast<DawgArgs *>(void_more_args);
56  word_ending = (char_choice_index == char_choices.size()-1);
57  int word_index = word->length() - 1;
58  if (best_choice->rating() < *limit) return;
59  // Look up char in DAWG
60 
61  // If the current unichar is an ngram first try calling
62  // letter_is_okay() for each unigram it contains separately.
63  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
64  bool checked_unigrams = false;
65  if (getUnicharset().get_isngram(orig_uch_id)) {
66  if (dawg_debug_level) {
67  tprintf("checking unigrams in an ngram %s\n",
68  getUnicharset().debug_str(orig_uch_id).string());
69  }
70  int num_unigrams = 0;
71  word->remove_last_unichar_id();
73  const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
74  // Since the string came out of the unicharset, failure is impossible.
75  ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr,
76  nullptr));
77  bool unigrams_ok = true;
78  // Construct DawgArgs that reflect the current state.
79  DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
80  DawgPositionVector unigram_updated_dawgs;
81  DawgArgs unigram_dawg_args(&unigram_active_dawgs,
82  &unigram_updated_dawgs,
83  more_args->permuter);
84  // Check unigrams in the ngram with letter_is_okay().
85  for (int i = 0; unigrams_ok && i < encoding.size(); ++i) {
86  UNICHAR_ID uch_id = encoding[i];
87  ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
88  ++num_unigrams;
89  word->append_unichar_id(uch_id, 1, 0.0, 0.0);
90  unigrams_ok = (this->*letter_is_okay_)(
91  &unigram_dawg_args, *word->unicharset(),
92  word->unichar_id(word_index+num_unigrams-1),
93  word_ending && i == encoding.size() - 1);
94  (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
95  if (dawg_debug_level) {
96  tprintf("unigram %s is %s\n",
97  getUnicharset().debug_str(uch_id).string(),
98  unigrams_ok ? "OK" : "not OK");
99  }
100  }
101  // Restore the word and copy the updated dawg state if needed.
102  while (num_unigrams-- > 0) word->remove_last_unichar_id();
103  word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
104  if (unigrams_ok) {
105  checked_unigrams = true;
106  more_args->permuter = unigram_dawg_args.permuter;
107  *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
108  }
109  }
110 
111  // Check which dawgs from the dawgs_ vector contain the word
112  // up to and including the current unichar.
113  if (checked_unigrams || (this->*letter_is_okay_)(
114  more_args, *word->unicharset(), word->unichar_id(word_index),
115  word_ending)) {
116  // Add a new word choice
117  if (word_ending) {
118  if (dawg_debug_level) {
119  tprintf("found word = %s\n", word->debug_string().string());
120  }
121  if (strcmp(output_ambig_words_file.string(), "") != 0) {
122  if (output_ambig_words_file_ == nullptr) {
123  output_ambig_words_file_ =
124  fopen(output_ambig_words_file.string(), "wb+");
125  if (output_ambig_words_file_ == nullptr) {
126  tprintf("Failed to open output_ambig_words_file %s\n",
127  output_ambig_words_file.string());
128  exit(1);
129  }
130  STRING word_str;
131  word->string_and_lengths(&word_str, nullptr);
132  word_str += " ";
133  fprintf(output_ambig_words_file_, "%s", word_str.string());
134  }
135  STRING word_str;
136  word->string_and_lengths(&word_str, nullptr);
137  word_str += " ";
138  fprintf(output_ambig_words_file_, "%s", word_str.string());
139  }
140  WERD_CHOICE *adjusted_word = word;
141  adjusted_word->set_permuter(more_args->permuter);
142  update_best_choice(*adjusted_word, best_choice);
143  } else { // search the next letter
144  // Make updated_* point to the next entries in the DawgPositionVector
145  // arrays (that were originally created in dawg_permute_and_select)
146  ++(more_args->updated_dawgs);
147  // Make active_dawgs and constraints point to the updated ones.
148  ++(more_args->active_dawgs);
149  permute_choices(debug, char_choices, char_choice_index + 1,
150  prev_char_frag_info, word, certainties, limit,
151  best_choice, attempts_left, more_args);
152  // Restore previous state to explore another letter in this position.
153  --(more_args->updated_dawgs);
154  --(more_args->active_dawgs);
155  }
156  } else {
157  if (dawg_debug_level) {
158  tprintf("last unichar not OK at index %d in %s\n",
159  word_index, word->debug_string().string());
160  }
161  }
162 }
163 
164 
175  const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
176  WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
177  best_choice->make_bad();
178  best_choice->set_rating(rating_limit);
179  if (char_choices.length() == 0 || char_choices.length() > MAX_WERD_LENGTH)
180  return best_choice;
181  DawgPositionVector *active_dawgs =
182  new DawgPositionVector[char_choices.length() + 1];
183  init_active_dawgs(&(active_dawgs[0]), true);
184  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
186 
187  float certainties[MAX_WERD_LENGTH];
189  int attempts_left = max_permuter_attempts;
190  permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr,
191  char_choices, 0, nullptr, &word, certainties, &rating_limit, best_choice,
192  &attempts_left, &dawg_args);
193  delete[] active_dawgs;
194  return best_choice;
195 }
196 
204  const char *debug,
205  const BLOB_CHOICE_LIST_VECTOR &char_choices,
206  int char_choice_index,
207  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
208  WERD_CHOICE *word,
209  float certainties[],
210  float *limit,
211  WERD_CHOICE *best_choice,
212  int *attempts_left,
213  void *more_args) {
214  if (debug) {
215  tprintf("%s permute_choices: char_choice_index=%d"
216  " limit=%g rating=%g, certainty=%g word=%s\n",
217  debug, char_choice_index, *limit, word->rating(),
218  word->certainty(), word->debug_string().string());
219  }
220  if (char_choice_index < char_choices.length()) {
221  BLOB_CHOICE_IT blob_choice_it;
222  blob_choice_it.set_to_list(char_choices.get(char_choice_index));
223  for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
224  blob_choice_it.forward()) {
225  (*attempts_left)--;
226  append_choices(debug, char_choices, *(blob_choice_it.data()),
227  char_choice_index, prev_char_frag_info, word,
228  certainties, limit, best_choice, attempts_left, more_args);
229  if (*attempts_left <= 0) {
230  if (debug) tprintf("permute_choices(): attempts_left is 0\n");
231  break;
232  }
233  }
234  }
235 }
236 
246  const char *debug,
247  const BLOB_CHOICE_LIST_VECTOR &char_choices,
248  const BLOB_CHOICE &blob_choice,
249  int char_choice_index,
250  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
251  WERD_CHOICE *word,
252  float certainties[],
253  float *limit,
254  WERD_CHOICE *best_choice,
255  int *attempts_left,
256  void *more_args) {
257  int word_ending = (char_choice_index == char_choices.length() - 1);
258 
259  // Deal with fragments.
260  CHAR_FRAGMENT_INFO char_frag_info;
261  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
262  blob_choice.certainty(), prev_char_frag_info, debug,
263  word_ending, &char_frag_info)) {
264  return; // blob_choice must be an invalid fragment
265  }
266  // Search the next letter if this character is a fragment.
267  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
268  permute_choices(debug, char_choices, char_choice_index + 1,
269  &char_frag_info, word, certainties, limit,
270  best_choice, attempts_left, more_args);
271  return;
272  }
273 
274  // Add the next unichar.
275  float old_rating = word->rating();
276  float old_certainty = word->certainty();
277  uint8_t old_permuter = word->permuter();
278  certainties[word->length()] = char_frag_info.certainty;
280  char_frag_info.unichar_id, char_frag_info.num_fragments,
281  char_frag_info.rating, char_frag_info.certainty);
282 
283  // Explore the next unichar.
284  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
285  &char_frag_info, word_ending, word, certainties,
286  limit, best_choice, attempts_left, more_args);
287 
288  // Remove the unichar we added to explore other choices in it's place.
289  word->remove_last_unichar_id();
290  word->set_rating(old_rating);
291  word->set_certainty(old_certainty);
292  word->set_permuter(old_permuter);
293 }
294 
321  float curr_rating, float curr_certainty,
322  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
323  const char *debug, int word_ending,
324  CHAR_FRAGMENT_INFO *char_frag_info) {
325  const CHAR_FRAGMENT *this_fragment =
326  getUnicharset().get_fragment(curr_unichar_id);
327  const CHAR_FRAGMENT *prev_fragment =
328  prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;
329 
330  // Print debug info for fragments.
331  if (debug && (prev_fragment || this_fragment)) {
332  tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
333  getUnicharset().debug_str(curr_unichar_id).string(),
334  word_ending);
335  if (prev_fragment) {
336  tprintf("prev_fragment %s\n", prev_fragment->to_string().string());
337  }
338  if (this_fragment) {
339  tprintf("this_fragment %s\n", this_fragment->to_string().string());
340  }
341  }
342 
343  char_frag_info->unichar_id = curr_unichar_id;
344  char_frag_info->fragment = this_fragment;
345  char_frag_info->rating = curr_rating;
346  char_frag_info->certainty = curr_certainty;
347  char_frag_info->num_fragments = 1;
348  if (prev_fragment && !this_fragment) {
349  if (debug) tprintf("Skip choice with incomplete fragment\n");
350  return false;
351  }
352  if (this_fragment) {
353  // We are dealing with a fragment.
354  char_frag_info->unichar_id = INVALID_UNICHAR_ID;
355  if (prev_fragment) {
356  if (!this_fragment->is_continuation_of(prev_fragment)) {
357  if (debug) tprintf("Non-matching fragment piece\n");
358  return false;
359  }
360  if (this_fragment->is_ending()) {
361  char_frag_info->unichar_id =
362  getUnicharset().unichar_to_id(this_fragment->get_unichar());
363  char_frag_info->fragment = nullptr;
364  if (debug) {
365  tprintf("Built character %s from fragments\n",
366  getUnicharset().debug_str(
367  char_frag_info->unichar_id).string());
368  }
369  } else {
370  if (debug) tprintf("Record fragment continuation\n");
371  char_frag_info->fragment = this_fragment;
372  }
373  // Update certainty and rating.
374  char_frag_info->rating =
375  prev_char_frag_info->rating + curr_rating;
376  char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
377  char_frag_info->certainty =
378  std::min(curr_certainty, prev_char_frag_info->certainty);
379  } else {
380  if (this_fragment->is_beginning()) {
381  if (debug) tprintf("Record fragment beginning\n");
382  } else {
383  if (debug) {
384  tprintf("Non-starting fragment piece with no prev_fragment\n");
385  }
386  return false;
387  }
388  }
389  }
390  if (word_ending && char_frag_info->fragment) {
391  if (debug) tprintf("Word can not end with a fragment\n");
392  return false;
393  }
394  return true;
395 }
396 
397 } // namespace tesseract
bool is_beginning() const
Definition: unicharset.h:106
#define MAX_WERD_LENGTH
Definition: dict.h:36
PermuterType permuter
Definition: dict.h:83
float certainty() const
Definition: ratngs.h:83
int UNICHAR_ID
Definition: unichar.h:35
int size() const
Definition: genericvector.h:71
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Definition: permdawg.cpp:320
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:99
float certainty
Definition: dict.h:45
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:452
const char * string() const
Definition: strngs.cpp:196
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
uint8_t permuter() const
Definition: ratngs.h:346
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:569
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:468
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
static STRING to_string(const char *unichar, int pos, int total, bool natural)
float rating() const
Definition: ratngs.h:327
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:171
float certainty() const
Definition: ratngs.h:330
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:443
const char * get_unichar() const
Definition: unicharset.h:71
char * output_ambig_words_file
Definition: dict.h:603
const CHAR_FRAGMENT * fragment
Definition: dict.h:42
UNICHAR_ID unichar_id
Definition: dict.h:41
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Definition: permdawg.cpp:50
T & get(int index) const
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:205
DawgPositionVector * updated_dawgs
Definition: dict.h:82
int length() const
Definition: genericvector.h:85
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int length() const
Definition: ratngs.h:303
bool is_ending() const
Definition: unicharset.h:109
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:449
void remove_last_unichar_id()
Definition: ratngs.h:483
int num_fragments
Definition: dict.h:43
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:245
float rating() const
Definition: ratngs.h:80
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:729
int dawg_debug_level
Definition: dict.h:605
Definition: strngs.h:45
const STRING debug_string() const
Definition: ratngs.h:505
int max_permuter_attempts
Definition: dict.h:645
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
DawgPositionVector * active_dawgs
Definition: dict.h:81
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:174
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:203
void set_certainty(float new_val)
Definition: ratngs.h:372
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:357
#define ASSERT_HOST(x)
Definition: errcode.h:84
void set_rating(float new_val)
Definition: ratngs.h:369
float rating
Definition: dict.h:44
void set_permuter(uint8_t perm)
Definition: ratngs.h:375