tesseract  4.0.0-1-g2a2b
dict.cpp
Go to the documentation of this file.
1 // File: dict.cpp
3 // Description: dict class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include <cstdio>
20 
21 #include "dict.h"
22 #include "unicodes.h"
23 
24 #include "tprintf.h"
25 
26 namespace tesseract {
27 
28 class Image;
29 
31  : letter_is_okay_(&tesseract::Dict::def_letter_is_okay),
32  probability_in_context_(&tesseract::Dict::def_probability_in_context),
33  params_model_classify_(nullptr),
34  ccutil_(ccutil),
35  wildcard_unichar_id_(INVALID_UNICHAR_ID),
36  apostrophe_unichar_id_(INVALID_UNICHAR_ID),
37  question_unichar_id_(INVALID_UNICHAR_ID),
38  slash_unichar_id_(INVALID_UNICHAR_ID),
39  hyphen_unichar_id_(INVALID_UNICHAR_ID),
40  STRING_MEMBER(user_words_file, "", "A filename of user-provided words.",
41  getCCUtil()->params()),
42  STRING_INIT_MEMBER(user_words_suffix, "",
43  "A suffix of user-provided words located in tessdata.",
44  getCCUtil()->params()),
45  STRING_MEMBER(user_patterns_file, "",
46  "A filename of user-provided patterns.",
47  getCCUtil()->params()),
48  STRING_INIT_MEMBER(user_patterns_suffix, "",
49  "A suffix of user-provided patterns located in "
50  "tessdata.",
51  getCCUtil()->params()),
52  BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
53  getCCUtil()->params()),
54  BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
55  getCCUtil()->params()),
56  BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
57  getCCUtil()->params()),
58  BOOL_INIT_MEMBER(load_punc_dawg, true,
59  "Load dawg with punctuation"
60  " patterns.",
61  getCCUtil()->params()),
62  BOOL_INIT_MEMBER(load_number_dawg, true,
63  "Load dawg with number"
64  " patterns.",
65  getCCUtil()->params()),
66  BOOL_INIT_MEMBER(load_bigram_dawg, true,
67  "Load dawg with special word "
68  "bigrams.",
69  getCCUtil()->params()),
70  double_MEMBER(xheight_penalty_subscripts, 0.125,
71  "Score penalty (0.1 = 10%) added if there are subscripts "
72  "or superscripts in a word, but it is otherwise OK.",
73  getCCUtil()->params()),
74  double_MEMBER(xheight_penalty_inconsistent, 0.25,
75  "Score penalty (0.1 = 10%) added if an xheight is "
76  "inconsistent.",
77  getCCUtil()->params()),
78  double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
79  "Score multiplier for word matches which have good case and"
80  " are frequent in the given language (lower is better).",
81  getCCUtil()->params()),
82  double_MEMBER(segment_penalty_dict_case_ok, 1.1,
83  "Score multiplier for word matches that have good case "
84  "(lower is better).",
85  getCCUtil()->params()),
86  double_MEMBER(segment_penalty_dict_case_bad, 1.3125,
87  "Default score multiplier for word matches, which may have "
88  "case issues (lower is better).",
89  getCCUtil()->params()),
90  double_MEMBER(segment_penalty_dict_nonword, 1.25,
91  "Score multiplier for glyph fragment segmentations which "
92  "do not match a dictionary word (lower is better).",
93  getCCUtil()->params()),
94  double_MEMBER(segment_penalty_garbage, 1.50,
95  "Score multiplier for poorly cased strings that are not in"
96  " the dictionary and generally look like garbage (lower is"
97  " better).",
98  getCCUtil()->params()),
99  STRING_MEMBER(output_ambig_words_file, "",
100  "Output file for ambiguities found in the dictionary",
101  getCCUtil()->params()),
102  INT_MEMBER(dawg_debug_level, 0,
103  "Set to 1 for general debug info"
104  ", to 2 for more details, to 3 to see all the debug messages",
105  getCCUtil()->params()),
106  INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.",
107  getCCUtil()->params()),
108  INT_MEMBER(max_viterbi_list_size, 10, "Maximum size of viterbi list.",
109  getCCUtil()->params()),
110  BOOL_MEMBER(use_only_first_uft8_step, false,
111  "Use only the first UTF8 step of the given string"
112  " when computing log probabilities.",
113  getCCUtil()->params()),
114  double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
115  getCCUtil()->params()),
116  double_MEMBER(stopper_nondict_certainty_base, -2.50,
117  "Certainty threshold for non-dict words",
118  getCCUtil()->params()),
119  double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0,
120  "Reject certainty offset", getCCUtil()->params()),
121  INT_MEMBER(stopper_smallword_size, 2,
122  "Size of dict word to be treated as non-dict word",
123  getCCUtil()->params()),
124  double_MEMBER(stopper_certainty_per_char, -0.50,
125  "Certainty to add"
126  " for each dict char above small word size.",
127  getCCUtil()->params()),
128  double_MEMBER(stopper_allowable_character_badness, 3.0,
129  "Max certaintly variation allowed in a word (in sigma)",
130  getCCUtil()->params()),
131  INT_MEMBER(stopper_debug_level, 0, "Stopper debug level",
132  getCCUtil()->params()),
133  BOOL_MEMBER(stopper_no_acceptable_choices, false,
134  "Make AcceptableChoice() always return false. Useful"
135  " when there is a need to explore all segmentations",
136  getCCUtil()->params()),
137  INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
138  "Max words to keep in list", getCCUtil()->params()),
139  STRING_MEMBER(word_to_debug, "",
140  "Word for which stopper debug"
141  " information should be printed to stdout",
142  getCCUtil()->params()),
143  STRING_MEMBER(word_to_debug_lengths, "",
144  "Lengths of unichars in word_to_debug",
145  getCCUtil()->params()),
146  INT_MEMBER(fragments_debug, 0, "Debug character fragments",
147  getCCUtil()->params()),
148  BOOL_MEMBER(segment_nonalphabetic_script, false,
149  "Don't use any alphabetic-specific tricks."
150  " Set to true in the traineddata config file for"
151  " scripts that are cursive or inherently fixed-pitch",
152  getCCUtil()->params()),
153  BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
154  getCCUtil()->params()),
155  double_MEMBER(doc_dict_pending_threshold, 0.0,
156  "Worst certainty for using pending dictionary",
157  getCCUtil()->params()),
158  double_MEMBER(doc_dict_certainty_threshold, -2.25,
159  "Worst certainty for words that can be inserted into the"
160  " document dictionary",
161  getCCUtil()->params()),
162  INT_MEMBER(max_permuter_attempts, 10000,
163  "Maximum number of different"
164  " character choices to consider during permutation."
165  " This limit is especially useful when user patterns"
166  " are specified, since overly generic patterns can result in"
167  " dawg search exploring an overly large number of options.",
168  getCCUtil()->params()) {
169  dang_ambigs_table_ = nullptr;
170  replace_ambigs_table_ = nullptr;
171  reject_offset_ = 0.0;
172  go_deeper_fxn_ = nullptr;
173  hyphen_word_ = nullptr;
174  last_word_on_line_ = false;
175  document_words_ = nullptr;
176  dawg_cache_ = nullptr;
177  dawg_cache_is_ours_ = false;
178  pending_words_ = nullptr;
179  bigram_dawg_ = nullptr;
180  freq_dawg_ = nullptr;
181  punc_dawg_ = nullptr;
182  unambig_dawg_ = nullptr;
183  wordseg_rating_adjust_factor_ = -1.0f;
184  output_ambig_words_file_ = nullptr;
185 }
186 
188  End();
189  delete hyphen_word_;
190  if (output_ambig_words_file_ != nullptr) fclose(output_ambig_words_file_);
191 }
192 
194  // This global cache (a singleton) will outlive every Tesseract instance
195  // (even those that someone else might declare as global statics).
196  static DawgCache cache;
197  return &cache;
198 }
199 
200 // Sets up ready for a Load or LoadLSTM.
201 void Dict::SetupForLoad(DawgCache *dawg_cache) {
202  if (dawgs_.length() != 0) this->End();
203 
204  apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
205  question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
206  slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
207  hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
208 
209  if (dawg_cache != nullptr) {
210  dawg_cache_ = dawg_cache;
211  dawg_cache_is_ours_ = false;
212  } else {
213  dawg_cache_ = new DawgCache();
214  dawg_cache_is_ours_ = true;
215  }
216 }
217 
218 // Loads the dawgs needed by Tesseract. Call FinishLoad() after.
219 void Dict::Load(const STRING &lang, TessdataManager *data_file) {
220  // Load dawgs_.
221  if (load_punc_dawg) {
222  punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG,
223  dawg_debug_level, data_file);
224  if (punc_dawg_) dawgs_ += punc_dawg_;
225  }
226  if (load_system_dawg) {
227  Dawg *system_dawg = dawg_cache_->GetSquishedDawg(
228  lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
229  if (system_dawg) dawgs_ += system_dawg;
230  }
231  if (load_number_dawg) {
232  Dawg *number_dawg = dawg_cache_->GetSquishedDawg(
233  lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
234  if (number_dawg) dawgs_ += number_dawg;
235  }
236  if (load_bigram_dawg) {
237  bigram_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG,
238  dawg_debug_level, data_file);
239  // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the
240  // dawgs_!!
241  }
242  if (load_freq_dawg) {
243  freq_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG,
244  dawg_debug_level, data_file);
245  if (freq_dawg_) dawgs_ += freq_dawg_;
246  }
247  if (load_unambig_dawg) {
248  unambig_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG,
249  dawg_debug_level, data_file);
250  if (unambig_dawg_) dawgs_ += unambig_dawg_;
251  }
252 
253  STRING name;
254  if (((STRING &)user_words_suffix).length() > 0 ||
255  ((STRING &)user_words_file).length() > 0) {
256  Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
257  getUnicharset().size(), dawg_debug_level);
258  if (((STRING &)user_words_file).length() > 0) {
259  name = user_words_file;
260  } else {
262  name += user_words_suffix;
263  }
264  if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
266  tprintf("Error: failed to load %s\n", name.string());
267  delete trie_ptr;
268  } else {
269  dawgs_ += trie_ptr;
270  }
271  }
272 
273  if (((STRING &)user_patterns_suffix).length() > 0 ||
274  ((STRING &)user_patterns_file).length() > 0) {
275  Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
276  getUnicharset().size(), dawg_debug_level);
277  trie_ptr->initialize_patterns(&(getUnicharset()));
278  if (((STRING &)user_patterns_file).length() > 0) {
279  name = user_patterns_file;
280  } else {
282  name += user_patterns_suffix;
283  }
284  if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
285  tprintf("Error: failed to load %s\n", name.string());
286  delete trie_ptr;
287  } else {
288  dawgs_ += trie_ptr;
289  }
290  }
291 
292  document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
293  getUnicharset().size(), dawg_debug_level);
294  dawgs_ += document_words_;
295 
296  // This dawg is temporary and should not be searched by letter_is_ok.
297  pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
298  getUnicharset().size(), dawg_debug_level);
299 }
300 
301 // Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
302 void Dict::LoadLSTM(const STRING &lang, TessdataManager *data_file) {
303  // Load dawgs_.
304  if (load_punc_dawg) {
305  punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG,
306  dawg_debug_level, data_file);
307  if (punc_dawg_) dawgs_ += punc_dawg_;
308  }
309  if (load_system_dawg) {
310  Dawg *system_dawg = dawg_cache_->GetSquishedDawg(
311  lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
312  if (system_dawg) dawgs_ += system_dawg;
313  }
314  if (load_number_dawg) {
315  Dawg *number_dawg = dawg_cache_->GetSquishedDawg(
316  lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
317  if (number_dawg) dawgs_ += number_dawg;
318  }
319 }
320 
321 // Completes the loading process after Load() and/or LoadLSTM().
322 // Returns false if no dictionaries were loaded.
324  if (dawgs_.empty()) return false;
325  // Construct a list of corresponding successors for each dawg. Each entry, i,
326  // in the successors_ vector is a vector of integers that represent the
327  // indices into the dawgs_ vector of the successors for dawg i.
328  successors_.reserve(dawgs_.length());
329  for (int i = 0; i < dawgs_.length(); ++i) {
330  const Dawg *dawg = dawgs_[i];
331  SuccessorList *lst = new SuccessorList();
332  for (int j = 0; j < dawgs_.length(); ++j) {
333  const Dawg *other = dawgs_[j];
334  if (dawg != nullptr && other != nullptr &&
335  (dawg->lang() == other->lang()) &&
336  kDawgSuccessors[dawg->type()][other->type()]) *lst += j;
337  }
338  successors_ += lst;
339  }
340  return true;
341 }
342 
343 void Dict::End() {
344  if (dawgs_.length() == 0)
345  return; // Not safe to call twice.
346  for (int i = 0; i < dawgs_.size(); i++) {
347  if (!dawg_cache_->FreeDawg(dawgs_[i])) {
348  delete dawgs_[i];
349  }
350  }
351  dawg_cache_->FreeDawg(bigram_dawg_);
352  if (dawg_cache_is_ours_) {
353  delete dawg_cache_;
354  dawg_cache_ = nullptr;
355  }
356  successors_.delete_data_pointers();
357  dawgs_.clear();
358  successors_.clear();
359  document_words_ = nullptr;
360  delete pending_words_;
361  pending_words_ = nullptr;
362 }
363 
364 // Returns true if in light of the current state unichar_id is allowed
365 // according to at least one of the dawgs in the dawgs_ vector.
366 // See more extensive comments in dict.h where this function is declared.
367 int Dict::def_letter_is_okay(void* void_dawg_args,
368  const UNICHARSET& unicharset,
369  UNICHAR_ID unichar_id,
370  bool word_end) const {
371  DawgArgs *dawg_args = static_cast<DawgArgs *>(void_dawg_args);
372 
373  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
374 
375  if (dawg_debug_level >= 3) {
376  tprintf("def_letter_is_okay: current unichar=%s word_end=%d"
377  " num active dawgs=%d\n",
378  getUnicharset().debug_str(unichar_id).string(), word_end,
379  dawg_args->active_dawgs->length());
380  }
381 
382  // Do not accept words that contain kPatternUnicharID.
383  // (otherwise pattern dawgs would not function correctly).
384  // Do not accept words containing INVALID_UNICHAR_IDs.
385  if (unichar_id == Dawg::kPatternUnicharID ||
386  unichar_id == INVALID_UNICHAR_ID) {
387  dawg_args->permuter = NO_PERM;
388  return NO_PERM;
389  }
390 
391  // Initialization.
392  PermuterType curr_perm = NO_PERM;
393  dawg_args->updated_dawgs->clear();
394  dawg_args->valid_end = false;
395 
396  // Go over the active_dawgs vector and insert DawgPosition records
397  // with the updated ref (an edge with the corresponding unichar id) into
398  // dawg_args->updated_pos.
399  for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
400  const DawgPosition &pos = (*dawg_args->active_dawgs)[a];
401  const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;
402  const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;
403 
404  if (!dawg && !punc_dawg) {
405  // shouldn't happen.
406  tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n");
407  continue;
408  }
409  if (!dawg) {
410  // We're in the punctuation dawg. A core dawg has not been chosen.
411  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
412  EDGE_REF punc_transition_edge = punc_dawg->edge_char_of(
413  punc_node, Dawg::kPatternUnicharID, word_end);
414  if (punc_transition_edge != NO_EDGE) {
415  // Find all successors, and see which can transition.
416  const SuccessorList &slist = *(successors_[pos.punc_index]);
417  for (int s = 0; s < slist.length(); ++s) {
418  int sdawg_index = slist[s];
419  const Dawg *sdawg = dawgs_[sdawg_index];
420  UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
421  EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
422  if (dawg_edge != NO_EDGE) {
423  if (dawg_debug_level >=3) {
424  tprintf("Letter found in dawg %d\n", sdawg_index);
425  }
426  dawg_args->updated_dawgs->add_unique(
427  DawgPosition(sdawg_index, dawg_edge,
428  pos.punc_index, punc_transition_edge, false),
429  dawg_debug_level > 0,
430  "Append transition from punc dawg to current dawgs: ");
431  if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
432  if (sdawg->end_of_word(dawg_edge) &&
433  punc_dawg->end_of_word(punc_transition_edge))
434  dawg_args->valid_end = true;
435  }
436  }
437  }
438  EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id,
439  word_end);
440  if (punc_edge != NO_EDGE) {
441  if (dawg_debug_level >=3) {
442  tprintf("Letter found in punctuation dawg\n");
443  }
444  dawg_args->updated_dawgs->add_unique(
445  DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false),
446  dawg_debug_level > 0,
447  "Extend punctuation dawg: ");
448  if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM;
449  if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
450  }
451  continue;
452  }
453 
454  if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
455  // We can end the main word here.
456  // If we can continue on the punc ref, add that possibility.
457  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
458  EDGE_REF punc_edge = punc_node == NO_EDGE ? NO_EDGE
459  : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
460  if (punc_edge != NO_EDGE) {
461  dawg_args->updated_dawgs->add_unique(
463  pos.punc_index, punc_edge, true),
464  dawg_debug_level > 0,
465  "Return to punctuation dawg: ");
466  if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
467  if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
468  }
469  }
470 
471  if (pos.back_to_punc) continue;
472 
473  // If we are dealing with the pattern dawg, look up all the
474  // possible edges, not only for the exact unichar_id, but also
475  // for all its character classes (alpha, digit, etc).
476  if (dawg->type() == DAWG_TYPE_PATTERN) {
477  ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args,
478  &curr_perm);
479  // There can't be any successors to dawg that is of type
480  // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
481  continue;
482  }
483 
484  // Find the edge out of the node for the unichar_id.
485  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
486  EDGE_REF edge = (node == NO_EDGE) ? NO_EDGE
487  : dawg->edge_char_of(node, char_for_dawg(unicharset, unichar_id, dawg),
488  word_end);
489 
490  if (dawg_debug_level >= 3) {
491  tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
492  pos.dawg_index, node, edge);
493  }
494 
495  if (edge != NO_EDGE) { // the unichar was found in the current dawg
496  if (dawg_debug_level >=3) {
497  tprintf("Letter found in dawg %d\n", pos.dawg_index);
498  }
499  if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
500  if (dawg_debug_level >= 3) {
501  tprintf("Punctuation constraint not satisfied at end of word.\n");
502  }
503  continue;
504  }
505  if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
506  if (dawg->end_of_word(edge) &&
507  (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref)))
508  dawg_args->valid_end = true;
509  dawg_args->updated_dawgs->add_unique(
510  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
511  false),
512  dawg_debug_level > 0,
513  "Append current dawg to updated active dawgs: ");
514  }
515  } // end for
516  // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
517  // or if we found the current letter in a non-punctuation dawg. This
518  // allows preserving information on which dawg the "core" word came from.
519  // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
520  if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
521  (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
522  dawg_args->permuter = curr_perm;
523  }
524  if (dawg_debug_level >= 2) {
525  tprintf("Returning %d for permuter code for this character.\n",
526  dawg_args->permuter);
527  }
528  return dawg_args->permuter;
529 }
530 
531 void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos,
532  UNICHAR_ID unichar_id, bool word_end,
533  DawgArgs *dawg_args,
534  PermuterType *curr_perm) const {
535  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
536  // Try to find the edge corresponding to the exact unichar_id and to all the
537  // edges corresponding to the character class of unichar_id.
538  GenericVector<UNICHAR_ID> unichar_id_patterns;
539  unichar_id_patterns.push_back(unichar_id);
540  dawg->unichar_id_to_patterns(unichar_id, getUnicharset(),
541  &unichar_id_patterns);
542  for (int i = 0; i < unichar_id_patterns.size(); ++i) {
543  // On the first iteration check all the outgoing edges.
544  // On the second iteration check all self-loops.
545  for (int k = 0; k < 2; ++k) {
546  EDGE_REF edge = (k == 0)
547  ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
548  : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_patterns[i], word_end);
549  if (edge == NO_EDGE) continue;
550  if (dawg_debug_level >= 3) {
551  tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
552  pos.dawg_index, node, edge);
553  tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
554  }
555  if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();
556  if (dawg->end_of_word(edge)) dawg_args->valid_end = true;
557  dawg_args->updated_dawgs->add_unique(
558  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
559  pos.back_to_punc),
560  dawg_debug_level > 0,
561  "Append current dawg to updated active dawgs: ");
562  }
563  }
564 }
565 
566 // Fill the given active_dawgs vector with dawgs that could contain the
567 // beginning of the word. If hyphenated() returns true, copy the entries
568 // from hyphen_active_dawgs_ instead.
570  bool ambigs_mode) const {
571  int i;
572  if (hyphenated()) {
573  *active_dawgs = hyphen_active_dawgs_;
574  if (dawg_debug_level >= 3) {
575  for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
576  tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
577  hyphen_active_dawgs_[i].dawg_index,
578  hyphen_active_dawgs_[i].dawg_ref);
579  }
580  }
581  } else {
582  default_dawgs(active_dawgs, ambigs_mode);
583  }
584 }
585 
587  bool suppress_patterns) const {
588  bool punc_dawg_available =
589  (punc_dawg_ != nullptr) &&
590  punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
591 
592  for (int i = 0; i < dawgs_.length(); i++) {
593  if (dawgs_[i] != nullptr &&
594  !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
595  int dawg_ty = dawgs_[i]->type();
596  bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
597  if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
598  *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
599  if (dawg_debug_level >= 3) {
600  tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i,
601  NO_EDGE);
602  }
603  } else if (!punc_dawg_available || !subsumed_by_punc) {
604  *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
605  if (dawg_debug_level >= 3) {
606  tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
607  }
608  }
609  }
610  }
611 }
612 
613 void Dict::add_document_word(const WERD_CHOICE &best_choice) {
614  // Do not add hyphenated word parts to the document dawg.
615  // hyphen_word_ will be non-nullptr after the set_hyphen_word() is
616  // called when the first part of the hyphenated word is
617  // discovered and while the second part of the word is recognized.
618  // hyphen_word_ is cleared in cc_recg() before the next word on
619  // the line is recognized.
620  if (hyphen_word_) return;
621 
622  int stringlen = best_choice.length();
623 
624  if (valid_word(best_choice) || stringlen < 2)
625  return;
626 
627  // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
628  if (best_choice.length() >= kDocDictMaxRepChars) {
629  int num_rep_chars = 1;
630  UNICHAR_ID uch_id = best_choice.unichar_id(0);
631  for (int i = 1; i < best_choice.length(); ++i) {
632  if (best_choice.unichar_id(i) != uch_id) {
633  num_rep_chars = 1;
634  uch_id = best_choice.unichar_id(i);
635  } else {
636  ++num_rep_chars;
637  if (num_rep_chars == kDocDictMaxRepChars) return;
638  }
639  }
640  }
641 
642  if (best_choice.certainty() < doc_dict_certainty_threshold ||
643  stringlen == 2) {
644  if (best_choice.certainty() < doc_dict_pending_threshold)
645  return;
646 
647  if (!pending_words_->word_in_dawg(best_choice)) {
648  if (stringlen > 2 ||
649  (stringlen == 2 &&
650  getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
651  getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
652  pending_words_->add_word_to_dawg(best_choice);
653  }
654  return;
655  }
656  }
657 
658  if (save_doc_words) {
659  STRING filename(getCCUtil()->imagefile);
660  filename += ".doc";
661  FILE *doc_word_file = fopen(filename.string(), "a");
662  if (doc_word_file == nullptr) {
663  tprintf("Error: Could not open file %s\n", filename.string());
664  ASSERT_HOST(doc_word_file);
665  }
666  fprintf(doc_word_file, "%s\n",
667  best_choice.debug_string().string());
668  fclose(doc_word_file);
669  }
670  document_words_->add_word_to_dawg(best_choice);
671 }
672 
674  bool nonword,
675  XHeightConsistencyEnum xheight_consistency,
676  float additional_adjust,
677  bool modify_rating,
678  bool debug) {
679  bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
680  word->GetTopScriptID() == getUnicharset().han_sid());
681  bool case_is_ok = (is_han || case_ok(*word, getUnicharset()));
682  bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
683 
684  float adjust_factor = additional_adjust;
685  float new_rating = word->rating();
686  new_rating += kRatingPad;
687  const char *xheight_triggered = "";
688  if (word->length() > 1) {
689  // Calculate x-height and y-offset consistency penalties.
690  switch (xheight_consistency) {
691  case XH_INCONSISTENT:
692  adjust_factor += xheight_penalty_inconsistent;
693  xheight_triggered = ", xhtBAD";
694  break;
695  case XH_SUBNORMAL:
696  adjust_factor += xheight_penalty_subscripts;
697  xheight_triggered = ", xhtSUB";
698  break;
699  case XH_GOOD:
700  // leave the factor alone - all good!
701  break;
702  }
703  // TODO(eger): if nonword is true, but there is a "core" that is a dict
704  // word, negate nonword status.
705  } else {
706  if (debug) {
707  tprintf("Consistency could not be calculated.\n");
708  }
709  }
710  if (debug) {
711  tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "",
712  word->unichar_string().string(), word->rating(),
713  xheight_triggered);
714  }
715 
716  if (nonword) { // non-dictionary word
717  if (case_is_ok && punc_is_ok) {
718  adjust_factor += segment_penalty_dict_nonword;
719  new_rating *= adjust_factor;
720  if (debug) tprintf(", W");
721  } else {
722  adjust_factor += segment_penalty_garbage;
723  new_rating *= adjust_factor;
724  if (debug) {
725  if (!case_is_ok) tprintf(", C");
726  if (!punc_is_ok) tprintf(", P");
727  }
728  }
729  } else { // dictionary word
730  if (case_is_ok) {
731  if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) {
733  adjust_factor += segment_penalty_dict_frequent_word;
734  new_rating *= adjust_factor;
735  if (debug) tprintf(", F");
736  } else {
737  adjust_factor += segment_penalty_dict_case_ok;
738  new_rating *= adjust_factor;
739  if (debug) tprintf(", ");
740  }
741  } else {
742  adjust_factor += segment_penalty_dict_case_bad;
743  new_rating *= adjust_factor;
744  if (debug) tprintf(", C");
745  }
746  }
747  new_rating -= kRatingPad;
748  if (modify_rating) word->set_rating(new_rating);
749  if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
750  word->set_adjust_factor(adjust_factor);
751 }
752 
753 int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
754  const WERD_CHOICE *word_ptr = &word;
755  WERD_CHOICE temp_word(word.unicharset());
756  if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
757  copy_hyphen_info(&temp_word);
758  temp_word += word;
759  word_ptr = &temp_word;
760  }
761  if (word_ptr->length() == 0) return NO_PERM;
762  // Allocate vectors for holding current and updated
763  // active_dawgs and initialize them.
764  DawgPositionVector *active_dawgs = new DawgPositionVector[2];
765  init_active_dawgs(&(active_dawgs[0]), false);
766  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
767  int last_index = word_ptr->length() - 1;
768  // Call letter_is_okay for each letter in the word.
769  for (int i = hyphen_base_size(); i <= last_index; ++i) {
770  if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(),
771  word_ptr->unichar_id(i),
772  i == last_index))) break;
773  // Swap active_dawgs, constraints with the corresponding updated vector.
774  if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
775  dawg_args.updated_dawgs = &(active_dawgs[0]);
776  ++(dawg_args.active_dawgs);
777  } else {
778  ++(dawg_args.updated_dawgs);
779  dawg_args.active_dawgs = &(active_dawgs[0]);
780  }
781  }
782  delete[] active_dawgs;
783  return valid_word_permuter(dawg_args.permuter, numbers_ok) ?
784  dawg_args.permuter : NO_PERM;
785 }
786 
787 bool Dict::valid_bigram(const WERD_CHOICE &word1,
788  const WERD_CHOICE &word2) const {
789  if (bigram_dawg_ == nullptr) return false;
790 
791  // Extract the core word from the middle of each word with any digits
792  // replaced with question marks.
793  int w1start, w1end, w2start, w2end;
794  word1.punct_stripped(&w1start, &w1end);
795  word2.punct_stripped(&w2start, &w2end);
796 
797  // We don't want to penalize a single guillemet, hyphen, etc.
798  // But our bigram list doesn't have any information about punctuation.
799  if (w1start >= w1end) return word1.length() < 3;
800  if (w2start >= w2end) return word2.length() < 3;
801 
802  const UNICHARSET& uchset = getUnicharset();
803  GenericVector<UNICHAR_ID> bigram_string;
804  bigram_string.reserve(w1end + w2end + 1);
805  for (int i = w1start; i < w1end; i++) {
806  const GenericVector<UNICHAR_ID>& normed_ids =
807  getUnicharset().normed_ids(word1.unichar_id(i));
808  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
809  bigram_string.push_back(question_unichar_id_);
810  else
811  bigram_string += normed_ids;
812  }
813  bigram_string.push_back(UNICHAR_SPACE);
814  for (int i = w2start; i < w2end; i++) {
815  const GenericVector<UNICHAR_ID>& normed_ids =
816  getUnicharset().normed_ids(word2.unichar_id(i));
817  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
818  bigram_string.push_back(question_unichar_id_);
819  else
820  bigram_string += normed_ids;
821  }
822  WERD_CHOICE normalized_word(&uchset, bigram_string.size());
823  for (int i = 0; i < bigram_string.size(); ++i) {
824  normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1,
825  0.0f, 0.0f);
826  }
827  return bigram_dawg_->word_in_dawg(normalized_word);
828 }
829 
831  if (word.length() == 0) return NO_PERM;
832  int i;
833  WERD_CHOICE new_word(word.unicharset());
834  int last_index = word.length() - 1;
835  int new_len = 0;
836  for (i = 0; i <= last_index; ++i) {
837  UNICHAR_ID unichar_id = (word.unichar_id(i));
838  if (getUnicharset().get_ispunctuation(unichar_id)) {
839  new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
840  } else if (!getUnicharset().get_isalpha(unichar_id) &&
841  !getUnicharset().get_isdigit(unichar_id)) {
842  return false; // neither punc, nor alpha, nor digit
843  } else if ((new_len = new_word.length()) == 0 ||
844  new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) {
845  new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
846  }
847  }
848  for (i = 0; i < dawgs_.size(); ++i) {
849  if (dawgs_[i] != nullptr &&
850  dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
851  dawgs_[i]->word_in_dawg(new_word)) return true;
852  }
853  return false;
854 }
855 
858  const UNICHARSET &u_set = getUnicharset();
859  if (u_set.han_sid() > 0) return false;
860  if (u_set.katakana_sid() > 0) return false;
861  if (u_set.thai_sid() > 0) return false;
862  return true;
863 }
864 
865 } // namespace tesseract
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:193
bool FreeDawg(Dawg *dawg)
Definition: dawg_cache.h:38
void End()
Definition: dict.cpp:343
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:52
void initialize_patterns(UNICHARSET *unicharset)
Definition: trie.cpp:342
PermuterType permuter
Definition: dict.h:83
int UNICHAR_ID
Definition: unichar.h:35
int size() const
Definition: genericvector.h:71
double segment_penalty_dict_case_ok
Definition: dict.h:588
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:514
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:294
#define STRING_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:306
const CCUtil * getCCUtil() const
Definition: dict.h:92
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:753
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
Definition: dawg.h:185
bool valid_end
Definition: dict.h:85
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:673
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:452
double segment_penalty_dict_frequent_word
Definition: dict.h:584
const char * string() const
Definition: strngs.cpp:196
double doc_dict_pending_threshold
Definition: dict.h:638
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
STRING language_data_path_prefix
Definition: ccutil.h:67
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:303
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:569
double segment_penalty_dict_case_bad
Definition: dict.h:592
void reserve(int size)
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:288
double segment_penalty_dict_nonword
Definition: dict.h:596
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:830
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:71
float rating() const
Definition: ratngs.h:327
bool load_number_dawg
Definition: dict.h:573
float certainty() const
Definition: ratngs.h:330
bool load_punc_dawg
Definition: dict.h:572
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:787
bool load_freq_dawg
Definition: dict.h:569
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:291
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:131
char * user_words_file
Definition: dict.h:561
bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg)
Definition: dawg.h:389
#define REFFORMAT
Definition: dawg.h:93
EDGE_REF dawg_ref
Definition: dawg.h:374
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:174
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:423
double segment_penalty_garbage
Definition: dict.h:601
bool load_unambig_dawg
Definition: dict.h:570
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:297
int64_t EDGE_REF
Definition: dawg.h:55
int64_t NODE_REF
Definition: dawg.h:56
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
Definition: dict.cpp:531
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
char * user_words_suffix
Definition: dict.h:563
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
Definition: trie.cpp:286
Dawg * GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type, int debug_level, TessdataManager *data_file)
Definition: dawg_cache.cpp:45
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:205
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
void punct_stripped(int *start_core, int *end_core) const
Definition: ratngs.cpp:383
DawgPositionVector * updated_dawgs
Definition: dict.h:82
int length() const
Definition: genericvector.h:85
bool IsSpaceDelimitedLang() const
Returns true if the language is space-delimited (not CJ, or T).
Definition: dict.cpp:857
bool load_bigram_dawg
Definition: dict.h:575
void Load(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:219
bool FinishLoad()
Definition: dict.cpp:323
virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: dawg.h:196
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
bool empty() const
Definition: genericvector.h:90
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
XHeightConsistencyEnum
Definition: dict.h:75
int length() const
Definition: ratngs.h:303
int GetTopScriptID() const
Definition: ratngs.cpp:670
void LoadLSTM(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:302
int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.cpp:367
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:201
int push_back(T object)
double doc_dict_certainty_threshold
Definition: dict.h:640
bool valid_punctuation(const WERD_CHOICE &word)
Definition: dict.cpp:830
double xheight_penalty_subscripts
Definition: dict.h:578
bool load_system_dawg
Definition: dict.h:568
GenericVector< int > SuccessorList
Definition: dawg.h:69
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:137
int dawg_debug_level
Definition: dict.h:605
Definition: strngs.h:45
int null_sid() const
Definition: unicharset.h:878
const STRING debug_string() const
Definition: ratngs.h:505
int han_sid() const
Definition: unicharset.h:883
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:127
void delete_data_pointers()
DawgPositionVector * active_dawgs
Definition: dict.h:81
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
const STRING & unichar_string() const
Definition: ratngs.h:541
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:126
const STRING & lang() const
Definition: dawg.h:129
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
Definition: dict.cpp:613
char * user_patterns_file
Definition: dict.h:565
char * user_patterns_suffix
Definition: dict.h:567
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:586
int katakana_sid() const
Definition: unicharset.h:885
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:500
DawgType type() const
Definition: dawg.h:128
Dict(CCUtil *image_ptr)
Definition: dict.cpp:30
void set_adjust_factor(float factor)
Definition: ratngs.h:309
PermuterType permuter() const
Definition: dawg.h:130
double xheight_penalty_inconsistent
Definition: dict.h:581
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:459
CCUtil ccutil
PermuterType
Definition: ratngs.h:242
bool read_pattern_list(const char *filename, const UNICHARSET &unicharset)
Definition: trie.cpp:399
EDGE_REF punc_ref
Definition: dawg.h:376
int thai_sid() const
Definition: unicharset.h:886
virtual bool end_of_word(EDGE_REF edge_ref) const =0
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:357
bool save_doc_words
Definition: dict.h:636
#define ASSERT_HOST(x)
Definition: errcode.h:84
void set_rating(float new_val)
Definition: ratngs.h:369
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:433
void set_permuter(uint8_t perm)
Definition: ratngs.h:375