tesseract  5.0.0-alpha-619-ge9db
dict.cpp
Go to the documentation of this file.
1 // File: dict.cpp
3 // Description: dict class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include <cstdio>
20 
21 #include "dict.h"
22 #include "unicodes.h"
23 
24 #include "tprintf.h"
25 
26 namespace tesseract {
27 
28 class Image;
29 
31  : letter_is_okay_(&tesseract::Dict::def_letter_is_okay),
32  probability_in_context_(&tesseract::Dict::def_probability_in_context),
33  params_model_classify_(nullptr),
34  ccutil_(ccutil),
35  wildcard_unichar_id_(INVALID_UNICHAR_ID),
36  apostrophe_unichar_id_(INVALID_UNICHAR_ID),
37  question_unichar_id_(INVALID_UNICHAR_ID),
38  slash_unichar_id_(INVALID_UNICHAR_ID),
39  hyphen_unichar_id_(INVALID_UNICHAR_ID),
40  STRING_MEMBER(user_words_file, "", "A filename of user-provided words.",
41  getCCUtil()->params()),
42  STRING_INIT_MEMBER(user_words_suffix, "",
43  "A suffix of user-provided words located in tessdata.",
44  getCCUtil()->params()),
45  STRING_MEMBER(user_patterns_file, "",
46  "A filename of user-provided patterns.",
47  getCCUtil()->params()),
48  STRING_INIT_MEMBER(user_patterns_suffix, "",
49  "A suffix of user-provided patterns located in "
50  "tessdata.",
51  getCCUtil()->params()),
52  BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
53  getCCUtil()->params()),
54  BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
55  getCCUtil()->params()),
56  BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
57  getCCUtil()->params()),
58  BOOL_INIT_MEMBER(load_punc_dawg, true,
59  "Load dawg with punctuation"
60  " patterns.",
61  getCCUtil()->params()),
62  BOOL_INIT_MEMBER(load_number_dawg, true,
63  "Load dawg with number"
64  " patterns.",
65  getCCUtil()->params()),
66  BOOL_INIT_MEMBER(load_bigram_dawg, true,
67  "Load dawg with special word "
68  "bigrams.",
69  getCCUtil()->params()),
70  double_MEMBER(xheight_penalty_subscripts, 0.125,
71  "Score penalty (0.1 = 10%) added if there are subscripts "
72  "or superscripts in a word, but it is otherwise OK.",
73  getCCUtil()->params()),
74  double_MEMBER(xheight_penalty_inconsistent, 0.25,
75  "Score penalty (0.1 = 10%) added if an xheight is "
76  "inconsistent.",
77  getCCUtil()->params()),
78  double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
79  "Score multiplier for word matches which have good case and"
80  " are frequent in the given language (lower is better).",
81  getCCUtil()->params()),
82  double_MEMBER(segment_penalty_dict_case_ok, 1.1,
83  "Score multiplier for word matches that have good case "
84  "(lower is better).",
85  getCCUtil()->params()),
86  double_MEMBER(segment_penalty_dict_case_bad, 1.3125,
87  "Default score multiplier for word matches, which may have "
88  "case issues (lower is better).",
89  getCCUtil()->params()),
90  double_MEMBER(segment_penalty_dict_nonword, 1.25,
91  "Score multiplier for glyph fragment segmentations which "
92  "do not match a dictionary word (lower is better).",
93  getCCUtil()->params()),
94  double_MEMBER(segment_penalty_garbage, 1.50,
95  "Score multiplier for poorly cased strings that are not in"
96  " the dictionary and generally look like garbage (lower is"
97  " better).",
98  getCCUtil()->params()),
99  STRING_MEMBER(output_ambig_words_file, "",
100  "Output file for ambiguities found in the dictionary",
101  getCCUtil()->params()),
102  INT_MEMBER(dawg_debug_level, 0,
103  "Set to 1 for general debug info"
104  ", to 2 for more details, to 3 to see all the debug messages",
105  getCCUtil()->params()),
106  INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.",
107  getCCUtil()->params()),
108  BOOL_MEMBER(use_only_first_uft8_step, false,
109  "Use only the first UTF8 step of the given string"
110  " when computing log probabilities.",
111  getCCUtil()->params()),
112  double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
113  getCCUtil()->params()),
114  double_MEMBER(stopper_nondict_certainty_base, -2.50,
115  "Certainty threshold for non-dict words",
116  getCCUtil()->params()),
117  double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0,
118  "Reject certainty offset", getCCUtil()->params()),
119  INT_MEMBER(stopper_smallword_size, 2,
120  "Size of dict word to be treated as non-dict word",
121  getCCUtil()->params()),
122  double_MEMBER(stopper_certainty_per_char, -0.50,
123  "Certainty to add"
124  " for each dict char above small word size.",
125  getCCUtil()->params()),
126  double_MEMBER(stopper_allowable_character_badness, 3.0,
127  "Max certaintly variation allowed in a word (in sigma)",
128  getCCUtil()->params()),
129  INT_MEMBER(stopper_debug_level, 0, "Stopper debug level",
130  getCCUtil()->params()),
131  BOOL_MEMBER(stopper_no_acceptable_choices, false,
132  "Make AcceptableChoice() always return false. Useful"
133  " when there is a need to explore all segmentations",
134  getCCUtil()->params()),
135  INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
136  "Max words to keep in list", getCCUtil()->params()),
137  STRING_MEMBER(word_to_debug, "",
138  "Word for which stopper debug"
139  " information should be printed to stdout",
140  getCCUtil()->params()),
141  BOOL_MEMBER(segment_nonalphabetic_script, false,
142  "Don't use any alphabetic-specific tricks."
143  " Set to true in the traineddata config file for"
144  " scripts that are cursive or inherently fixed-pitch",
145  getCCUtil()->params()),
146  BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
147  getCCUtil()->params()),
148  double_MEMBER(doc_dict_pending_threshold, 0.0,
149  "Worst certainty for using pending dictionary",
150  getCCUtil()->params()),
151  double_MEMBER(doc_dict_certainty_threshold, -2.25,
152  "Worst certainty for words that can be inserted into the"
153  " document dictionary",
154  getCCUtil()->params()),
155  INT_MEMBER(max_permuter_attempts, 10000,
156  "Maximum number of different"
157  " character choices to consider during permutation."
158  " This limit is especially useful when user patterns"
159  " are specified, since overly generic patterns can result in"
160  " dawg search exploring an overly large number of options.",
161  getCCUtil()->params()) {
162  reject_offset_ = 0.0;
163  go_deeper_fxn_ = nullptr;
164  hyphen_word_ = nullptr;
165  last_word_on_line_ = false;
166  document_words_ = nullptr;
167  dawg_cache_ = nullptr;
168  dawg_cache_is_ours_ = false;
169  pending_words_ = nullptr;
170  bigram_dawg_ = nullptr;
171  freq_dawg_ = nullptr;
172  punc_dawg_ = nullptr;
173  unambig_dawg_ = nullptr;
174  wordseg_rating_adjust_factor_ = -1.0f;
175  output_ambig_words_file_ = nullptr;
176 }
177 
179  End();
180  delete hyphen_word_;
181  if (output_ambig_words_file_ != nullptr) fclose(output_ambig_words_file_);
182 }
183 
185  // This global cache (a singleton) will outlive every Tesseract instance
186  // (even those that someone else might declare as global statics).
187  static DawgCache cache;
188  return &cache;
189 }
190 
191 // Sets up ready for a Load or LoadLSTM.
192 void Dict::SetupForLoad(DawgCache* dawg_cache) {
193  if (dawgs_.size() != 0) this->End();
194 
195  apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
196  question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
197  slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
198  hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
199 
200  if (dawg_cache != nullptr) {
201  dawg_cache_ = dawg_cache;
202  dawg_cache_is_ours_ = false;
203  } else {
204  dawg_cache_ = new DawgCache();
205  dawg_cache_is_ours_ = true;
206  }
207 }
208 
209 // Loads the dawgs needed by Tesseract. Call FinishLoad() after.
210 void Dict::Load(const STRING& lang, TessdataManager* data_file) {
211  // Load dawgs_.
212  if (load_punc_dawg) {
213  punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG,
214  dawg_debug_level, data_file);
215  if (punc_dawg_) dawgs_ += punc_dawg_;
216  }
217  if (load_system_dawg) {
218  Dawg* system_dawg = dawg_cache_->GetSquishedDawg(
219  lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
220  if (system_dawg) dawgs_ += system_dawg;
221  }
222  if (load_number_dawg) {
223  Dawg* number_dawg = dawg_cache_->GetSquishedDawg(
224  lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
225  if (number_dawg) dawgs_ += number_dawg;
226  }
227  if (load_bigram_dawg) {
228  bigram_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG,
229  dawg_debug_level, data_file);
230  // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the
231  // dawgs_!!
232  }
233  if (load_freq_dawg) {
234  freq_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG,
235  dawg_debug_level, data_file);
236  if (freq_dawg_) dawgs_ += freq_dawg_;
237  }
238  if (load_unambig_dawg) {
239  unambig_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG,
240  dawg_debug_level, data_file);
241  if (unambig_dawg_) dawgs_ += unambig_dawg_;
242  }
243 
244  STRING name;
245  if (!user_words_suffix.empty() || !user_words_file.empty()) {
246  Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
247  getUnicharset().size(), dawg_debug_level);
248  if (!user_words_file.empty()) {
249  name = user_words_file;
250  } else {
252  name += user_words_suffix;
253  }
254  if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
256  tprintf("Error: failed to load %s\n", name.c_str());
257  delete trie_ptr;
258  } else {
259  dawgs_ += trie_ptr;
260  }
261  }
262 
263  if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
264  Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
265  getUnicharset().size(), dawg_debug_level);
266  trie_ptr->initialize_patterns(&(getUnicharset()));
267  if (!user_patterns_file.empty()) {
268  name = user_patterns_file;
269  } else {
271  name += user_patterns_suffix;
272  }
273  if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
274  tprintf("Error: failed to load %s\n", name.c_str());
275  delete trie_ptr;
276  } else {
277  dawgs_ += trie_ptr;
278  }
279  }
280 
281  document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
282  getUnicharset().size(), dawg_debug_level);
283  dawgs_ += document_words_;
284 
285  // This dawg is temporary and should not be searched by letter_is_ok.
286  pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
287  getUnicharset().size(), dawg_debug_level);
288 }
289 
290 // Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
291 void Dict::LoadLSTM(const STRING& lang, TessdataManager* data_file) {
292  // Load dawgs_.
293  if (load_punc_dawg) {
294  punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG,
295  dawg_debug_level, data_file);
296  if (punc_dawg_) dawgs_ += punc_dawg_;
297  }
298  if (load_system_dawg) {
299  Dawg* system_dawg = dawg_cache_->GetSquishedDawg(
300  lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
301  if (system_dawg) dawgs_ += system_dawg;
302  }
303  if (load_number_dawg) {
304  Dawg* number_dawg = dawg_cache_->GetSquishedDawg(
305  lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
306  if (number_dawg) dawgs_ += number_dawg;
307  }
308 
309  // stolen from Dict::Load (but needs params_ from Tesseract
310  // langdata/config/api):
311  STRING name;
312  if (!user_words_suffix.empty() || !user_words_file.empty()) {
313  Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
314  getUnicharset().size(), dawg_debug_level);
315  if (!user_words_file.empty()) {
316  name = user_words_file;
317  } else {
319  name += user_words_suffix;
320  }
321  if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
323  tprintf("Error: failed to load %s\n", name.c_str());
324  delete trie_ptr;
325  } else {
326  dawgs_ += trie_ptr;
327  }
328  }
329 
330  if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
331  Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
332  getUnicharset().size(), dawg_debug_level);
333  trie_ptr->initialize_patterns(&(getUnicharset()));
334  if (!user_patterns_file.empty()) {
335  name = user_patterns_file;
336  } else {
338  name += user_patterns_suffix;
339  }
340  if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
341  tprintf("Error: failed to load %s\n", name.c_str());
342  delete trie_ptr;
343  } else {
344  dawgs_ += trie_ptr;
345  }
346  }
347 }
348 
349 // Completes the loading process after Load() and/or LoadLSTM().
350 // Returns false if no dictionaries were loaded.
352  if (dawgs_.empty()) return false;
353  // Construct a list of corresponding successors for each dawg. Each entry, i,
354  // in the successors_ vector is a vector of integers that represent the
355  // indices into the dawgs_ vector of the successors for dawg i.
356  successors_.reserve(dawgs_.size());
357  for (int i = 0; i < dawgs_.size(); ++i) {
358  const Dawg* dawg = dawgs_[i];
359  auto* lst = new SuccessorList();
360  for (int j = 0; j < dawgs_.size(); ++j) {
361  const Dawg* other = dawgs_[j];
362  if (dawg != nullptr && other != nullptr &&
363  (dawg->lang() == other->lang()) &&
364  kDawgSuccessors[dawg->type()][other->type()])
365  *lst += j;
366  }
367  successors_ += lst;
368  }
369  return true;
370 }
371 
372 void Dict::End() {
373  if (dawgs_.size() == 0) return; // Not safe to call twice.
374  for (int i = 0; i < dawgs_.size(); i++) {
375  if (!dawg_cache_->FreeDawg(dawgs_[i])) {
376  delete dawgs_[i];
377  }
378  }
379  dawg_cache_->FreeDawg(bigram_dawg_);
380  if (dawg_cache_is_ours_) {
381  delete dawg_cache_;
382  dawg_cache_ = nullptr;
383  }
384  successors_.delete_data_pointers();
385  dawgs_.clear();
386  successors_.clear();
387  document_words_ = nullptr;
388  delete pending_words_;
389  pending_words_ = nullptr;
390 }
391 
392 // Returns true if in light of the current state unichar_id is allowed
393 // according to at least one of the dawgs in the dawgs_ vector.
394 // See more extensive comments in dict.h where this function is declared.
395 int Dict::def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset,
396  UNICHAR_ID unichar_id, bool word_end) const {
397  auto* dawg_args = static_cast<DawgArgs*>(void_dawg_args);
398 
399  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
400 
401  if (dawg_debug_level >= 3) {
402  tprintf(
403  "def_letter_is_okay: current unichar=%s word_end=%d"
404  " num active dawgs=%d\n",
405  getUnicharset().debug_str(unichar_id).c_str(), word_end,
406  dawg_args->active_dawgs->size());
407  }
408 
409  // Do not accept words that contain kPatternUnicharID.
410  // (otherwise pattern dawgs would not function correctly).
411  // Do not accept words containing INVALID_UNICHAR_IDs.
412  if (unichar_id == Dawg::kPatternUnicharID ||
413  unichar_id == INVALID_UNICHAR_ID) {
414  dawg_args->permuter = NO_PERM;
415  return NO_PERM;
416  }
417 
418  // Initialization.
419  PermuterType curr_perm = NO_PERM;
420  dawg_args->updated_dawgs->clear();
421  dawg_args->valid_end = false;
422 
423  // Go over the active_dawgs vector and insert DawgPosition records
424  // with the updated ref (an edge with the corresponding unichar id) into
425  // dawg_args->updated_pos.
426  for (int a = 0; a < dawg_args->active_dawgs->size(); ++a) {
427  const DawgPosition& pos = (*dawg_args->active_dawgs)[a];
428  const Dawg* punc_dawg =
429  pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;
430  const Dawg* dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;
431 
432  if (!dawg && !punc_dawg) {
433  // shouldn't happen.
434  tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n");
435  continue;
436  }
437  if (!dawg) {
438  // We're in the punctuation dawg. A core dawg has not been chosen.
439  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
440  EDGE_REF punc_transition_edge =
441  punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end);
442  if (punc_transition_edge != NO_EDGE) {
443  // Find all successors, and see which can transition.
444  const SuccessorList& slist = *(successors_[pos.punc_index]);
445  for (int s = 0; s < slist.size(); ++s) {
446  int sdawg_index = slist[s];
447  const Dawg* sdawg = dawgs_[sdawg_index];
448  UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
449  EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
450  if (dawg_edge != NO_EDGE) {
451  if (dawg_debug_level >= 3) {
452  tprintf("Letter found in dawg %d\n", sdawg_index);
453  }
454  dawg_args->updated_dawgs->add_unique(
455  DawgPosition(sdawg_index, dawg_edge, pos.punc_index,
456  punc_transition_edge, false),
457  dawg_debug_level > 0,
458  "Append transition from punc dawg to current dawgs: ");
459  if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
460  if (sdawg->end_of_word(dawg_edge) &&
461  punc_dawg->end_of_word(punc_transition_edge))
462  dawg_args->valid_end = true;
463  }
464  }
465  }
466  EDGE_REF punc_edge =
467  punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
468  if (punc_edge != NO_EDGE) {
469  if (dawg_debug_level >= 3) {
470  tprintf("Letter found in punctuation dawg\n");
471  }
472  dawg_args->updated_dawgs->add_unique(
473  DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false),
474  dawg_debug_level > 0, "Extend punctuation dawg: ");
475  if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM;
476  if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
477  }
478  continue;
479  }
480 
481  if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
482  // We can end the main word here.
483  // If we can continue on the punc ref, add that possibility.
484  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
485  EDGE_REF punc_edge =
486  punc_node == NO_EDGE
487  ? NO_EDGE
488  : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
489  if (punc_edge != NO_EDGE) {
490  dawg_args->updated_dawgs->add_unique(
492  punc_edge, true),
493  dawg_debug_level > 0, "Return to punctuation dawg: ");
494  if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
495  if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
496  }
497  }
498 
499  if (pos.back_to_punc) continue;
500 
501  // If we are dealing with the pattern dawg, look up all the
502  // possible edges, not only for the exact unichar_id, but also
503  // for all its character classes (alpha, digit, etc).
504  if (dawg->type() == DAWG_TYPE_PATTERN) {
505  ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args,
506  &curr_perm);
507  // There can't be any successors to dawg that is of type
508  // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
509  continue;
510  }
511 
512  // Find the edge out of the node for the unichar_id.
513  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
514  EDGE_REF edge =
515  (node == NO_EDGE)
516  ? NO_EDGE
517  : dawg->edge_char_of(
518  node, char_for_dawg(unicharset, unichar_id, dawg), word_end);
519 
520  if (dawg_debug_level >= 3) {
521  tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
522  pos.dawg_index, node, edge);
523  }
524 
525  if (edge != NO_EDGE) { // the unichar was found in the current dawg
526  if (dawg_debug_level >= 3) {
527  tprintf("Letter found in dawg %d\n", pos.dawg_index);
528  }
529  if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
530  if (dawg_debug_level >= 3) {
531  tprintf("Punctuation constraint not satisfied at end of word.\n");
532  }
533  continue;
534  }
535  if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
536  if (dawg->end_of_word(edge) &&
537  (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref)))
538  dawg_args->valid_end = true;
539  dawg_args->updated_dawgs->add_unique(
540  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
541  false),
542  dawg_debug_level > 0,
543  "Append current dawg to updated active dawgs: ");
544  }
545  } // end for
546  // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
547  // or if we found the current letter in a non-punctuation dawg. This
548  // allows preserving information on which dawg the "core" word came from.
549  // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
550  if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
551  (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
552  dawg_args->permuter = curr_perm;
553  }
554  if (dawg_debug_level >= 2) {
555  tprintf("Returning %d for permuter code for this character.\n",
556  dawg_args->permuter);
557  }
558  return dawg_args->permuter;
559 }
560 
561 void Dict::ProcessPatternEdges(const Dawg* dawg, const DawgPosition& pos,
562  UNICHAR_ID unichar_id, bool word_end,
563  DawgArgs* dawg_args,
564  PermuterType* curr_perm) const {
565  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
566  // Try to find the edge corresponding to the exact unichar_id and to all the
567  // edges corresponding to the character class of unichar_id.
568  GenericVector<UNICHAR_ID> unichar_id_patterns;
569  unichar_id_patterns.push_back(unichar_id);
570  dawg->unichar_id_to_patterns(unichar_id, getUnicharset(),
571  &unichar_id_patterns);
572  for (int i = 0; i < unichar_id_patterns.size(); ++i) {
573  // On the first iteration check all the outgoing edges.
574  // On the second iteration check all self-loops.
575  for (int k = 0; k < 2; ++k) {
576  EDGE_REF edge =
577  (k == 0) ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
578  : dawg->pattern_loop_edge(pos.dawg_ref,
579  unichar_id_patterns[i], word_end);
580  if (edge == NO_EDGE) continue;
581  if (dawg_debug_level >= 3) {
582  tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
583  pos.dawg_index, node, edge);
584  tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
585  }
586  if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();
587  if (dawg->end_of_word(edge)) dawg_args->valid_end = true;
588  dawg_args->updated_dawgs->add_unique(
589  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
590  pos.back_to_punc),
591  dawg_debug_level > 0,
592  "Append current dawg to updated active dawgs: ");
593  }
594  }
595 }
596 
597 // Fill the given active_dawgs vector with dawgs that could contain the
598 // beginning of the word. If hyphenated() returns true, copy the entries
599 // from hyphen_active_dawgs_ instead.
601  bool ambigs_mode) const {
602  int i;
603  if (hyphenated()) {
604  *active_dawgs = hyphen_active_dawgs_;
605  if (dawg_debug_level >= 3) {
606  for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
607  tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
608  hyphen_active_dawgs_[i].dawg_index,
609  hyphen_active_dawgs_[i].dawg_ref);
610  }
611  }
612  } else {
613  default_dawgs(active_dawgs, ambigs_mode);
614  }
615 }
616 
618  bool suppress_patterns) const {
619  bool punc_dawg_available =
620  (punc_dawg_ != nullptr) &&
621  punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
622 
623  for (int i = 0; i < dawgs_.size(); i++) {
624  if (dawgs_[i] != nullptr &&
625  !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
626  int dawg_ty = dawgs_[i]->type();
627  bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
628  if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
629  *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
630  if (dawg_debug_level >= 3) {
631  tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i,
632  NO_EDGE);
633  }
634  } else if (!punc_dawg_available || !subsumed_by_punc) {
635  *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
636  if (dawg_debug_level >= 3) {
637  tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
638  }
639  }
640  }
641  }
642 }
643 
644 void Dict::add_document_word(const WERD_CHOICE& best_choice) {
645  // Do not add hyphenated word parts to the document dawg.
646  // hyphen_word_ will be non-nullptr after the set_hyphen_word() is
647  // called when the first part of the hyphenated word is
648  // discovered and while the second part of the word is recognized.
649  // hyphen_word_ is cleared in cc_recg() before the next word on
650  // the line is recognized.
651  if (hyphen_word_) return;
652 
653  int stringlen = best_choice.length();
654 
655  if (valid_word(best_choice) || stringlen < 2) return;
656 
657  // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
658  if (best_choice.length() >= kDocDictMaxRepChars) {
659  int num_rep_chars = 1;
660  UNICHAR_ID uch_id = best_choice.unichar_id(0);
661  for (int i = 1; i < best_choice.length(); ++i) {
662  if (best_choice.unichar_id(i) != uch_id) {
663  num_rep_chars = 1;
664  uch_id = best_choice.unichar_id(i);
665  } else {
666  ++num_rep_chars;
667  if (num_rep_chars == kDocDictMaxRepChars) return;
668  }
669  }
670  }
671 
672  if (best_choice.certainty() < doc_dict_certainty_threshold ||
673  stringlen == 2) {
674  if (best_choice.certainty() < doc_dict_pending_threshold) return;
675 
676  if (!pending_words_->word_in_dawg(best_choice)) {
677  if (stringlen > 2 ||
678  (stringlen == 2 &&
679  getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
680  getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
681  pending_words_->add_word_to_dawg(best_choice);
682  }
683  return;
684  }
685  }
686 
687  if (save_doc_words) {
688  STRING filename(getCCUtil()->imagefile);
689  filename += ".doc";
690  FILE* doc_word_file = fopen(filename.c_str(), "a");
691  if (doc_word_file == nullptr) {
692  tprintf("Error: Could not open file %s\n", filename.c_str());
693  ASSERT_HOST(doc_word_file);
694  }
695  fprintf(doc_word_file, "%s\n", best_choice.debug_string().c_str());
696  fclose(doc_word_file);
697  }
698  document_words_->add_word_to_dawg(best_choice);
699 }
700 
701 void Dict::adjust_word(WERD_CHOICE* word, bool nonword,
702  XHeightConsistencyEnum xheight_consistency,
703  float additional_adjust, bool modify_rating,
704  bool debug) {
705  bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
706  word->GetTopScriptID() == getUnicharset().han_sid());
707  bool case_is_ok = (is_han || case_ok(*word));
708  bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
709 
710  float adjust_factor = additional_adjust;
711  float new_rating = word->rating();
712  new_rating += kRatingPad;
713  const char* xheight_triggered = "";
714  if (word->length() > 1) {
715  // Calculate x-height and y-offset consistency penalties.
716  switch (xheight_consistency) {
717  case XH_INCONSISTENT:
718  adjust_factor += xheight_penalty_inconsistent;
719  xheight_triggered = ", xhtBAD";
720  break;
721  case XH_SUBNORMAL:
722  adjust_factor += xheight_penalty_subscripts;
723  xheight_triggered = ", xhtSUB";
724  break;
725  case XH_GOOD:
726  // leave the factor alone - all good!
727  break;
728  }
729  // TODO(eger): if nonword is true, but there is a "core" that is a dict
730  // word, negate nonword status.
731  } else {
732  if (debug) {
733  tprintf("Consistency could not be calculated.\n");
734  }
735  }
736  if (debug) {
737  tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "",
738  word->unichar_string().c_str(), word->rating(), xheight_triggered);
739  }
740 
741  if (nonword) { // non-dictionary word
742  if (case_is_ok && punc_is_ok) {
743  adjust_factor += segment_penalty_dict_nonword;
744  new_rating *= adjust_factor;
745  if (debug) tprintf(", W");
746  } else {
747  adjust_factor += segment_penalty_garbage;
748  new_rating *= adjust_factor;
749  if (debug) {
750  if (!case_is_ok) tprintf(", C");
751  if (!punc_is_ok) tprintf(", P");
752  }
753  }
754  } else { // dictionary word
755  if (case_is_ok) {
756  if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) {
758  adjust_factor += segment_penalty_dict_frequent_word;
759  new_rating *= adjust_factor;
760  if (debug) tprintf(", F");
761  } else {
762  adjust_factor += segment_penalty_dict_case_ok;
763  new_rating *= adjust_factor;
764  if (debug) tprintf(", ");
765  }
766  } else {
767  adjust_factor += segment_penalty_dict_case_bad;
768  new_rating *= adjust_factor;
769  if (debug) tprintf(", C");
770  }
771  }
772  new_rating -= kRatingPad;
773  if (modify_rating) word->set_rating(new_rating);
774  if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
775  word->set_adjust_factor(adjust_factor);
776 }
777 
778 int Dict::valid_word(const WERD_CHOICE& word, bool numbers_ok) const {
779  const WERD_CHOICE* word_ptr = &word;
780  WERD_CHOICE temp_word(word.unicharset());
781  if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
782  copy_hyphen_info(&temp_word);
783  temp_word += word;
784  word_ptr = &temp_word;
785  }
786  if (word_ptr->length() == 0) return NO_PERM;
787  // Allocate vectors for holding current and updated
788  // active_dawgs and initialize them.
789  auto* active_dawgs = new DawgPositionVector[2];
790  init_active_dawgs(&(active_dawgs[0]), false);
791  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
792  int last_index = word_ptr->length() - 1;
793  // Call letter_is_okay for each letter in the word.
794  for (int i = hyphen_base_size(); i <= last_index; ++i) {
795  if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(),
796  word_ptr->unichar_id(i), i == last_index)))
797  break;
798  // Swap active_dawgs, constraints with the corresponding updated vector.
799  if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
800  dawg_args.updated_dawgs = &(active_dawgs[0]);
801  ++(dawg_args.active_dawgs);
802  } else {
803  ++(dawg_args.updated_dawgs);
804  dawg_args.active_dawgs = &(active_dawgs[0]);
805  }
806  }
807  delete[] active_dawgs;
808  return valid_word_permuter(dawg_args.permuter, numbers_ok)
809  ? dawg_args.permuter
810  : NO_PERM;
811 }
812 
813 bool Dict::valid_bigram(const WERD_CHOICE& word1,
814  const WERD_CHOICE& word2) const {
815  if (bigram_dawg_ == nullptr) return false;
816 
817  // Extract the core word from the middle of each word with any digits
818  // replaced with question marks.
819  int w1start, w1end, w2start, w2end;
820  word1.punct_stripped(&w1start, &w1end);
821  word2.punct_stripped(&w2start, &w2end);
822 
823  // We don't want to penalize a single guillemet, hyphen, etc.
824  // But our bigram list doesn't have any information about punctuation.
825  if (w1start >= w1end) return word1.length() < 3;
826  if (w2start >= w2end) return word2.length() < 3;
827 
828  const UNICHARSET& uchset = getUnicharset();
829  GenericVector<UNICHAR_ID> bigram_string;
830  bigram_string.reserve(w1end + w2end + 1);
831  for (int i = w1start; i < w1end; i++) {
832  const GenericVector<UNICHAR_ID>& normed_ids =
833  getUnicharset().normed_ids(word1.unichar_id(i));
834  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
835  bigram_string.push_back(question_unichar_id_);
836  else
837  bigram_string += normed_ids;
838  }
839  bigram_string.push_back(UNICHAR_SPACE);
840  for (int i = w2start; i < w2end; i++) {
841  const GenericVector<UNICHAR_ID>& normed_ids =
842  getUnicharset().normed_ids(word2.unichar_id(i));
843  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
844  bigram_string.push_back(question_unichar_id_);
845  else
846  bigram_string += normed_ids;
847  }
848  WERD_CHOICE normalized_word(&uchset, bigram_string.size());
849  for (int i = 0; i < bigram_string.size(); ++i) {
850  normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1, 0.0f,
851  0.0f);
852  }
853  return bigram_dawg_->word_in_dawg(normalized_word);
854 }
855 
857  if (word.length() == 0) return NO_PERM;
858  int i;
859  WERD_CHOICE new_word(word.unicharset());
860  int last_index = word.length() - 1;
861  int new_len = 0;
862  for (i = 0; i <= last_index; ++i) {
863  UNICHAR_ID unichar_id = (word.unichar_id(i));
864  if (getUnicharset().get_ispunctuation(unichar_id)) {
865  new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
866  } else if (!getUnicharset().get_isalpha(unichar_id) &&
867  !getUnicharset().get_isdigit(unichar_id)) {
868  return false; // neither punc, nor alpha, nor digit
869  } else if ((new_len = new_word.length()) == 0 ||
870  new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) {
871  new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
872  }
873  }
874  for (i = 0; i < dawgs_.size(); ++i) {
875  if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
876  dawgs_[i]->word_in_dawg(new_word))
877  return true;
878  }
879  return false;
880 }
881 
884  const UNICHARSET& u_set = getUnicharset();
885  if (u_set.han_sid() > 0) return false;
886  if (u_set.katakana_sid() > 0) return false;
887  if (u_set.thai_sid() > 0) return false;
888  return true;
889 }
890 
891 } // namespace tesseract
tesseract::Dict::getCCUtil
const CCUtil * getCCUtil() const
Definition: dict.h:95
WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:529
GenericVector::delete_data_pointers
void delete_data_pointers()
Definition: genericvector.h:872
tesseract::Dict::ProcessPatternEdges
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
Definition: dict.cpp:561
tesseract::Dawg::unichar_id_to_patterns
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
Definition: dawg.h:179
tesseract::DAWG_TYPE_PUNCTUATION
Definition: dawg.h:67
tesseract::Dawg::end_of_word
virtual bool end_of_word(EDGE_REF edge_ref) const =0
tesseract::XH_SUBNORMAL
Definition: dict.h:78
dict.h
WERD_CHOICE::set_adjust_factor
void set_adjust_factor(float factor)
Definition: ratngs.h:297
tesseract::Dict::go_deeper_fxn_
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:216
tesseract::Dict::segment_penalty_dict_case_bad
double segment_penalty_dict_case_bad
Definition: dict.h:609
UNICHARSET::contains_unichar_id
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:284
tesseract::Dawg::type
DawgType type() const
Definition: dawg.h:122
tesseract::TESSDATA_SYSTEM_DAWG
Definition: tessdatamanager.h:64
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:303
tesseract::TessdataManager
Definition: tessdatamanager.h:126
tesseract::Dict::xheight_penalty_subscripts
double xheight_penalty_subscripts
Definition: dict.h:595
tesseract::TESSDATA_BIGRAM_DAWG
Definition: tessdatamanager.h:71
tesseract::DawgPosition
Definition: dawg.h:348
UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:502
WERD_CHOICE
Definition: ratngs.h:261
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
tesseract::Trie::initialize_patterns
void initialize_patterns(UNICHARSET *unicharset)
Definition: trie.cpp:351
INT_MEMBER
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:312
tesseract::Trie
Definition: trie.h:54
tesseract::Dict::segment_penalty_garbage
double segment_penalty_garbage
Definition: dict.h:618
WERD_CHOICE::GetTopScriptID
int GetTopScriptID() const
Definition: ratngs.cpp:669
tesseract::Dict::hyphen_base_size
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:139
tesseract::Dawg::lang
const STRING & lang() const
Definition: dawg.h:123
tesseract::Dict::doc_dict_certainty_threshold
double doc_dict_certainty_threshold
Definition: dict.h:653
PermuterType
PermuterType
Definition: ratngs.h:230
tesseract::Trie::read_pattern_list
bool read_pattern_list(const char *filename, const UNICHARSET &unicharset)
Definition: trie.cpp:408
WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:318
NO_PERM
Definition: ratngs.h:231
STRING
Definition: strngs.h:45
tesseract::DawgPosition::punc_ref
EDGE_REF punc_ref
Definition: dawg.h:366
tesseract::Dict::char_for_dawg
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:448
tesseract::Dict::Load
void Load(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:210
tesseract::XH_GOOD
Definition: dict.h:78
tesseract::DawgArgs
Definition: dict.h:80
tesseract::Dict::add_document_word
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
Definition: dict.cpp:644
COMPOUND_PERM
Definition: ratngs.h:243
tesseract::DawgCache::FreeDawg
bool FreeDawg(Dawg *dawg)
Definition: dawg_cache.h:38
WERD_CHOICE::append_unichar_id_space_allocated
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:440
tesseract::Dict::user_patterns_file
char * user_patterns_file
Definition: dict.h:582
WERD_CHOICE::unicharset
const UNICHARSET * unicharset() const
Definition: ratngs.h:288
tesseract::Dict::default_dawgs
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:617
UNICHARSET::katakana_sid
int katakana_sid() const
Definition: unicharset.h:881
tesseract::Dict::GlobalDawgCache
static TESS_API DawgCache * GlobalDawgCache()
Definition: dict.cpp:184
tesseract::CCUtil::language_data_path_prefix
STRING language_data_path_prefix
Definition: ccutil.h:56
tesseract::Dict::segment_penalty_dict_nonword
double segment_penalty_dict_nonword
Definition: dict.h:613
tesseract::Dict::valid_word_permuter
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:474
tesseract::XHeightConsistencyEnum
XHeightConsistencyEnum
Definition: dict.h:78
tesseract::TESSDATA_LSTM_SYSTEM_DAWG
Definition: tessdatamanager.h:76
tesseract::DAWG_TYPE_WORD
Definition: dawg.h:68
tesseract::DawgCache
Definition: dawg_cache.h:30
tesseract::Dict::xheight_penalty_inconsistent
double xheight_penalty_inconsistent
Definition: dict.h:598
BOOL_INIT_MEMBER
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:327
tesseract::TESSDATA_PUNC_DAWG
Definition: tessdatamanager.h:63
UNICHARSET::get_ispunctuation
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:509
tesseract::Dict::copy_hyphen_info
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:145
tesseract::Dict::hyphenated
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:135
tesseract::Dict::load_system_dawg
bool load_system_dawg
Definition: dict.h:585
tesseract::SuccessorList
GenericVector< int > SuccessorList
Definition: dawg.h:63
tesseract::Dict::user_words_suffix
char * user_words_suffix
Definition: dict.h:580
WERD_CHOICE::punct_stripped
void punct_stripped(int *start_core, int *end_core) const
Definition: ratngs.cpp:385
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
tesseract::DawgArgs::permuter
PermuterType permuter
Definition: dict.h:86
STRING_MEMBER
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:318
tesseract::Dict::End
void End()
Definition: dict.cpp:372
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
tesseract::DawgCache::GetSquishedDawg
Dawg * GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type, int debug_level, TessdataManager *data_file)
Definition: dawg_cache.cpp:44
UNICHARSET::null_sid
int null_sid() const
Definition: unicharset.h:874
tesseract::Trie::RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:58
tesseract::DawgArgs::updated_dawgs
DawgPositionVector * updated_dawgs
Definition: dict.h:85
UNICHARSET::han_sid
int han_sid() const
Definition: unicharset.h:879
UNICHARSET::thai_sid
int thai_sid() const
Definition: unicharset.h:882
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
tesseract::TESSDATA_LSTM_NUMBER_DAWG
Definition: tessdatamanager.h:77
UNICHAR_SPACE
Definition: unicharset.h:34
tesseract::XH_INCONSISTENT
Definition: dict.h:78
tesseract::Dict::segment_penalty_dict_case_ok
double segment_penalty_dict_case_ok
Definition: dict.h:605
tesseract::Dict::def_letter_is_okay
int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.cpp:395
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
tesseract::Dict::load_punc_dawg
bool load_punc_dawg
Definition: dict.h:589
UNICHARSET
Definition: unicharset.h:145
tesseract::Dict::SetupForLoad
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:192
double_MEMBER
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:321
tesseract::Dict::load_number_dawg
bool load_number_dawg
Definition: dict.h:590
tesseract::Dawg::edge_char_of
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
tesseract::Dict::valid_punctuation
bool valid_punctuation(const WERD_CHOICE &word)
Definition: dict.cpp:856
tesseract::Trie::add_word_to_dawg
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:183
tesseract::Dict::dawg_debug_level
int dawg_debug_level
Definition: dict.h:622
tesseract::Dict::segment_penalty_dict_frequent_word
double segment_penalty_dict_frequent_word
Definition: dict.h:601
WERD_CHOICE::set_rating
void set_rating(float new_val)
Definition: ratngs.h:357
tesseract::Dawg::pattern_loop_edge
virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: dawg.h:190
tesseract
Definition: baseapi.h:65
WERD_CHOICE::debug_string
const STRING debug_string() const
Definition: ratngs.h:493
tesseract::DawgPosition::dawg_index
int8_t dawg_index
Definition: dawg.h:367
unicodes.h
tesseract::Dict::Dict
Dict(CCUtil *image_ptr)
Definition: dict.cpp:30
REFFORMAT
#define REFFORMAT
Definition: dawg.h:87
tesseract::DawgPosition::punc_index
int8_t punc_index
Definition: dawg.h:368
tprintf.h
tesseract::DawgPositionVector::add_unique
bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg)
Definition: dawg.h:381
tesseract::DawgPositionVector
Definition: dawg.h:373
tesseract::Dawg::word_in_dawg
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:78
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
USER_PATTERN_PERM
Definition: ratngs.h:238
GenericVector< int >
GenericVector::reserve
void reserve(int size)
Definition: genericvector.h:679
STRING_INIT_MEMBER
#define STRING_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:330
tesseract::Dict
Definition: dict.h:91
UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:495
tesseract::DawgPosition::back_to_punc
bool back_to_punc
Definition: dawg.h:370
tesseract::Dict::init_active_dawgs
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:600
tesseract::DawgArgs::valid_end
bool valid_end
Definition: dict.h:88
tesseract::Dict::LoadLSTM
void LoadLSTM(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:291
tesseract::TESSDATA_UNAMBIG_DAWG
Definition: tessdatamanager.h:72
tesseract::TESSDATA_LSTM_PUNC_DAWG
Definition: tessdatamanager.h:75
tesseract::Dict::FinishLoad
bool FinishLoad()
Definition: dict.cpp:351
tesseract::Dict::GetStartingNode
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:438
tesseract::Dict::doc_dict_pending_threshold
double doc_dict_pending_threshold
Definition: dict.h:651
WERD_CHOICE::length
int length() const
Definition: ratngs.h:291
tesseract::Dawg
Definition: dawg.h:113
tesseract::Dict::save_doc_words
bool save_doc_words
Definition: dict.h:649
tesseract::Dict::adjust_word
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:701
EDGE_REF
int64_t EDGE_REF
Definition: dawg.h:49
GenericVector::clear
void clear()
Definition: genericvector.h:857
tesseract::Dict::case_ok
int case_ok(const WERD_CHOICE &word) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:61
UNICHARSET::normed_ids
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:825
tesseract::Dict::valid_bigram
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:813
tesseract::Dict::load_bigram_dawg
bool load_bigram_dawg
Definition: dict.h:592
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::Dict::getUnicharset
const UNICHARSET & getUnicharset() const
Definition: dict.h:101
tesseract::DAWG_TYPE_PATTERN
Definition: dawg.h:70
tesstrain_utils.type
type
Definition: tesstrain_utils.py:141
tesseract::DawgPosition::dawg_ref
EDGE_REF dawg_ref
Definition: dawg.h:365
tesseract::Dawg::kPatternUnicharID
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:120
tesseract::Dict::~Dict
~Dict()
Definition: dict.cpp:178
tesseract::DawgArgs::active_dawgs
DawgPositionVector * active_dawgs
Definition: dict.h:84
tesseract::Dict::valid_word
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:778
WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:315
tesseract::Dict::IsSpaceDelimitedLang
bool IsSpaceDelimitedLang() const
Returns true if the language is space-delimited (not CJ, or T).
Definition: dict.cpp:883
BOOL_MEMBER
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:315
DOC_DAWG_PERM
Definition: ratngs.h:240
PUNC_PERM
Definition: ratngs.h:232
tesseract::Dict::load_freq_dawg
bool load_freq_dawg
Definition: dict.h:586
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::Dict::user_words_file
char * user_words_file
Definition: dict.h:578
WERD_CHOICE::set_permuter
void set_permuter(uint8_t perm)
Definition: ratngs.h:363
tesseract::Dawg::permuter
PermuterType permuter() const
Definition: dawg.h:124
tesseract::CCUtil
Definition: ccutil.h:40
FREQ_DAWG_PERM
Definition: ratngs.h:242
tesseract::Trie::read_and_add_word_list
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
Definition: trie.cpp:295
tesseract::TESSDATA_NUMBER_DAWG
Definition: tessdatamanager.h:65
UNICHARSET::size
int size() const
Definition: unicharset.h:341
NODE_REF
int64_t NODE_REF
Definition: dawg.h:50
tesseract::Dict::user_patterns_suffix
char * user_patterns_suffix
Definition: dict.h:584
USER_DAWG_PERM
Definition: ratngs.h:241
tesseract::Dict::load_unambig_dawg
bool load_unambig_dawg
Definition: dict.h:587
tesseract::TESSDATA_FREQ_DAWG
Definition: tessdatamanager.h:66
tesseract::Dict::letter_is_okay_
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:372