All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
dict.h
Go to the documentation of this file.
1 // File: dict.h
3 // Description: dict class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifndef TESSERACT_DICT_DICT_H_
20 #define TESSERACT_DICT_DICT_H_
21 
22 #include "ambigs.h"
23 #include "dawg.h"
24 #include "dawg_cache.h"
25 #include "host.h"
26 #include "oldlist.h"
27 #include "ratngs.h"
28 #include "stopper.h"
29 #include "trie.h"
30 #include "unicharset.h"
32 
33 class MATRIX;
34 class WERD_RES;
35 
36 #define MAX_WERD_LENGTH (inT64) 128
37 #define NO_RATING -1
38 
44  float rating;
45  float certainty;
46 };
47 
48 namespace tesseract {
49 
51 
52 //
53 // Constants
54 //
55 static const int kRatingPad = 4;
56 static const char kDictWildcard[] = "\u2606"; // WHITE STAR
57 static const int kDictMaxWildcards = 2; // max wildcards for a word
58 // TODO(daria): If hyphens are different in different languages and can be
59 // inferred from training data we should load their values dynamically.
60 static const char kHyphenSymbol[] = "-";
61 static const char kSlashSymbol[] = "/";
62 static const char kQuestionSymbol[] = "?";
63 static const char kApostropheSymbol[] = "'";
64 static const float kSimCertaintyScale = -10.0; // similarity matcher scaling
65 static const float kSimCertaintyOffset = -10.0; // similarity matcher offset
66 static const float kSimilarityFloor = 100.0; // worst E*L product to stop on
67 static const int kDocDictMaxRepChars = 4;
68 
69 // Enum for describing whether the x-height for the word is consistent:
70 // 0 - everything is good.
71 // 1 - there are one or two secondary (but consistent) baselines
72 // [think subscript and superscript], or there is an oversized
73 // first character.
74 // 2 - the word is inconsistent.
76 
77 struct DawgArgs {
79  : active_dawgs(d), updated_dawgs(up), permuter(p) {}
80 
84 };
85 
86 class Dict {
87  public:
88  Dict(CCUtil* image_ptr);
89  ~Dict();
90  const CCUtil* getCCUtil() const {
91  return ccutil_;
92  }
94  return ccutil_;
95  }
96  const UNICHARSET& getUnicharset() const {
97  return getCCUtil()->unicharset;
98  }
100  return getCCUtil()->unicharset;
101  }
103  return getCCUtil()->unichar_ambigs;
104  }
105 
106  // Returns true if unichar_id is a word compounding character like - or /.
107  inline bool compound_marker(UNICHAR_ID unichar_id) {
108  const GenericVector<UNICHAR_ID>& normed_ids =
109  getUnicharset().normed_ids(unichar_id);
110  return normed_ids.size() == 1 &&
111  (normed_ids[0] == hyphen_unichar_id_ ||
112  normed_ids[0] == slash_unichar_id_);
113  }
114  // Returns true if unichar_id is an apostrophe-like character that may
115  // separate prefix/suffix words from a main body word.
116  inline bool is_apostrophe(UNICHAR_ID unichar_id) {
117  const GenericVector<UNICHAR_ID>& normed_ids =
118  getUnicharset().normed_ids(unichar_id);
119  return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
120  }
121 
122  /* hyphen.cpp ************************************************************/
123 
125  inline bool hyphenated() const { return
126  !last_word_on_line_ && hyphen_word_;
127  }
129  inline int hyphen_base_size() const {
130  return this->hyphenated() ? hyphen_word_->length() : 0;
131  }
135  inline void copy_hyphen_info(WERD_CHOICE *word) const {
136  if (this->hyphenated()) {
137  *word = *hyphen_word_;
138  if (hyphen_debug_level) word->print("copy_hyphen_info: ");
139  }
140  }
142  inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const {
143  if (!last_word_on_line_ || first_pos)
144  return false;
145  const GenericVector<UNICHAR_ID>& normed_ids =
146  getUnicharset().normed_ids(unichar_id);
147  return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
148  }
150  inline bool has_hyphen_end(const WERD_CHOICE &word) const {
151  int word_index = word.length() - 1;
152  return has_hyphen_end(word.unichar_id(word_index), word_index == 0);
153  }
157  void reset_hyphen_vars(bool last_word_on_line);
160  void set_hyphen_word(const WERD_CHOICE &word,
161  const DawgPositionVector &active_dawgs);
162 
163  /* permdawg.cpp ************************************************************/
164  // Note: Functions in permdawg.cpp are only used by NoDangerousAmbig().
165  // When this function is refactored, permdawg.cpp can be removed.
166 
169  inline void update_best_choice(const WERD_CHOICE &word,
170  WERD_CHOICE *best_choice) {
171  if (word.rating() < best_choice->rating()) {
172  *best_choice = word;
173  }
174  }
178  void init_active_dawgs(DawgPositionVector *active_dawgs,
179  bool ambigs_mode) const;
180  // Fill the given vector with the default collection of any-length dawgs
181  void default_dawgs(DawgPositionVector *anylength_dawgs,
182  bool suppress_patterns) const;
183 
184 
191  const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit);
195  void go_deeper_dawg_fxn(
196  const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
197  int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
198  bool word_ending, WERD_CHOICE *word, float certainties[],
199  float *limit, WERD_CHOICE *best_choice, int *attempts_left,
200  void *void_more_args);
201 
203  void (Dict::*go_deeper_fxn_)(const char *debug,
204  const BLOB_CHOICE_LIST_VECTOR &char_choices,
205  int char_choice_index,
206  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
207  bool word_ending, WERD_CHOICE *word,
208  float certainties[], float *limit,
209  WERD_CHOICE *best_choice, int *attempts_left,
210  void *void_more_args);
211  //
212  // Helper functions for dawg_permute_and_select().
213  //
214  void permute_choices(
215  const char *debug,
216  const BLOB_CHOICE_LIST_VECTOR &char_choices,
217  int char_choice_index,
218  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
219  WERD_CHOICE *word,
220  float certainties[],
221  float *limit,
222  WERD_CHOICE *best_choice,
223  int *attempts_left,
224  void *more_args);
225 
226  void append_choices(
227  const char *debug,
228  const BLOB_CHOICE_LIST_VECTOR &char_choices,
229  const BLOB_CHOICE &blob_choice,
230  int char_choice_index,
231  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
232  WERD_CHOICE *word,
233  float certainties[],
234  float *limit,
235  WERD_CHOICE *best_choice,
236  int *attempts_left,
237  void *more_args);
238 
239  bool fragment_state_okay(UNICHAR_ID curr_unichar_id,
240  float curr_rating, float curr_certainty,
241  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
242  const char *debug, int word_ending,
243  CHAR_FRAGMENT_INFO *char_frag_info);
244 
245  /* stopper.cpp *************************************************************/
246  bool NoDangerousAmbig(WERD_CHOICE *BestChoice,
247  DANGERR *fixpt,
248  bool fix_replaceable,
249  MATRIX* ratings);
250  // Replaces the corresponding wrong ngram in werd_choice with the correct
251  // one. The whole correct n-gram is inserted into the ratings matrix and
252  // the werd_choice: no more fragments!. Rating and certainty of new entries
253  // in matrix and werd_choice are the sum and mean of the wrong ngram
254  // respectively.
255  // E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes
256  // mystring", with a new entry in the ratings matrix for ".
257  void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
258  UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
259  MATRIX *ratings);
260 
262  int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice);
270  int UniformCertainties(const WERD_CHOICE& word);
272  bool AcceptableChoice(const WERD_CHOICE& best_choice,
273  XHeightConsistencyEnum xheight_consistency);
277  bool AcceptableResult(WERD_RES* word);
278  void EndDangerousAmbigs();
280  void DebugWordChoices();
282  void SettupStopperPass1();
284  void SettupStopperPass2();
285  /* context.cpp *************************************************************/
287  int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset);
290  bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);
291 
292  /* dict.cpp ****************************************************************/
293 
296  static DawgCache *GlobalDawgCache();
297  void Load(DawgCache *dawg_cache);
298  void End();
299 
300  // Resets the document dictionary analogous to ResetAdaptiveClassifier.
302  if (pending_words_ != NULL)
303  pending_words_->clear();
304  if (document_words_ != NULL)
305  document_words_->clear();
306  }
307 
343  //
344  int def_letter_is_okay(void* void_dawg_args,
345  UNICHAR_ID unichar_id, bool word_end) const;
346 
347  int (Dict::*letter_is_okay_)(void* void_dawg_args,
348  UNICHAR_ID unichar_id, bool word_end) const;
350  int LetterIsOkay(void* void_dawg_args,
351  UNICHAR_ID unichar_id, bool word_end) const {
352  return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end);
353  }
354 
355 
357  double (Dict::*probability_in_context_)(const char* lang,
358  const char* context,
359  int context_bytes,
360  const char* character,
361  int character_bytes);
363  double ProbabilityInContext(const char* context,
364  int context_bytes,
365  const char* character,
366  int character_bytes) {
367  return (this->*probability_in_context_)(
368  getCCUtil()->lang.string(),
369  context, context_bytes,
370  character, character_bytes);
371  }
372 
375  const char* lang, const char* context, int context_bytes,
376  const char* character, int character_bytes) {
377  (void) context;
378  (void) context_bytes;
379  (void) character;
380  (void) character_bytes;
381  return 0.0;
382  }
383  double ngram_probability_in_context(const char* lang,
384  const char* context,
385  int context_bytes,
386  const char* character,
387  int character_bytes);
388 
389  // Interface with params model.
390  float (Dict::*params_model_classify_)(const char *lang, void *path);
391  float ParamsModelClassify(const char *lang, void *path);
392  // Call params_model_classify_ member function.
393  float CallParamsModelClassify(void *path) {
394  ASSERT_HOST(params_model_classify_ != NULL); // ASSERT_HOST -> assert
395  return (this->*params_model_classify_)(
396  getCCUtil()->lang.string(), path);
397  }
398 
399  inline void SetWildcardID(UNICHAR_ID id) { wildcard_unichar_id_ = id; }
400  inline const UNICHAR_ID WildcardID() const {
401  return wildcard_unichar_id_;
402  }
404  inline const int NumDawgs() const { return dawgs_.size(); }
406  inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
408  inline const Dawg *GetPuncDawg() const { return punc_dawg_; }
410  inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; }
412  static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) {
413  if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg
414  NODE_REF node = dawg->next_node(edge_ref);
415  if (node == 0) node = NO_EDGE; // end of word
416  return node;
417  }
418 
419  // Given a unichar from a string and a given dawg, return the unichar
420  // we should use to match in that dawg type. (for example, in the number
421  // dawg, all numbers are transformed to kPatternUnicharId).
422  inline UNICHAR_ID char_for_dawg(UNICHAR_ID ch, const Dawg *dawg) const {
423  if (!dawg) return ch;
424  switch (dawg->type()) {
425  case DAWG_TYPE_NUMBER:
427  default:
428  return ch;
429  }
430  }
431 
437  void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info,
438  UNICHAR_ID unichar_id, bool word_end,
439  DawgPositionVector *updated_dawgs,
440  PermuterType *current_permuter) const;
441 
445 
447  inline static bool valid_word_permuter(uinT8 perm, bool numbers_ok) {
448  return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
449  perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
450  perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
451  (numbers_ok && perm == NUMBER_PERM));
452  }
453  int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;
454  int valid_word(const WERD_CHOICE &word) const {
455  return valid_word(word, false); // return NO_PERM for words with digits
456  }
457  int valid_word_or_number(const WERD_CHOICE &word) const {
458  return valid_word(word, true); // return NUMBER_PERM for valid numbers
459  }
461  int valid_word(const char *string) const {
462  WERD_CHOICE word(string, getUnicharset());
463  return valid_word(word);
464  }
465  // Do the two WERD_CHOICEs form a meaningful bigram?
466  bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;
471  bool valid_punctuation(const WERD_CHOICE &word);
473  int good_choice(const WERD_CHOICE &choice);
475  void add_document_word(const WERD_CHOICE &best_choice);
477  void adjust_word(WERD_CHOICE *word,
478  bool nonword, XHeightConsistencyEnum xheight_consistency,
479  float additional_adjust,
480  bool modify_rating,
481  bool debug);
483  inline void SetWordsegRatingAdjustFactor(float f) {
484  wordseg_rating_adjust_factor_ = f;
485  }
486 
487  private:
489  CCUtil* ccutil_;
496  UnicharAmbigs *dang_ambigs_table_;
498  UnicharAmbigs *replace_ambigs_table_;
500  FLOAT32 reject_offset_;
501  // Cached UNICHAR_IDs:
502  UNICHAR_ID wildcard_unichar_id_; // kDictWildcard.
503  UNICHAR_ID apostrophe_unichar_id_; // kApostropheSymbol.
504  UNICHAR_ID question_unichar_id_; // kQuestionSymbol.
505  UNICHAR_ID slash_unichar_id_; // kSlashSymbol.
506  UNICHAR_ID hyphen_unichar_id_; // kHyphenSymbol.
507  // Hyphen-related variables.
508  WERD_CHOICE *hyphen_word_;
509  DawgPositionVector hyphen_active_dawgs_;
510  bool last_word_on_line_;
511  // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary
512  // matching. The first member of each list is taken as canonical. For
513  // example, the first list contains hyphens and dashes with the first symbol
514  // being the ASCII hyphen minus.
515  GenericVector<GenericVectorEqEq<UNICHAR_ID> > equivalent_symbols_;
516  // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.
517  DawgCache *dawg_cache_;
518  bool dawg_cache_is_ours_; // we should delete our own dawg_cache_
519  // Dawgs.
520  DawgVector dawgs_;
521  SuccessorListsVector successors_;
522  Trie *pending_words_;
523  // bigram_dawg_ points to a dawg of two-word bigrams which always supercede if
524  // any of them are present on the best choices list for a word pair.
525  // the bigrams are stored as space-separated words where:
526  // (1) leading and trailing punctuation has been removed from each word and
527  // (2) any digits have been replaced with '?' marks.
528  Dawg *bigram_dawg_;
531  // TODO(daria): need to support multiple languages in the future,
532  // so maybe will need to maintain a list of dawgs of each kind.
533  Dawg *freq_dawg_;
534  Dawg *unambig_dawg_;
535  Dawg *punc_dawg_;
536  Trie *document_words_;
539  float wordseg_rating_adjust_factor_;
540  // File for recording ambiguities discovered during dictionary search.
541  FILE *output_ambig_words_file_;
542 
543  public:
547  STRING_VAR_H(user_words_file, "", "A filename of user-provided words.");
549  "A suffix of user-provided words located in tessdata.");
551  "A filename of user-provided patterns.");
553  "A suffix of user-provided patterns located in tessdata.");
554  BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
555  BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
556  BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");
558  "Load dawg with punctuation patterns.");
559  BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns.");
561  "Load dawg with special word bigrams.");
563  "Score penalty (0.1 = 10%) added if there are subscripts "
564  "or superscripts in a word, but it is otherwise OK.");
566  "Score penalty (0.1 = 10%) added if an xheight is "
567  "inconsistent.");
569  "Score multiplier for word matches which have good case and"
570  "are frequent in the given language (lower is better).");
571 
573  "Score multiplier for word matches that have good case "
574  "(lower is better).");
575 
577  "Default score multiplier for word matches, which may have "
578  "case issues (lower is better).");
579 
580  // TODO(daria): remove this param when ngram permuter is deprecated.
582  "Multipler to for the best choice from the ngram model.");
583 
585  "Score multiplier for glyph fragment segmentations which "
586  "do not match a dictionary word (lower is better).");
587 
589  "Score multiplier for poorly cased strings that are not in"
590  " the dictionary and generally look like garbage (lower is"
591  " better).");
593  "Output file for ambiguities found in the dictionary");
594  INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info"
595  ", to 2 for more details, to 3 to see all the debug messages");
596  INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");
597  INT_VAR_H(max_viterbi_list_size, 10, "Maximum size of viterbi list.");
599  "Use only the first UTF8 step of the given string"
600  " when computing log probabilities.");
601  double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
603  "Certainty threshold for non-dict words");
605  "Reject certainty offset");
607  "Size of dict word to be treated as non-dict word");
609  "Certainty to add for each dict char above small word size.");
611  "Max certaintly variation allowed in a word (in sigma)");
612  INT_VAR_H(stopper_debug_level, 0, "Stopper debug level");
614  "Make AcceptableChoice() always return false. Useful"
615  " when there is a need to explore all segmentations");
617  "Deprecated- backward compatability only");
618  INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
619  STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"
620  " should be printed to stdout");
622  "Lengths of unichars in word_to_debug");
623  INT_VAR_H(fragments_debug, 0, "Debug character fragments");
625  "Don't use any alphabetic-specific tricks."
626  "Set to true in the traineddata config file for"
627  " scripts that are cursive or inherently fixed-pitch");
628  BOOL_VAR_H(save_doc_words, 0, "Save Document Words");
630  "Worst certainty for using pending dictionary");
631  double_VAR_H(doc_dict_certainty_threshold, -2.25, "Worst certainty"
632  " for words that can be inserted into the document dictionary");
633  INT_VAR_H(max_permuter_attempts, 10000, "Maximum number of different"
634  " character choices to consider during permutation."
635  " This limit is especially useful when user patterns"
636  " are specified, since overly generic patterns can result in"
637  " dawg search exploring an overly large number of options.");
638 };
639 } // namespace tesseract
640 
641 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_
Dict(CCUtil *image_ptr)
Definition: dict.cpp:33
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
Definition: stopper.cpp:378
double stopper_phase2_certainty_rejection_offset
Definition: dict.h:605
DawgPositionVector * active_dawgs
Definition: dict.h:81
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:357
int hyphen_debug_level
Definition: dict.h:596
int size() const
Definition: genericvector.h:72
float rating() const
Definition: ratngs.h:324
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Definition: permdawg.cpp:52
const CHAR_FRAGMENT * fragment
Definition: dict.h:42
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
Definition: dict.h:374
float FLOAT32
Definition: host.h:111
int max_permuter_attempts
Definition: dict.h:637
char * word_to_debug_lengths
Definition: dict.h:622
bool has_hyphen_end(const WERD_CHOICE &word) const
Same as above, but check the unichar at the end of the word.
Definition: dict.h:150
int valid_word(const char *string) const
This function is used by api/tesseract_cube_combiner.cpp.
Definition: dict.h:461
const CCUtil * getCCUtil() const
Definition: dict.h:90
void EndDangerousAmbigs()
Definition: stopper.cpp:368
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:169
int length() const
Definition: ratngs.h:300
char * word_to_debug
Definition: dict.h:620
bool load_bigram_dawg
Definition: dict.h:561
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:705
#define INT_VAR_H(name, val, comment)
Definition: params.h:265
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgPositionVector *updated_dawgs, PermuterType *current_permuter) const
Definition: dict.cpp:486
bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Definition: context.cpp:76
int good_choice(const WERD_CHOICE &choice)
Returns true if a good answer is found for the unknown blob rating.
PermuterType
Definition: ratngs.h:240
double stopper_allowable_character_badness
Definition: dict.h:611
UNICHARSET unicharset
Definition: ccutil.h:72
double segment_penalty_dict_case_ok
Definition: dict.h:574
XHeightConsistencyEnum
Definition: dict.h:75
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:32
const UNICHAR_ID WildcardID() const
Definition: dict.h:400
bool segment_nonalphabetic_script
Definition: dict.h:627
int def_letter_is_okay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.cpp:336
double xheight_penalty_inconsistent
Definition: dict.h:567
GenericVector< Dawg * > DawgVector
Definition: dict.h:50
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:523
UNICHAR_ID unichar_id
Definition: dict.h:41
double segment_penalty_ngram_best_choice
Definition: dict.h:582
double segment_penalty_dict_case_bad
Definition: dict.h:578
DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)
Definition: dict.h:78
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:783
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:73
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:540
void clear()
Definition: trie.cpp:66
bool save_doc_words
Definition: dict.h:628
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:204
int stopper_smallword_size
Definition: dict.h:607
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:203
int fragments_debug
Definition: dict.h:623
char * user_patterns_file
Definition: dict.h:551
#define STRING_VAR_H(name, val, comment)
Definition: params.h:271
void SetWordsegRatingAdjustFactor(float f)
Set wordseg_rating_adjust_factor_ to the given value.
Definition: dict.h:483
#define ASSERT_HOST(x)
Definition: errcode.h:84
int valid_word(const WERD_CHOICE &word) const
Definition: dict.h:454
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:125
double stopper_certainty_per_char
Definition: dict.h:609
float(Dict::* params_model_classify_)(const char *lang, void *path)
Definition: dict.h:390
void End()
Definition: dict.cpp:310
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:51
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
Definition: stopper.cpp:370
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
CCUtil * getCCUtil()
Definition: dict.h:93
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice)
Returns the length of the shortest alpha run in WordChoice.
Definition: stopper.cpp:452
double doc_dict_pending_threshold
Definition: dict.h:630
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:186
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:135
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:447
bool valid_punctuation(const WERD_CHOICE &word)
Definition: dict.cpp:781
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:58
float ParamsModelClassify(const char *lang, void *path)
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:102
double xheight_penalty_subscripts
Definition: dict.h:564
double ngram_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
int max_viterbi_list_size
Definition: dict.h:597
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:129
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:107
int valid_word_or_number(const WERD_CHOICE &word) const
Definition: dict.h:457
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:152
bool use_only_first_uft8_step
Definition: dict.h:600
double stopper_nondict_certainty_base
Definition: dict.h:603
UNICHAR_ID char_for_dawg(UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:422
double segment_penalty_dict_frequent_word
Definition: dict.h:570
UNICHARSET & getUnicharset()
Definition: dict.h:99
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:408
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:125
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:246
int stopper_debug_level
Definition: dict.h:612
char * user_patterns_suffix
Definition: dict.h:553
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:175
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
Definition: stopper.cpp:374
bool load_system_dawg
Definition: dict.h:554
#define double_VAR_H(name, val, comment)
Definition: params.h:274
int UNICHAR_ID
Definition: unichar.h:33
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:738
bool stopper_no_acceptable_choices
Definition: dict.h:615
int LetterIsOkay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:350
double doc_dict_certainty_threshold
Definition: dict.h:632
const Dawg * GetUnambigDawg() const
Return the points to the unambiguous words dawg.
Definition: dict.h:410
int dawg_debug_level
Definition: dict.h:595
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:363
bool load_unambig_dawg
Definition: dict.h:556
bool load_freq_dawg
Definition: dict.h:555
const int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
Definition: dict.h:404
int num_fragments
Definition: dict.h:43
void Load(DawgCache *dawg_cache)
Definition: dict.cpp:194
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:347
char * user_words_suffix
Definition: dict.h:549
void DebugWordChoices()
Prints the current choices for this word to stdout.
char * output_ambig_words_file
Definition: dict.h:593
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
DawgType type() const
Definition: dawg.h:127
double segment_penalty_dict_nonword
Definition: dict.h:586
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Definition: permdawg.cpp:322
float CallParamsModelClassify(void *path)
Definition: dict.h:393
float rating
Definition: dict.h:44
bool AcceptableResult(WERD_RES *word)
Definition: stopper.cpp:111
double segment_penalty_garbage
Definition: dict.h:591
STRING lang
Definition: ccutil.h:69
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:406
Definition: matrix.h:289
DawgPositionVector * updated_dawgs
Definition: dict.h:82
void print() const
Definition: ratngs.h:563
inT64 EDGE_REF
Definition: dawg.h:54
char * user_words_file
Definition: dict.h:547
bool save_raw_choices
Definition: dict.h:617
#define NULL
Definition: host.h:144
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
Definition: hyphen.cpp:49
int tessedit_truncate_wordchoice_log
Definition: dict.h:618
inT64 NODE_REF
Definition: dawg.h:55
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
Definition: dict.cpp:567
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:142
const char * string() const
Definition: strngs.cpp:193
float certainty
Definition: dict.h:45
int UniformCertainties(const WERD_CHOICE &word)
Definition: stopper.cpp:471
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:116
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
#define BOOL_VAR_H(name, val, comment)
Definition: params.h:268
bool load_number_dawg
Definition: dict.h:559
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:412
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:625
void ResetDocumentDictionary()
Definition: dict.h:301
PermuterType permuter
Definition: dict.h:83
bool load_punc_dawg
Definition: dict.h:558
double certainty_scale
Definition: dict.h:601
unsigned char uinT8
Definition: host.h:99
void SetWildcardID(UNICHAR_ID id)
Definition: dict.h:399