tesseract  4.0.0-1-g2a2b
tesseractclass.h
Go to the documentation of this file.
1 // File: tesseractclass.h
3 // Description: The Tesseract class. It holds/owns everything needed
4 // to run Tesseract on a single language, and also a set of
5 // sub-Tesseracts to run sub-languages. For thread safety, *every*
6 // global variable goes in here, directly, or indirectly.
7 // This makes it safe to run multiple Tesseracts in different
8 // threads in parallel, and keeps the different language
9 // instances separate.
10 // Author: Ray Smith
11 // Created: Fri Mar 07 08:17:01 PST 2008
12 //
13 // (C) Copyright 2008, Google Inc.
14 // Licensed under the Apache License, Version 2.0 (the "License");
15 // you may not use this file except in compliance with the License.
16 // You may obtain a copy of the License at
17 // http://www.apache.org/licenses/LICENSE-2.0
18 // Unless required by applicable law or agreed to in writing, software
19 // distributed under the License is distributed on an "AS IS" BASIS,
20 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 // See the License for the specific language governing permissions and
22 // limitations under the License.
23 //
25 
26 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H_
27 #define TESSERACT_CCMAIN_TESSERACTCLASS_H_
28 
29 #include <cstdint> // for int16_t, int32_t, uint16_t
30 #include <cstdio> // for FILE
31 #include "allheaders.h" // for pixDestroy, pixGetWidth, pixGetHe...
32 #include "control.h" // for ACCEPTABLE_WERD_TYPE
33 #include "debugpixa.h" // for DebugPixa
34 #include "devanagari_processing.h" // for ShiroRekhaSplitter
35 #include "docqual.h" // for GARBAGE_LEVEL
36 #include "genericvector.h" // for GenericVector, PointerVector
37 #include "host.h" // for BOOL8
38 #include "pageres.h" // for WERD_RES (ptr only), PAGE_RES (pt...
39 #include "params.h" // for BOOL_VAR_H, BoolParam, DoubleParam
40 #include "points.h" // for FCOORD
41 #include "publictypes.h" // for OcrEngineMode, PageSegMode, OEM_L...
42 #include "ratngs.h" // for ScriptPos, WERD_CHOICE (ptr only)
43 #include "strngs.h" // for STRING
44 #include "tessdatamanager.h" // for TessdataManager
45 #include "textord.h" // for Textord
46 #include "unichar.h" // for UNICHAR_ID
47 #include "wordrec.h" // for Wordrec
48 
49 class BLOCK_LIST;
50 class ETEXT_DESC;
51 struct OSResults;
52 class PAGE_RES;
53 class PAGE_RES_IT;
54 struct Pix;
55 class ROW;
56 class SVMenuNode;
57 class TBOX;
58 class TO_BLOCK_LIST;
59 class WERD;
60 class WERD_CHOICE;
61 class WERD_RES;
62 
63 
64 // Top-level class for all tesseract global instance data.
65 // This class either holds or points to all data used by an instance
66 // of Tesseract, including the memory allocator. When this is
67 // complete, Tesseract will be thread-safe. UNTIL THEN, IT IS NOT!
68 //
69 // NOTE to developers: Do not create cyclic dependencies through this class!
70 // The directory dependency tree must remain a tree! The keep this clean,
71 // lower-level code (eg in ccutil, the bottom level) must never need to
72 // know about the content of a higher-level directory.
73 // The following scheme will grant the easiest access to lower-level
74 // global members without creating a cyclic dependency:
75 //
76 // Class Hierarchy (^ = inheritance):
77 //
78 // CCUtil (ccutil/ccutil.h)
79 // ^ Members include: UNICHARSET
80 // CUtil (cutil/cutil_class.h)
81 // ^ Members include: TBLOB*, TEXTBLOCK*
82 // CCStruct (ccstruct/ccstruct.h)
83 // ^ Members include: Image
84 // Classify (classify/classify.h)
85 // ^ Members include: Dict
86 // WordRec (wordrec/wordrec.h)
87 // ^ Members include: WERD*, DENORM*
88 // Tesseract (ccmain/tesseractclass.h)
89 // Members include: Pix*
90 //
91 // Other important classes:
92 //
93 // TessBaseAPI (api/baseapi.h)
94 // Members include: BLOCK_LIST*, PAGE_RES*,
95 // Tesseract*, ImageThresholder*
96 // Dict (dict/dict.h)
97 // Members include: Image* (private)
98 //
99 // NOTE: that each level contains members that correspond to global
100 // data that is defined (and used) at that level, not necessarily where
101 // the type is defined so for instance:
102 // BOOL_VAR_H(textord_show_blobs, false, "Display unsorted blobs");
103 // goes inside the Textord class, not the cc_util class.
104 
105 namespace tesseract {
106 
107 class ColumnFinder;
108 class DocumentData;
109 class EquationDetect;
110 class ImageData;
111 class LSTMRecognizer;
112 class Tesseract;
113 
114 // A collection of various variables for statistics and debugging.
118  doc_blob_quality(0),
119  doc_outline_errs(0),
120  doc_char_quality(0),
121  good_char_count(0),
123  word_count(0),
124  dict_words(0),
125  tilde_crunch_written(false),
126  last_char_was_newline(true),
127  last_char_was_tilde(false),
129 
136  int32_t word_count; // count of word in the document
137  int32_t dict_words; // number of dicitionary words in the document
138  STRING dump_words_str; // accumulator used by dump_words()
139  // Flags used by write_results()
144 };
145 
146 // Struct to hold all the pointers to relevant data for processing a word.
147 struct WordData {
148  WordData() : word(nullptr), row(nullptr), block(nullptr), prev_word(nullptr) {}
149  explicit WordData(const PAGE_RES_IT& page_res_it)
150  : word(page_res_it.word()), row(page_res_it.row()->row),
151  block(page_res_it.block()->block), prev_word(nullptr) {}
152  WordData(BLOCK* block_in, ROW* row_in, WERD_RES* word_res)
153  : word(word_res), row(row_in), block(block_in), prev_word(nullptr) {}
154 
160 };
161 
162 // Definition of a Tesseract WordRecognizer. The WordData provides the context
163 // of row/block, in_word holds an initialized, possibly pre-classified word,
164 // that the recognizer may or may not consume (but if so it sets *in_word=nullptr)
165 // and produces one or more output words in out_words, which may be the
166 // consumed in_word, or may be generated independently.
167 // This api allows both a conventional tesseract classifier to work, or a
168 // line-level classifier that generates multiple words from a merged input.
169 typedef void (Tesseract::*WordRecognizer)(const WordData& word_data,
170  WERD_RES** in_word,
171  PointerVector<WERD_RES>* out_words);
172 
173 class Tesseract : public Wordrec {
174  public:
175  Tesseract();
176  ~Tesseract();
177 
178  // Return appropriate dictionary
179  Dict& getDict() override;
180 
181  // Clear as much used memory as possible without resetting the adaptive
182  // classifier or losing any other classifier data.
183  void Clear();
184  // Clear all memory of adaption for this and all subclassifiers.
186  // Clear the document dictionary for this and all subclassifiers.
188 
189  // Set the equation detector.
190  void SetEquationDetect(EquationDetect* detector);
191 
192  // Simple accessors.
193  const FCOORD& reskew() const {
194  return reskew_;
195  }
196  // Destroy any existing pix and return a pointer to the pointer.
198  pixDestroy(&pix_binary_);
199  return &pix_binary_;
200  }
201  Pix* pix_binary() const {
202  return pix_binary_;
203  }
204  Pix* pix_grey() const {
205  return pix_grey_;
206  }
207  void set_pix_grey(Pix* grey_pix) {
208  pixDestroy(&pix_grey_);
209  pix_grey_ = grey_pix;
210  }
211  Pix* pix_original() const { return pix_original_; }
212  // Takes ownership of the given original_pix.
213  void set_pix_original(Pix* original_pix) {
214  pixDestroy(&pix_original_);
215  pix_original_ = original_pix;
216  // Clone to sublangs as well.
217  for (int i = 0; i < sub_langs_.size(); ++i)
218  sub_langs_[i]->set_pix_original(original_pix ? pixClone(original_pix)
219  : nullptr);
220  }
221  // Returns a pointer to a Pix representing the best available resolution image
222  // of the page, with best available bit depth as second priority. Result can
223  // be of any bit depth, but never color-mapped, as that has always been
224  // removed. Note that in grey and color, 0 is black and 255 is
225  // white. If the input was binary, then black is 1 and white is 0.
226  // To tell the difference pixGetDepth() will return 32, 8 or 1.
227  // In any case, the return value is a borrowed Pix, and should not be
228  // deleted or pixDestroyed.
229  Pix* BestPix() const {
230  if (pixGetWidth(pix_original_) == ImageWidth())
231  return pix_original_;
232  else if (pix_grey_ != nullptr)
233  return pix_grey_;
234  else
235  return pix_binary_;
236  }
237  void set_pix_thresholds(Pix* thresholds) {
238  pixDestroy(&pix_thresholds_);
239  pix_thresholds_ = thresholds;
240  }
241  int source_resolution() const {
242  return source_resolution_;
243  }
244  void set_source_resolution(int ppi) {
245  source_resolution_ = ppi;
246  }
247  int ImageWidth() const {
248  return pixGetWidth(pix_binary_);
249  }
250  int ImageHeight() const {
251  return pixGetHeight(pix_binary_);
252  }
253  Pix* scaled_color() const {
254  return scaled_color_;
255  }
256  int scaled_factor() const {
257  return scaled_factor_;
258  }
259  void SetScaledColor(int factor, Pix* color) {
260  scaled_factor_ = factor;
261  scaled_color_ = color;
262  }
263  const Textord& textord() const {
264  return textord_;
265  }
267  return &textord_;
268  }
269 
270  bool right_to_left() const {
271  return right_to_left_;
272  }
273  int num_sub_langs() const {
274  return sub_langs_.size();
275  }
276  Tesseract* get_sub_lang(int index) const {
277  return sub_langs_[index];
278  }
279  // Returns true if any language uses Tesseract (as opposed to LSTM).
280  bool AnyTessLang() const {
281  if (tessedit_ocr_engine_mode != OEM_LSTM_ONLY) return true;
282  for (int i = 0; i < sub_langs_.size(); ++i) {
283  if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) return true;
284  }
285  return false;
286  }
287  // Returns true if any language uses the LSTM.
288  bool AnyLSTMLang() const {
289  if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) return true;
290  for (int i = 0; i < sub_langs_.size(); ++i) {
291  if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY)
292  return true;
293  }
294  return false;
295  }
296 
297  void SetBlackAndWhitelist();
298 
299  // Perform steps to prepare underlying binary image/other data structures for
300  // page segmentation. Uses the strategy specified in the global variable
301  // pageseg_devanagari_split_strategy for perform splitting while preparing for
302  // page segmentation.
303  void PrepareForPageseg();
304 
305  // Perform steps to prepare underlying binary image/other data structures for
306  // Tesseract OCR. The current segmentation is required by this method.
307  // Uses the strategy specified in the global variable
308  // ocr_devanagari_split_strategy for performing splitting while preparing for
309  // Tesseract ocr.
310  void PrepareForTessOCR(BLOCK_LIST* block_list,
311  Tesseract* osd_tess, OSResults* osr);
312 
313  int SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
314  Tesseract* osd_tess, OSResults* osr);
315  void SetupWordScripts(BLOCK_LIST* blocks);
316  int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
317  TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs,
318  Tesseract* osd_tess, OSResults* osr);
320  PageSegMode pageseg_mode, BLOCK_LIST* blocks, Tesseract* osd_tess,
321  OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
322  Pix** music_mask_pix);
323  // par_control.cpp
324  void PrerecAllWordsPar(const GenericVector<WordData>& words);
325 
327  // Generates training data for training a line recognizer, eg LSTM.
328  // Breaks the page into lines, according to the boxes, and writes them to a
329  // serialized DocumentData based on output_basename.
330  void TrainLineRecognizer(const STRING& input_imagename,
331  const STRING& output_basename,
332  BLOCK_LIST *block_list);
333  // Generates training data for training a line recognizer, eg LSTM.
334  // Breaks the boxes into lines, normalizes them, converts to ImageData and
335  // appends them to the given training_data.
336  void TrainFromBoxes(const GenericVector<TBOX>& boxes,
337  const GenericVector<STRING>& texts,
338  BLOCK_LIST *block_list,
339  DocumentData* training_data);
340 
341  // Returns an Imagedata containing the image of the given textline,
342  // and ground truth boxes/truth text if available in the input.
343  // The image is not normalized in any way.
344  ImageData* GetLineData(const TBOX& line_box,
345  const GenericVector<TBOX>& boxes,
346  const GenericVector<STRING>& texts,
347  int start_box, int end_box,
348  const BLOCK& block);
349  // Helper gets the image of a rectangle, using the block.re_rotation() if
350  // needed to get to the image, and rotating the result back to horizontal
351  // layout. (CJK characters will be on their left sides) The vertical text flag
352  // is set in the returned ImageData if the text was originally vertical, which
353  // can be used to invoke a different CJK recognition engine. The revised_box
354  // is also returned to enable calculation of output bounding boxes.
355  ImageData* GetRectImage(const TBOX& box, const BLOCK& block, int padding,
356  TBOX* revised_box) const;
357  // Recognizes a word or group of words, converting to WERD_RES in *words.
358  // Analogous to classify_word_pass1, but can handle a group of words as well.
359  void LSTMRecognizeWord(const BLOCK& block, ROW *row, WERD_RES *word,
360  PointerVector<WERD_RES>* words);
361  // Apply segmentation search to the given set of words, within the constraints
362  // of the existing ratings matrix. If there is already a best_choice on a word
363  // leaves it untouched and just sets the done/accepted etc flags.
365 
367  bool ProcessTargetWord(const TBOX& word_box, const TBOX& target_word_box,
368  const char* word_config, int pass);
369  // Sets up the words ready for whichever engine is to be run
370  void SetupAllWordsPassN(int pass_n,
371  const TBOX* target_word_box,
372  const char* word_config,
373  PAGE_RES* page_res,
374  GenericVector<WordData>* words);
375  // Sets up the single word ready for whichever engine is to be run.
376  void SetupWordPassN(int pass_n, WordData* word);
377  // Runs word recognition on all the words.
378  bool RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
379  PAGE_RES_IT* pr_it,
380  GenericVector<WordData>* words);
381  bool recog_all_words(PAGE_RES* page_res,
382  ETEXT_DESC* monitor,
383  const TBOX* target_word_box,
384  const char* word_config,
385  int dopasses);
386  void rejection_passes(PAGE_RES* page_res,
387  ETEXT_DESC* monitor,
388  const TBOX* target_word_box,
389  const char* word_config);
390  void bigram_correction_pass(PAGE_RES *page_res);
391  void blamer_pass(PAGE_RES* page_res);
392  // Sets script positions and detects smallcaps on all output words.
393  void script_pos_pass(PAGE_RES* page_res);
394  // Helper to recognize the word using the given (language-specific) tesseract.
395  // Returns positive if this recognizer found more new best words than the
396  // number kept from best_words.
397  int RetryWithLanguage(const WordData& word_data, WordRecognizer recognizer,
398  bool debug, WERD_RES** in_word,
399  PointerVector<WERD_RES>* best_words);
400  // Moves good-looking "noise"/diacritics from the reject list to the main
401  // blob list on the current word. Returns true if anything was done, and
402  // sets make_next_word_fuzzy if blob(s) were added to the end of the word.
403  bool ReassignDiacritics(int pass, PAGE_RES_IT* pr_it,
404  bool* make_next_word_fuzzy);
405  // Attempts to put noise/diacritic outlines into the blobs that they overlap.
406  // Input: a set of noisy outlines that probably belong to the real_word.
407  // Output: outlines that overlapped blobs are set to nullptr and put back into
408  // the word, either in the blobs or in the reject list.
410  const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
411  PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
412  GenericVector<bool>* overlapped_any_blob,
413  GenericVector<C_BLOB*>* target_blobs);
414  // Attempts to assign non-overlapping outlines to their nearest blobs or
415  // make new blobs out of them.
417  int pass, WERD* real_word, PAGE_RES_IT* pr_it,
418  GenericVector<bool>* word_wanted,
419  GenericVector<C_BLOB*>* target_blobs);
420  // Starting with ok_outlines set to indicate which outlines overlap the blob,
421  // chooses the optimal set (approximately) and returns true if any outlines
422  // are desired, in which case ok_outlines indicates which ones.
423  bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold,
424  PAGE_RES_IT* pr_it, C_BLOB* blob,
425  const GenericVector<C_OUTLINE*>& outlines,
426  int num_outlines,
427  GenericVector<bool>* ok_outlines);
428  // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
429  // the inclusion of the outlines, and returns the certainty of the raw choice.
430  float ClassifyBlobPlusOutlines(const GenericVector<bool>& ok_outlines,
431  const GenericVector<C_OUTLINE*>& outlines,
432  int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob,
433  STRING* best_str);
434  // Classifies the given blob (part of word_data->word->word) as an individual
435  // word, using languages, chopper etc, returning only the certainty of the
436  // best raw choice, and undoing all the work done to fake out the word.
437  float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob,
438  STRING* best_str, float* c2);
439  void classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
440  WordData* word_data);
441  void classify_word_pass1(const WordData& word_data,
442  WERD_RES** in_word,
443  PointerVector<WERD_RES>* out_words);
444  void recog_pseudo_word(PAGE_RES* page_res, // blocks to check
445  TBOX &selection_box);
446 
447  void fix_rep_char(PAGE_RES_IT* page_res_it);
448 
450  const char *s,
451  const char *lengths);
452  void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK* block);
453  void classify_word_pass2(const WordData& word_data,
454  WERD_RES** in_word,
455  PointerVector<WERD_RES>* out_words);
456  void ReportXhtFixResult(bool accept_new_word, float new_x_ht,
457  WERD_RES* word, WERD_RES* new_word);
458  bool RunOldFixXht(WERD_RES *word, BLOCK* block, ROW *row);
459  bool TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row);
460  // Runs recognition with the test baseline shift and x-height and returns true
461  // if there was an improvement in recognition result.
462  bool TestNewNormalization(int original_misfits, float baseline_shift,
463  float new_x_ht, WERD_RES *word, BLOCK* block,
464  ROW *row);
465  bool recog_interactive(PAGE_RES_IT* pr_it);
466 
467  // Set fonts of this word.
468  void set_word_fonts(WERD_RES *word);
469  void font_recognition_pass(PAGE_RES* page_res);
470  void dictionary_correction_pass(PAGE_RES* page_res);
471  bool check_debug_pt(WERD_RES* word, int location);
472 
474  bool SubAndSuperscriptFix(WERD_RES *word_res);
475  void GetSubAndSuperscriptCandidates(const WERD_RES *word,
476  int *num_rebuilt_leading,
477  ScriptPos *leading_pos,
478  float *leading_certainty,
479  int *num_rebuilt_trailing,
480  ScriptPos *trailing_pos,
481  float *trailing_certainty,
482  float *avg_certainty,
483  float *unlikely_threshold);
484  WERD_RES *TrySuperscriptSplits(int num_chopped_leading,
485  float leading_certainty,
486  ScriptPos leading_pos,
487  int num_chopped_trailing,
488  float trailing_certainty,
489  ScriptPos trailing_pos,
490  WERD_RES *word,
491  bool *is_good,
492  int *retry_leading,
493  int *retry_trailing);
494  bool BelievableSuperscript(bool debug,
495  const WERD_RES &word,
496  float certainty_threshold,
497  int *left_ok,
498  int *right_ok) const;
499 
501 
502  void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box);
503  void write_results(PAGE_RES_IT& page_res_it, // full info
504  char newline_type, // type of newline
505  bool force_eol // override tilde crunch?
506  );
507  void set_unlv_suspects(WERD_RES *word);
508  UNICHAR_ID get_rep_char(WERD_RES *word); // what char is repeated?
509  bool acceptable_number_string(const char* s,
510  const char* lengths);
511  int16_t count_alphanums(const WERD_CHOICE &word);
512  int16_t count_alphas(const WERD_CHOICE &word);
514  void read_config_file(const char *filename, SetParamConstraint constraint);
515  // Initialize for potentially a set of languages defined by the language
516  // string and recursively any additional languages required by any language
517  // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
518  // See init_tesseract_internal for args.
519  int init_tesseract(const char* arg0, const char* textbase,
520  const char* language, OcrEngineMode oem, char** configs,
521  int configs_size, const GenericVector<STRING>* vars_vec,
522  const GenericVector<STRING>* vars_values,
523  bool set_only_init_params, TessdataManager* mgr);
524  int init_tesseract(const char *datapath,
525  const char *language,
526  OcrEngineMode oem) {
527  TessdataManager mgr;
528  return init_tesseract(datapath, nullptr, language, oem, nullptr, 0, nullptr, nullptr,
529  false, &mgr);
530  }
531  // Common initialization for a single language.
532  // arg0 is the datapath for the tessdata directory, which could be the
533  // path of the tessdata directory with no trailing /, or (if tessdata
534  // lives in the same directory as the executable, the path of the executable,
535  // hence the name arg0.
536  // textbase is an optional output file basename (used only for training)
537  // language is the language code to load.
538  // oem controls which engine(s) will operate on the image
539  // configs (argv) is an array of config filenames to load variables from.
540  // May be nullptr.
541  // configs_size (argc) is the number of elements in configs.
542  // vars_vec is an optional vector of variables to set.
543  // vars_values is an optional corresponding vector of values for the variables
544  // in vars_vec.
545  // If set_only_init_params is true, then only the initialization variables
546  // will be set.
547  int init_tesseract_internal(const char* arg0, const char* textbase,
548  const char* language, OcrEngineMode oem,
549  char** configs, int configs_size,
550  const GenericVector<STRING>* vars_vec,
551  const GenericVector<STRING>* vars_values,
552  bool set_only_init_params, TessdataManager* mgr);
553 
554  // Set the universal_id member of each font to be unique among all
555  // instances of the same font loaded.
556  void SetupUniversalFontIds();
557 
558  int init_tesseract_lm(const char* arg0, const char* textbase,
559  const char* language, TessdataManager* mgr);
560 
561  void recognize_page(STRING& image_name);
562  void end_tesseract();
563 
564  bool init_tesseract_lang_data(const char* arg0, const char* textbase,
565  const char* language, OcrEngineMode oem,
566  char** configs, int configs_size,
567  const GenericVector<STRING>* vars_vec,
568  const GenericVector<STRING>* vars_values,
569  bool set_only_init_params,
570  TessdataManager* mgr);
571 
572  void ParseLanguageString(const char* lang_str,
573  GenericVector<STRING>* to_load,
574  GenericVector<STRING>* not_to_load);
575 
578  #ifndef GRAPHICS_DISABLED
579  void pgeditor_main(int width, int height, PAGE_RES* page_res);
580  #endif // GRAPHICS_DISABLED
581  void process_image_event( // action in image win
582  const SVEvent &event);
583  bool process_cmd_win_event( // UI command semantics
584  int32_t cmd_event, // which menu item?
585  char* new_value // any prompt data
586  );
587  void debug_word(PAGE_RES* page_res, const TBOX &selection_box);
588  void do_re_display(
589  bool (tesseract::Tesseract::* word_painter)(PAGE_RES_IT* pr_it));
590  bool word_display(PAGE_RES_IT* pr_it);
591  bool word_bln_display(PAGE_RES_IT* pr_it);
593  bool word_set_display(PAGE_RES_IT* pr_it);
594  // #ifndef GRAPHICS_DISABLED
595  bool word_dumper(PAGE_RES_IT* pr_it);
596  // #endif // GRAPHICS_DISABLED
597  void blob_feature_display(PAGE_RES* page_res, const TBOX& selection_box);
599  // make rej map for word
600  void make_reject_map(WERD_RES *word, ROW *row, int16_t pass);
601  bool one_ell_conflict(WERD_RES* word_res, bool update_map);
602  int16_t first_alphanum_index(const char *word,
603  const char *word_lengths);
604  int16_t first_alphanum_offset(const char *word,
605  const char *word_lengths);
606  int16_t alpha_count(const char *word,
607  const char *word_lengths);
608  bool word_contains_non_1_digit(const char* word,
609  const char* word_lengths);
610  void dont_allow_1Il(WERD_RES *word);
611  int16_t count_alphanums( //how many alphanums
612  WERD_RES *word);
613  void flip_0O(WERD_RES *word);
614  bool non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id);
615  bool non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id);
616  bool repeated_nonalphanum_wd(WERD_RES* word, ROW* row);
617  void nn_match_word( //Match a word
618  WERD_RES *word,
619  ROW *row);
620  void nn_recover_rejects(WERD_RES *word, ROW *row);
621  void set_done( //set done flag
622  WERD_RES *word,
623  int16_t pass);
624  int16_t safe_dict_word(const WERD_RES *werd_res); // is best_choice in dict?
625  void flip_hyphens(WERD_RES *word);
626  void reject_I_1_L(WERD_RES *word);
627  void reject_edge_blobs(WERD_RES *word);
628  void reject_mostly_rejects(WERD_RES *word);
630  bool word_adaptable( //should we adapt?
631  WERD_RES* word,
632  uint16_t mode);
633 
635  void recog_word_recursive(WERD_RES* word);
636  void recog_word(WERD_RES *word);
637  void split_and_recog_word(WERD_RES* word);
638  void split_word(WERD_RES *word,
639  int split_pt,
640  WERD_RES **right_piece,
641  BlamerBundle **orig_blamer_bundle) const;
642  void join_words(WERD_RES *word,
643  WERD_RES *word2,
644  BlamerBundle *orig_bb) const;
646  bool digit_or_numeric_punct(WERD_RES *word, int char_position);
647  int16_t eval_word_spacing(WERD_RES_LIST &word_res_list);
648  void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK* block);
649  int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list);
650  void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block);
651  void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block);
652  void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK* block);
653  void fix_fuzzy_spaces( //find fuzzy words
654  ETEXT_DESC *monitor, //progress monitor
655  int32_t word_count, //count of words in doc
656  PAGE_RES *page_res);
657  void dump_words(WERD_RES_LIST &perm, int16_t score,
658  int16_t mode, bool improved);
660  int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score);
661  float blob_noise_score(TBLOB *blob);
662  void break_noisiest_blob_word(WERD_RES_LIST &words);
664  GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word);
665  bool potential_word_crunch(WERD_RES* word,
666  GARBAGE_LEVEL garbage_level,
667  bool ok_dict_word);
668  void tilde_crunch(PAGE_RES_IT &page_res_it);
669  void unrej_good_quality_words( //unreject potential
670  PAGE_RES_IT &page_res_it);
671  void doc_and_block_rejection( //reject big chunks
672  PAGE_RES_IT &page_res_it,
673  bool good_quality_doc);
674  void quality_based_rejection(PAGE_RES_IT &page_res_it,
675  bool good_quality_doc);
676  void convert_bad_unlv_chs(WERD_RES *word_res);
677  void tilde_delete(PAGE_RES_IT &page_res_it);
678  int16_t word_blob_quality(WERD_RES *word, ROW *row);
679  void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count,
680  int16_t *accepted_match_count);
681  void unrej_good_chs(WERD_RES *word, ROW *row);
682  int16_t count_outline_errs(char c, int16_t outline_count);
683  int16_t word_outline_errs(WERD_RES *word);
684  bool terrible_word_crunch(WERD_RES* word, GARBAGE_LEVEL garbage_level);
685  CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode);
686  int16_t failure_count(WERD_RES *word);
687  bool noise_outlines(TWERD* word);
689  void
691  PAGE_RES* page_res, // blocks to check
692  //function to call
693  TBOX& selection_box,
694  bool (tesseract::Tesseract::* word_processor)(PAGE_RES_IT* pr_it));
696  void tess_add_doc_word( //test acceptability
697  WERD_CHOICE *word_choice //after context
698  );
699  void tess_segment_pass_n(int pass_n, WERD_RES *word);
700  bool tess_acceptable_word(WERD_RES *word);
701 
703  // Applies the box file based on the image name fname, and resegments
704  // the words in the block_list (page), with:
705  // blob-mode: one blob per line in the box file, words as input.
706  // word/line-mode: one blob per space-delimited unit after the #, and one word
707  // per line in the box file. (See comment above for box file format.)
708  // If find_segmentation is true, (word/line mode) then the classifier is used
709  // to re-segment words/lines to match the space-delimited truth string for
710  // each box. In this case, the input box may be for a word or even a whole
711  // text line, and the output words will contain multiple blobs corresponding
712  // to the space-delimited input string.
713  // With find_segmentation false, no classifier is needed, but the chopper
714  // can still be used to correctly segment touching characters with the help
715  // of the input boxes.
716  // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
717  // from normal classification, ie. with a word, chopped_word, rebuild_word,
718  // seam_array, denorm, box_word, and best_state, but NO best_choice or
719  // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
720  // Instead, the correct_text member of WERD_RES is set, and this may be later
721  // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
722  // is not required before calling ApplyBoxTraining.
723  PAGE_RES* ApplyBoxes(const STRING& fname, bool find_segmentation,
724  BLOCK_LIST *block_list);
725 
726  // Any row xheight that is significantly different from the median is set
727  // to the median.
728  void PreenXHeights(BLOCK_LIST *block_list);
729 
730  // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
731  // All fuzzy spaces are removed, and all the words are maximally chopped.
733  BLOCK_LIST *block_list);
734  // Tests the chopper by exhaustively running chop_one_blob.
735  // The word_res will contain filled chopped_word, seam_array, denorm,
736  // box_word and best_state for the maximally chopped word.
737  void MaximallyChopWord(const GenericVector<TBOX>& boxes,
738  BLOCK* block, ROW* row, WERD_RES* word_res);
739  // Gather consecutive blobs that match the given box into the best_state
740  // and corresponding correct_text.
741  // Fights over which box owns which blobs are settled by pre-chopping and
742  // applying the blobs to box or next_box with the least non-overlap.
743  // Returns false if the box was in error, which can only be caused by
744  // failing to find an appropriate blob for a box.
745  // This means that occasionally, blobs may be incorrectly segmented if the
746  // chopper fails to find a suitable chop point.
747  bool ResegmentCharBox(PAGE_RES* page_res, const TBOX* prev_box,
748  const TBOX& box, const TBOX* next_box,
749  const char* correct_text);
750  // Consume all source blobs that strongly overlap the given box,
751  // putting them into a new word, with the correct_text label.
752  // Fights over which box owns which blobs are settled by
753  // applying the blobs to box or next_box with the least non-overlap.
754  // Returns false if the box was in error, which can only be caused by
755  // failing to find an overlapping blob for a box.
756  bool ResegmentWordBox(BLOCK_LIST* block_list,
757  const TBOX& box, const TBOX* next_box,
758  const char* correct_text);
759  // Resegments the words by running the classifier in an attempt to find the
760  // correct segmentation that produces the required string.
761  void ReSegmentByClassification(PAGE_RES* page_res);
762  // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
763  // Returns false if an invalid UNICHAR_ID is encountered.
764  bool ConvertStringToUnichars(const char* utf8,
765  GenericVector<UNICHAR_ID>* class_ids);
766  // Resegments the word to achieve the target_text from the classifier.
767  // Returns false if the re-segmentation fails.
768  // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
769  // applies a full search on the classifier results to find the best classified
770  // segmentation. As a compromise to obtain better recall, 1-1 ambigiguity
771  // substitutions ARE used.
772  bool FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
773  WERD_RES* word_res);
774  // Recursive helper to find a match to the target_text (from text_index
775  // position) in the choices (from choices_pos position).
776  // Choices is an array of GenericVectors, of length choices_length, with each
777  // element representing a starting position in the word, and the
778  // GenericVector holding classification results for a sequence of consecutive
779  // blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
781  int choices_pos, int choices_length,
782  const GenericVector<UNICHAR_ID>& target_text,
783  int text_index,
784  float rating, GenericVector<int>* segmentation,
785  float* best_rating, GenericVector<int>* best_segmentation);
786  // Counts up the labelled words and the blobs within.
787  // Deletes all unused or emptied words, counting the unused ones.
788  // Resets W_BOL and W_EOL flags correctly.
789  // Builds the rebuild_word and rebuilds the box_word.
790  void TidyUp(PAGE_RES* page_res);
791  // Logs a bad box by line in the box file and box coords.
792  void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch,
793  const char *err_msg);
794  // Creates a fake best_choice entry in each WERD_RES with the correct text.
795  void CorrectClassifyWords(PAGE_RES* page_res);
796  // Call LearnWord to extract features for labelled blobs within each word.
797  // Features are stored in an internal buffer.
798  void ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res);
799 
801  // Returns the number of misfit blob tops in this word.
802  int CountMisfitTops(WERD_RES *word_res);
803  // Returns a new x-height in pixels (original image coords) that is
804  // maximally compatible with the result in word_res.
805  // Returns 0.0f if no x-height is found that is better than the current
806  // estimate.
807  float ComputeCompatibleXheight(WERD_RES *word_res, float* baseline_shift);
809  // TODO(ocr-team): Find and remove obsolete parameters.
811  "Take segmentation and labeling from box file");
813  "Conversion of word/line box file to char box file");
815  "Generate training data from boxed chars");
817  "Generate more boxes from boxed chars");
819  "Break input into lines and remap boxes if present");
821  "Dump intermediate images made during page segmentation");
823  "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
824  " 5=line, 6=word, 7=char"
825  " (Values from PageSegMode enum in publictypes.h)");
827  "Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults"
828  " to loading and running the most accurate available.");
830  "Blacklist of chars not to recognize");
832  "Whitelist of chars to recognize");
834  "List of chars to override tessedit_char_blacklist");
836  "Perform training for ambiguities");
839  "Whether to use the top-line splitting process for Devanagari "
840  "documents while performing page-segmentation.");
843  "Whether to use the top-line splitting process for Devanagari "
844  "documents while performing ocr.");
846  "Write all parameters to the given file.");
848  "Generate and print debug information for adaption");
849  INT_VAR_H(bidi_debug, 0, "Debug level for BiDi");
850  INT_VAR_H(applybox_debug, 1, "Debug level");
851  INT_VAR_H(applybox_page, 0, "Page number to apply boxes from");
853  "Exposure value follows this pattern in the image"
854  " filename. The name of the image files are expected"
855  " to be in the form [lang].[fontname].exp[num].tif");
857  "Learn both character fragments (as is done in the"
858  " special low exposure mode) as well as unfragmented"
859  " characters.");
861  "Each bounding box is assumed to contain ngrams. Only"
862  " learn the ngrams whose outlines overlap horizontally.");
863  BOOL_VAR_H(tessedit_display_outwords, false, "Draw output words");
864  BOOL_VAR_H(tessedit_dump_choices, false, "Dump char choices");
865  BOOL_VAR_H(tessedit_timing_debug, false, "Print timing stats");
867  "Try to improve fuzzy spaces");
869  "Don't bother with word plausibility");
870  BOOL_VAR_H(tessedit_fix_hyphens, true, "Crunch double hyphens?");
871  BOOL_VAR_H(tessedit_redo_xheight, true, "Check/Correct x-height");
873  "Add words to the document dictionary");
874  BOOL_VAR_H(tessedit_debug_fonts, false, "Output font info per char");
875  BOOL_VAR_H(tessedit_debug_block_rejection, false, "Block and Row stats");
877  "Enable correction based on the word bigram dictionary.");
879  "Enable single word correction based on the dictionary.");
880  INT_VAR_H(tessedit_bigram_debug, 0, "Amount of debug output for bigram "
881  "correction.");
883  "Remove and conditionally reassign small outlines when they"
884  " confuse layout analysis, determining diacritics vs noise");
885  INT_VAR_H(debug_noise_removal, 0, "Debug reassignment of small outlines");
886  // Worst (min) certainty, for which a diacritic is allowed to make the base
887  // character worse and still be included.
888  double_VAR_H(noise_cert_basechar, -8.0, "Hingepoint for base char certainty");
889  // Worst (min) certainty, for which a non-overlapping diacritic is allowed to
890  // make the base character worse and still be included.
891  double_VAR_H(noise_cert_disjoint, -2.5, "Hingepoint for disjoint certainty");
892  // Worst (min) certainty, for which a diacritic is allowed to make a new
893  // stand-alone blob.
894  double_VAR_H(noise_cert_punc, -2.5, "Threshold for new punc char certainty");
895  // Factor of certainty margin for adding diacritics to not count as worse.
897  "Scaling on certainty diff from Hingepoint");
898  INT_VAR_H(noise_maxperblob, 8, "Max diacritics to apply to a blob");
899  INT_VAR_H(noise_maxperword, 16, "Max diacritics to apply to a word");
900  INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug");
901  BOOL_VAR_H(debug_acceptable_wds, false, "Dump word pass/fail chk");
902  STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation");
903  STRING_VAR_H(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation");
904  STRING_VAR_H(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation");
905  double_VAR_H(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit");
906  double_VAR_H(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit");
908  "good_quality_doc lte outline error limit");
909  double_VAR_H(quality_char_pc, 0.95, "good_quality_doc gte good char limit");
910  INT_VAR_H(quality_min_initial_alphas_reqd, 2, "alphas in a good word");
912  "Adaptation decision algorithm for tess");
914  "Do minimal rejection on pass 1 output");
915  BOOL_VAR_H(tessedit_test_adaption, false, "Test adaption criteria");
916  BOOL_VAR_H(tessedit_matcher_log, false, "Log matcher activity");
918  "Adaptation decision algorithm for tess");
919  BOOL_VAR_H(test_pt, false, "Test for point");
920  double_VAR_H(test_pt_x, 99999.99, "xcoord");
921  double_VAR_H(test_pt_y, 99999.99, "ycoord");
922  INT_VAR_H(multilang_debug_level, 0, "Print multilang debug info.");
923  INT_VAR_H(paragraph_debug_level, 0, "Print paragraph debug info.");
925  "Run paragraph detection on the post-text-recognition "
926  "(more accurate)");
927  BOOL_VAR_H(lstm_use_matrix, 1, "Use ratings matrix/beam searct with lstm");
928  STRING_VAR_H(outlines_odd, "%| ", "Non standard number of outlines");
929  STRING_VAR_H(outlines_2, "ij!?%\":;", "Non standard number of outlines");
931  "Allow outline errs in unrejection?");
933  "Reduce rejection on good docs");
934  BOOL_VAR_H(tessedit_use_reject_spaces, true, "Reject spaces?");
936  "%rej allowed before rej whole doc");
938  "%rej allowed before rej whole block");
940  "%rej allowed before rej whole row");
942  "Number of row rejects in whole word rejects"
943  "which prevents whole row rejection");
945  "Only rej partially rejected words in block rejection");
947  "Only rej partially rejected words in row rejection");
949  "Use word segmentation quality metric");
951  "Use word segmentation quality metric");
953  "Only preserve wds longer than this");
955  "Apply row rejection to good docs");
957  "rej good doc wd if more than this fraction rejected");
959  "Reject all bad quality wds");
962  "Output data to debug file");
963  BOOL_VAR_H(bland_unrej, false, "unrej potential with no checks");
965  "good_quality_doc gte good char limit");
967  "Mark v.bad words for tilde crunch");
968  BOOL_VAR_H(hocr_font_info, false,
969  "Add font info to hocr output");
970  BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?");
971  BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?");
972  double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");
973  BOOL_VAR_H(crunch_terrible_garbage, true, "As it says");
975  "crunch garbage cert lt this");
976  double_VAR_H(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this");
977  double_VAR_H(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this");
978  double_VAR_H(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this");
979  BOOL_VAR_H(crunch_pot_garbage, true, "POTENTIAL crunch garbage");
980  double_VAR_H(crunch_del_rating, 60, "POTENTIAL crunch rating lt this");
981  double_VAR_H(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
982  double_VAR_H(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
983  double_VAR_H(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
984  double_VAR_H(crunch_del_min_width, 3.0, "Del if word width lt xht x this");
986  "Del if word gt xht x this above bl");
987  double_VAR_H(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl");
988  double_VAR_H(crunch_small_outlines_size, 0.6, "Small if lt xht x this");
989  INT_VAR_H(crunch_rating_max, 10, "For adj length in rating per ch");
990  INT_VAR_H(crunch_pot_indicators, 1, "How many potential indicators needed");
991  BOOL_VAR_H(crunch_leave_ok_strings, true, "Don't touch sensible strings");
992  BOOL_VAR_H(crunch_accept_ok, true, "Use acceptability in okstring");
994  "Don't pot crunch sensible strings");
995  BOOL_VAR_H(crunch_include_numerals, false, "Fiddle alpha figures");
997  "Don't crunch words with long lower case strings");
999  "Don't crunch words with long lower case strings");
1000  INT_VAR_H(crunch_long_repetitions, 3, "Crunch words with long repetitions");
1001  INT_VAR_H(crunch_debug, 0, "As it says");
1003  "How many non-noise blbs either side?");
1004  double_VAR_H(fixsp_small_outlines_size, 0.28, "Small if lt xht x this");
1005  BOOL_VAR_H(tessedit_prefer_joined_punct, false, "Reward punctuation joins");
1006  INT_VAR_H(fixsp_done_mode, 1, "What constitues done for spacing");
1007  INT_VAR_H(debug_fix_space_level, 0, "Contextual fixspace debug");
1009  "Punct. chs expected WITHIN numbers");
1011  "Max allowed deviation of blob top outside of font data");
1012  INT_VAR_H(x_ht_min_change, 8, "Min change in xht before actually trying it");
1013  INT_VAR_H(superscript_debug, 0, "Debug level for sub & superscript fixer");
1014  double_VAR_H(superscript_worse_certainty, 2.0, "How many times worse "
1015  "certainty does a superscript position glyph need to be for us "
1016  "to try classifying it as a char with a different baseline?");
1017  double_VAR_H(superscript_bettered_certainty, 0.97, "What reduction in "
1018  "badness do we think sufficient to choose a superscript over "
1019  "what we'd thought. For example, a value of 0.6 means we want "
1020  "to reduce badness of certainty by 40%");
1022  "A superscript scaled down more than this is unbelievably "
1023  "small. For example, 0.3 means we expect the font size to "
1024  "be no smaller than 30% of the text line font size.");
1026  "Maximum top of a character measured as a multiple of x-height "
1027  "above the baseline for us to reconsider whether it's a "
1028  "subscript.");
1030  "Minimum bottom of a character measured as a multiple of "
1031  "x-height above the baseline for us to reconsider whether it's "
1032  "a superscript.");
1034  "Write block separators in output");
1036  "Write repetition char code");
1037  BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
1038  BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
1039  BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
1040  BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
1041  BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
1042  BOOL_VAR_H(textonly_pdf, false,
1043  "Create PDF with only one invisible text layer");
1044  INT_VAR_H(jpg_quality, 85, "Set JPEG quality level");
1045  INT_VAR_H(user_defined_dpi, 0, "Specify DPI for input image");
1047  "Specify minimum characters to try during OSD");
1049  "Output char for unidentified blobs");
1050  INT_VAR_H(suspect_level, 99, "Suspect marker level");
1052  "Min suspect level for rejecting spaces");
1053  INT_VAR_H(suspect_short_words, 2, "Don't Suspect dict wds longer than this");
1054  BOOL_VAR_H(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected");
1055  double_VAR_H(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit");
1056  double_VAR_H(suspect_accept_rating, -999.9, "Accept good rating limit");
1057  BOOL_VAR_H(tessedit_minimal_rejection, false, "Only reject tess failures");
1058  BOOL_VAR_H(tessedit_zero_rejection, false, "Don't reject ANYTHING");
1060  "Make output have exactly one word per WERD");
1062  "Don't reject ANYTHING AT ALL");
1063  BOOL_VAR_H(tessedit_consistent_reps, true, "Force all rep chars the same");
1064  INT_VAR_H(tessedit_reject_mode, 0, "Rejection algorithm");
1065  BOOL_VAR_H(tessedit_rejection_debug, false, "Adaption debug");
1066  BOOL_VAR_H(tessedit_flip_0O, true, "Contextual 0O O0 flips");
1068  "Aspect ratio dot/hyphen test");
1070  "Aspect ratio dot/hyphen test");
1071  BOOL_VAR_H(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector");
1072  BOOL_VAR_H(rej_1Il_use_dict_word, false, "Use dictword test");
1073  BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Don't double check");
1074  BOOL_VAR_H(rej_use_tess_accepted, true, "Individual rejection control");
1075  BOOL_VAR_H(rej_use_tess_blanks, true, "Individual rejection control");
1076  BOOL_VAR_H(rej_use_good_perm, true, "Individual rejection control");
1077  BOOL_VAR_H(rej_use_sensible_wd, false, "Extend permuter check");
1078  BOOL_VAR_H(rej_alphas_in_number_perm, false, "Extend permuter check");
1080  INT_VAR_H(tessedit_image_border, 2, "Rej blbs near image edge limit");
1082  "Allow NN to unrej");
1083  STRING_VAR_H(conflict_set_I_l_1, "Il1[]", "Il1 conflict set");
1084  INT_VAR_H(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this");
1085  BOOL_VAR_H(tessedit_create_boxfile, false, "Output text with boxes");
1087  "-1 -> All pages, else specific page to process");
1088  BOOL_VAR_H(tessedit_write_images, false, "Capture the image from the IPE");
1089  BOOL_VAR_H(interactive_display_mode, false, "Run interactively?");
1090  STRING_VAR_H(file_type, ".tif", "Filename extension");
1091  BOOL_VAR_H(tessedit_override_permuter, true, "According to dict_word");
1093  "List of languages to load with this one");
1095  "In multilingual mode use params model of the primary language");
1096  // Min acceptable orientation margin (difference in scores between top and 2nd
1097  // choice in OSResults::orientations) to believe the page orientation.
1099  "Min acceptable orientation margin");
1100  BOOL_VAR_H(textord_tabfind_show_vlines, false, "Debug line finding");
1101  BOOL_VAR_H(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model");
1103  "Allow feature extractors to see the original outline");
1105  "Only initialize with the config file. Useful if the instance is "
1106  "not going to be used for OCR but say only for layout analysis.");
1107  BOOL_VAR_H(textord_equation_detect, false, "Turn on equation detector");
1108  BOOL_VAR_H(textord_tabfind_vertical_text, true, "Enable vertical detection");
1110  "Force using vertical text page mode");
1112  "Fraction of textlines deemed vertical to use vertical page "
1113  "mode");
1115  "Fraction of height used as a minimum gap for aligned blobs.");
1116  INT_VAR_H(tessedit_parallelize, 0, "Run in parallel where possible");
1118  "Preserve multiple interword spaces");
1120  "Page separator (default is form feed control character)");
1122  "Allows to include alternative symbols choices in the hOCR output. "
1123  "Valid input values are 0, 1 and 2. 0 is the default value. "
1124  "With 1 the alternative symbol choices per timestep are included. "
1125  "With 2 the alternative symbol choices are accumulated per character.");
1126 
1128  FILE *init_recog_training(const STRING &fname);
1129  void recog_training_segmented(const STRING &fname,
1130  PAGE_RES *page_res,
1131  volatile ETEXT_DESC *monitor,
1132  FILE *output_file);
1133  void ambigs_classify_and_output(const char *label,
1134  PAGE_RES_IT* pr_it,
1135  FILE *output_file);
1136 
1137  private:
1138  // The filename of a backup config file. If not null, then we currently
1139  // have a temporary debug config file loaded, and backup_config_file_
1140  // will be loaded, and set to null when debug is complete.
1141  const char* backup_config_file_;
1142  // The filename of a config file to read when processing a debug word.
1143  STRING word_config_;
1144  // Image used for input to layout analysis and tesseract recognition.
1145  // May be modified by the ShiroRekhaSplitter to eliminate the top-line.
1146  Pix* pix_binary_;
1147  // Grey-level input image if the input was not binary, otherwise nullptr.
1148  Pix* pix_grey_;
1149  // Original input image. Color if the input was color.
1150  Pix* pix_original_;
1151  // Thresholds that were used to generate the thresholded image from grey.
1152  Pix* pix_thresholds_;
1153  // Debug images. If non-empty, will be written on destruction.
1154  DebugPixa pixa_debug_;
1155  // Input image resolution after any scaling. The resolution is not well
1156  // transmitted by operations on Pix, so we keep an independent record here.
1157  int source_resolution_;
1158  // The shiro-rekha splitter object which is used to split top-lines in
1159  // Devanagari words to provide a better word and grapheme segmentation.
1160  ShiroRekhaSplitter splitter_;
1161  // Page segmentation/layout
1162  Textord textord_;
1163  // True if the primary language uses right_to_left reading order.
1164  bool right_to_left_;
1165  Pix* scaled_color_;
1166  int scaled_factor_;
1167  FCOORD deskew_;
1168  FCOORD reskew_;
1169  TesseractStats stats_;
1170  // Sub-languages to be tried in addition to this.
1171  GenericVector<Tesseract*> sub_langs_;
1172  // Most recently used Tesseract out of this and sub_langs_. The default
1173  // language for the next word.
1174  Tesseract* most_recently_used_;
1175  // The size of the font table, ie max possible font id + 1.
1176  int font_table_size_;
1177  // Equation detector. Note: this pointer is NOT owned by the class.
1178  EquationDetect* equ_detect_;
1179  // LSTM recognizer, if available.
1180  LSTMRecognizer* lstm_recognizer_;
1181  // Output "page" number (actually line number) using TrainLineRecognizer.
1182  int train_line_page_num_;
1183 };
1184 
1185 } // namespace tesseract
1186 
1187 #endif // TESSERACT_CCMAIN_TESSERACTCLASS_H_
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:280
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
Definition: control.cpp:159
double superscript_bettered_certainty
int16_t alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:500
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
int16_t first_alphanum_index(const char *word, const char *word_lengths)
Definition: reject.cpp:474
int16_t failure_count(WERD_RES *word)
Definition: docqual.cpp:966
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
Definition: control.cpp:1287
int scaled_factor() const
bool acceptable_number_string(const char *s, const char *lengths)
Definition: output.cpp:394
void ReSegmentByClassification(PAGE_RES *page_res)
#define INT_VAR_H(name, val, comment)
Definition: params.h:264
void font_recognition_pass(PAGE_RES *page_res)
Definition: control.cpp:2060
int UNICHAR_ID
Definition: unichar.h:35
int SegmentPage(const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
Definition: fixxht.cpp:102
Pix * pix_grey() const
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
Definition: tfacepp.cpp:234
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1591
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB *> *target_blobs)
Definition: control.cpp:1074
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
Definition: fixspace.cpp:373
char * tessedit_write_params_to_file
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
void dont_allow_1Il(WERD_RES *word)
Definition: reject.cpp:531
Definition: blobs.h:402
void SetScaledColor(int factor, Pix *color)
void TrainFromBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
Definition: linerec.cpp:74
void PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:172
void set_pix_original(Pix *original_pix)
Dict & getDict() override
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE *> &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
Definition: control.cpp:1239
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
Definition: linerec.cpp:222
bool word_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:759
#define BOOL_VAR_H(name, val, comment)
Definition: params.h:267
void tess_add_doc_word(WERD_CHOICE *word_choice)
Definition: tessbox.cpp:72
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1420
int init_tesseract_lm(const char *arg0, const char *textbase, const char *language, TessdataManager *mgr)
Definition: tessedit.cpp:462
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:504
Definition: rect.h:34
Pix * pix_original() const
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
Definition: fixspace.cpp:565
#define double_VAR_H(name, val, comment)
Definition: params.h:273
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
Definition: control.cpp:910
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
Definition: reject.cpp:514
void do_re_display(bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
Definition: pgedit.cpp:298
int16_t count_alphas(const WERD_CHOICE &word)
Definition: output.cpp:373
Pix * scaled_color() const
void ApplyBoxTraining(const STRING &fontname, PAGE_RES *page_res)
bool tessedit_enable_bigram_correction
void set_pix_grey(Pix *grey_pix)
ImageData * GetRectImage(const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
Definition: linerec.cpp:163
double tessedit_reject_doc_percent
void set_word_fonts(WERD_RES *word)
Definition: control.cpp:1981
double tessedit_reject_block_percent
void blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:959
#define STRING_VAR_H(name, val, comment)
Definition: params.h:270
bool crunch_early_convert_bad_unlv_chs
bool noise_outlines(TWERD *word)
Definition: docqual.cpp:978
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:91
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:269
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:418
bool repeated_nonalphanum_wd(WERD_RES *word, ROW *row)
Definition: reject.cpp:587
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
Definition: fixspace.cpp:479
void process_image_event(const SVEvent &event)
Definition: pgedit.cpp:559
bool SubAndSuperscriptFix(WERD_RES *word_res)
bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:794
double tessedit_reject_row_percent
bool tessedit_resegment_from_line_boxes
bool FindSegmentation(const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:660
Pix * BestPix() const
void write_results(PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
Definition: output.cpp:105
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:790
PointerVector< WERD_RES > lang_words
bool tessedit_preserve_blk_rej_perfect_wds
void SearchWords(PointerVector< WERD_RES > *words)
Definition: linerec.cpp:250
bool applybox_learn_chars_and_char_frags_mode
double tessedit_good_doc_still_rowrej_wd
SetParamConstraint
Definition: params.h:36
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1649
int16_t first_alphanum_offset(const char *word, const char *word_lengths)
Definition: reject.cpp:487
int init_tesseract(const char *datapath, const char *language, OcrEngineMode oem)
bool textord_tabfind_force_vertical_text
int16_t word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:61
void SetEquationDetect(EquationDetect *detector)
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:296
void recog_word(WERD_RES *word)
Definition: tfacepp.cpp:40
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:161
void TrainLineRecognizer(const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list)
Definition: linerec.cpp:43
void nn_match_word(WERD_RES *word, ROW *row)
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
const Textord & textord() const
const FCOORD & reskew() const
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:125
bool RunOldFixXht(WERD_RES *word, BLOCK *block, ROW *row)
WordData(BLOCK *block_in, ROW *row_in, WERD_RES *word_res)
void SetupWordScripts(BLOCK_LIST *blocks)
bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:790
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1538
double textord_tabfind_vertical_text_ratio
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
Definition: control.cpp:1481
CRUNCH_MODE
Definition: pageres.h:159
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:612
double tessedit_whole_wd_rej_row_percent
void(Tesseract::* WordRecognizer)(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
void SetupUniversalFontIds()
Definition: tessedit.cpp:441
bool word_adaptable(WERD_RES *word, uint16_t mode)
Definition: adaptions.cpp:35
void script_pos_pass(PAGE_RES *page_res)
Definition: control.cpp:740
bool word_blank_and_set_display(PAGE_RES_IT *pr_its)
Definition: pgedit.cpp:715
int source_resolution() const
#define FALSE
Definition: capi.h:52
void pgeditor_main(int width, int height, PAGE_RES *page_res)
Definition: pgedit.cpp:327
void split_and_recog_word(WERD_RES *word)
Definition: tfacepp.cpp:138
PAGE_RES * ApplyBoxes(const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:308
SVMenuNode * build_menu_new()
Definition: pgedit.cpp:247
bool fixspace_thinks_word_done(WERD_RES *word)
Definition: fixspace.cpp:533
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
bool process_cmd_win_event(int32_t cmd_event, char *new_value)
Definition: pgedit.cpp:387
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
Definition: tessedit.cpp:262
bool word_bln_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:727
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
Definition: docqual.cpp:542
bool right_to_left() const
bool ConvertStringToUnichars(const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
bool tessedit_preserve_row_rej_perfect_wds
bool word_set_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:944
WordData(const PAGE_RES_IT &page_res_it)
void set_pix_thresholds(Pix *thresholds)
void SearchForText(const GenericVector< BLOB_CHOICE_LIST *> *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
int16_t count_outline_errs(char c, int16_t outline_count)
Definition: docqual.cpp:127
char * ok_repeated_ch_non_alphanum_wds
void set_done(WERD_RES *word, int16_t pass)
Definition: werd.h:59
bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
Pix * pix_binary() const
ImageData * GetLineData(const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block)
Definition: linerec.cpp:129
unsigned char BOOL8
Definition: host.h:34
Definition: ocrrow.h:36
int init_tesseract_internal(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:396
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:175
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:73
UNICHAR_ID get_rep_char(WERD_RES *word)
Definition: output.cpp:258
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1764
Definition: ocrblock.h:30
Textord * mutable_textord()
void flip_0O(WERD_RES *word)
Definition: reject.cpp:678
FILE * init_recog_training(const STRING &fname)
void reject_mostly_rejects(WERD_RES *word)
Definition: reject.cpp:578
void set_source_resolution(int ppi)
void TidyUp(PAGE_RES *page_res)
GARBAGE_LEVEL
Definition: docqual.h:29
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
Definition: control.cpp:618
void fix_rep_char(PAGE_RES_IT *page_res_it)
Definition: control.cpp:1725
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
Definition: control.cpp:67
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE *> &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Definition: control.cpp:1152
bool tess_acceptable_word(WERD_RES *word)
Definition: tessbox.cpp:62
void recognize_page(STRING &image_name)
bool word_dumper(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:920
void bigram_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:473
Definition: strngs.h:45
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:233
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:182
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:78
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:710
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
Definition: control.cpp:218
Tesseract * get_sub_lang(int index) const
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:70
void reject_edge_blobs(WERD_RES *word)
Definition: reject.cpp:268
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:226
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:104
Definition: points.h:189
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:590
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:599
void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1868
void process_selected_words(PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
Definition: pagewalk.cpp:30
bool AnyTessLang() const
ACCEPTABLE_WERD_TYPE
Definition: control.h:28
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:139
WERD_RES * TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:60
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
Definition: tfacepp.cpp:176
bool one_ell_conflict(WERD_RES *word_res, bool update_map)
Definition: reject.cpp:297
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word)
Definition: docqual.cpp:680
void nn_recover_rejects(WERD_RES *word, ROW *row)
Definition: blobs.h:268
void CorrectClassifyWords(PAGE_RES *page_res)
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
Definition: docqual.cpp:895
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:93
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
Definition: control.cpp:949
void blamer_pass(PAGE_RES *page_res)
Definition: control.cpp:716
bool recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:82
void recog_training_segmented(const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
double rej_whole_of_mostly_reject_word_fract
double textord_tabfind_aligned_gap_fraction
void dictionary_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:2117
void tess_segment_pass_n(int pass_n, WERD_RES *word)
Definition: tessbox.cpp:32
bool AnyLSTMLang() const
void flip_hyphens(WERD_RES *word)
Definition: reject.cpp:621
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
int16_t count_alphanums(const WERD_CHOICE &word)
Definition: output.cpp:383
void PrerecAllWordsPar(const GenericVector< WordData > &words)
Definition: par_control.cpp:39
void PreenXHeights(BLOCK_LIST *block_list)
int num_sub_langs() const
void break_noisiest_blob_word(WERD_RES_LIST &words)
Definition: fixspace.cpp:645
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB *> *target_blobs)
Definition: control.cpp:1019
void unrej_good_chs(WERD_RES *word, ROW *row)
Definition: docqual.cpp:116
ColumnFinder * SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)
void reject_I_1_L(WERD_RES *word)
Definition: reject.cpp:198
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:860
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:43
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1504
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
void debug_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:637
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1338