26 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H_ 27 #define TESSERACT_CCMAIN_TESSERACTCLASS_H_ 31 #include "allheaders.h" 109 class EquationDetect;
111 class LSTMRecognizer;
198 pixDestroy(&pix_binary_);
208 pixDestroy(&pix_grey_);
209 pix_grey_ = grey_pix;
214 pixDestroy(&pix_original_);
215 pix_original_ = original_pix;
217 for (
int i = 0; i < sub_langs_.size(); ++i)
230 if (pixGetWidth(pix_original_) ==
ImageWidth())
231 return pix_original_;
232 else if (pix_grey_ !=
nullptr)
238 pixDestroy(&pix_thresholds_);
239 pix_thresholds_ = thresholds;
242 return source_resolution_;
245 source_resolution_ = ppi;
248 return pixGetWidth(pix_binary_);
251 return pixGetHeight(pix_binary_);
254 return scaled_color_;
257 return scaled_factor_;
260 scaled_factor_ = factor;
261 scaled_color_ = color;
271 return right_to_left_;
274 return sub_langs_.size();
277 return sub_langs_[index];
282 for (
int i = 0; i < sub_langs_.size(); ++i) {
290 for (
int i = 0; i < sub_langs_.size(); ++i) {
317 TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs,
321 OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
322 Pix** music_mask_pix);
331 const STRING& output_basename,
332 BLOCK_LIST *block_list);
338 BLOCK_LIST *block_list,
347 int start_box,
int end_box,
356 TBOX* revised_box)
const;
368 const char* word_config,
int pass);
371 const TBOX* target_word_box,
372 const char* word_config,
383 const TBOX* target_word_box,
384 const char* word_config,
388 const TBOX* target_word_box,
389 const char* word_config);
404 bool* make_next_word_fuzzy);
438 STRING* best_str,
float* c2);
445 TBOX &selection_box);
451 const char *lengths);
476 int *num_rebuilt_leading,
478 float *leading_certainty,
479 int *num_rebuilt_trailing,
481 float *trailing_certainty,
482 float *avg_certainty,
483 float *unlikely_threshold);
485 float leading_certainty,
487 int num_chopped_trailing,
488 float trailing_certainty,
493 int *retry_trailing);
496 float certainty_threshold,
498 int *right_ok)
const;
510 const char* lengths);
525 const char *language,
528 return init_tesseract(datapath,
nullptr, language, oem,
nullptr, 0,
nullptr,
nullptr,
549 char** configs,
int configs_size,
566 char** configs,
int configs_size,
569 bool set_only_init_params,
578 #ifndef GRAPHICS_DISABLED 580 #endif // GRAPHICS_DISABLED 603 const char *word_lengths);
605 const char *word_lengths);
607 const char *word_lengths);
609 const char* word_lengths);
657 void dump_words(WERD_RES_LIST &perm, int16_t score,
658 int16_t mode,
bool improved);
673 bool good_quality_doc);
675 bool good_quality_doc);
680 int16_t *accepted_match_count);
724 BLOCK_LIST *block_list);
733 BLOCK_LIST *block_list);
748 const TBOX& box,
const TBOX* next_box,
749 const char* correct_text);
757 const TBOX& box,
const TBOX* next_box,
758 const char* correct_text);
781 int choices_pos,
int choices_length,
793 const char *err_msg);
811 "Take segmentation and labeling from box file");
813 "Conversion of word/line box file to char box file");
815 "Generate training data from boxed chars");
817 "Generate more boxes from boxed chars");
819 "Break input into lines and remap boxes if present");
821 "Dump intermediate images made during page segmentation");
823 "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," 824 " 5=line, 6=word, 7=char" 825 " (Values from PageSegMode enum in publictypes.h)");
827 "Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults" 828 " to loading and running the most accurate available.");
830 "Blacklist of chars not to recognize");
832 "Whitelist of chars to recognize");
834 "List of chars to override tessedit_char_blacklist");
836 "Perform training for ambiguities");
839 "Whether to use the top-line splitting process for Devanagari " 840 "documents while performing page-segmentation.");
843 "Whether to use the top-line splitting process for Devanagari " 844 "documents while performing ocr.");
846 "Write all parameters to the given file.");
848 "Generate and print debug information for adaption");
853 "Exposure value follows this pattern in the image" 854 " filename. The name of the image files are expected" 855 " to be in the form [lang].[fontname].exp[num].tif");
857 "Learn both character fragments (as is done in the" 858 " special low exposure mode) as well as unfragmented" 861 "Each bounding box is assumed to contain ngrams. Only" 862 " learn the ngrams whose outlines overlap horizontally.");
867 "Try to improve fuzzy spaces");
869 "Don't bother with word plausibility");
873 "Add words to the document dictionary");
877 "Enable correction based on the word bigram dictionary.");
879 "Enable single word correction based on the dictionary.");
883 "Remove and conditionally reassign small outlines when they" 884 " confuse layout analysis, determining diacritics vs noise");
897 "Scaling on certainty diff from Hingepoint");
908 "good_quality_doc lte outline error limit");
912 "Adaptation decision algorithm for tess");
914 "Do minimal rejection on pass 1 output");
918 "Adaptation decision algorithm for tess");
925 "Run paragraph detection on the post-text-recognition " 931 "Allow outline errs in unrejection?");
933 "Reduce rejection on good docs");
936 "%rej allowed before rej whole doc");
938 "%rej allowed before rej whole block");
940 "%rej allowed before rej whole row");
942 "Number of row rejects in whole word rejects" 943 "which prevents whole row rejection");
945 "Only rej partially rejected words in block rejection");
947 "Only rej partially rejected words in row rejection");
949 "Use word segmentation quality metric");
951 "Use word segmentation quality metric");
953 "Only preserve wds longer than this");
955 "Apply row rejection to good docs");
957 "rej good doc wd if more than this fraction rejected");
959 "Reject all bad quality wds");
962 "Output data to debug file");
965 "good_quality_doc gte good char limit");
967 "Mark v.bad words for tilde crunch");
969 "Add font info to hocr output");
975 "crunch garbage cert lt this");
986 "Del if word gt xht x this above bl");
994 "Don't pot crunch sensible strings");
997 "Don't crunch words with long lower case strings");
999 "Don't crunch words with long lower case strings");
1003 "How many non-noise blbs either side?");
1009 "Punct. chs expected WITHIN numbers");
1011 "Max allowed deviation of blob top outside of font data");
1015 "certainty does a superscript position glyph need to be for us " 1016 "to try classifying it as a char with a different baseline?");
1018 "badness do we think sufficient to choose a superscript over " 1019 "what we'd thought. For example, a value of 0.6 means we want " 1020 "to reduce badness of certainty by 40%");
1022 "A superscript scaled down more than this is unbelievably " 1023 "small. For example, 0.3 means we expect the font size to " 1024 "be no smaller than 30% of the text line font size.");
1026 "Maximum top of a character measured as a multiple of x-height " 1027 "above the baseline for us to reconsider whether it's a " 1030 "Minimum bottom of a character measured as a multiple of " 1031 "x-height above the baseline for us to reconsider whether it's " 1034 "Write block separators in output");
1036 "Write repetition char code");
1043 "Create PDF with only one invisible text layer");
1047 "Specify minimum characters to try during OSD");
1049 "Output char for unidentified blobs");
1052 "Min suspect level for rejecting spaces");
1060 "Make output have exactly one word per WERD");
1062 "Don't reject ANYTHING AT ALL");
1068 "Aspect ratio dot/hyphen test");
1070 "Aspect ratio dot/hyphen test");
1082 "Allow NN to unrej");
1087 "-1 -> All pages, else specific page to process");
1093 "List of languages to load with this one");
1095 "In multilingual mode use params model of the primary language");
1099 "Min acceptable orientation margin");
1103 "Allow feature extractors to see the original outline");
1105 "Only initialize with the config file. Useful if the instance is " 1106 "not going to be used for OCR but say only for layout analysis.");
1110 "Force using vertical text page mode");
1112 "Fraction of textlines deemed vertical to use vertical page " 1115 "Fraction of height used as a minimum gap for aligned blobs.");
1118 "Preserve multiple interword spaces");
1120 "Page separator (default is form feed control character)");
1122 "Allows to include alternative symbols choices in the hOCR output. " 1123 "Valid input values are 0, 1 and 2. 0 is the default value. " 1124 "With 1 the alternative symbol choices per timestep are included. " 1125 "With 2 the alternative symbol choices are accumulated per character.");
1141 const char* backup_config_file_;
1152 Pix* pix_thresholds_;
1157 int source_resolution_;
1164 bool right_to_left_;
1176 int font_table_size_;
1182 int train_line_page_num_;
1187 #endif // TESSERACT_CCMAIN_TESSERACTCLASS_H_ bool applybox_learn_ngrams_mode
bool textord_tabfind_vertical_text
void set_unlv_suspects(WERD_RES *word)
double subscript_max_y_top
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
double superscript_bettered_certainty
Pix ** mutable_pix_binary()
int16_t alpha_count(const char *word, const char *word_lengths)
int crunch_leave_uc_strings
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
int16_t first_alphanum_index(const char *word, const char *word_lengths)
int16_t failure_count(WERD_RES *word)
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
int scaled_factor() const
bool acceptable_number_string(const char *s, const char *lengths)
bool tessedit_create_hocr
void ReSegmentByClassification(PAGE_RES *page_res)
bool paragraph_text_based
#define INT_VAR_H(name, val, comment)
char * tessedit_char_whitelist
void font_recognition_pass(PAGE_RES *page_res)
bool tessedit_prefer_joined_punct
bool tessedit_write_images
int SegmentPage(const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
bool tessedit_minimal_rejection
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB *> *target_blobs)
bool tessedit_zero_kelvin_rejection
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
char * tessedit_load_sublangs
char * tessedit_write_params_to_file
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
void dont_allow_1Il(WERD_RES *word)
double superscript_worse_certainty
void SetScaledColor(int factor, Pix *color)
void TrainFromBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
void PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
Assume a single uniform block of text. (Default.)
void set_pix_original(Pix *original_pix)
bool rej_1Il_use_dict_word
Dict & getDict() override
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE *> &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
bool tessedit_fix_hyphens
double min_orientation_margin
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
bool tessedit_rejection_debug
bool tessedit_debug_block_rejection
bool word_display(PAGE_RES_IT *pr_it)
bool write_results_empty_block
#define BOOL_VAR_H(name, val, comment)
void tess_add_doc_word(WERD_CHOICE *word_choice)
int tessedit_ocr_engine_mode
char * tessedit_char_unblacklist
double noise_cert_disjoint
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
int init_tesseract_lm(const char *arg0, const char *textbase, const char *language, TessdataManager *mgr)
bool suspect_constrain_1Il
bool crunch_leave_accept_strings
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Pix * pix_original() const
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
#define double_VAR_H(name, val, comment)
bool tessedit_word_for_word
bool tessedit_debug_quality_metrics
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
bool crunch_include_numerals
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
bool tessedit_timing_debug
void do_re_display(bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
int16_t count_alphas(const WERD_CHOICE &word)
bool tessedit_dump_choices
Pix * scaled_color() const
bool tessedit_zero_rejection
bool tessedit_resegment_from_boxes
void ApplyBoxTraining(const STRING &fontname, PAGE_RES *page_res)
bool tessedit_enable_bigram_correction
void set_pix_grey(Pix *grey_pix)
ImageData * GetRectImage(const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
double tessedit_reject_doc_percent
void set_word_fonts(WERD_RES *word)
double tessedit_reject_block_percent
void blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box)
bool tessedit_adaption_debug
double suspect_accept_rating
#define STRING_VAR_H(name, val, comment)
bool crunch_early_convert_bad_unlv_chs
bool enable_noise_removal
bool noise_outlines(TWERD *word)
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
int pageseg_devanagari_split_strategy
bool tessedit_override_permuter
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
double superscript_scaledown_ratio
void tilde_crunch(PAGE_RES_IT &page_res_it)
bool tessedit_minimal_rej_pass1
bool repeated_nonalphanum_wd(WERD_RES *word, ROW *row)
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
int crunch_pot_indicators
void process_image_event(const SVEvent &event)
bool SubAndSuperscriptFix(WERD_RES *word_res)
int tessedit_preserve_min_wd_len
bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
double tessedit_reject_row_percent
bool tessedit_resegment_from_line_boxes
bool tessedit_make_boxes_from_boxes
double crunch_del_low_word
bool FindSegmentation(const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
void convert_bad_unlv_chs(WERD_RES *word_res)
char * chs_trailing_punct2
bool docqual_excuse_outline_errs
void write_results(PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
bool tessedit_dump_pageseg_images
float blob_noise_score(TBLOB *blob)
PointerVector< WERD_RES > lang_words
char * numeric_punctuation
bool tessedit_preserve_blk_rej_perfect_wds
bool tessedit_init_config_only
void SearchWords(PointerVector< WERD_RES > *words)
bool applybox_learn_chars_and_char_frags_mode
double tessedit_good_doc_still_rowrej_wd
int quality_min_initial_alphas_reqd
double superscript_min_y_bottom
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
int16_t first_alphanum_offset(const char *word, const char *word_lengths)
int init_tesseract(const char *datapath, const char *language, OcrEngineMode oem)
double tessedit_lower_flip_hyphen
bool interactive_display_mode
bool textord_tabfind_force_vertical_text
int16_t word_blob_quality(WERD_RES *word, ROW *row)
double fixsp_small_outlines_size
bool preserve_interword_spaces
void SetEquationDetect(EquationDetect *detector)
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
char * tessedit_char_blacklist
void recog_word(WERD_RES *word)
int tessedit_image_border
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
void TrainLineRecognizer(const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list)
bool tessedit_enable_doc_dict
bool tessedit_redo_xheight
void nn_match_word(WERD_RES *word, ROW *row)
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
bool debug_acceptable_wds
bool tessedit_test_adaption
const Textord & textord() const
bool rej_alphas_in_number_perm
const FCOORD & reskew() const
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
bool RunOldFixXht(WERD_RES *word, BLOCK *block, ROW *row)
bool unlv_tilde_crunching
WordData(BLOCK *block_in, ROW *row_in, WERD_RES *word_res)
void SetupWordScripts(BLOCK_LIST *blocks)
bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
double textord_tabfind_vertical_text_ratio
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
bool tessedit_write_block_separators
bool tessedit_ambigs_training
int16_t safe_dict_word(const WERD_RES *werd_res)
double tessedit_whole_wd_rej_row_percent
int debug_fix_space_level
void(Tesseract::* WordRecognizer)(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
void SetupUniversalFontIds()
bool word_adaptable(WERD_RES *word, uint16_t mode)
void script_pos_pass(PAGE_RES *page_res)
void ResetAdaptiveClassifier()
bool tessedit_train_from_boxes
bool rej_1Il_trust_permuter_type
bool word_blank_and_set_display(PAGE_RES_IT *pr_its)
int source_resolution() const
bool tessedit_matcher_log
bool textord_use_cjk_fp_model
int16_t doc_good_char_quality
void pgeditor_main(int width, int height, PAGE_RES *page_res)
void split_and_recog_word(WERD_RES *word)
PAGE_RES * ApplyBoxes(const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
SVMenuNode * build_menu_new()
bool fixspace_thinks_word_done(WERD_RES *word)
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
bool process_cmd_win_event(int32_t cmd_event, char *new_value)
bool textord_equation_detect
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
int paragraph_debug_level
bool word_bln_display(PAGE_RES_IT *pr_it)
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
bool right_to_left() const
bool ConvertStringToUnichars(const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
bool tessedit_use_primary_params_model
int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
bool tessedit_fix_fuzzy_spaces
bool tessedit_preserve_row_rej_perfect_wds
bool word_set_display(PAGE_RES_IT *pr_it)
WordData(const PAGE_RES_IT &page_res_it)
void set_pix_thresholds(Pix *thresholds)
void SearchForText(const GenericVector< BLOB_CHOICE_LIST *> *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
bool last_char_was_newline
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
int16_t count_outline_errs(char c, int16_t outline_count)
char * ok_repeated_ch_non_alphanum_wds
void set_done(WERD_RES *word, int16_t pass)
int tessedit_bigram_debug
bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
int min_characters_to_try
bool tessedit_reject_bad_qual_wds
double suspect_rating_per_ch
double tessedit_upper_flip_hyphen
ImageData * GetLineData(const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block)
int init_tesseract_internal(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
int16_t word_outline_errs(WERD_RES *word)
UNICHAR_ID get_rep_char(WERD_RES *word)
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
bool crunch_leave_ok_strings
void ResetDocumentDictionary()
Textord * mutable_textord()
void flip_0O(WERD_RES *word)
FILE * init_recog_training(const STRING &fname)
double crunch_del_high_word
void reject_mostly_rejects(WERD_RES *word)
void set_source_resolution(int ppi)
void TidyUp(PAGE_RES *page_res)
int32_t adaption_word_number
int crunch_long_repetitions
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
bool tessedit_debug_fonts
void fix_rep_char(PAGE_RES_IT *page_res_it)
double crunch_terrible_rating
bool crunch_early_merge_tess_fails
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE *> &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
bool tess_acceptable_word(WERD_RES *word)
bool tessedit_unrej_any_wd
void recognize_page(STRING &image_name)
bool word_dumper(PAGE_RES_IT *pr_it)
void bigram_correction_pass(PAGE_RES *page_res)
double crunch_del_min_width
char * chs_trailing_punct1
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
void SetupWordPassN(int pass_n, WordData *word)
bool tessedit_enable_dict_correction
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
Tesseract * get_sub_lang(int index) const
int CountMisfitTops(WERD_RES *word_res)
void reject_edge_blobs(WERD_RES *word)
double crunch_poor_garbage_cert
bool tessedit_good_quality_unrej
int tessedit_pageseg_mode
bool tessedit_dont_blkrej_good_wds
int ocr_devanagari_split_strategy
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
void recog_word_recursive(WERD_RES *word)
void tilde_delete(PAGE_RES_IT &page_res_it)
char * conflict_set_I_l_1
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
int crunch_leave_lc_strings
double noise_cert_basechar
void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
bool tessedit_train_line_recognizer
bool check_debug_pt(WERD_RES *word, int location)
void process_selected_words(PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
bool tessedit_dont_rowrej_good_wds
void SetBlackAndWhitelist()
bool rej_use_tess_accepted
char * applybox_exposure_pattern
bool tessedit_use_reject_spaces
double crunch_poor_garbage_rate
bool tessedit_debug_doc_rejection
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
WERD_RES * TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
void read_config_file(const char *filename, SetParamConstraint constraint)
double quality_outline_pc
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
bool one_ell_conflict(WERD_RES *word_res, bool update_map)
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word)
void nn_recover_rejects(WERD_RES *word, ROW *row)
bool tessedit_row_rej_good_docs
bool textord_tabfind_show_vlines
void CorrectClassifyWords(PAGE_RES *page_res)
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
double crunch_small_outlines_size
bool tessedit_consistent_reps
int tessedit_tess_adaption_mode
void blamer_pass(PAGE_RES *page_res)
bool recog_interactive(PAGE_RES_IT *pr_it)
void recog_training_segmented(const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
double rej_whole_of_mostly_reject_word_fract
bool tessedit_create_boxfile
double textord_tabfind_aligned_gap_fraction
bool tessedit_display_outwords
int multilang_debug_level
void dictionary_correction_pass(PAGE_RES *page_res)
void tess_segment_pass_n(int pass_n, WERD_RES *word)
void flip_hyphens(WERD_RES *word)
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
int16_t count_alphanums(const WERD_CHOICE &word)
bool tessedit_write_rep_codes
void PrerecAllWordsPar(const GenericVector< WordData > &words)
void PreenXHeights(BLOCK_LIST *block_list)
int fixsp_non_noise_limit
bool crunch_terrible_garbage
int num_sub_langs() const
void break_noisiest_blob_word(WERD_RES_LIST &words)
bool poly_allow_detailed_fx
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB *> *target_blobs)
bool tilde_crunch_written
void unrej_good_chs(WERD_RES *word, ROW *row)
ColumnFinder * SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)
int tessedit_test_adaption_mode
int x_ht_acceptance_tolerance
void reject_I_1_L(WERD_RES *word)
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
double crunch_pot_poor_rate
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
double crunch_pot_poor_cert
void debug_word(PAGE_RES *page_res, const TBOX &selection_box)
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)