26 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H__
27 #define TESSERACT_CCMAIN_TESSERACTCLASS_H__
29 #include "allheaders.h"
39 class BLOB_CHOICE_LIST_CLIST;
100 #ifndef ANDROID_BUILD
101 class CubeLineObject;
103 class CubeRecoContext;
105 class EquationDetect;
107 #ifndef ANDROID_BUILD
108 class TesseractCubeCombiner;
202 pixDestroy(&pix_grey_);
203 pix_grey_ = grey_pix;
213 return pix_grey_ !=
NULL ? pix_grey_ : pix_binary_;
216 pixDestroy(&pix_thresholds_);
217 pix_thresholds_ = thresholds;
220 return source_resolution_;
223 source_resolution_ = ppi;
226 return pixGetWidth(pix_binary_);
229 return pixGetHeight(pix_binary_);
232 return scaled_color_;
235 return scaled_factor_;
238 scaled_factor_ = factor;
239 scaled_color_ = color;
249 return right_to_left_;
252 return sub_langs_.size();
255 return sub_langs_[index];
260 for (
int i = 0; i < sub_langs_.size(); ++i) {
287 TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs,
291 OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
292 Pix** music_mask_pix);
298 const char* word_config,
int pass);
301 const TBOX* target_word_box,
302 const char* word_config,
313 const TBOX* target_word_box,
314 const char* word_config,
318 const TBOX* target_word_box,
319 const char* word_config);
335 bool* make_next_word_fuzzy);
369 STRING* best_str,
float* c2);
376 TBOX &selection_box);
382 const char *lengths);
407 int *num_rebuilt_leading,
409 float *leading_certainty,
410 int *num_rebuilt_trailing,
412 float *trailing_certainty,
413 float *avg_certainty,
414 float *unlikely_threshold);
416 float leading_certainty,
418 int num_chopped_trailing,
419 float trailing_certainty,
424 int *retry_trailing);
427 float certainty_threshold,
429 int *right_ok)
const;
432 #ifndef ANDROID_BUILD
452 const char* cube_best_str,
455 Boxa** char_boxes,
CharSamp*** char_samples);
469 const char *lengths);
479 const char *textbase,
480 const char *language,
486 bool set_only_init_params);
488 const char *language,
510 const char *textbase,
511 const char *language,
517 bool set_only_init_params);
524 const char *textbase,
525 const char *language);
531 const char *textbase,
532 const char *language,
538 bool set_only_init_params);
546 #ifndef GRAPHICS_DISABLED
548 #endif // GRAPHICS_DISABLED
571 const char *word_lengths);
573 const char *word_lengths);
575 const char *word_lengths);
577 const char *word_lengths);
641 BOOL8 good_quality_doc);
643 BOOL8 good_quality_doc);
648 inT16 *accepted_match_count);
661 TBOX & selection_box,
692 BLOCK_LIST *block_list);
701 BLOCK_LIST *block_list);
716 const TBOX& box,
const TBOX& next_box,
717 const char* correct_text);
725 const TBOX& box,
const TBOX& next_box,
726 const char* correct_text);
749 int choices_pos,
int choices_length,
761 const char *err_msg);
779 "Take segmentation and labeling from box file");
781 "Conversion of word/line box file to char box file");
783 "Generate training data from boxed chars");
785 "Generate more boxes from boxed chars");
787 "Dump intermediate images made during page segmentation");
789 "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
790 " 5=line, 6=word, 7=char"
791 " (Values from PageSegMode enum in publictypes.h)");
793 "Which OCR engine(s) to run (Tesseract, Cube, both). Defaults"
794 " to loading and running only Tesseract (no Cube, no combiner)."
795 " (Values from OcrEngineMode enum in tesseractclass.h)");
797 "Blacklist of chars not to recognize");
799 "Whitelist of chars to recognize");
801 "List of chars to override tessedit_char_blacklist");
803 "Perform training for ambiguities");
806 "Whether to use the top-line splitting process for Devanagari "
807 "documents while performing page-segmentation.");
810 "Whether to use the top-line splitting process for Devanagari "
811 "documents while performing ocr.");
813 "Write all parameters to the given file.");
815 "Generate and print debug information for adaption");
820 "Exposure value follows this pattern in the image"
821 " filename. The name of the image files are expected"
822 " to be in the form [lang].[fontname].exp[num].tif");
824 "Learn both character fragments (as is done in the"
825 " special low exposure mode) as well as unfragmented"
828 "Each bounding box is assumed to contain ngrams. Only"
829 " learn the ngrams whose outlines overlap horizontally.");
834 "Try to improve fuzzy spaces");
836 "Dont bother with word plausibility");
840 "Add words to the document dictionary");
844 "Enable correction based on the word bigram dictionary.");
846 "Enable single word correction based on the dictionary.");
850 "Remove and conditionally reassign small outlines when they"
851 " confuse layout analysis, determining diacritics vs noise");
864 "Scaling on certainty diff from Hingepoint");
875 "good_quality_doc lte outline error limit");
879 "Adaptation decision algorithm for tess");
881 "Do minimal rejection on pass 1 output");
885 "Adaptation decision algorithm for tess");
891 "Run paragraph detection on the post-text-recognition "
897 "Allow outline errs in unrejection?");
899 "Reduce rejection on good docs");
902 "%rej allowed before rej whole doc");
904 "%rej allowed before rej whole block");
906 "%rej allowed before rej whole row");
908 "Number of row rejects in whole word rejects"
909 "which prevents whole row rejection");
911 "Only rej partially rejected words in block rejection");
913 "Only rej partially rejected words in row rejection");
915 "Use word segmentation quality metric");
917 "Use word segmentation quality metric");
919 "Only preserve wds longer than this");
921 "Apply row rejection to good docs");
923 "rej good doc wd if more than this fraction rejected");
925 "Reject all bad quality wds");
928 "Output data to debug file");
931 "good_quality_doc gte good char limit");
933 "Mark v.bad words for tilde crunch");
935 "Add font info to hocr output");
941 "crunch garbage cert lt this");
952 "Del if word gt xht x this above bl");
960 "Dont pot crunch sensible strings");
963 "Dont crunch words with long lower case strings");
965 "Dont crunch words with long lower case strings");
969 "How many non-noise blbs either side?");
975 "Punct. chs expected WITHIN numbers");
977 "Max allowed deviation of blob top outside of font data");
981 "certainty does a superscript position glyph need to be for us "
982 "to try classifying it as a char with a different baseline?");
984 "badness do we think sufficient to choose a superscript over "
985 "what we'd thought. For example, a value of 0.6 means we want "
986 "to reduce badness of certainty by 40%");
988 "A superscript scaled down more than this is unbelievably "
989 "small. For example, 0.3 means we expect the font size to "
990 "be no smaller than 30% of the text line font size.");
992 "Maximum top of a character measured as a multiple of x-height "
993 "above the baseline for us to reconsider whether it's a "
996 "Minimum bottom of a character measured as a multiple of "
997 "x-height above the baseline for us to reconsider whether it's "
1000 "Write block separators in output");
1002 "Write repetition char code");
1008 "Output char for unidentified blobs");
1011 "Min suspect level for rejecting spaces");
1013 "Dont Suspect dict wds longer than this");
1020 "Make output have exactly one word per WERD");
1022 "Dont reject ANYTHING AT ALL");
1028 "Aspect ratio dot/hyphen test");
1030 "Aspect ratio dot/hyphen test");
1042 "Allow NN to unrej");
1047 "-1 -> All pages, else specifc page to process");
1053 "Debug level for TessdataManager functions.");
1055 "List of languages to load with this one");
1057 "In multilingual mode use params model of the primary language");
1061 "Min acceptable orientation margin");
1065 "Allow feature extractors to see the original outline");
1067 "Only initialize with the config file. Useful if the instance is "
1068 "not going to be used for OCR but say only for layout analysis.");
1072 "Force using vertical text page mode");
1074 "Fraction of textlines deemed vertical to use vertical page "
1077 "Fraction of height used as a minimum gap for aligned blobs.");
1080 "Preserve multiple interword spaces");
1082 "Include page separator string in output text after each "
1085 "Page separator (default is form feed control character)");
1096 "find horizontal lines such as headers in vertical page mode");
1099 " dawgs (e.g. for non-space delimited languages)");
1103 " current best rate to prune other hypotheses");
1105 "Turn on word script consistency permuter");
1107 "incorporate segmentation cost in word rating?");
1109 "Score multipler for script consistency within a word. "
1110 "Being a 'reward' factor, it should be <= 1. "
1111 "Smaller value implies bigger reward.");
1113 "Turn on fixed-length phrasebook search permuter");
1115 "Turn on character type (property) consistency permuter");
1117 "Score multipler for char type consistency within a word. ");
1119 "Score multipler for ngram permuter's best choice"
1120 " (only used in the Han script path).");
1122 "Activate character-level n-gram-based permuter");
1125 "Depth of blob choice lists to explore"
1126 " when fixed length dawgs are on");
1128 "use new state cost heuristics for segmentation state evaluation");
1130 "base factor for adding segmentation cost into word rating."
1131 "It's a multiplying factor, the larger the value above 1, "
1132 "the bigger the effect of segmentation cost.");
1134 "weight associated with char rating in combined cost of state");
1136 "weight associated with width evidence in combined cost of"
1139 "weight associated with seam cut in combined cost of state");
1141 "max char width-to-height ratio allowed in segmentation");
1143 "Enable new segmentation search path.");
1145 "Maximum character width-to-height ratio for"
1146 "fixed pitch fonts");
1159 #ifndef ANDROID_BUILD
1167 const char* backup_config_file_;
1178 Pix* pix_thresholds_;
1181 int source_resolution_;
1188 bool right_to_left_;
1200 int font_table_size_;
1201 #ifndef ANDROID_BUILD
1213 #endif // TESSERACT_CCMAIN_TESSERACTCLASS_H__
bool tessedit_dump_pageseg_images
double superscript_scaledown_ratio
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX &next_box, const char *correct_text)
void convert_bad_unlv_chs(WERD_RES *word_res)
void run_cube_combiner(PAGE_RES *page_res)
bool docqual_excuse_outline_errs
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
inT16 count_alphas(const WERD_CHOICE &word)
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB * > *target_blobs)
inT16 first_alphanum_offset(const char *word, const char *word_lengths)
void CorrectClassifyWords(PAGE_RES *page_res)
BOOL8 word_dumper(PAGE_RES_IT *pr_it)
inT32 adaption_word_number
void break_noisiest_blob_word(WERD_RES_LIST &words)
bool right_to_left() const
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
void reject_mostly_rejects(WERD_RES *word)
BOOL8 recog_interactive(PAGE_RES_IT *pr_it)
Textord * mutable_textord()
inT16 word_outline_errs(WERD_RES *word)
int x_ht_acceptance_tolerance
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
bool tessedit_display_outwords
char * numeric_punctuation
bool tessedit_train_from_boxes
bool crunch_include_numerals
BOOL8 word_adaptable(WERD_RES *word, uinT16 mode)
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
double crunch_terrible_rating
void recog_training_segmented(const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
void make_reject_map(WERD_RES *word, ROW *row, inT16 pass)
void fill_werd_res(const BoxWord &cube_box_word, const char *cube_best_str, WERD_RES *tess_werd_res)
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
BOOL8 potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word)
BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map)
bool paragraph_text_based
bool applybox_learn_ngrams_mode
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
void ResetDocumentDictionary()
void set_pix_thresholds(Pix *thresholds)
void debug_word(PAGE_RES *page_res, const TBOX &selection_box)
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
void dont_allow_1Il(WERD_RES *word)
bool create_cube_box_word(Boxa *char_boxes, int num_chars, TBOX word_box, BoxWord *box_word)
bool SubAndSuperscriptFix(WERD_RES *word_res)
bool tessedit_create_boxfile
double noise_cert_basechar
double segment_reward_chartype
bool tessedit_zero_kelvin_rejection
double bestrate_pruning_factor
bool tessedit_test_adaption
#define INT_VAR_H(name, val, comment)
void SetBlackAndWhitelist()
double segment_reward_ngram_best_choice
char * ok_repeated_ch_non_alphanum_wds
bool tessedit_fix_hyphens
Pix ** mutable_pix_binary()
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
BOOL8 word_display(PAGE_RES_IT *pr_it)
void ResetAdaptiveClassifier()
double noise_cert_disjoint
int quality_min_initial_alphas_reqd
PointerVector< WERD_RES > lang_words
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
bool tessedit_debug_block_rejection
char * tessedit_write_params_to_file
bool textord_equation_detect
int crunch_long_repetitions
void blamer_pass(PAGE_RES *page_res)
bool tessedit_minimal_rej_pass1
inT16 doc_good_char_quality
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
bool tessedit_consistent_reps
void recognize_page(STRING &image_name)
void recog_word(WERD_RES *word)
char * tessedit_char_blacklist
bool rej_alphas_in_number_perm
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
bool tessedit_timing_debug
void set_pix_grey(Pix *grey_pix)
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
void set_source_resolution(int ppi)
TessdataManager tessdata_manager
bool tessedit_use_reject_spaces
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
bool crunch_leave_ok_strings
void tilde_crunch(PAGE_RES_IT &page_res_it)
Tesseract * get_sub_lang(int index) const
int debug_fix_space_level
int source_resolution() const
FILE * init_recog_training(const STRING &fname)
bool tessedit_write_images
WordData(const PAGE_RES_IT &page_res_it)
PAGE_RES * ApplyBoxes(const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB * > *target_blobs)
void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
bool tessedit_word_for_word
bool rej_1Il_trust_permuter_type
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
int SegmentPage(const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
int tessedit_pageseg_mode
bool tessedit_rejection_debug
inT16 safe_dict_word(const WERD_RES *werd_res)
int CountMisfitTops(WERD_RES *word_res)
inT16 eval_word_spacing(WERD_RES_LIST &word_res_list)
bool tessedit_create_hocr
void SetScaledColor(int factor, Pix *color)
#define STRING_VAR_H(name, val, comment)
void split_and_recog_word(WERD_RES *word)
void write_results(PAGE_RES_IT &page_res_it, char newline_type, BOOL8 force_eol)
void script_pos_pass(PAGE_RES *page_res)
double suspect_accept_rating
bool load_fixed_length_dawgs
inT16 count_alphanums(const WERD_CHOICE &word)
Pix * scaled_color() const
bool tessedit_reject_bad_qual_wds
bool tessedit_enable_dict_correction
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
bool ngram_permuter_activated
bool tessedit_good_quality_unrej
double tessedit_whole_wd_rej_row_percent
CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode)
void process_selected_words(PAGE_RES *page_res, TBOX &selection_box, BOOL8(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
bool enable_new_segsearch
double crunch_small_outlines_size
double tessedit_reject_block_percent
double segment_reward_script
double textord_tabfind_vertical_text_ratio
bool FindSegmentation(const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
char * tessedit_char_unblacklist
inT16 alpha_count(const char *word, const char *word_lengths)
void quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
void TidyUp(PAGE_RES *page_res)
bool tessedit_zero_rejection
int crunch_pot_indicators
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
bool tessedit_dont_blkrej_good_wds
bool crunch_early_convert_bad_unlv_chs
int paragraph_debug_level
int scaled_factor() const
int pageseg_devanagari_split_strategy
CubeRecoContext * GetCubeRecoContext()
inT16 first_alphanum_index(const char *word, const char *word_lengths)
BOOL8 acceptable_number_string(const char *s, const char *lengths)
void reject_edge_blobs(WERD_RES *word)
bool textord_tabfind_vertical_horizontal_mix
inT16 failure_count(WERD_RES *word)
inT16 count_outline_errs(char c, inT16 outline_count)
BOOL8 word_set_display(PAGE_RES_IT *pr_it)
BOOL8 check_debug_pt(WERD_RES *word, int location)
bool tessedit_fix_fuzzy_spaces
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
void PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
int tessedit_test_adaption_mode
void process_image_event(const SVEvent &event)
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
bool tessedit_enable_bigram_correction
int num_sub_langs() const
bool tessedit_resegment_from_line_boxes
bool interactive_display_mode
bool segment_segcost_rating
int tessedit_tess_adaption_mode
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
bool permute_chartype_word
char * conflict_set_I_l_1
bool extract_cube_state(CubeObject *cube_obj, int *num_chars, Boxa **char_boxes, CharSamp ***char_samples)
void nn_match_word(WERD_RES *word, ROW *row)
BOOL8 word_blank_and_set_display(PAGE_RES_IT *pr_its)
void read_config_file(const char *filename, SetParamConstraint constraint)
const Textord & textord() const
int tessedit_ocr_engine_mode
bool RunOldFixXht(WERD_RES *word, BLOCK *block, ROW *row)
bool poly_allow_detailed_fx
bool unlv_tilde_crunching
char * applybox_exposure_pattern
double heuristic_weight_width
BOOL8 process_cmd_win_event(inT32 cmd_event, char *new_value)
bool init_cube_objects(bool load_combiner, TessdataManager *tessdata_manager)
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
BOOL8 word_contains_non_1_digit(const char *word, const char *word_lengths)
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
void dictionary_correction_pass(PAGE_RES *page_res)
bool debug_acceptable_wds
double crunch_poor_garbage_cert
double tessedit_lower_flip_hyphen
void pgeditor_main(int width, int height, PAGE_RES *page_res)
bool tessedit_unrej_any_wd
int fixsp_non_noise_limit
void font_recognition_pass(PAGE_RES *page_res)
bool tessedit_resegment_from_boxes
bool tessedit_enable_doc_dict
double crunch_pot_poor_rate
void SearchForText(const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
UNICHAR_ID get_rep_char(WERD_RES *word)
bool write_results_empty_block
#define double_VAR_H(name, val, comment)
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word)
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
bool tessedit_matcher_log
BOOL8 word_bln_display(PAGE_RES_IT *pr_it)
bool tessedit_ambigs_training
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
void bigram_correction_pass(PAGE_RES *page_res)
bool tessedit_debug_fonts
int init_tesseract_internal(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
void SetupWordPassN(int pass_n, WordData *word)
int crunch_leave_uc_strings
void tess_segment_pass_n(int pass_n, WERD_RES *word)
int tessdata_manager_debug_level
double fixsp_small_outlines_size
double tessedit_reject_row_percent
void cube_word_pass1(BLOCK *block, ROW *row, WERD_RES *word)
inT16 word_blob_quality(WERD_RES *word, ROW *row)
Assume a single uniform block of text. (Default.)
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
WordData(BLOCK *block_in, ROW *row_in, WERD_RES *word_res)
bool enable_noise_removal
bool tessedit_debug_doc_rejection
int init_tesseract(const char *datapath, const char *language, OcrEngineMode oem)
double heuristic_segcost_rating_base
char * tessedit_load_sublangs
void set_unlv_suspects(WERD_RES *word)
BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
bool tessedit_override_permuter
bool crunch_early_merge_tess_fails
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
double rej_whole_of_mostly_reject_word_fract
bool tessedit_use_primary_params_model
void fix_rep_char(PAGE_RES_IT *page_res_it)
float blob_noise_score(TBLOB *blob)
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX &next_box, const char *correct_text)
int ocr_devanagari_split_strategy
int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
bool tessedit_row_rej_good_docs
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
void ApplyBoxTraining(const STRING &fontname, PAGE_RES *page_res)
void blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box)
double min_orientation_margin
bool tessedit_preserve_row_rej_perfect_wds
double tessedit_upper_flip_hyphen
BOOL8 fixspace_thinks_word_done(WERD_RES *word)
inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
void set_word_fonts(WERD_RES *word)
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
double tessedit_good_doc_still_rowrej_wd
WERD_RES * TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
BOOL8 non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
bool rej_1Il_use_dict_word
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
void unrej_good_chs(WERD_RES *word, ROW *row)
void SetEquationDetect(EquationDetect *detector)
double heuristic_max_char_wh_ratio
void nn_recover_rejects(WERD_RES *word, ROW *row)
bool tessedit_init_config_only
bool tilde_crunch_written
bool textord_tabfind_show_vlines
void flip_hyphens(WERD_RES *word)
BOOL8 noise_outlines(TWERD *word)
double suspect_rating_per_ch
bool permute_fixed_length_dawg
bool tessedit_dont_rowrej_good_wds
bool suspect_constrain_1Il
void do_re_display(BOOL8(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
void recog_word_recursive(WERD_RES *word)
double crunch_pot_poor_cert
bool rej_use_tess_accepted
void(Tesseract::* WordRecognizer)(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
bool tess_acceptable_word(WERD_RES *word)
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
void SetupUniversalFontIds()
bool cube_recognize(CubeObject *cube_obj, BLOCK *block, WERD_RES *word)
double tessedit_reject_doc_percent
void SetupWordScripts(BLOCK_LIST *blocks)
bool tessedit_adaption_debug
bool tessedit_redo_xheight
bool tessedit_write_rep_codes
const FCOORD & reskew() const
double crunch_del_low_word
void PrerecAllWordsPar(const GenericVector< WordData > &words)
void cube_combine_word(CubeObject *cube_obj, WERD_RES *cube_word, WERD_RES *tess_word)
int tessedit_preserve_min_wd_len
double crunch_del_high_word
bool textord_tabfind_vertical_text
double crunch_del_min_width
char * tessedit_char_whitelist
ColumnFinder * SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
double superscript_worse_certainty
int crunch_leave_lc_strings
bool preserve_interword_spaces
bool tessedit_dump_choices
char * chs_trailing_punct1
bool tessedit_write_block_separators
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
double crunch_poor_garbage_rate
bool last_char_was_newline
void reject_I_1_L(WERD_RES *word)
double quality_outline_pc
void set_done(WERD_RES *word, inT16 pass)
bool tessedit_minimal_rejection
int tessedit_bigram_debug
void PreenXHeights(BLOCK_LIST *block_list)
bool ConvertStringToUnichars(const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row)
SVMenuNode * build_menu_new()
bool crunch_leave_accept_strings
void tilde_delete(PAGE_RES_IT &page_res_it)
bool textord_tabfind_force_vertical_text
double segsearch_max_fixed_pitch_char_wh_ratio
bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
CubeObject * cube_recognize_word(BLOCK *block, WERD_RES *word)
bool crunch_terrible_garbage
bool tessedit_debug_quality_metrics
bool applybox_learn_chars_and_char_frags_mode
bool tessedit_make_boxes_from_boxes
#define BOOL_VAR_H(name, val, comment)
bool tessedit_prefer_joined_punct
char * chs_trailing_punct2
double subscript_max_y_top
double heuristic_weight_seamcut
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
bool tessedit_preserve_blk_rej_perfect_wds
bool textord_use_cjk_fp_model
void fix_fuzzy_spaces(ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)
double textord_tabfind_aligned_gap_fraction
BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position)
int init_tesseract_lm(const char *arg0, const char *textbase, const char *language)
void flip_0O(WERD_RES *word)
void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved)
double superscript_min_y_bottom
void ReSegmentByClassification(PAGE_RES *page_res)
int tessedit_image_border
double heuristic_weight_rating
double superscript_bettered_certainty
BOOL8 non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
int language_model_fixed_length_choices_depth
void tess_add_doc_word(WERD_CHOICE *word_choice)