tesseract
5.0.0-alpha-619-ge9db
|
Go to the documentation of this file.
25 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H_
26 #define TESSERACT_CCMAIN_TESSERACTCLASS_H_
30 #include "allheaders.h"
34 #ifndef DISABLED_LEGACY_ENGINE
106 class EquationDetect;
108 class LSTMRecognizer;
197 pixDestroy(&pix_binary_);
207 pixDestroy(&pix_grey_);
208 pix_grey_ = grey_pix;
211 return pix_original_;
215 pixDestroy(&pix_original_);
216 pix_original_ = original_pix;
218 for (
int i = 0; i < sub_langs_.size(); ++i) {
219 sub_langs_[i]->set_pix_original(original_pix ? pixClone(original_pix)
232 if (pixGetWidth(pix_original_) ==
ImageWidth()) {
233 return pix_original_;
234 }
else if (pix_grey_ !=
nullptr) {
241 pixDestroy(&pix_thresholds_);
242 pix_thresholds_ = thresholds;
245 return source_resolution_;
248 source_resolution_ = ppi;
251 return pixGetWidth(pix_binary_);
254 return pixGetHeight(pix_binary_);
257 return scaled_color_;
260 return scaled_factor_;
263 scaled_factor_ = factor;
264 scaled_color_ = color;
274 return right_to_left_;
277 return sub_langs_.size();
280 return sub_langs_[index];
286 for (
int i = 0; i < sub_langs_.size(); ++i) {
296 for (
int i = 0; i < sub_langs_.size(); ++i) {
324 TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs,
328 OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
329 Pix** music_mask_pix);
339 const STRING& output_basename,
340 BLOCK_LIST* block_list);
353 int end_box,
const BLOCK& block);
361 TBOX* revised_box)
const;
373 const char* word_config,
int pass);
376 const char* word_config,
PAGE_RES* page_res,
384 const TBOX* target_word_box,
const char* word_config,
387 const TBOX* target_word_box,
const char* word_config);
402 bool* make_next_word_fuzzy);
436 STRING* best_str,
float* c2);
442 TBOX& selection_box);
448 const char* lengths);
473 float* leading_certainty,
int* num_rebuilt_trailing,
474 ScriptPos* trailing_pos,
float* trailing_certainty,
float* avg_certainty,
475 float* unlikely_threshold);
477 float leading_certainty,
ScriptPos leading_pos,
478 int num_chopped_trailing,
479 float trailing_certainty,
481 bool* is_good,
int* retry_leading,
482 int* retry_trailing);
484 float certainty_threshold,
int* left_ok,
485 int* right_ok)
const;
513 return init_tesseract(datapath,
nullptr, language, oem,
nullptr, 0,
nullptr,
514 nullptr,
false, &mgr);
534 char** configs,
int configs_size,
551 char** configs,
int configs_size,
554 bool set_only_init_params,
562 #ifndef GRAPHICS_DISABLED
564 #endif // GRAPHICS_DISABLED
588 int16_t
alpha_count(
const char* word,
const char* word_lengths);
630 void dump_words(WERD_RES_LIST& perm, int16_t score, int16_t mode,
637 #ifndef DISABLED_LEGACY_ENGINE
652 int16_t* accepted_match_count);
656 #ifndef DISABLED_LEGACY_ENGINE
697 BLOCK_LIST* block_list);
706 BLOCK_LIST* block_list);
721 const TBOX& box,
const TBOX* next_box,
722 const char* correct_text);
730 const TBOX* next_box,
const char* correct_text);
753 int choices_pos,
int choices_length,
755 int text_index,
float rating,
765 const char* err_msg);
783 "Take segmentation and labeling from box file");
785 "Conversion of word/line box file to char box file");
787 "Generate training data from boxed chars");
789 "Generate more boxes from boxed chars");
791 "Break input into lines and remap boxes if present");
793 "Dump intermediate images made during page segmentation");
795 "Try inverting the image in `LSTMRecognizeWord`");
797 "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
798 " 5=line, 6=word, 7=char"
799 " (Values from PageSegMode enum in tesseract/publictypes.h)");
801 "Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults"
802 " to loading and running the most accurate available.");
804 "Blacklist of chars not to recognize");
807 "List of chars to override tessedit_char_blacklist");
809 "Perform training for ambiguities");
812 "Whether to use the top-line splitting process for Devanagari "
813 "documents while performing page-segmentation.");
816 "Whether to use the top-line splitting process for Devanagari "
817 "documents while performing ocr.");
819 "Write all parameters to the given file.");
821 "Generate and print debug information for adaption");
826 "Exposure value follows this pattern in the image"
827 " filename. The name of the image files are expected"
828 " to be in the form [lang].[fontname].exp[num].tif");
830 "Learn both character fragments (as is done in the"
831 " special low exposure mode) as well as unfragmented"
834 "Each bounding box is assumed to contain ngrams. Only"
835 " learn the ngrams whose outlines overlap horizontally.");
841 "Don't bother with word plausibility");
844 "Add words to the document dictionary");
848 "Enable correction based on the word bigram dictionary.");
850 "Enable single word correction based on the dictionary.");
852 "Amount of debug output for bigram "
855 "Remove and conditionally reassign small outlines when they"
856 " confuse layout analysis, determining diacritics vs noise");
869 "Scaling on certainty diff from Hingepoint");
879 "good_quality_doc lte outline error limit");
883 "Adaptation decision algorithm for tess");
885 "Do minimal rejection on pass 1 output");
893 "Run paragraph detection on the post-text-recognition "
899 "Reduce rejection on good docs");
902 "%rej allowed before rej whole doc");
904 "%rej allowed before rej whole block");
906 "%rej allowed before rej whole row");
908 "Number of row rejects in whole word rejects"
909 "which prevents whole row rejection");
911 "Only rej partially rejected words in block rejection");
913 "Only rej partially rejected words in row rejection");
915 "Use word segmentation quality metric");
917 "Use word segmentation quality metric");
919 "Only preserve wds longer than this");
921 "Apply row rejection to good docs");
923 "rej good doc wd if more than this fraction rejected");
927 "Output data to debug file");
933 "Add coordinates for each character to hocr output");
955 "Don't pot crunch sensible strings");
958 "Don't crunch words with long lower case strings");
960 "Don't crunch words with long lower case strings");
970 "Max allowed deviation of blob top outside of font data");
974 "How many times worse "
975 "certainty does a superscript position glyph need to be for us "
976 "to try classifying it as a char with a different baseline?");
979 "badness do we think sufficient to choose a superscript over "
980 "what we'd thought. For example, a value of 0.6 means we want "
981 "to reduce badness of certainty by 40%");
983 "A superscript scaled down more than this is unbelievably "
984 "small. For example, 0.3 means we expect the font size to "
985 "be no smaller than 30% of the text line font size.");
987 "Maximum top of a character measured as a multiple of x-height "
988 "above the baseline for us to reconsider whether it's a "
991 "Minimum bottom of a character measured as a multiple of "
992 "x-height above the baseline for us to reconsider whether it's "
995 "Write block separators in output");
1002 "Write .box file for LSTM training");
1005 "Write WordStr format .box output file");
1008 "Create PDF with only one invisible text layer");
1012 "Specify minimum characters to try during OSD");
1022 "Make output have exactly one word per WERD");
1024 "Don't reject ANYTHING AT ALL");
1045 "-1 -> All pages, else specific page to process");
1051 "List of languages to load with this one");
1053 "In multilingual mode use params model of the primary language");
1057 "Min acceptable orientation margin");
1061 "Allow feature extractors to see the original outline");
1063 "Only initialize with the config file. Useful if the instance is "
1064 "not going to be used for OCR but say only for layout analysis.");
1068 "Force using vertical text page mode");
1070 "Fraction of textlines deemed vertical to use vertical page "
1073 "Fraction of height used as a minimum gap for aligned blobs.");
1076 "Preserve multiple interword spaces");
1078 "Page separator (default is form feed control character)");
1080 "Allows to include alternative symbols choices in the hOCR "
1082 "Valid input values are 0, 1 and 2. 0 is the default value. "
1083 "With 1 the alternative symbol choices per timestep are included. "
1084 "With 2 the alternative symbol choices are extracted from the CTC "
1085 "process instead of the lattice. The choices are mapped per "
1088 "Sets the number of cascading iterations for the Beamsearch in "
1089 "lstm_choice_mode. Note that lstm_choice_mode must be set to "
1090 "a value greater than 0 to produce results.");
1092 "Sets the rating coefficient for the lstm choices. The smaller "
1093 "the coefficient, the better are the ratings for each choice "
1094 "and less information is lost due to the cut off at 0. The "
1095 "standard value is 5.");
1097 "Detect music staff and remove intersecting components");
1111 const char* backup_config_file_;
1122 Pix* pix_thresholds_;
1127 int source_resolution_;
1134 bool right_to_left_;
1146 int font_table_size_;
1152 int train_line_page_num_;
1157 #endif // TESSERACT_CCMAIN_TESSERACTCLASS_H_
bool tessedit_dont_rowrej_good_wds
double superscript_bettered_certainty
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
int16_t first_alphanum_index(const char *word, const char *word_lengths)
ColumnFinder * SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)
char * applybox_exposure_pattern
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
void break_noisiest_blob_word(WERD_RES_LIST &words)
bool poly_allow_detailed_fx
void split_and_recog_word(WERD_RES *word)
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
void PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
int quality_min_initial_alphas_reqd
double crunch_terrible_rating
bool tessedit_use_reject_spaces
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
bool tessedit_train_from_boxes
int pageseg_devanagari_split_strategy
int16_t failure_count(WERD_RES *word)
void ResetDocumentDictionary()
double tessedit_good_doc_still_rowrej_wd
double crunch_poor_garbage_cert
void set_word_fonts(WERD_RES *word)
double superscript_worse_certainty
int x_ht_acceptance_tolerance
bool tessedit_display_outwords
bool textord_use_cjk_fp_model
char * chs_trailing_punct1
bool tessedit_create_wordstrbox
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
bool tessedit_create_boxfile
bool noise_outlines(TWERD *word)
void recog_training_segmented(const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
double lstm_rating_coefficient
void flip_0O(WERD_RES *word)
void recog_word(WERD_RES *word)
bool tessedit_debug_quality_metrics
int CountMisfitTops(WERD_RES *word_res)
void SearchForText(const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
bool tessedit_fix_fuzzy_spaces
void debug_word(PAGE_RES *page_res, const TBOX &selection_box)
int init_tesseract(const char *datapath, const char *language, OcrEngineMode oem)
void SetEquationDetect(EquationDetect *detector)
bool applybox_learn_ngrams_mode
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB * > *target_blobs)
void recognize_page(STRING &image_name)
void tilde_crunch(PAGE_RES_IT &page_res_it)
bool crunch_terrible_garbage
bool tessedit_write_rep_codes
bool RunOldFixXht(WERD_RES *word, BLOCK *block, ROW *row)
double min_orientation_margin
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
void SetupWordScripts(BLOCK_LIST *blocks)
double superscript_min_y_bottom
int crunch_leave_lc_strings
bool recog_interactive(PAGE_RES_IT *pr_it)
void set_unlv_suspects(WERD_RES *word)
bool crunch_include_numerals
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
bool rej_1Il_trust_permuter_type
bool tessedit_zero_rejection
bool tessedit_train_line_recognizer
double suspect_rating_per_ch
void read_config_file(const char *filename, SetParamConstraint constraint)
Assume a single uniform block of text. (Default.)
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
bool write_results_empty_block
bool tessedit_enable_dict_correction
void SetScaledColor(int factor, Pix *color)
int tessedit_pageseg_mode
void tess_add_doc_word(WERD_CHOICE *word_choice)
int SegmentPage(const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
bool tilde_crunch_written
int16_t count_alphas(const WERD_CHOICE &word)
bool rej_1Il_use_dict_word
void reject_I_1_L(WERD_RES *word)
bool ConvertStringToUnichars(const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
int tessedit_image_border
char * ok_repeated_ch_non_alphanum_wds
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
double superscript_scaledown_ratio
void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
int crunch_leave_uc_strings
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
bool textord_tabfind_force_vertical_text
int tessedit_bigram_debug
double fixsp_small_outlines_size
#define double_VAR_H(name, val, comment)
bool enable_noise_removal
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
double crunch_pot_poor_cert
void script_pos_pass(PAGE_RES *page_res)
Tesseract * get_sub_lang(int index) const
bool tessedit_row_rej_good_docs
int tessedit_preserve_min_wd_len
int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
void PreenXHeights(BLOCK_LIST *block_list)
int lstm_choice_iterations
bool tessedit_adaption_debug
char * chs_trailing_punct2
void nn_recover_rejects(WERD_RES *word, ROW *row)
void pgeditor_main(int width, int height, PAGE_RES *page_res)
int scaled_factor() const
void SetupWordPassN(int pass_n, WordData *word)
int crunch_pot_indicators
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
bool crunch_early_convert_bad_unlv_chs
bool textord_tabfind_vertical_text
int init_tesseract_lm(const char *arg0, const char *textbase, const char *language, TessdataManager *mgr)
SVMenuNode * build_menu_new()
bool tessedit_resegment_from_line_boxes
double crunch_small_outlines_size
bool tessedit_make_boxes_from_boxes
void ApplyBoxTraining(const STRING &fontname, PAGE_RES *page_res)
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
int tessedit_tess_adaption_mode
bool crunch_leave_ok_strings
void set_pix_grey(Pix *grey_pix)
bool tessedit_prefer_joined_punct
int debug_fix_space_level
int16_t count_alphanums(const WERD_CHOICE &word)
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
bool pageseg_apply_music_mask
void TidyUp(PAGE_RES *page_res)
bool word_bln_display(PAGE_RES_IT *pr_it)
bool tessedit_enable_doc_dict
bool interactive_display_mode
bool suspect_constrain_1Il
bool SubAndSuperscriptFix(WERD_RES *word_res)
bool last_char_was_newline
bool tessedit_debug_fonts
int16_t safe_dict_word(const WERD_RES *werd_res)
ImageData * GetRectImage(const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
bool acceptable_number_string(const char *s, const char *lengths)
bool tessedit_test_adaption
bool tessedit_rejection_debug
bool tessedit_resegment_from_boxes
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
bool tessedit_reject_bad_qual_wds
void dont_allow_1Il(WERD_RES *word)
bool fixspace_thinks_word_done(WERD_RES *word)
PAGE_RES * ApplyBoxes(const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
char * tessedit_char_whitelist
WordData(BLOCK *block_in, ROW *row_in, WERD_RES *word_res)
bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
bool crunch_leave_accept_strings
double noise_cert_disjoint
void PrerecAllWordsPar(const GenericVector< WordData > &words)
bool word_adaptable(WERD_RES *word, uint16_t mode)
bool tessedit_preserve_row_rej_perfect_wds
char * tessedit_load_sublangs
bool tessedit_dump_pageseg_images
bool tessedit_zero_kelvin_rejection
void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
bool tessedit_override_permuter
double crunch_del_low_word
double noise_cert_basechar
Dict & getDict() override
bool tessedit_ambigs_training
Textord * mutable_textord()
float blob_noise_score(TBLOB *blob)
double tessedit_whole_wd_rej_row_percent
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
int crunch_long_repetitions
bool tessedit_minimal_rej_pass1
double crunch_poor_garbage_rate
void do_re_display(bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
bool tess_acceptable_word(WERD_RES *word)
bool tessedit_create_alto
bool tessedit_good_quality_unrej
WordData(const PAGE_RES_IT &page_res_it)
int source_resolution() const
void SetBlackAndWhitelist()
double rej_whole_of_mostly_reject_word_fract
bool tessedit_dont_blkrej_good_wds
bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
void bigram_correction_pass(PAGE_RES *page_res)
GARBAGE_LEVEL garbage_word(WERD_RES *word, bool ok_dict_word)
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
PointerVector< WERD_RES > lang_words
const Textord & textord() const
bool process_cmd_win_event(int32_t cmd_event, char *new_value)
bool crunch_early_merge_tess_fails
Pix ** mutable_pix_binary()
#define INT_VAR_H(name, val, comment)
double tessedit_upper_flip_hyphen
bool tessedit_init_config_only
int init_tesseract_internal(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
void process_selected_words(PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
double tessedit_lower_flip_hyphen
bool applybox_learn_chars_and_char_frags_mode
bool word_display(PAGE_RES_IT *pr_it)
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB * > *target_blobs)
bool textord_tabfind_show_vlines
bool tessedit_debug_doc_rejection
void blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box)
bool paragraph_text_based
int ocr_devanagari_split_strategy
bool tessedit_write_images
void flip_hyphens(WERD_RES *word)
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
char * tessedit_char_blacklist
double crunch_del_min_width
int num_sub_langs() const
void TrainFromBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
void recog_word_recursive(WERD_RES *word)
int min_characters_to_try
int16_t first_alphanum_offset(const char *word, const char *word_lengths)
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
double suspect_accept_rating
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
bool tessedit_dump_choices
bool tessedit_create_lstmbox
char * tessedit_char_unblacklist
bool TrainLineRecognizer(const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list)
bool tessedit_timing_debug
void nn_match_word(WERD_RES *word, ROW *row)
bool rej_use_tess_accepted
void process_image_event(const SVEvent &event)
void set_pix_original(Pix *original_pix)
bool tessedit_enable_bigram_correction
int16_t count_outline_errs(char c, int16_t outline_count)
int32_t adaption_word_number
void SetupUniversalFontIds()
double subscript_max_y_top
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
int multilang_debug_level
#define BOOL_VAR_H(name, val, comment)
double textord_tabfind_vertical_text_ratio
char * numeric_punctuation
void font_recognition_pass(PAGE_RES *page_res)
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
double tessedit_reject_row_percent
bool repeated_nonalphanum_wd(WERD_RES *word, ROW *row)
void reject_mostly_rejects(WERD_RES *word)
int16_t doc_good_char_quality
void blamer_pass(PAGE_RES *page_res)
void convert_bad_unlv_chs(WERD_RES *word_res)
void ReSegmentByClassification(PAGE_RES *page_res)
double tessedit_reject_doc_percent
double textord_tabfind_aligned_gap_fraction
void dictionary_correction_pass(PAGE_RES *page_res)
bool one_ell_conflict(WERD_RES *word_res, bool update_map)
void tilde_delete(PAGE_RES_IT &page_res_it)
void CorrectClassifyWords(PAGE_RES *page_res)
void set_done(WERD_RES *word, int16_t pass)
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
void set_source_resolution(int ppi)
double crunch_del_high_word
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Pix * scaled_color() const
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
char * conflict_set_I_l_1
bool tessedit_fix_hyphens
void set_pix_thresholds(Pix *thresholds)
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
bool tessedit_word_for_word
void fix_rep_char(PAGE_RES_IT *page_res_it)
void SearchWords(PointerVector< WERD_RES > *words)
bool textord_equation_detect
int paragraph_debug_level
void unrej_good_chs(WERD_RES *word)
bool tessedit_unrej_any_wd
void reject_edge_blobs(WERD_RES *word)
FILE * init_recog_training(const STRING &fname)
void tess_segment_pass_n(int pass_n, WERD_RES *word)
UNICHAR_ID get_rep_char(WERD_RES *word)
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
bool unlv_tilde_crunching
double tessedit_reject_block_percent
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
bool word_blank_and_set_display(PAGE_RES_IT *pr_its)
bool word_set_display(PAGE_RES_IT *pr_it)
bool rej_alphas_in_number_perm
bool tessedit_preserve_blk_rej_perfect_wds
void ResetAdaptiveClassifier()
ImageData * GetLineData(const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block)
bool tessedit_debug_block_rejection
WERD_RES * TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
bool tessedit_minimal_rejection
bool word_dumper(PAGE_RES_IT *pr_it)
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
bool tessedit_write_block_separators
bool tessedit_use_primary_params_model
const FCOORD & reskew() const
int tessedit_ocr_engine_mode
int fixsp_non_noise_limit
bool preserve_interword_spaces
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
void(Tesseract::*)(const WordData &, WERD_RES **, PointerVector< WERD_RES > *) WordRecognizer
void write_results(PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
bool tessedit_create_hocr
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
#define STRING_VAR_H(name, val, comment)
char * tessedit_write_params_to_file
int16_t word_blob_quality(WERD_RES *word)
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
Pix * pix_original() const
bool FindSegmentation(const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
double crunch_pot_poor_rate
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
int16_t alpha_count(const char *word, const char *word_lengths)
int16_t word_outline_errs(WERD_RES *word)
bool check_debug_pt(WERD_RES *word, int location)
bool right_to_left() const
double quality_outline_pc