tesseract
5.0.0-alpha-619-ge9db
|
#include <tesseractclass.h>
|
| Tesseract () |
|
| ~Tesseract () override |
|
Dict & | getDict () override |
|
void | Clear () |
|
void | ResetAdaptiveClassifier () |
|
void | ResetDocumentDictionary () |
|
void | SetEquationDetect (EquationDetect *detector) |
|
const FCOORD & | reskew () const |
|
Pix ** | mutable_pix_binary () |
|
Pix * | pix_binary () const |
|
Pix * | pix_grey () const |
|
void | set_pix_grey (Pix *grey_pix) |
|
Pix * | pix_original () const |
|
void | set_pix_original (Pix *original_pix) |
|
Pix * | BestPix () const |
|
void | set_pix_thresholds (Pix *thresholds) |
|
int | source_resolution () const |
|
void | set_source_resolution (int ppi) |
|
int | ImageWidth () const |
|
int | ImageHeight () const |
|
Pix * | scaled_color () const |
|
int | scaled_factor () const |
|
void | SetScaledColor (int factor, Pix *color) |
|
const Textord & | textord () const |
|
Textord * | mutable_textord () |
|
bool | right_to_left () const |
|
int | num_sub_langs () const |
|
Tesseract * | get_sub_lang (int index) const |
|
bool | AnyTessLang () const |
|
bool | AnyLSTMLang () const |
|
void | SetBlackAndWhitelist () |
|
void | PrepareForPageseg () |
|
void | PrepareForTessOCR (BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr) |
|
int | SegmentPage (const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr) |
|
void | SetupWordScripts (BLOCK_LIST *blocks) |
|
int | AutoPageSeg (PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr) |
|
ColumnFinder * | SetupPageSegAndDetectOrientation (PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix) |
|
void | PrerecAllWordsPar (const GenericVector< WordData > &words) |
|
bool | TrainLineRecognizer (const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list) |
|
void | TrainFromBoxes (const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data) |
|
ImageData * | GetLineData (const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block) |
|
ImageData * | GetRectImage (const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const |
|
void | LSTMRecognizeWord (const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words) |
|
void | SearchWords (PointerVector< WERD_RES > *words) |
|
bool | ProcessTargetWord (const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass) |
|
void | SetupAllWordsPassN (int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words) |
|
void | SetupWordPassN (int pass_n, WordData *word) |
|
bool | RecogAllWordsPassN (int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words) |
|
bool | recog_all_words (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses) |
|
void | rejection_passes (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config) |
|
void | bigram_correction_pass (PAGE_RES *page_res) |
|
void | blamer_pass (PAGE_RES *page_res) |
|
void | script_pos_pass (PAGE_RES *page_res) |
|
int | RetryWithLanguage (const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words) |
|
bool | ReassignDiacritics (int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy) |
|
void | AssignDiacriticsToOverlappingBlobs (const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB * > *target_blobs) |
|
void | AssignDiacriticsToNewBlobs (const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB * > *target_blobs) |
|
bool | SelectGoodDiacriticOutlines (int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines) |
|
float | ClassifyBlobPlusOutlines (const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str) |
|
float | ClassifyBlobAsWord (int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2) |
|
void | classify_word_and_language (int pass_n, PAGE_RES_IT *pr_it, WordData *word_data) |
|
void | classify_word_pass1 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words) |
|
void | recog_pseudo_word (PAGE_RES *page_res, TBOX &selection_box) |
|
void | fix_rep_char (PAGE_RES_IT *page_res_it) |
|
ACCEPTABLE_WERD_TYPE | acceptable_word_string (const UNICHARSET &char_set, const char *s, const char *lengths) |
|
void | match_word_pass_n (int pass_n, WERD_RES *word, ROW *row, BLOCK *block) |
|
void | classify_word_pass2 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words) |
|
void | ReportXhtFixResult (bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word) |
|
bool | RunOldFixXht (WERD_RES *word, BLOCK *block, ROW *row) |
|
bool | TrainedXheightFix (WERD_RES *word, BLOCK *block, ROW *row) |
|
bool | TestNewNormalization (int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row) |
|
bool | recog_interactive (PAGE_RES_IT *pr_it) |
|
void | set_word_fonts (WERD_RES *word) |
|
void | font_recognition_pass (PAGE_RES *page_res) |
|
void | dictionary_correction_pass (PAGE_RES *page_res) |
|
bool | check_debug_pt (WERD_RES *word, int location) |
|
bool | SubAndSuperscriptFix (WERD_RES *word_res) |
|
void | GetSubAndSuperscriptCandidates (const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold) |
|
WERD_RES * | TrySuperscriptSplits (int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing) |
|
bool | BelievableSuperscript (bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const |
|
void | output_pass (PAGE_RES_IT &page_res_it, const TBOX *target_word_box) |
|
void | write_results (PAGE_RES_IT &page_res_it, char newline_type, bool force_eol) |
|
void | set_unlv_suspects (WERD_RES *word) |
|
UNICHAR_ID | get_rep_char (WERD_RES *word) |
|
bool | acceptable_number_string (const char *s, const char *lengths) |
|
int16_t | count_alphanums (const WERD_CHOICE &word) |
|
int16_t | count_alphas (const WERD_CHOICE &word) |
|
void | read_config_file (const char *filename, SetParamConstraint constraint) |
|
int | init_tesseract (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr) |
|
int | init_tesseract (const char *datapath, const char *language, OcrEngineMode oem) |
|
int | init_tesseract_internal (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr) |
|
void | SetupUniversalFontIds () |
|
int | init_tesseract_lm (const char *arg0, const char *textbase, const char *language, TessdataManager *mgr) |
|
void | recognize_page (STRING &image_name) |
|
void | end_tesseract () |
|
bool | init_tesseract_lang_data (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr) |
|
void | ParseLanguageString (const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load) |
|
SVMenuNode * | build_menu_new () |
|
void | pgeditor_main (int width, int height, PAGE_RES *page_res) |
|
void | process_image_event (const SVEvent &event) |
|
bool | process_cmd_win_event (int32_t cmd_event, char *new_value) |
|
void | debug_word (PAGE_RES *page_res, const TBOX &selection_box) |
|
void | do_re_display (bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it)) |
|
bool | word_display (PAGE_RES_IT *pr_it) |
|
bool | word_bln_display (PAGE_RES_IT *pr_it) |
|
bool | word_blank_and_set_display (PAGE_RES_IT *pr_its) |
|
bool | word_set_display (PAGE_RES_IT *pr_it) |
|
bool | word_dumper (PAGE_RES_IT *pr_it) |
|
void | blob_feature_display (PAGE_RES *page_res, const TBOX &selection_box) |
|
void | make_reject_map (WERD_RES *word, ROW *row, int16_t pass) |
|
bool | one_ell_conflict (WERD_RES *word_res, bool update_map) |
|
int16_t | first_alphanum_index (const char *word, const char *word_lengths) |
|
int16_t | first_alphanum_offset (const char *word, const char *word_lengths) |
|
int16_t | alpha_count (const char *word, const char *word_lengths) |
|
bool | word_contains_non_1_digit (const char *word, const char *word_lengths) |
|
void | dont_allow_1Il (WERD_RES *word) |
|
int16_t | count_alphanums (WERD_RES *word) |
|
void | flip_0O (WERD_RES *word) |
|
bool | non_0_digit (const UNICHARSET &ch_set, UNICHAR_ID unichar_id) |
|
bool | non_O_upper (const UNICHARSET &ch_set, UNICHAR_ID unichar_id) |
|
bool | repeated_nonalphanum_wd (WERD_RES *word, ROW *row) |
|
void | nn_match_word (WERD_RES *word, ROW *row) |
|
void | nn_recover_rejects (WERD_RES *word, ROW *row) |
|
void | set_done (WERD_RES *word, int16_t pass) |
|
int16_t | safe_dict_word (const WERD_RES *werd_res) |
|
void | flip_hyphens (WERD_RES *word) |
|
void | reject_I_1_L (WERD_RES *word) |
|
void | reject_edge_blobs (WERD_RES *word) |
|
void | reject_mostly_rejects (WERD_RES *word) |
|
bool | word_adaptable (WERD_RES *word, uint16_t mode) |
|
void | recog_word_recursive (WERD_RES *word) |
|
void | recog_word (WERD_RES *word) |
|
void | split_and_recog_word (WERD_RES *word) |
|
void | split_word (WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const |
|
void | join_words (WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const |
|
void | match_current_words (WERD_RES_LIST &words, ROW *row, BLOCK *block) |
|
int16_t | fp_eval_word_spacing (WERD_RES_LIST &word_res_list) |
|
void | dump_words (WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved) |
|
bool | fixspace_thinks_word_done (WERD_RES *word) |
|
GARBAGE_LEVEL | garbage_word (WERD_RES *word, bool ok_dict_word) |
|
bool | potential_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word) |
|
void | tilde_crunch (PAGE_RES_IT &page_res_it) |
|
void | unrej_good_quality_words (PAGE_RES_IT &page_res_it) |
|
void | doc_and_block_rejection (PAGE_RES_IT &page_res_it, bool good_quality_doc) |
|
void | quality_based_rejection (PAGE_RES_IT &page_res_it, bool good_quality_doc) |
|
void | convert_bad_unlv_chs (WERD_RES *word_res) |
|
void | tilde_delete (PAGE_RES_IT &page_res_it) |
|
int16_t | word_blob_quality (WERD_RES *word) |
|
void | word_char_quality (WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count) |
|
void | unrej_good_chs (WERD_RES *word) |
|
int16_t | count_outline_errs (char c, int16_t outline_count) |
|
int16_t | word_outline_errs (WERD_RES *word) |
|
bool | terrible_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level) |
|
CRUNCH_MODE | word_deletable (WERD_RES *word, int16_t &delete_mode) |
|
int16_t | failure_count (WERD_RES *word) |
|
bool | noise_outlines (TWERD *word) |
|
void | tess_segment_pass_n (int pass_n, WERD_RES *word) |
|
PAGE_RES * | ApplyBoxes (const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list) |
|
void | PreenXHeights (BLOCK_LIST *block_list) |
|
PAGE_RES * | SetupApplyBoxes (const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list) |
|
void | MaximallyChopWord (const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res) |
|
bool | ResegmentCharBox (PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text) |
|
bool | ResegmentWordBox (BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text) |
|
void | ReSegmentByClassification (PAGE_RES *page_res) |
|
bool | ConvertStringToUnichars (const char *utf8, GenericVector< UNICHAR_ID > *class_ids) |
|
bool | FindSegmentation (const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res) |
|
void | SearchForText (const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation) |
|
void | TidyUp (PAGE_RES *page_res) |
|
void | ReportFailedBox (int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg) |
|
void | CorrectClassifyWords (PAGE_RES *page_res) |
|
void | ApplyBoxTraining (const STRING &fontname, PAGE_RES *page_res) |
|
int | CountMisfitTops (WERD_RES *word_res) |
|
float | ComputeCompatibleXheight (WERD_RES *word_res, float *baseline_shift) |
|
FILE * | init_recog_training (const STRING &fname) |
|
void | recog_training_segmented (const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file) |
|
void | ambigs_classify_and_output (const char *label, PAGE_RES_IT *pr_it, FILE *output_file) |
|
|
The basic measure is the number of characters in contextually confirmed words. (I.e the word is done) If all words are contextually confirmed the evaluation is deemed perfect.
Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is preferred.
The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space.
Conversely, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word + 2 sides of a "1" joined.
The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.
|
bool | digit_or_numeric_punct (WERD_RES *word, int char_position) |
|
int16_t | eval_word_spacing (WERD_RES_LIST &word_res_list) |
|
|
Test the current word to see if it can be split by deleting noise blobs. If so, do the business. Return with the iterator pointing to the same place if the word is unchanged, or the last of the replacement words.
|
void | fix_noisy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) |
|
void | fix_sp_fp_word (WERD_RES_IT &word_res_it, ROW *row, BLOCK *block) |
|
int16_t | worst_noise_blob (WERD_RES *word_res, float *worst_noise_score) |
|
float | blob_noise_score (TBLOB *blob) |
|
void | break_noisiest_blob_word (WERD_RES_LIST &words) |
|
|
Walk over the page finding sequences of words joined by fuzzy spaces. Extract them as a sublist, process the sublist to find the optimal arrangement of spaces then replace the sublist in the ROW_RES.
- Parameters
-
| monitor | progress monitor |
| word_count | count of words in doc |
[out] | page_res | |
|
void | fix_fuzzy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) |
|
void | fix_fuzzy_spaces (ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res) |
|
|
Walk the current block list applying the specified word processor function to each word that overlaps the selection_box.
|
void | process_selected_words (PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it)) |
|
|
Add the given word to the document dictionary
|
void | tess_add_doc_word (WERD_CHOICE *word_choice) |
|
|
- Returns
- true if the word is regarded as "good enough".
- Parameters
-
word_choice | after context |
raw_choice | before context |
|
bool | tess_acceptable_word (WERD_RES *word) |
|
| Wordrec () |
|
| ~Wordrec () override=default |
|
void | SaveAltChoices (const LIST &best_choices, WERD_RES *word) |
|
void | FillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) |
|
void | CallFillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) |
|
void | SegSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
|
void | InitialSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
|
void | DoSegSearch (WERD_RES *word_res) |
|
void | add_seam_to_queue (float new_priority, SEAM *new_seam, SeamQueue *seams) |
|
void | choose_best_seam (SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile) |
|
void | combine_seam (const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue) |
|
SEAM * | pick_good_seam (TBLOB *blob) |
|
void | try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob) |
|
void | try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob) |
|
PRIORITY | grade_split_length (SPLIT *split) |
|
PRIORITY | grade_sharpness (SPLIT *split) |
|
bool | near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt) |
|
virtual BLOB_CHOICE_LIST * | classify_piece (const GenericVector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle) |
|
void | merge_fragments (MATRIX *ratings, int16_t num_blobs) |
|
void | get_fragment_lists (int16_t current_frag, int16_t current_row, int16_t start, int16_t num_frag_parts, int16_t num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists) |
|
void | merge_and_put_fragment_lists (int16_t row, int16_t column, int16_t num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings) |
|
void | fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices) |
|
void | program_editup (const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict) |
|
void | cc_recog (WERD_RES *word) |
|
void | program_editdown (int32_t elasped_time) |
|
void | set_pass1 () |
|
void | set_pass2 () |
|
int | end_recog () |
|
BLOB_CHOICE_LIST * | call_matcher (TBLOB *blob) |
|
int | dict_word (const WERD_CHOICE &word) |
|
BLOB_CHOICE_LIST * | classify_blob (TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle) |
|
PRIORITY | point_priority (EDGEPT *point) |
|
void | add_point_to_list (PointHeap *point_heap, EDGEPT *point) |
|
bool | is_inside_angle (EDGEPT *pt) |
|
int | angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3) |
|
EDGEPT * | pick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist) |
|
void | prioritize_points (TESSLINE *outline, PointHeap *points) |
|
void | new_min_point (EDGEPT *local_min, PointHeap *points) |
|
void | new_max_point (EDGEPT *local_max, PointHeap *points) |
|
void | vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points) |
|
SEAM * | attempt_blob_chop (TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const GenericVector< SEAM * > &seams) |
|
SEAM * | chop_numbered_blob (TWERD *word, int32_t blob_number, bool italic_blob, const GenericVector< SEAM * > &seams) |
|
SEAM * | chop_overlapping_blob (const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number) |
|
SEAM * | improve_one_blob (const GenericVector< BLOB_CHOICE * > &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number) |
|
SEAM * | chop_one_blob (const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, int *blob_number) |
|
void | chop_word_main (WERD_RES *word) |
|
void | improve_by_chopping (float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending) |
|
int | select_blob_to_split (const GenericVector< BLOB_CHOICE * > &blob_choices, float rating_ceiling, bool split_next_to_fragment) |
|
int | select_blob_to_split_from_fixpt (DANGERR *fixpt) |
|
| Classify () |
|
| ~Classify () override |
|
const ShapeTable * | shape_table () const |
|
void | SetStaticClassifier (ShapeClassifier *static_classifier) |
|
void | AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices) |
|
bool | LargeSpeckle (const TBLOB &blob) |
|
ADAPT_TEMPLATES | NewAdaptedTemplates (bool InitFromUnicharset) |
|
int | GetFontinfoId (ADAPT_CLASS Class, uint8_t ConfigId) |
|
int | PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results) |
|
void | ReadNewCutoffs (TFile *fp, uint16_t *Cutoffs) |
|
void | PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates) |
|
void | WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates) |
|
ADAPT_TEMPLATES | ReadAdaptedTemplates (TFile *File) |
|
float | ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch) |
|
void | FreeNormProtos () |
|
NORM_PROTOS * | ReadNormProtos (TFile *fp) |
|
void | ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class) |
|
INT_TEMPLATES | CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset) |
|
void | LearnWord (const char *fontname, WERD_RES *word) |
|
void | LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word) |
|
void | InitAdaptiveClassifier (TessdataManager *mgr) |
|
void | InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates) |
|
void | AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results) |
|
void | MasterMatcher (INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results) |
|
void | ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results) |
|
double | ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors) |
|
void | ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices) |
|
void | AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results) |
|
int | GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures) |
|
void | DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results) |
|
PROTO_ID | MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask) |
|
int | MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures) |
|
void | MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob) |
|
void | PrintAdaptiveMatchResults (const ADAPT_RESULTS &results) |
|
void | RemoveExtraPuncs (ADAPT_RESULTS *Results) |
|
void | RemoveBadMatches (ADAPT_RESULTS *Results) |
|
void | SetAdaptiveThreshold (float Threshold) |
|
void | ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features) |
|
STRING | ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const |
|
int | ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const |
|
int | ShapeIDToClassID (int shape_id) const |
|
UNICHAR_ID * | BaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results) |
|
int | CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results) |
|
int | CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results) |
|
UNICHAR_ID * | GetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass) |
|
void | DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results) |
|
void | AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES adaptive_templates) |
|
void | DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class) |
|
bool | AdaptableWord (WERD_RES *word) |
|
void | EndAdaptiveClassifier () |
|
void | SettupPass1 () |
|
void | SettupPass2 () |
|
void | AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices) |
|
void | ClassifyAsNoise (ADAPT_RESULTS *Results) |
|
void | ResetAdaptiveClassifierInternal () |
|
void | SwitchAdaptiveClassifier () |
|
void | StartBackupAdaptiveClassifier () |
|
int | GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array) |
|
void | ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array) |
|
bool | TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config) |
|
void | UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob) |
|
bool | AdaptiveClassifierIsFull () const |
|
bool | AdaptiveClassifierIsEmpty () const |
|
bool | LooksLikeGarbage (TBLOB *blob) |
|
void | RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox) |
|
void | ClearCharNormArray (uint8_t *char_norm_array) |
|
void | ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array) |
|
void | ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures) |
|
INT_TEMPLATES | ReadIntTemplates (TFile *fp) |
|
void | WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset) |
|
CLASS_ID | GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id) |
|
void | ShowMatchDisplay () |
|
UnicityTable< FontInfo > & | get_fontinfo_table () |
|
const UnicityTable< FontInfo > & | get_fontinfo_table () const |
|
UnicityTable< FontSet > & | get_fontset_table () |
|
void | NormalizeOutlines (LIST Outlines, float *XScale, float *YScale) |
|
FEATURE_SET | ExtractOutlineFeatures (TBLOB *Blob) |
|
FEATURE_SET | ExtractPicoFeatures (TBLOB *Blob) |
|
FEATURE_SET | ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info) |
|
FEATURE_SET | ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info) |
|
void | LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text) |
|
bool | WriteTRFile (const STRING &filename) |
|
| CCStruct ()=default |
|
| ~CCStruct () override |
|
| CCUtil () |
|
virtual | ~CCUtil () |
|
void | main_setup (const char *argv0, const char *basename) |
| CCUtil::main_setup - set location of tessdata and name of image. More...
|
|
ParamsVectors * | params () |
|
|
static void | SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info) |
|
static void | ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts) |
|
static const double | kDescenderFraction = 0.25 |
|
static const double | kXHeightFraction = 0.5 |
|
static const double | kAscenderFraction = 0.25 |
|
static const double | kXHeightCapRatio |
|
bool | SegSearchDone (int num_futile_classifications) |
|
void | UpdateSegSearchNodes (float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
|
void | ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle) |
|
void | ResetNGramSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending) |
|
void | InitBlamerForSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug) |
|
IntegerMatcher | im_ |
|
FEATURE_DEFS_STRUCT | feature_defs_ |
|
ShapeTable * | shape_table_ = nullptr |
|
Definition at line 172 of file tesseractclass.h.
◆ Tesseract()
tesseract::Tesseract::Tesseract |
( |
| ) |
|
Definition at line 52 of file tesseractclass.cpp.
54 "Take segmentation and labeling from box file",
57 "Conversion of word/line box file to char box file",
60 "Generate training data from boxed chars", this->
params()),
62 "Generate more boxes from boxed chars", this->
params()),
64 "Break input into lines and remap boxes if present",
67 "Dump intermediate images made during page segmentation",
70 "Try inverting the image in `LSTMRecognizeWord`", this->
params()),
75 "Page seg mode: 0=osd only, 1=auto+osd, 2=auto_only, 3=auto, 4=column,"
76 " 5=block_vert, 6=block, 7=line, 8=word, 9=word_circle, 10=char,"
77 "11=sparse_text, 12=sparse_text+osd, 13=raw_line"
78 " (Values from PageSegMode enum in tesseract/publictypes.h)",
81 "Which OCR engine(s) to run (Tesseract, LSTM, both)."
82 " Defaults to loading and running the most accurate"
86 "Blacklist of chars not to recognize", this->
params()),
88 "Whitelist of chars to recognize", this->
params()),
90 "List of chars to override tessedit_char_blacklist",
93 "Perform training for ambiguities", this->
params()),
96 "Whether to use the top-line splitting process for Devanagari "
97 "documents while performing page-segmentation.",
101 "Whether to use the top-line splitting process for Devanagari "
102 "documents while performing ocr.",
105 "Write all parameters to the given file.", this->
params()),
107 "Generate and print debug"
108 " information for adaption",
115 "Exposure value follows"
116 " this pattern in the image filename. The name of the image"
117 " files are expected to be in the form"
118 " [lang].[fontname].exp[num].tif",
121 "Learn both character fragments (as is done in the"
122 " special low exposure mode) as well as unfragmented"
127 " is assumed to contain ngrams. Only learn the ngrams"
128 " whose outlines overlap horizontally.",
137 "Try to improve fuzzy spaces", this->
params()),
139 "Don't bother with word plausibility", this->
params()),
143 "Add words to the document dictionary", this->
params()),
149 "Enable correction based on the word bigram dictionary.",
152 "Enable single word correction based on the dictionary.",
155 "Amount of debug output for bigram correction.",
158 "Remove and conditionally reassign small outlines when they"
159 " confuse layout analysis, determining diacritics vs noise",
167 "Hingepoint for base char certainty", this->
params()),
171 "Hingepoint for disjoint certainty", this->
params()),
175 "Threshold for new punc char certainty", this->
params()),
178 "Scaling on certainty diff from Hingepoint",
192 "good_quality_doc lte rejection limit", this->
params()),
194 "good_quality_doc gte good blobs limit", this->
params()),
196 "good_quality_doc lte outline error limit", this->
params()),
198 "good_quality_doc gte good char limit", this->
params()),
202 "Adaptation decision algorithm for tess", this->
params()),
204 "Do minimal rejection on pass 1 output", this->
params()),
215 "Run paragraph detection on the post-text-recognition "
219 "Use ratings matrix/beam search with lstm", this->
params()),
225 "Reduce rejection on good docs", this->
params()),
229 "%rej allowed before rej whole doc", this->
params()),
231 "%rej allowed before rej whole block", this->
params()),
233 "%rej allowed before rej whole row", this->
params()),
235 "Number of row rejects in whole word rejects"
236 " which prevents whole row rejection",
239 "Only rej partially rejected words in block rejection",
242 "Only rej partially rejected words in row rejection",
245 "Use word segmentation quality metric", this->
params()),
247 "Use word segmentation quality metric", this->
params()),
249 "Only preserve wds longer than this", this->
params()),
251 "Apply row rejection to good docs", this->
params()),
253 "rej good doc wd if more than this fraction rejected",
256 "Reject all bad quality wds", this->
params()),
260 "Output data to debug file", this->
params()),
264 "good_quality_doc gte good char limit", this->
params()),
266 "Mark v.bad words for tilde crunch", this->
params()),
274 "Take out ~^ early?", this->
params()),
279 "crunch garbage cert lt this", this->
params()),
281 "crunch garbage rating lt this", this->
params()),
295 "Del if word width lt xht x this", this->
params()),
297 "Del if word gt xht x this above bl", this->
params()),
299 "Del if word gt xht x this below bl", this->
params()),
305 "How many potential indicators needed", this->
params()),
311 "Don't pot crunch sensible strings", this->
params()),
315 "Don't crunch words with long lower case strings",
318 "Don't crunch words with long lower case strings",
321 "Crunch words with long repetitions", this->
params()),
324 "How many non-noise blbs either side?", this->
params()),
328 "Reward punctuation joins", this->
params()),
334 "Punct. chs expected WITHIN numbers", this->
params()),
336 "Max allowed deviation of blob top outside of font data",
339 "Min change in xht before actually trying it", this->
params()),
341 "Debug level for sub & superscript fixer", this->
params()),
344 "How many times worse "
345 "certainty does a superscript position glyph need to be for "
346 "us to try classifying it as a char with a different "
352 "badness do we think sufficient to choose a superscript "
353 "over what we'd thought. For example, a value of 0.6 means "
354 "we want to reduce badness of certainty by at least 40%",
357 "A superscript scaled down more than this is unbelievably "
358 "small. For example, 0.3 means we expect the font size to "
359 "be no smaller than 30% of the text line font size.",
362 "Maximum top of a character measured as a multiple of "
363 "x-height above the baseline for us to reconsider whether "
367 "Minimum bottom of a character measured as a multiple of "
368 "x-height above the baseline for us to reconsider whether "
369 "it's a superscript.",
372 "Write block separators in output", this->
params()),
392 "Create PDF with only one invisible text layer",
398 "Specify minimum characters to try during OSD",
401 "Output char for unidentified blobs", this->
params()),
404 "Don't suspect dict wds longer than this", this->
params()),
408 "Don't touch bad rating limit", this->
params()),
412 "Only reject tess failures", this->
params()),
416 "Make output have exactly one word per WERD", this->
params()),
418 "Don't reject ANYTHING AT ALL", this->
params()),
426 "Aspect ratio dot/hyphen test", this->
params()),
428 "Aspect ratio dot/hyphen test", this->
params()),
430 "Use DOC dawg in 11l conf. detector", this->
params()),
446 "if >this fract", this->
params()),
450 "Allow NN to unrej", this->
params()),
458 "-1 -> All pages, else specific page to process",
461 "Capture the image from the IPE", this->
params()),
468 "List of languages to load with this one", this->
params()),
470 "In multilingual mode use params model of the"
474 "Min acceptable orientation margin", this->
params()),
480 "Allow feature extractors to see the original outline",
483 "Only initialize with the config file. Useful if the "
484 "instance is not going to be used for OCR but say only "
485 "for layout analysis.",
490 "Enable vertical detection", this->
params()),
492 "Force using vertical text page mode", this->
params()),
495 "Fraction of textlines deemed vertical to use vertical page "
500 "Fraction of height used as a minimum gap for aligned blobs.",
505 "Preserve multiple interword spaces", this->
params()),
507 "Page separator (default is form feed control character)",
510 "Allows to include alternative symbols choices in the hOCR output. "
511 "Valid input values are 0, 1 and 2. 0 is the default value. "
512 "With 1 the alternative symbol choices per timestep are included. "
513 "With 2 alternative symbol choices are extracted from the CTC "
514 "process instead of the lattice. The choices are mapped per "
519 "Sets the number of cascading iterations for the Beamsearch in "
520 "lstm_choice_mode. Note that lstm_choice_mode must be set to a "
521 "value greater than 0 to produce results.",
525 "Sets the rating coefficient for the lstm choices. The smaller the "
526 "coefficient, the better are the ratings for each choice and less "
527 "information is lost due to the cut off at 0. The standard value is "
530 "Detect music staff and remove intersecting components", this->
params()),
532 backup_config_file_(
nullptr),
533 pix_binary_(
nullptr),
535 pix_original_(
nullptr),
536 pix_thresholds_(
nullptr),
537 source_resolution_(0),
539 right_to_left_(
false),
540 scaled_color_(
nullptr),
544 most_recently_used_(
this),
546 equ_detect_(
nullptr),
547 #ifndef ANDROID_BUILD
548 lstm_recognizer_(
nullptr),
550 train_line_page_num_(0) {
◆ ~Tesseract()
tesseract::Tesseract::~Tesseract |
( |
| ) |
|
|
override |
Definition at line 553 of file tesseractclass.cpp.
555 pixDestroy(&pix_original_);
557 sub_langs_.delete_data_pointers();
558 #ifndef ANDROID_BUILD
559 delete lstm_recognizer_;
560 lstm_recognizer_ =
nullptr;
◆ acceptable_number_string()
bool tesseract::Tesseract::acceptable_number_string |
( |
const char * |
s, |
|
|
const char * |
lengths |
|
) |
| |
Definition at line 386 of file output.cpp.
389 bool prev_digit =
false;
391 if (*lengths == 1 && *s ==
'(')
395 ((*s ==
'$') || (*s ==
'.') || (*s ==
'+') || (*s ==
'-')))
398 for (; *s !=
'\0'; s += *(lengths++)) {
401 else if (prev_digit &&
402 (*lengths == 1 && ((*s ==
'.') || (*s ==
',') || (*s ==
'-'))))
404 else if (prev_digit && *lengths == 1 &&
405 (*(s + *lengths) ==
'\0') && ((*s ==
'%') || (*s ==
')')))
407 else if (prev_digit &&
408 *lengths == 1 && (*s ==
'%') &&
409 (*(lengths + 1) == 1 && *(s + *lengths) ==
')') &&
410 (*(s + *lengths + *(lengths + 1)) ==
'\0'))
◆ acceptable_word_string()
Definition at line 1744 of file control.cpp.
1748 int leading_punct_count;
1749 int upper_count = 0;
1750 int hyphen_pos = -1;
1753 if (strlen (lengths) > 20)
1759 offset += lengths[i++];
1760 leading_punct_count = i;
1763 while (s[offset] !=
'\0' && char_set.
get_isupper(s + offset, lengths[i])) {
1764 offset += lengths[i++];
1767 if (upper_count > 1) {
1771 while (s[offset] !=
'\0' && char_set.
get_islower(s + offset, lengths[i])) {
1772 offset += lengths[i++];
1780 if (lengths[i] == 1 && s[offset] ==
'-') {
1782 offset += lengths[i++];
1783 if (s[offset] !=
'\0') {
1784 while ((s[offset] !=
'\0') &&
1786 offset += lengths[i++];
1788 if (i < hyphen_pos + 3)
1793 if (lengths[i] == 1 && (s[offset] ==
'\'') &&
1794 lengths[i + 1] == 1 && (s[offset + lengths[i]] ==
's')) {
1795 offset += lengths[i++];
1796 offset += lengths[i++];
1799 if (upper_count > 0)
1806 if (lengths[i] == 1 && s[offset] !=
'\0' &&
1808 offset += lengths[i++];
1809 if (lengths[i] == 1 && s[offset] !=
'\0' && i > 0 &&
1810 s[offset - lengths[i - 1]] != s[offset] &&
1812 offset += lengths[i++];
1814 if (s[offset] !=
'\0')
1823 if (s[0] !=
'\0' && char_set.
get_isupper(s, lengths[0])) {
1825 while (s[offset] !=
'\0' &&
1827 lengths[i + 1] == 1 && s[offset + lengths[i]] ==
'.') {
1828 offset += lengths[i++];
1829 offset += lengths[i++];
1832 else if (s[0] !=
'\0' && char_set.
get_islower(s, lengths[0])) {
1834 while (s[offset] !=
'\0' &&
1836 lengths[i + 1] == 1 && s[offset + lengths[i]] ==
'.') {
1837 offset += lengths[i++];
1838 offset += lengths[i++];
1841 if (s[offset] !=
'\0')
◆ alpha_count()
int16_t tesseract::Tesseract::alpha_count |
( |
const char * |
word, |
|
|
const char * |
word_lengths |
|
) |
| |
Definition at line 494 of file reject.cpp.
502 for (i = 0, offset = 0; word[offset] !=
'\0'; offset += word_lengths[i++]) {
◆ ambigs_classify_and_output()
void tesseract::Tesseract::ambigs_classify_and_output |
( |
const char * |
label, |
|
|
PAGE_RES_IT * |
pr_it, |
|
|
FILE * |
output_file |
|
) |
| |
Definition at line 211 of file recogtraining.cpp.
216 WordData word_data(*pr_it);
226 tprintf(
"Not outputting illegal unichar %s\n", label);
232 const auto** blob_choices =
new const BLOB_CHOICE*[dim];
235 delete[] blob_choices;
◆ AnyLSTMLang()
bool tesseract::Tesseract::AnyLSTMLang |
( |
| ) |
const |
|
inline |
Definition at line 293 of file tesseractclass.h.
296 for (
int i = 0; i < sub_langs_.size(); ++i) {
◆ AnyTessLang()
bool tesseract::Tesseract::AnyTessLang |
( |
| ) |
const |
|
inline |
Definition at line 283 of file tesseractclass.h.
286 for (
int i = 0; i < sub_langs_.size(); ++i) {
◆ ApplyBoxes()
PAGE_RES * tesseract::Tesseract::ApplyBoxes |
( |
const STRING & |
fname, |
|
|
bool |
find_segmentation, |
|
|
BLOCK_LIST * |
block_list |
|
) |
| |
Definition at line 108 of file applybox.cpp.
119 const int box_count = boxes.
size();
120 int box_failures = 0;
124 PAGE_RES* page_res = find_segmentation ?
126 clear_any_old_text(block_list);
128 for (
int i = 0; i < box_count; i++) {
129 bool foundit =
false;
130 if (page_res !=
nullptr) {
132 (i == 0) ?
nullptr : &boxes[i - 1],
134 (i == box_count - 1) ?
nullptr : &boxes[i + 1],
135 full_texts[i].c_str());
138 (i == box_count - 1) ?
nullptr : &boxes[i + 1],
144 "FAILURE! Couldn't find a matching blob");
148 if (page_res ==
nullptr) {
156 tprintf(
" Boxes read from boxfile: %6d\n", box_count);
157 if (box_failures > 0)
158 tprintf(
" Boxes failed resegmentation: %6d\n", box_failures);
◆ ApplyBoxTraining()
void tesseract::Tesseract::ApplyBoxTraining |
( |
const STRING & |
fontname, |
|
|
PAGE_RES * |
page_res |
|
) |
| |
◆ AssignDiacriticsToNewBlobs()
Definition at line 1063 of file control.cpp.
1071 for (
int i = 0; i < outlines.
size(); ++i) {
1072 if (outlines[i] ==
nullptr)
continue;
1075 int num_blob_outlines = 0;
1076 TBOX total_ol_box(outlines[i]->bounding_box());
1077 while (i < outlines.
size() && outlines[i] !=
nullptr) {
1078 blob_wanted[i] =
true;
1079 total_ol_box += outlines[i]->bounding_box();
1081 ++num_blob_outlines;
1085 while (!blob_it.at_last() &&
1086 blob_it.data_relative(1)->bounding_box().left() <=
1087 total_ol_box.left()) {
1093 tprintf(
"Num blobless outlines = %d\n", num_blob_outlines);
1094 C_BLOB* left_blob = blob_it.data();
1096 C_BLOB* right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);
1097 if ((left_box.
x_overlap(total_ol_box) || right_blob ==
nullptr ||
1100 outlines, num_blob_outlines,
1103 for (
int j = 0; j < blob_wanted.
size(); ++j) {
1104 if (blob_wanted[j]) {
1105 (*word_wanted)[j] =
true;
1106 (*target_blobs)[j] = left_blob;
1109 }
else if (right_blob !=
nullptr &&
1113 right_blob, outlines,
1114 num_blob_outlines, &blob_wanted)) {
1116 for (
int j = 0; j < blob_wanted.
size(); ++j) {
1117 if (blob_wanted[j]) {
1118 (*word_wanted)[j] =
true;
1119 (*target_blobs)[j] = right_blob;
1123 outlines, num_blob_outlines,
1126 for (
int j = 0; j < blob_wanted.
size(); ++j) {
1127 if (blob_wanted[j]) {
1128 (*word_wanted)[j] =
true;
1129 (*target_blobs)[j] =
nullptr;
◆ AssignDiacriticsToOverlappingBlobs()
Definition at line 1010 of file control.cpp.
1024 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1025 C_BLOB* blob = blob_it.data();
1028 int num_blob_outlines = 0;
1029 for (
int i = 0; i < outlines.
size(); ++i) {
1031 !(*word_wanted)[i]) {
1032 blob_wanted[i] =
true;
1033 (*overlapped_any_blob)[i] =
true;
1034 ++num_blob_outlines;
1038 tprintf(
"%d noise outlines overlap blob at:", num_blob_outlines);
1047 outlines, num_blob_outlines,
1049 for (
int i = 0; i < blob_wanted.
size(); ++i) {
1050 if (blob_wanted[i]) {
1052 (*word_wanted)[i] =
true;
1053 (*target_blobs)[i] = blob;
◆ AutoPageSeg()
int tesseract::Tesseract::AutoPageSeg |
( |
PageSegMode |
pageseg_mode, |
|
|
BLOCK_LIST * |
blocks, |
|
|
TO_BLOCK_LIST * |
to_blocks, |
|
|
BLOBNBOX_LIST * |
diacritic_blobs, |
|
|
Tesseract * |
osd_tess, |
|
|
OSResults * |
osr |
|
) |
| |
Auto page segmentation. Divide the page image into blocks of uniform text linespacing and images.
Resolution (in ppi) is derived from the input image.
The output goes in the blocks list with corresponding TO_BLOCKs in the to_blocks list.
If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide the image into columns, but multiple blocks are still made if the text is of non-uniform linespacing.
If diacritic_blobs is non-null, then diacritics/noise blobs, that would confuse layout analysis by causing textline overlap, are placed there, with the expectation that they will be reassigned to words later and noise/diacriticness determined via classification.
If osd (orientation and script detection) is true then that is performed as well. If only_osd is true, then only orientation and script detection is performed. If osd is desired, (osd or only_osd) then osr_tess must be another Tesseract that was initialized especially for osd, and the results will be output into osr (orientation and script result).
Definition at line 214 of file pagesegmain.cpp.
215 TO_BLOCK_IT to_block_it(&temp_blocks);
216 TO_BLOCK* to_block = to_block_it.data();
217 if (musicmask_pix !=
nullptr) {
220 pixOr(photomask_pix, photomask_pix, musicmask_pix);
223 finder->SetEquationDetect(equ_detect_);
225 result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_,
226 to_block, photomask_pix, pix_thresholds_,
227 pix_grey_, &pixa_debug_, &found_blocks,
228 diacritic_blobs, to_blocks);
230 finder->GetDeskewVectors(&deskew_, &reskew_);
233 pixDestroy(&photomask_pix);
234 pixDestroy(&musicmask_pix);
235 if (result < 0)
return result;
238 BLOCK_IT block_it(blocks);
240 block_it.add_list_after(&found_blocks);
246 static void AddAllScriptsConverted(
const UNICHARSET& sid_set,
◆ BelievableSuperscript()
bool tesseract::Tesseract::BelievableSuperscript |
( |
bool |
debug, |
|
|
const WERD_RES & |
word, |
|
|
float |
certainty_threshold, |
|
|
int * |
left_ok, |
|
|
int * |
right_ok |
|
) |
| const |
Return whether this is believable superscript or subscript text.
We insist that:
- there are no punctuation marks.
- there are no italics.
- no normal-sized character is smaller than superscript_scaledown_ratio of what it ought to be, and
- each character is at least as certain as certainty_threshold.
- Parameters
-
[in] | debug | If true, spew debug output |
[in] | word | The word whose best_choice we're evaluating |
[in] | certainty_threshold | If any of the characters have less certainty than this, reject. |
[out] | left_ok | How many left-side characters were ok? |
[out] | right_ok | How many right-side characters were ok? |
- Returns
- Whether the complete best choice is believable as a superscript.
Definition at line 520 of file superscript.cpp.
526 int initial_ok_run_count = 0;
527 int ok_run_count = 0;
528 float worst_certainty = 0.0f;
532 for (
int i = 0; i < wc.
length(); i++) {
536 bool bad_certainty = char_certainty < certainty_threshold;
540 if (choice && fontinfo_table.
size() > 0) {
543 bool font1_is_italic = font_id1 >= 0
546 is_italic = font1_is_italic &&
547 (font_id2 < 0 || fontinfo_table.
get(font_id2).
is_italic());
550 float height_fraction = 1.0f;
552 float normal_height = char_height;
554 int min_bot, max_bot, min_top, max_top;
558 float hi_height = max_top - max_bot;
559 float lo_height = min_top - min_bot;
560 normal_height = (hi_height + lo_height) / 2;
564 height_fraction = char_height / normal_height;
571 tprintf(
" Rejecting: superscript is italic.\n");
574 tprintf(
" Rejecting: punctuation present.\n");
578 tprintf(
" Rejecting: don't believe character %s with certainty %.2f "
579 "which is less than threshold %.2f\n", char_str,
580 char_certainty, certainty_threshold);
583 tprintf(
" Rejecting: character %s seems too small @ %.2f versus "
584 "expected %.2f\n", char_str, char_height, normal_height);
587 if (bad_certainty || bad_height || is_punc || is_italic) {
588 if (ok_run_count == i) {
589 initial_ok_run_count = ok_run_count;
595 if (char_certainty < worst_certainty) {
596 worst_certainty = char_certainty;
599 bool all_ok = ok_run_count == wc.
length();
600 if (all_ok && debug) {
601 tprintf(
" Accept: worst revised certainty is %.2f\n", worst_certainty);
604 if (left_ok) *left_ok = initial_ok_run_count;
605 if (right_ok) *right_ok = ok_run_count;
◆ BestPix()
Pix* tesseract::Tesseract::BestPix |
( |
| ) |
const |
|
inline |
Definition at line 231 of file tesseractclass.h.
232 if (pixGetWidth(pix_original_) ==
ImageWidth()) {
233 return pix_original_;
234 }
else if (pix_grey_ !=
nullptr) {
◆ bigram_correction_pass()
void tesseract::Tesseract::bigram_correction_pass |
( |
PAGE_RES * |
page_res | ) |
|
Definition at line 467 of file control.cpp.
474 while (word_it.forward() !=
nullptr &&
475 (!word_it.word() || word_it.word()->part_of_combo)) {
478 if (!word_it.word())
break;
485 tprintf(
"Skipping because one of the words is W_REP_CHAR\n");
510 tprintf(
"Top choice \"%s %s\" verified by bigram model.\n",
516 tprintf(
"Examining alt choices for \"%s %s\".\n",
527 float best_rating = 0.0;
530 for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
539 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
550 if (overrides_word1.
size() == 1 ||
553 best_idx = overrides_word1.
size() - 1;
558 if (!overrides_word1.
empty()) {
561 *overrides_word1[best_idx]) &&
563 *overrides_word2[best_idx])) {
565 tprintf(
"Top choice \"%s %s\" verified (sans case) by bigram "
566 "model.\n", orig_w1_str.
c_str(), orig_w2_str.
c_str());
570 const STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
571 const STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
572 if (new_w1_str != orig_w1_str) {
575 if (new_w2_str != orig_w2_str) {
579 STRING choices_description;
580 int num_bigram_choices
581 = overrides_word1.
size() * overrides_word2.
size();
582 if (num_bigram_choices == 1) {
583 choices_description =
"This was the unique bigram choice.";
587 const int kMaxChoicesToPrint = 20;
588 for (
int i = 0; i < overrides_word1.
size() &&
589 i < kMaxChoicesToPrint; i++) {
590 if (i > 0) { bigrams_list +=
", "; }
595 choices_description =
"There were many choices: {";
596 choices_description += bigrams_list;
597 choices_description +=
"}";
599 choices_description.
add_str_int(
"There were ", num_bigram_choices);
600 choices_description +=
" compatible bigrams.";
603 tprintf(
"Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
606 choices_description.
c_str());
◆ blamer_pass()
void tesseract::Tesseract::blamer_pass |
( |
PAGE_RES * |
page_res | ) |
|
Definition at line 709 of file control.cpp.
712 for (page_res_it.restart_page(); page_res_it.word() !=
nullptr;
713 page_res_it.forward()) {
721 static_cast<IncorrectResultReason>(bl)),
◆ blob_feature_display()
void tesseract::Tesseract::blob_feature_display |
( |
PAGE_RES * |
page_res, |
|
|
const TBOX & |
selection_box |
|
) |
| |
◆ blob_noise_score()
float tesseract::Tesseract::blob_noise_score |
( |
TBLOB * |
blob | ) |
|
Definition at line 786 of file fixspace.cpp.
789 int16_t outline_count = 0;
790 int16_t max_dimension;
791 int16_t largest_outline_dimension = 0;
795 box = ol->bounding_box();
797 max_dimension = box.
height();
799 max_dimension = box.
width();
802 if (largest_outline_dimension < max_dimension)
803 largest_outline_dimension = max_dimension;
806 if (outline_count > 5) {
808 largest_outline_dimension *= 2;
815 largest_outline_dimension /= 2;
818 return largest_outline_dimension;
◆ break_noisiest_blob_word()
void tesseract::Tesseract::break_noisiest_blob_word |
( |
WERD_RES_LIST & |
words | ) |
|
break_noisiest_blob_word() Find the word with the blob which looks like the worst noise. Break the word into two, deleting the noise blob.
Definition at line 641 of file fixspace.cpp.
643 WERD_RES_IT word_it(&words);
644 WERD_RES_IT worst_word_it;
645 float worst_noise_score = 9999;
646 int worst_blob_index = -1;
651 C_BLOB_IT rej_cblob_it;
652 C_BLOB_LIST new_blob_list;
653 C_BLOB_IT new_blob_it;
654 C_BLOB_IT new_rej_cblob_it;
656 int16_t start_of_noise_blob;
659 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
661 if (blob_index > -1 && worst_noise_score > noise_score) {
662 worst_noise_score = noise_score;
663 worst_blob_index = blob_index;
664 worst_word_it = word_it;
667 if (worst_blob_index < 0) {
674 word_res = worst_word_it.data();
678 new_blob_it.set_to_list(&new_blob_list);
680 for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
681 new_blob_it.add_after_then_move(blob_it.extract());
683 start_of_noise_blob = blob_it.data()->bounding_box().left();
684 delete blob_it.extract();
686 new_word =
new WERD(&new_blob_list, word_res->
word);
694 (!rej_cblob_it.empty() &&
695 (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
696 rej_cblob_it.forward()) {
697 new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
700 auto* new_word_res =
new WERD_RES(new_word);
701 new_word_res->combination =
true;
702 worst_word_it.add_before_then_move(new_word_res);
◆ build_menu_new()
SVMenuNode * tesseract::Tesseract::build_menu_new |
( |
| ) |
|
Definition at line 298 of file pgedit.cpp.
313 parent_menu = root_menu_item->
AddChild(
"DISPLAY");
332 parent_menu = root_menu_item->
AddChild(
"OTHER");
341 return root_menu_item;
◆ check_debug_pt()
bool tesseract::Tesseract::check_debug_pt |
( |
WERD_RES * |
word, |
|
|
int |
location |
|
) |
| |
Definition at line 1848 of file control.cpp.
1849 bool show_map_detail =
false;
1866 tprintf (
"classify_word_pass1 start\n");
1870 tprintf (
"make_reject_map: initial map");
1873 tprintf (
"make_reject_map: after NN");
1876 tprintf (
"classify_word_pass2 - START");
1879 tprintf (
"classify_word_pass2 - Pre Xht");
1882 tprintf (
"classify_word_pass2 - END");
1883 show_map_detail =
true;
1895 tprintf (
"After Poor quality rejection");
1898 tprintf (
"unrej_good_quality_words - START");
1901 tprintf (
"unrej_good_quality_words - END");
1904 tprintf (
"Write results pass");
1905 show_map_detail =
true;
1912 if (show_map_detail) {
1920 tprintf(
"null best choice\n");
1923 tprintf (
"Done flag: %s\n\n", word->
done ?
"TRUE" :
"FALSE");
◆ classify_word_and_language()
void tesseract::Tesseract::classify_word_and_language |
( |
int |
pass_n, |
|
|
PAGE_RES_IT * |
pr_it, |
|
|
WordData * |
word_data |
|
) |
| |
Definition at line 1318 of file control.cpp.
1320 #ifdef DISABLED_LEGACY_ENGINE
1325 #endif // def DISABLED_LEGACY_ENGINE
1328 PointerVector<WERD_RES> best_words;
1331 clock_t start_t = clock();
1334 tprintf(
"%s word with lang %s at:",
1335 word->
done ?
"Already done" :
"Processing",
1345 int sub = sub_langs_.size();
1346 if (most_recently_used_ !=
this) {
1348 for (sub = 0; sub < sub_langs_.size() &&
1349 most_recently_used_ != sub_langs_[sub]; ++sub) {}
1352 *word_data, recognizer, debug, &word_data->lang_words[sub], &best_words);
1353 Tesseract* best_lang_tess = most_recently_used_;
1354 if (!WordsAcceptable(best_words)) {
1356 if (most_recently_used_ !=
this &&
1358 &word_data->lang_words[sub_langs_.size()],
1360 best_lang_tess =
this;
1362 for (
int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
1364 if (most_recently_used_ != sub_langs_[i] &&
1366 &word_data->lang_words[i],
1368 best_lang_tess = sub_langs_[i];
1372 most_recently_used_ = best_lang_tess;
1373 if (!best_words.empty()) {
1374 if (best_words.size() == 1 && !best_words[0]->combination) {
1376 word_data->word->ConsumeWordResults(best_words[0]);
1379 word_data->word = best_words.back();
1382 ASSERT_HOST(word_data->word->box_word !=
nullptr);
1386 clock_t ocr_t = clock();
1388 tprintf(
"%s (ocr took %.2f sec)\n",
1389 word_data->word->best_choice->unichar_string().c_str(),
1390 static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
◆ classify_word_pass1()
classify_word_pass1
Baseline normalize the word and pass it to Tess.
Definition at line 1400 of file control.cpp.
1403 ROW* row = word_data.row;
1404 BLOCK* block = word_data.block;
1406 ? word_data.prev_word->word->best_choice :
nullptr;
1407 #ifndef ANDROID_BUILD
1408 #ifdef DISABLED_LEGACY_ENGINE
1413 #endif // def DISABLED_LEGACY_ENGINE
1416 if (!out_words->
empty())
1425 #ifndef DISABLED_LEGACY_ENGINE
1432 #endif // ndef DISABLED_LEGACY_ENGINE
1434 #endif // ndef ANDROID_BUILD
1436 #ifndef DISABLED_LEGACY_ENGINE
1457 #endif // ndef DISABLED_LEGACY_ENGINE
◆ classify_word_pass2()
classify_word_pass2
Control what to do with the word in pass 2
Definition at line 1571 of file control.cpp.
1578 #ifndef DISABLED_LEGACY_ENGINE
1579 ROW* row = word_data.row;
1580 BLOCK* block = word_data.block;
1583 ? word_data.prev_word->word->best_choice :
nullptr;
1606 #ifndef GRAPHICS_DISABLED
1620 #endif // ndef DISABLED_LEGACY_ENGINE
◆ ClassifyBlobAsWord()
float tesseract::Tesseract::ClassifyBlobAsWord |
( |
int |
pass_n, |
|
|
PAGE_RES_IT * |
pr_it, |
|
|
C_BLOB * |
blob, |
|
|
STRING * |
best_str, |
|
|
float * |
c2 |
|
) |
| |
Definition at line 1269 of file control.cpp.
1277 while (it.word() != word_res && it.
word() !=
nullptr) it.forward();
1284 if (wd.word->raw_choice !=
nullptr) {
1285 tprintf(
"word xheight=%g, row=%g, range=[%g,%g]\n", word_res->
x_height,
1286 wd.row->x_height(), wd.word->raw_choice->min_x_height(),
1287 wd.word->raw_choice->max_x_height());
1289 tprintf(
"Got word with null raw choice xheight=%g, row=%g\n", word_res->
x_height,
1290 wd.row->x_height());
1294 if (wd.word->raw_choice !=
nullptr) {
1295 cert = wd.word->raw_choice->certainty();
1296 float rat = wd.word->raw_choice->rating();
1297 *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1298 *best_str = wd.word->raw_choice->unichar_string();
1303 it.DeleteCurrentWord();
◆ ClassifyBlobPlusOutlines()
Definition at line 1225 of file control.cpp.
1231 C_BLOB* local_blob =
nullptr;
1232 if (blob !=
nullptr) {
1234 ol_it.set_to_list(blob->
out_list());
1235 first_to_keep = ol_it.data();
1237 for (
int i = 0; i < ok_outlines.
size(); ++i) {
1238 if (ok_outlines[i]) {
1240 if (blob ==
nullptr) {
1241 local_blob =
new C_BLOB(outlines[i]);
1243 ol_it.set_to_list(blob->
out_list());
1245 ol_it.add_before_stay_put(outlines[i]);
1251 ol_it.move_to_first();
1252 if (first_to_keep ==
nullptr) {
1254 for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
1259 for (; ol_it.data() != first_to_keep; ol_it.forward()) {
◆ Clear()
void tesseract::Tesseract::Clear |
( |
| ) |
|
Definition at line 574 of file tesseractclass.cpp.
577 pixDestroy(&pix_binary_);
578 pixDestroy(&pix_grey_);
579 pixDestroy(&pix_thresholds_);
580 pixDestroy(&scaled_color_);
581 deskew_ =
FCOORD(1.0f, 0.0f);
582 reskew_ =
FCOORD(1.0f, 0.0f);
585 for (
int i = 0; i < sub_langs_.size(); ++i)
586 sub_langs_[i]->
Clear();
◆ ComputeCompatibleXheight()
float tesseract::Tesseract::ComputeCompatibleXheight |
( |
WERD_RES * |
word_res, |
|
|
float * |
baseline_shift |
|
) |
| |
Definition at line 117 of file fixxht.cpp.
131 tprintf(
"Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
133 height, min_bottom, max_bottom, min_top, max_top,
150 tprintf(
" xht range min=%d, max=%d\n", min_xht, max_xht);
154 for (
int y = min_xht; y <= max_xht; ++y)
155 top_stats.add(y, misfit_dist);
160 int min_shift = min_bottom - bottom;
161 int max_shift = max_bottom - bottom;
163 tprintf(
" bottom shift min=%d, max=%d\n", min_shift, max_shift);
168 int misfit_weight = abs(min_shift);
169 if (max_shift > min_shift)
170 misfit_weight /= max_shift - min_shift;
171 for (
int y = min_shift; y <= max_shift; ++y)
172 shift_stats.add(y, misfit_weight);
174 if (bottom_shift == 0) {
185 if (shift_stats.get_total() > top_stats.get_total()) {
188 tprintf(
"Applying bottom shift=%d\n", bottom_shift);
191 }
while (bottom_shift != 0 &&
192 top_stats.get_total() < shift_stats.get_total());
194 *baseline_shift = -bottom_shift / word_res->
denorm.
y_scale();
196 tprintf(
"baseline shift=%g\n", *baseline_shift);
198 if (top_stats.get_total() == 0)
199 return bottom_shift != 0 ? word_res->
x_height : 0.0f;
202 float new_xht = top_stats.median();
204 tprintf(
"Median xht=%f\n", new_xht);
205 tprintf(
"Mode20:A: New x-height = %f (norm), %f (orig)\n",
212 return bottom_shift != 0 ? word_res->
x_height : 0.0f;
◆ convert_bad_unlv_chs()
void tesseract::Tesseract::convert_bad_unlv_chs |
( |
WERD_RES * |
word_res | ) |
|
◆ ConvertStringToUnichars()
◆ CorrectClassifyWords()
void tesseract::Tesseract::CorrectClassifyWords |
( |
PAGE_RES * |
page_res | ) |
|
◆ count_alphanums() [1/2]
int16_t tesseract::Tesseract::count_alphanums |
( |
const WERD_CHOICE & |
word | ) |
|
◆ count_alphanums() [2/2]
int16_t tesseract::Tesseract::count_alphanums |
( |
WERD_RES * |
word | ) |
|
Definition at line 556 of file reject.cpp.
561 const WERD_CHOICE *best_choice = word_res->best_choice;
562 for (
int i = 0; i < word_res->reject_map.length(); ++i) {
563 if ((word_res->reject_map[i].accepted()) &&
564 (word_res->uch_set->get_isalpha(best_choice->
unichar_id(i)) ||
565 word_res->uch_set->get_isdigit(best_choice->
unichar_id(i)))) {
◆ count_alphas()
int16_t tesseract::Tesseract::count_alphas |
( |
const WERD_CHOICE & |
word | ) |
|
◆ count_outline_errs()
int16_t tesseract::Tesseract::count_outline_errs |
( |
char |
c, |
|
|
int16_t |
outline_count |
|
) |
| |
◆ CountMisfitTops()
int tesseract::Tesseract::CountMisfitTops |
( |
WERD_RES * |
word_res | ) |
|
Definition at line 85 of file fixxht.cpp.
90 tprintf(
"Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
92 bad ?
"Misfit" :
"OK", top, min_top, max_top,
103 float* baseline_shift) {
104 STATS top_stats(0, UINT8_MAX);
105 STATS shift_stats(-UINT8_MAX, UINT8_MAX);
106 int bottom_shift = 0;
111 for (
int blob_id = 0; blob_id < num_blobs; ++blob_id) {
◆ debug_word()
void tesseract::Tesseract::debug_word |
( |
PAGE_RES * |
page_res, |
|
|
const TBOX & |
selection_box |
|
) |
| |
debug_word
Process the whole image, but load word_config_ for the selected word(s).
Definition at line 665 of file pgedit.cpp.
666 #ifndef DISABLED_LEGACY_ENGINE
◆ dictionary_correction_pass()
void tesseract::Tesseract::dictionary_correction_pass |
( |
PAGE_RES * |
page_res | ) |
|
Definition at line 2092 of file control.cpp.
2095 word = word_it.forward()) {
2096 if (word->best_choices.singleton())
2100 if (word->tesseract->getDict().valid_word(*best) != 0)
2103 WERD_CHOICE_IT choice_it(&word->best_choices);
2104 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
2105 choice_it.forward()) {
2107 if (word->tesseract->getDict().valid_word(*alternate)) {
2110 tprintf(
"Dictionary correction replaces best choice '%s' with '%s'\n",
2115 word->ReplaceBestChoice(alternate);
◆ digit_or_numeric_punct()
bool tesseract::Tesseract::digit_or_numeric_punct |
( |
WERD_RES * |
word, |
|
|
int |
char_position |
|
) |
| |
Definition at line 369 of file fixspace.cpp.
374 for (i = 0, offset = 0; i < char_position;
◆ do_re_display()
void tesseract::Tesseract::do_re_display |
( |
bool(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it) |
word_painter | ) |
|
do_re_display()
Redisplay page
Definition at line 349 of file pgedit.cpp.
355 image_win->
Image(pix_binary_, 0, 0);
360 for (
WERD_RES* word = pr_it.
word(); word !=
nullptr; word = pr_it.forward()) {
361 (this->*word_painter)(&pr_it);
362 if (display_baselines && pr_it.row() != pr_it.prev_row())
364 if (display_blocks && pr_it.block() != pr_it.prev_block())
365 pr_it.block()->block->pdblk.plot(image_win, block_count++,
ScrollView::RED);
◆ doc_and_block_rejection()
void tesseract::Tesseract::doc_and_block_rejection |
( |
PAGE_RES_IT & |
page_res_it, |
|
|
bool |
good_quality_doc |
|
) |
| |
Definition at line 225 of file docqual.cpp.
233 tprintf(
"REJECT ALL #chars: %d #Rejects: %d; \n",
239 tprintf(
"NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
248 while ((word = page_res_it.
word()) !=
nullptr) {
249 current_block = page_res_it.
block();
251 if (current_block->char_count > 0 &&
252 (current_block->rej_count * 100.0 / current_block->char_count) >
255 tprintf(
"REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
256 block_no, current_block->char_count,
257 current_block->rej_count);
259 prev_word_rejected =
false;
260 while ((word = page_res_it.
word()) !=
nullptr &&
261 (page_res_it.
block() == current_block)) {
285 prev_word_rejected &&
291 prev_word_rejected = rej_word;
296 tprintf(
"NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
303 while (page_res_it.
word() !=
nullptr &&
304 page_res_it.
block() == current_block) {
305 current_row = page_res_it.
row();
312 if (current_row->char_count > 0 &&
313 (current_row->rej_count * 100.0 / current_row->char_count) >
315 (current_row->whole_word_rej_count * 100.0 /
316 current_row->rej_count) <
319 tprintf(
"REJECTING ROW %d #chars: %d; #Rejects: %d\n",
320 row_no, current_row->char_count,
321 current_row->rej_count);
323 prev_word_rejected =
false;
324 while ((word = page_res_it.
word()) !=
nullptr &&
325 page_res_it.
row () == current_row) {
342 &accepted_char_quality);
355 prev_word_rejected &&
361 prev_word_rejected = rej_word;
366 tprintf(
"NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
367 row_no, current_row->char_count, current_row->rej_count);
369 while (page_res_it.
word() !=
nullptr &&
370 page_res_it.
row() == current_row)
◆ dont_allow_1Il()
void tesseract::Tesseract::dont_allow_1Il |
( |
WERD_RES * |
word | ) |
|
Definition at line 524 of file reject.cpp.
533 bool accepted_1Il =
false;
535 for (i = 0, offset = 0; i < word_len;
550 for (i = 0, offset = 0; i < word_len;
◆ dump_words()
void tesseract::Tesseract::dump_words |
( |
WERD_RES_LIST & |
perm, |
|
|
int16_t |
score, |
|
|
int16_t |
mode, |
|
|
bool |
improved |
|
) |
| |
Definition at line 475 of file fixspace.cpp.
477 int16_t mode,
bool improved) {
478 WERD_RES_IT word_res_it(&perm);
483 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
484 word_res_it.forward()) {
485 if (!word_res_it.data()->part_of_combo) {
487 word_res_it.data()->best_choice->unichar_string();
496 tprintf(
"EXTRACTED (%d): \"", score);
499 tprintf(
"TESTED (%d): \"", score);
502 tprintf(
"RETURNED (%d): \"", score);
506 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
507 word_res_it.forward()) {
508 if (!word_res_it.data()->part_of_combo) {
510 word_res_it.data()->best_choice->unichar_string().c_str(),
511 static_cast<int>(word_res_it.data()->best_choice->permuter()));
515 }
else if (improved) {
517 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
518 word_res_it.forward()) {
519 if (!word_res_it.data()->part_of_combo) {
521 word_res_it.data()->best_choice->unichar_string().c_str(),
522 static_cast<int>(word_res_it.data()->best_choice->permuter()));
◆ end_tesseract()
void tesseract::Tesseract::end_tesseract |
( |
| ) |
|
◆ eval_word_spacing()
int16_t tesseract::Tesseract::eval_word_spacing |
( |
WERD_RES_LIST & |
word_res_list | ) |
|
Definition at line 265 of file fixspace.cpp.
267 WERD_RES_IT word_res_it(&word_res_list);
268 int16_t total_score = 0;
269 int16_t word_count = 0;
270 int16_t done_word_count = 0;
275 int16_t prev_word_score = 0;
276 bool prev_word_done =
false;
277 bool prev_char_1 =
false;
278 bool prev_char_digit =
false;
279 bool current_char_1 =
false;
280 bool current_word_ok_so_far;
281 STRING punct_chars =
"!\"`',.:;";
282 bool prev_char_punct =
false;
283 bool current_char_punct =
false;
284 bool word_done =
false;
287 word = word_res_it.data();
291 total_score += prev_word_score;
296 prev_char_digit =
false;
297 prev_word_done =
false;
305 current_word_ok_so_far =
false;
307 (prev_char_digit && (
313 total_score += prev_word_score;
316 current_word_ok_so_far = word_done;
319 if (current_word_ok_so_far) {
320 prev_word_done =
true;
321 prev_word_score = word_len;
323 prev_word_done =
false;
329 for (i = 0, prev_char_1 =
false; i < word_len; i++) {
331 if (prev_char_1 || (current_char_1 && (i > 0)))
333 prev_char_1 = current_char_1;
339 for (i = 0, offset = 0, prev_char_punct =
false; i < word_len;
343 if (prev_char_punct || (current_char_punct && i > 0))
345 prev_char_punct = current_char_punct;
349 for (i = 0, offset = 0; i < word_len - 1;
358 word_res_it.forward();
359 }
while (word_res_it.data()->part_of_combo);
360 }
while (!word_res_it.at_first());
361 total_score += prev_word_score;
364 if (done_word_count == word_count)
◆ failure_count()
int16_t tesseract::Tesseract::failure_count |
( |
WERD_RES * |
word | ) |
|
Definition at line 946 of file docqual.cpp.
953 for (; *str !=
'\0'; str++) {
◆ FindSegmentation()
◆ first_alphanum_index()
int16_t tesseract::Tesseract::first_alphanum_index |
( |
const char * |
word, |
|
|
const char * |
word_lengths |
|
) |
| |
Definition at line 468 of file reject.cpp.
475 for (i = 0, offset = 0; word[offset] !=
'\0'; offset += word_lengths[i++]) {
◆ first_alphanum_offset()
int16_t tesseract::Tesseract::first_alphanum_offset |
( |
const char * |
word, |
|
|
const char * |
word_lengths |
|
) |
| |
Definition at line 481 of file reject.cpp.
488 for (i = 0, offset = 0; word[offset] !=
'\0'; offset += word_lengths[i++]) {
◆ fix_fuzzy_space_list()
void tesseract::Tesseract::fix_fuzzy_space_list |
( |
WERD_RES_LIST & |
best_perm, |
|
|
ROW * |
row, |
|
|
BLOCK * |
block |
|
) |
| |
Definition at line 170 of file fixspace.cpp.
176 WERD_RES_LIST current_perm;
177 int16_t current_score;
178 bool improved =
false;
181 dump_words(best_perm, best_score, 1, improved);
186 while ((best_score !=
PERFECT_WERDS) && !current_perm.empty()) {
189 dump_words(current_perm, current_score, 2, improved);
190 if (current_score > best_score) {
193 best_score = current_score;
◆ fix_fuzzy_spaces()
void tesseract::Tesseract::fix_fuzzy_spaces |
( |
ETEXT_DESC * |
monitor, |
|
|
int32_t |
word_count, |
|
|
PAGE_RES * |
page_res |
|
) |
| |
Definition at line 73 of file fixspace.cpp.
78 BLOCK_RES_IT block_res_it;
79 ROW_RES_IT row_res_it;
80 WERD_RES_IT word_res_it_from;
81 WERD_RES_IT word_res_it_to;
83 WERD_RES_LIST fuzzy_space_words;
85 bool prevent_null_wd_fixsp;
90 for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
91 block_res_it.forward()) {
92 row_res_it.set_to_list(&block_res_it.data()->row_res_list);
93 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
94 row_res_it.forward()) {
95 word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
96 while (!word_res_it_from.at_last()) {
97 word_res = word_res_it_from.data();
98 while (!word_res_it_from.at_last() &&
100 word_res_it_from.data_relative(1)->word->flag(
W_FUZZY_NON) ||
101 word_res_it_from.data_relative(1)->word->flag(
W_FUZZY_SP))) {
103 block_res_it.data()->block);
104 word_res = word_res_it_from.forward();
106 if (monitor !=
nullptr) {
108 monitor->
progress = 90 + 5 * word_index / word_count;
110 (monitor->
cancel !=
nullptr &&
116 if (!word_res_it_from.at_last()) {
117 word_res_it_to = word_res_it_from;
118 prevent_null_wd_fixsp =
119 word_res->word->cblob_list()->empty();
122 word_res_it_to.forward();
124 if (monitor !=
nullptr) {
126 monitor->
progress = 90 + 5 * word_index / word_count;
128 (monitor->
cancel !=
nullptr &&
132 while (!word_res_it_to.at_last () &&
133 (word_res_it_to.data_relative(1)->word->flag(
W_FUZZY_NON) ||
134 word_res_it_to.data_relative(1)->word->flag(
W_FUZZY_SP))) {
137 if (word_res->word->cblob_list()->empty())
138 prevent_null_wd_fixsp =
true;
139 word_res = word_res_it_to.forward();
143 if (word_res->word->cblob_list()->empty())
144 prevent_null_wd_fixsp =
true;
145 if (prevent_null_wd_fixsp) {
146 word_res_it_from = word_res_it_to;
148 fuzzy_space_words.assign_to_sublist(&word_res_it_from,
151 row_res_it.data()->row,
152 block_res_it.data()->block);
153 new_length = fuzzy_space_words.length();
154 word_res_it_from.add_list_before(&fuzzy_space_words);
156 !word_res_it_from.at_last() && new_length > 0;
158 word_res_it_from.forward();
165 block_res_it.data()->block);
◆ fix_noisy_space_list()
void tesseract::Tesseract::fix_noisy_space_list |
( |
WERD_RES_LIST & |
best_perm, |
|
|
ROW * |
row, |
|
|
BLOCK * |
block |
|
) |
| |
Definition at line 595 of file fixspace.cpp.
599 WERD_RES_IT best_perm_it(&best_perm);
600 WERD_RES_LIST current_perm;
601 WERD_RES_IT current_perm_it(¤t_perm);
603 int16_t current_score;
604 bool improved =
false;
608 dump_words(best_perm, best_score, 1, improved);
610 old_word_res = best_perm_it.data();
619 while (best_score !=
PERFECT_WERDS && !current_perm.empty()) {
622 dump_words(current_perm, current_score, 2, improved);
623 if (current_score > best_score) {
626 best_score = current_score;
633 dump_words(best_perm, best_score, 3, improved);
◆ fix_rep_char()
void tesseract::Tesseract::fix_rep_char |
( |
PAGE_RES_IT * |
page_res_it | ) |
|
fix_rep_char() The word is a repeated char. (Leader.) Find the repeated char character. Create the appropriate single-word or multi-word sequence according to the size of spaces in between blobs, and correct the classifications where some of the characters disagree with the majority.
Definition at line 1705 of file control.cpp.
1711 for (
int i = 0; i < word.
length(); ++i) {
1717 int max_count = rep_ch.MaxCount(&maxch_id);
1719 BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
1720 if (best_choice ==
nullptr) {
1721 tprintf(
"Failed to find a choice for %s, occurring %d times\n",
1725 word_res->
done =
true;
1731 C_BLOB* prev_blob = blob_it.data();
1732 for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1733 C_BLOB* blob = blob_it.data();
1735 gap -= prev_blob->bounding_box().right();
1740 CorrectRepcharChoices(best_choice, word_res);
◆ fix_sp_fp_word()
void tesseract::Tesseract::fix_sp_fp_word |
( |
WERD_RES_IT & |
word_res_it, |
|
|
ROW * |
row, |
|
|
BLOCK * |
block |
|
) |
| |
Definition at line 561 of file fixspace.cpp.
565 WERD_RES_LIST sub_word_list;
566 WERD_RES_IT sub_word_list_it(&sub_word_list);
571 word_res = word_res_it.data();
583 tprintf(
"FP fixspace working on \"%s\"\n",
587 sub_word_list_it.add_after_stay_put(word_res_it.extract());
589 new_length = sub_word_list.length();
590 word_res_it.add_list_before(&sub_word_list);
591 for (; !word_res_it.at_last() && new_length > 1; new_length--) {
592 word_res_it.forward();
◆ fixspace_thinks_word_done()
bool tesseract::Tesseract::fixspace_thinks_word_done |
( |
WERD_RES * |
word | ) |
|
◆ flip_0O()
void tesseract::Tesseract::flip_0O |
( |
WERD_RES * |
word | ) |
|
Definition at line 671 of file reject.cpp.
682 int num_blobs = word_res->rebuild_word->NumBlobs();
683 for (i = 0; i < best_choice->
length() && i < num_blobs; ++i) {
684 TBLOB* blob = word_res->rebuild_word->blobs[i];
685 if (word_res->uch_set->get_isupper(best_choice->
unichar_id(i)) ||
686 word_res->uch_set->get_isdigit(best_choice->
unichar_id(i))) {
693 UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id(
"0");
694 UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id(
"O");
695 if (unichar_0 == INVALID_UNICHAR_ID ||
696 !word_res->uch_set->get_enabled(unichar_0) ||
697 unichar_O == INVALID_UNICHAR_ID ||
698 !word_res->uch_set->get_enabled(unichar_O)) {
701 for (i = 1; i < best_choice->
length(); ++i) {
702 if (best_choice->
unichar_id(i) == unichar_0 ||
705 if ((i+1) < best_choice->
length() &&
712 (i+1) < best_choice->
length() &&
715 (i+2) < best_choice->
length() &&
724 (((i+1) < best_choice->
length() &&
725 !word_res->uch_set->get_isdigit(best_choice->
unichar_id(i+1)) &&
726 !word_res->uch_set->eq(best_choice->
unichar_id(i+1),
"l") &&
727 !word_res->uch_set->eq(best_choice->
unichar_id(i+1),
"I")) ||
728 (i == best_choice->
length() - 1))) {
733 (i+1) < best_choice->
length() &&
739 (i+2) < best_choice->
length() &&
751 (i+2) < best_choice->
length() &&
754 !word_res->uch_set->get_isupper(best_choice->
unichar_id(i+2))) {
761 (i+1) < best_choice->
length() &&
762 !word_res->uch_set->get_isupper(best_choice->
unichar_id(i+1))) {
767 (word_res->uch_set->eq(best_choice->
unichar_id(i-1),
".") ||
768 word_res->uch_set->eq(best_choice->
unichar_id(i-1),
",")) &&
769 (word_res->uch_set->get_isdigit(best_choice->
unichar_id(i-2)) ||
771 if (best_choice->
unichar_id(i-2) == unichar_O) {
774 while (i < best_choice->length() &&
◆ flip_hyphens()
void tesseract::Tesseract::flip_hyphens |
( |
WERD_RES * |
word | ) |
|
Definition at line 614 of file reject.cpp.
620 int prev_right = -9999;
628 int num_blobs = word_res->rebuild_word->NumBlobs();
629 UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id(
"-");
630 for (i = 0; i < best_choice->
length() && i < num_blobs; ++i) {
631 TBLOB* blob = word_res->rebuild_word->blobs[i];
633 if (i + 1 == num_blobs)
636 next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().
left();
638 if ((out_box.
width() > 8 * word_res->denorm.x_scale()) &&
639 (out_box.
left() > prev_right) && (out_box.
right() < next_left)) {
640 aspect_ratio = out_box.
width() / static_cast<float>(out_box.
height());
641 if (word_res->uch_set->eq(best_choice->
unichar_id(i),
".")) {
643 word_res->uch_set->contains_unichar_id(unichar_dash) &&
644 word_res->uch_set->get_enabled(unichar_dash)) {
647 if (word_res->reject_map[i].rejected())
648 word_res->reject_map[i].setrej_hyphen_accept();
651 word_res->reject_map[i].accepted())
653 word_res->reject_map[i].setrej_hyphen ();
655 else if (best_choice->
unichar_id(i) == unichar_dash) {
657 (word_res->reject_map[i].rejected()))
658 word_res->reject_map[i].setrej_hyphen_accept();
662 (word_res->reject_map[i].accepted()))
664 word_res->reject_map[i].setrej_hyphen();
◆ font_recognition_pass()
void tesseract::Tesseract::font_recognition_pass |
( |
PAGE_RES * |
page_res | ) |
|
font_recognition_pass
Smooth the fonts for the document.
Definition at line 2036 of file control.cpp.
2039 STATS doc_fonts(0, font_table_size_);
2042 for (page_res_it.restart_page(); page_res_it.word() !=
nullptr;
2043 page_res_it.forward()) {
2044 word = page_res_it.
word();
2053 int8_t doc_font_count;
2054 find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
2055 if (doc_font_count == 0)
2058 const FontInfo* modal_font =
nullptr;
2059 for (page_res_it.restart_page(); page_res_it.word() !=
nullptr;
2060 page_res_it.forward()) {
2061 word = page_res_it.
word();
2074 for (page_res_it.restart_page(); page_res_it.word() !=
nullptr;
2075 page_res_it.forward()) {
2076 word = page_res_it.
word();
2080 if (!(
count == length || (length > 3 &&
count >= length * 3 / 4))) {
◆ fp_eval_word_spacing()
int16_t tesseract::Tesseract::fp_eval_word_spacing |
( |
WERD_RES_LIST & |
word_res_list | ) |
|
Definition at line 856 of file fixspace.cpp.
858 WERD_RES_IT word_it(&word_res_list);
864 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
865 word = word_it.data();
◆ garbage_word()
Definition at line 658 of file docqual.cpp.
675 int isolated_digits = 0;
676 int isolated_alphas = 0;
677 int bad_char_count = 0;
682 int alpha_repetition_count = 0;
683 int longest_alpha_repetition_count = 0;
684 int longest_lower_run_len = 0;
685 int lower_string_count = 0;
686 int longest_upper_run_len = 0;
687 int upper_string_count = 0;
688 int total_alpha_count = 0;
689 int total_digit_count = 0;
691 for (; *str !=
'\0'; str += *(lengths++)) {
696 case SUBSEQUENT_UPPER:
698 state = SUBSEQUENT_UPPER;
699 upper_string_count++;
700 if (longest_upper_run_len < upper_string_count)
701 longest_upper_run_len = upper_string_count;
703 alpha_repetition_count++;
704 if (longest_alpha_repetition_count < alpha_repetition_count) {
705 longest_alpha_repetition_count = alpha_repetition_count;
710 alpha_repetition_count = 1;
719 alpha_repetition_count = 1;
720 upper_string_count = 1;
727 case SUBSEQUENT_LOWER:
729 state = SUBSEQUENT_LOWER;
730 lower_string_count++;
731 if (longest_lower_run_len < lower_string_count)
732 longest_lower_run_len = lower_string_count;
734 alpha_repetition_count++;
735 if (longest_alpha_repetition_count < alpha_repetition_count) {
736 longest_alpha_repetition_count = alpha_repetition_count;
741 alpha_repetition_count = 1;
750 alpha_repetition_count = 1;
751 lower_string_count = 1;
759 state = SUBSEQUENT_NUM;
772 if (*lengths == 1 && *str ==
' ')
802 total_alpha_count += total_digit_count - isolated_digits;
806 2 * (total_alpha_count - isolated_alphas) > len &&
816 strpbrk(str,
" ") ==
nullptr &&
825 ok_chars = len - bad_char_count - isolated_digits -
826 isolated_alphas - tess_rejs;
829 tprintf(
"garbage_word: \"%s\"\n",
831 tprintf(
"LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
833 bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
835 if (bad_char_count == 0 &&
837 (len > isolated_digits + isolated_alphas || len <= 2))
840 if (tess_rejs > ok_chars ||
841 (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
845 dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
847 if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5)
852 dodgy_chars = 2 * tess_rejs + bad_char_count;
853 if ((len == 4 && dodgy_chars > 2) ||
854 (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
◆ get_rep_char()
Definition at line 251 of file output.cpp.
256 if (i < word->reject_map.length()) {
◆ get_sub_lang()
Tesseract* tesseract::Tesseract::get_sub_lang |
( |
int |
index | ) |
const |
|
inline |
◆ getDict()
Dict & tesseract::Tesseract::getDict |
( |
| ) |
|
|
overridevirtual |
◆ GetLineData()
Definition at line 135 of file linerec.cpp.
143 if (image_data ==
nullptr)
return nullptr;
150 for (
int b = start_box; b < end_box; ++b) {
152 box.
rotate(block_rotation);
159 image_data->AddBoxes(line_boxes, line_texts, page_numbers);
◆ GetRectImage()
ImageData * tesseract::Tesseract::GetRectImage |
( |
const TBOX & |
box, |
|
|
const BLOCK & |
block, |
|
|
int |
padding, |
|
|
TBOX * |
revised_box |
|
) |
| const |
Definition at line 169 of file linerec.cpp.
172 wbox.
pad(padding, padding);
176 int num_rotations = 0;
190 int width = pixGetWidth(pix);
191 int height = pixGetHeight(pix);
192 TBOX image_box(0, 0, width, height);
194 *revised_box &= image_box;
195 if (revised_box->
null_box())
return nullptr;
196 Box* clip_box = boxCreate(revised_box->
left(), height - revised_box->
top(),
198 Pix* box_pix = pixClipRectangle(pix, clip_box,
nullptr);
199 if (box_pix ==
nullptr)
return nullptr;
200 boxDestroy(&clip_box);
201 if (num_rotations > 0) {
202 Pix* rot_pix = pixRotateOrth(box_pix, num_rotations);
203 pixDestroy(&box_pix);
207 int depth = pixGetDepth(box_pix);
210 grey = pixConvertTo8(box_pix,
false);
211 pixDestroy(&box_pix);
214 bool vertical_text =
false;
215 if (num_rotations > 0) {
218 revised_box->
rotate(rotation);
219 if (num_rotations != 2)
220 vertical_text =
true;
222 return new ImageData(vertical_text, box_pix);
◆ GetSubAndSuperscriptCandidates()
void tesseract::Tesseract::GetSubAndSuperscriptCandidates |
( |
const WERD_RES * |
word, |
|
|
int * |
num_rebuilt_leading, |
|
|
ScriptPos * |
leading_pos, |
|
|
float * |
leading_certainty, |
|
|
int * |
num_rebuilt_trailing, |
|
|
ScriptPos * |
trailing_pos, |
|
|
float * |
trailing_certainty, |
|
|
float * |
avg_certainty, |
|
|
float * |
unlikely_threshold |
|
) |
| |
Determine how many characters (rebuilt blobs) on each end of a given word might plausibly be superscripts so SubAndSuperscriptFix can try to re-recognize them. Even if we find no whole blobs at either end, we will set *unlikely_threshold to a certainty that might be used to select "bad enough" outlier characters. If *unlikely_threshold is set to 0, though, there's really no hope.
- Parameters
-
[in] | word | The word to examine. |
[out] | num_rebuilt_leading | the number of rebuilt blobs at the start of the word which are all up or down and seem badly classified. |
[out] | leading_pos | "super" or "sub" (for debugging) |
[out] | leading_certainty | the worst certainty in the leading blobs. |
[out] | num_rebuilt_trailing | the number of rebuilt blobs at the end of the word which are all up or down and seem badly classified. |
[out] | trailing_pos | "super" or "sub" (for debugging) |
[out] | trailing_certainty | the worst certainty in the trailing blobs. |
[out] | avg_certainty | the average certainty of "normal" blobs in the word. |
[out] | unlikely_threshold | the threshold (on certainty) we used to select "bad enough" outlier characters. |
Definition at line 252 of file superscript.cpp.
262 *avg_certainty = *unlikely_threshold = 0.0f;
263 *num_rebuilt_leading = *num_rebuilt_trailing = 0;
264 *leading_certainty = *trailing_certainty = 0.0f;
274 *leading_pos = *trailing_pos =
SP_NORMAL;
275 int leading_outliers = 0;
276 int trailing_outliers = 0;
278 float normal_certainty_total = 0.0f;
279 float worst_normal_certainty = 0.0f;
282 for (
int b = 0; b < num_blobs; ++b) {
285 if (box.
bottom() >= super_y_bottom) {
287 }
else if (box.
top() <= sub_y_top) {
293 if (char_certainty < worst_normal_certainty) {
294 worst_normal_certainty = char_certainty;
297 normal_certainty_total += char_certainty;
299 if (trailing_outliers == b) {
300 leading_outliers = trailing_outliers;
301 *leading_pos = last_pos;
303 trailing_outliers = 0;
305 if (last_pos == pos) {
308 trailing_outliers = 1;
313 *trailing_pos = last_pos;
314 if (num_normal >= 3) {
316 normal_certainty_total -= worst_normal_certainty;
318 if (num_normal > 0) {
319 *avg_certainty = normal_certainty_total / num_normal;
322 if (num_normal == 0 ||
323 (leading_outliers == 0 && trailing_outliers == 0)) {
330 for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0;
331 *num_rebuilt_leading < leading_outliers;
332 (*num_rebuilt_leading)++) {
334 if (char_certainty > *unlikely_threshold) {
337 if (char_certainty < *leading_certainty) {
338 *leading_certainty = char_certainty;
343 for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
344 *num_rebuilt_trailing < trailing_outliers;
345 (*num_rebuilt_trailing)++) {
346 int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
348 if (char_certainty > *unlikely_threshold) {
351 if (char_certainty < *trailing_certainty) {
352 *trailing_certainty = char_certainty;
◆ ImageHeight()
int tesseract::Tesseract::ImageHeight |
( |
| ) |
const |
|
inline |
◆ ImageWidth()
int tesseract::Tesseract::ImageWidth |
( |
| ) |
const |
|
inline |
◆ init_recog_training()
FILE * tesseract::Tesseract::init_recog_training |
( |
const STRING & |
fname | ) |
|
Definition at line 36 of file recogtraining.cpp.
44 STRING output_fname = fname;
45 const char* lastdot = strrchr(output_fname.
c_str(),
'.');
46 if (lastdot !=
nullptr)
47 output_fname[lastdot - output_fname.
c_str()] =
'\0';
48 output_fname +=
".txt";
49 FILE* output_file = fopen(output_fname.
c_str(),
"a+");
50 if (output_file ==
nullptr) {
51 tprintf(
"Error: Could not open file %s\n", output_fname.
c_str());
◆ init_tesseract() [1/2]
Definition at line 302 of file tessedit.cpp.
304 if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
305 const char* lang_str = langs_to_load[lang_index].c_str();
307 if (!loaded_primary) {
313 int result = tess_to_init->init_tesseract_internal(
314 arg0, textbase, lang_str, oem, configs, configs_size, vars_vec,
315 vars_values, set_only_non_debug_params, mgr);
319 if (!loaded_primary) {
321 tprintf(
"Failed loading language '%s'\n", lang_str);
324 &langs_to_load, &langs_not_to_load);
325 loaded_primary =
true;
329 tprintf(
"Failed loading language '%s'\n", lang_str);
332 sub_langs_.push_back(tess_to_init);
335 &langs_to_load, &langs_not_to_load);
340 if (!loaded_primary) {
341 tprintf(
"Tesseract couldn't load any languages!\n");
344 #ifndef DISABLED_LEGACY_ENGINE
345 if (!sub_langs_.empty()) {
352 for (
int s = 0; s < sub_langs_.size(); ++s) {
353 sub_langs_[s]->language_model_->getParamsModel().Copy(
356 tprintf(
"Using params model of the primary language\n");
359 for (
int s = 0; s < sub_langs_.size(); ++s) {
360 sub_langs_[s]->language_model_->getParamsModel().Clear();
366 #endif // ndef DISABLED_LEGACY_ENGINE
◆ init_tesseract() [2/2]
int tesseract::Tesseract::init_tesseract |
( |
const char * |
datapath, |
|
|
const char * |
language, |
|
|
OcrEngineMode |
oem |
|
) |
| |
|
inline |
◆ init_tesseract_internal()
Definition at line 402 of file tessedit.cpp.
409 #ifndef DISABLED_LEGACY_ENGINE
414 for (
int i = 0; i < new_fonts.
size(); ++i) {
423 for (
int i = 0; i < lang_fonts->
size(); ++i) {
◆ init_tesseract_lang_data()
Definition at line 95 of file tessedit.cpp.
97 tprintf(
"Error opening data file %s\n", tessdata_path.c_str());
99 "Please make sure the TESSDATA_PREFIX environment variable is set"
100 " to your \"tessdata\" directory.\n");
103 #ifndef DISABLED_LEGACY_ENGINE
107 if (!mgr->IsLSTMAvailable()) {
109 }
else if (!mgr->IsBaseAvailable()) {
115 #endif // ndef DISABLED_LEGACY_ENGINE
130 for (
int i = 0; i < configs_size; ++i) {
136 if (vars_vec !=
nullptr && vars_values !=
nullptr) {
137 for (
int i = 0; i < vars_vec->
size(); ++i) {
139 (*vars_values)[i].c_str(),
140 set_params_constraint, this->
params())) {
141 tprintf(
"Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].c_str());
148 if (params_file !=
nullptr) {
152 tprintf(
"Failed to open %s for writing params.\n",
169 #ifndef ANDROID_BUILD
170 # ifdef DISABLED_LEGACY_ENGINE
175 # endif // ndef DISABLED_LEGACY_ENGINE
181 tprintf(
"Error: LSTM requested, but not present!! Loading tesseract.\n");
185 #endif // ndef ANDROID_BUILD
190 #ifndef ANDROID_BUILD
192 #endif // ndef ANDROID_BUILD
194 #ifndef DISABLED_LEGACY_ENGINE
197 tprintf(
"Error: Tesseract (legacy) engine requested, but components are "
198 "not present in %s!!\n", tessdata_path.c_str());
201 #endif // ndef DISABLED_LEGACY_ENGINE
203 tprintf(
"Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
208 #ifndef DISABLED_LEGACY_ENGINE
228 static_cast<ParamsModel::PassEnum>(p));
235 #endif // ndef DISABLED_LEGACY_ENGINE
241 static bool IsStrInList(
const STRING& str,
243 for (
int i = 0; i < str_list.
size(); ++i) {
244 if (str_list[i] == str)
return true;
◆ init_tesseract_lm()
int tesseract::Tesseract::init_tesseract_lm |
( |
const char * |
arg0, |
|
|
const char * |
textbase, |
|
|
const char * |
language, |
|
|
TessdataManager * |
mgr |
|
) |
| |
◆ join_words()
Definition at line 231 of file tfacepp.cpp.
244 split_pt.
x = (prev_box.
right() + blob_box.
left()) / 2;
245 split_pt.
y = (prev_box.
top() + prev_box.
bottom() +
266 const int kAltsPerPiece = 2;
268 const int kTooManyAltChoices = 100;
271 WERD_CHOICE_LIST joined_choices;
272 WERD_CHOICE_IT jc_it(&joined_choices);
276 int total_joined_choices = num_word1_choices;
282 for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
283 if (total_joined_choices >= kTooManyAltChoices &&
284 bc2_index > kAltsPerPiece)
287 for (bc1_it.move_to_first(); bc1_index < num_word1_choices;
288 ++bc1_index, bc1_it.forward()) {
289 if (total_joined_choices >= kTooManyAltChoices &&
290 bc1_index > kAltsPerPiece)
293 *wc += *bc2_it.data();
294 jc_it.add_after_then_move(wc);
295 ++total_joined_choices;
300 bc1_it.move_to_first();
301 bc2_it.move_to_first();
302 for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
303 *bc1_it.data() += *bc2_it.data();
305 bc1_it.move_to_last();
306 bc1_it.add_list_after(&joined_choices);
310 if (orig_bb !=
nullptr) {
◆ LSTMRecognizeWord()
Definition at line 228 of file linerec.cpp.
245 if (im_data ==
nullptr)
return;
◆ make_reject_map()
void tesseract::Tesseract::make_reject_map |
( |
WERD_RES * |
word, |
|
|
ROW * |
row, |
|
|
int16_t |
pass |
|
) |
| |
◆ match_current_words()
void tesseract::Tesseract::match_current_words |
( |
WERD_RES_LIST & |
words, |
|
|
ROW * |
row, |
|
|
BLOCK * |
block |
|
) |
| |
Definition at line 222 of file fixspace.cpp.
225 WERD_RES_IT word_it(&words);
230 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
231 word = word_it.data();
233 WordData word_data(block, row, word);
◆ match_word_pass_n()
void tesseract::Tesseract::match_word_pass_n |
( |
int |
pass_n, |
|
|
WERD_RES * |
word, |
|
|
ROW * |
row, |
|
|
BLOCK * |
block |
|
) |
| |
match_word_pass2
Baseline normalize the word and pass it to Tess.
Definition at line 1629 of file control.cpp.
1641 tprintf(
"POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
◆ MaximallyChopWord()
Tests the chopper by exhaustively running chop_one_blob. The word_res will contain filled chopped_word, seam_array, denorm, box_word and best_state for the maximally chopped word.
Definition at line 242 of file applybox.cpp.
256 tprintf(
"Maximally chopping word at:");
261 auto rating = static_cast<float>(INT8_MAX);
276 const double e = exp(1.0);
278 int right_chop_index = 0;
281 SEAM* seam =
nullptr;
283 &blob_number)) !=
nullptr) {
285 BLOB_CHOICE* left_choice = blob_choices[blob_number];
286 rating = left_choice->
rating() / e;
290 auto* right_choice =
new BLOB_CHOICE(++right_chop_index,
291 rating - 0.125f, -rating, -1,
293 blob_choices.
insert(right_choice, blob_number + 1);
◆ mutable_pix_binary()
Pix** tesseract::Tesseract::mutable_pix_binary |
( |
| ) |
|
|
inline |
◆ mutable_textord()
Textord* tesseract::Tesseract::mutable_textord |
( |
| ) |
|
|
inline |
◆ nn_match_word()
void tesseract::Tesseract::nn_match_word |
( |
WERD_RES * |
word, |
|
|
ROW * |
row |
|
) |
| |
◆ nn_recover_rejects()
void tesseract::Tesseract::nn_recover_rejects |
( |
WERD_RES * |
word, |
|
|
ROW * |
row |
|
) |
| |
◆ noise_outlines()
bool tesseract::Tesseract::noise_outlines |
( |
TWERD * |
word | ) |
|
Definition at line 958 of file docqual.cpp.
963 int16_t outline_count = 0;
964 int16_t small_outline_count = 0;
965 int16_t max_dimension;
968 for (
int b = 0; b < word->
NumBlobs(); ++b) {
972 box = ol->bounding_box();
974 max_dimension = box.
height();
976 max_dimension = box.
width();
977 if (max_dimension < small_limit)
978 small_outline_count++;
◆ non_0_digit()
◆ non_O_upper()
◆ num_sub_langs()
int tesseract::Tesseract::num_sub_langs |
( |
| ) |
const |
|
inline |
◆ one_ell_conflict()
bool tesseract::Tesseract::one_ell_conflict |
( |
WERD_RES * |
word_res, |
|
|
bool |
update_map |
|
) |
| |
Definition at line 291 of file reject.cpp.
297 int16_t first_alphanum_index_;
298 int16_t first_alphanum_offset_;
301 bool non_conflict_set_char;
302 bool conflict =
false;
311 word_len = strlen(lengths);
324 for (i = 0, offset = 0, non_conflict_set_char =
false;
325 (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
326 non_conflict_set_char =
330 if (!non_conflict_set_char) {
348 dict_word_ok = (dict_word_type > 0) &&
353 (dict_perm_type && dict_word_ok)) {
356 if (lengths[first_alphanum_index_] == 1 &&
357 word[first_alphanum_offset_] ==
'I') {
363 setrej_1Il_conflict();
372 if (lengths[first_alphanum_index_] == 1 &&
373 word[first_alphanum_offset_] ==
'l') {
379 setrej_1Il_conflict();
403 if (lengths[first_alphanum_index_] == 1 &&
404 word[first_alphanum_offset_] ==
'l') {
411 else if (lengths[first_alphanum_index_] == 1 &&
412 word[first_alphanum_offset_] ==
'I') {
431 for (i = 0, offset = 0; word[offset] !=
'\0';
433 if ((!allow_1s || (word[offset] !=
'1')) &&
436 word_res->
reject_map[i].setrej_1Il_conflict ();
453 setrej_1Il_conflict ();
◆ output_pass()
void tesseract::Tesseract::output_pass |
( |
PAGE_RES_IT & |
page_res_it, |
|
|
const TBOX * |
target_word_box |
|
) |
| |
Definition at line 35 of file output.cpp.
38 const TBOX *target_word_box) {
45 block_of_last_word =
nullptr;
46 while (page_res_it.
word () !=
nullptr) {
49 if (target_word_box) {
52 (current_word_box.
right() + current_word_box.
left()) / 2,
53 (current_word_box.
bottom() + current_word_box.
top()) / 2);
54 if (!target_word_box->
contains(center_pt)) {
60 block_of_last_word != page_res_it.
block ()) {
61 block_of_last_word = page_res_it.
block ();
80 nextword, nextblock), force_eol);
◆ ParseLanguageString()
Definition at line 270 of file tessedit.cpp.
277 target->push_back(lang_code);
288 char** configs,
int configs_size,
291 bool set_only_non_debug_params,
292 TessdataManager* mgr) {
◆ pgeditor_main()
void tesseract::Tesseract::pgeditor_main |
( |
int |
width, |
|
|
int |
height, |
|
|
PAGE_RES * |
page_res |
|
) |
| |
pgeditor_main()
Top level editor operation: Setup a new window and an according event handler
Definition at line 378 of file pgedit.cpp.
379 current_page_res = page_res;
386 build_image_window(width, height);
389 #ifndef GRAPHICS_DISABLED
◆ pix_binary()
Pix* tesseract::Tesseract::pix_binary |
( |
| ) |
const |
|
inline |
◆ pix_grey()
Pix* tesseract::Tesseract::pix_grey |
( |
| ) |
const |
|
inline |
◆ pix_original()
Pix* tesseract::Tesseract::pix_original |
( |
| ) |
const |
|
inline |
◆ potential_word_crunch()
bool tesseract::Tesseract::potential_word_crunch |
( |
WERD_RES * |
word, |
|
|
GARBAGE_LEVEL |
garbage_level, |
|
|
bool |
ok_dict_word |
|
) |
| |
Definition at line 520 of file docqual.cpp.
529 bool word_crunchable;
530 int poor_indicator_count = 0;
539 if (adjusted_len > 10)
545 tprintf(
"Potential poor rating on \"%s\"\n",
548 poor_indicator_count++;
551 if (word_crunchable &&
554 tprintf(
"Potential poor cert on \"%s\"\n",
557 poor_indicator_count++;
560 if (garbage_level !=
G_OK) {
562 tprintf(
"Potential garbage on \"%s\"\n",
565 poor_indicator_count++;
◆ PreenXHeights()
void tesseract::Tesseract::PreenXHeights |
( |
BLOCK_LIST * |
block_list | ) |
|
Any row xheight that is significantly different from the median is set to the median.
Definition at line 180 of file applybox.cpp.
182 const double median_xheight = MedianXHeight(block_list);
185 BLOCK_IT b_it(block_list);
186 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
187 BLOCK* block = b_it.data();
189 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
190 ROW* row = r_it.data();
191 const double diff = fabs(row->
x_height() - median_xheight);
192 if (diff > max_deviation) {
194 tprintf(
"row xheight=%g, but median xheight = %g\n",
◆ PrepareForPageseg()
void tesseract::Tesseract::PrepareForPageseg |
( |
| ) |
|
Definition at line 641 of file tesseractclass.cpp.
644 auto max_pageseg_strategy =
645 static_cast<ShiroRekhaSplitter::SplitStrategy>(
647 for (
int i = 0; i < sub_langs_.size(); ++i) {
648 auto pageseg_strategy =
649 static_cast<ShiroRekhaSplitter::SplitStrategy>(
651 if (pageseg_strategy > max_pageseg_strategy)
652 max_pageseg_strategy = pageseg_strategy;
653 pixDestroy(&sub_langs_[i]->pix_binary_);
654 sub_langs_[i]->pix_binary_ = pixClone(
pix_binary());
660 if (splitter_.
Split(
true, &pixa_debug_)) {
662 pixDestroy(&pix_binary_);
◆ PrepareForTessOCR()
void tesseract::Tesseract::PrepareForTessOCR |
( |
BLOCK_LIST * |
block_list, |
|
|
Tesseract * |
osd_tess, |
|
|
OSResults * |
osr |
|
) |
| |
Definition at line 672 of file tesseractclass.cpp.
675 auto max_ocr_strategy =
676 static_cast<ShiroRekhaSplitter::SplitStrategy>(
678 for (
int i = 0; i < sub_langs_.size(); ++i) {
680 static_cast<ShiroRekhaSplitter::SplitStrategy>(
682 if (ocr_strategy > max_ocr_strategy)
683 max_ocr_strategy = ocr_strategy;
689 bool split_for_ocr = splitter_.
Split(
false, &pixa_debug_);
692 pixDestroy(&pix_binary_);
693 pix_binary_ = pixClone(splitter_.
orig_pix());
698 BLOCK block(
"",
true, 0, 0, 0, 0, pixGetWidth(pix_binary_),
699 pixGetHeight(pix_binary_));
◆ PrerecAllWordsPar()
Definition at line 38 of file par_control.cpp.
41 for (
int w = 0; w < words.
size(); ++w) {
42 if (words[w].word->ratings !=
nullptr &&
43 words[w].word->ratings->get(0, 0) ==
nullptr) {
44 for (
int s = 0; s < words[w].lang_words.
size(); ++s) {
45 Tesseract* sub = s < sub_langs_.size() ? sub_langs_[s] :
this;
46 const WERD_RES& word = *words[w].lang_words[s];
56 #pragma omp parallel for num_threads(10)
58 for (
int b = 0; b < blobs.
size(); ++b) {
60 blobs[b].tesseract->classify_blob(blobs[b].blob,
"par",
White,
nullptr);
64 for (
int b = 0; b < blobs.
size(); ++b) {
66 blobs[b].tesseract->classify_blob(blobs[b].blob,
"par",
White,
nullptr);
◆ process_cmd_win_event()
bool tesseract::Tesseract::process_cmd_win_event |
( |
int32_t |
cmd_event, |
|
|
char * |
new_value |
|
) |
| |
Definition at line 415 of file pgedit.cpp.
458 mode =static_cast<CMD_EVENTS>(cmd_event);
463 word_config_ = parameter;
467 if (new_value[0] ==
'T')
474 if (new_value[0] ==
'T')
482 if (new_value[0] ==
'T')
489 if (new_value[0] ==
'T')
496 if (new_value[0] ==
'T')
503 if (new_value[0] ==
'T')
513 display_image =(new_value[0] ==
'T');
517 display_blocks =(new_value[0] ==
'T');
521 display_baselines =(new_value[0] ==
'T');
569 snprintf(msg,
sizeof(msg),
"Unrecognised event %" PRId32
"(%s)",
570 cmd_event, new_value);
◆ process_image_event()
void tesseract::Tesseract::process_image_event |
( |
const SVEvent & |
event | ) |
|
process_image_event()
User has done something in the image window - mouse down or up. Work out what it is and do something with it. If DOWN - just remember where it was. If UP - for each word in the selected area do the operation defined by the current mode.
Definition at line 587 of file pgedit.cpp.
603 show_point(current_page_res, event.
x, event.
y);
609 selection_box =
TBOX(down, up);
635 #ifndef DISABLED_LEGACY_ENGINE
636 image_win->
AddMessage(
"Recogging selected words");
640 #endif // ndef DISABLED_LEGACY_ENGINE
643 image_win->
AddMessage(
"Recogging selected blobs");
651 sprintf(msg,
"Mode %d not yet implemented", mode);
◆ process_selected_words()
void tesseract::Tesseract::process_selected_words |
( |
PAGE_RES * |
page_res, |
|
|
TBOX & |
selection_box, |
|
|
bool(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it) |
word_processor |
|
) |
| |
◆ ProcessTargetWord()
bool tesseract::Tesseract::ProcessTargetWord |
( |
const TBOX & |
word_box, |
|
|
const TBOX & |
target_word_box, |
|
|
const char * |
word_config, |
|
|
int |
pass |
|
) |
| |
Definition at line 120 of file control.cpp.
124 if (word_config !=
nullptr) {
126 if (backup_config_file_ ==
nullptr) {
128 FILE* config_fp = fopen(backup_config_file_,
"wb");
129 if (config_fp ==
nullptr) {
130 tprintf(
"Error, failed to open file \"%s\"\n", backup_config_file_);
140 if (backup_config_file_ !=
nullptr) {
144 backup_config_file_ =
nullptr;
147 }
else if (pass > 1 && !word_box.
major_overlap(target_word_box)) {
◆ quality_based_rejection()
void tesseract::Tesseract::quality_based_rejection |
( |
PAGE_RES_IT & |
page_res_it, |
|
|
bool |
good_quality_doc |
|
) |
| |
◆ read_config_file()
void tesseract::Tesseract::read_config_file |
( |
const char * |
filename, |
|
|
SetParamConstraint |
constraint |
|
) |
| |
◆ ReassignDiacritics()
bool tesseract::Tesseract::ReassignDiacritics |
( |
int |
pass, |
|
|
PAGE_RES_IT * |
pr_it, |
|
|
bool * |
make_next_word_fuzzy |
|
) |
| |
Definition at line 944 of file control.cpp.
946 *make_next_word_fuzzy =
false;
960 &word_wanted, &overlapped_any_blob,
968 int num_overlapped = 0;
969 int num_overlapped_used = 0;
970 for (
int i = 0; i < overlapped_any_blob.
size(); ++i) {
971 if (overlapped_any_blob[i]) {
973 if (word_wanted[i]) ++num_overlapped_used;
977 outlines[i] =
nullptr;
983 int non_overlapped = 0;
984 int non_overlapped_used = 0;
985 for (
int i = 0; i < word_wanted.
size(); ++i) {
986 if (word_wanted[i]) ++non_overlapped_used;
987 if (outlines[i] !=
nullptr) ++non_overlapped_used;
990 tprintf(
"Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
991 num_overlapped_used, num_overlapped, non_overlapped_used,
997 make_next_word_fuzzy)) {
1002 return num_overlapped_used != 0 || non_overlapped_used != 0;
◆ recog_all_words()
bool tesseract::Tesseract::recog_all_words |
( |
PAGE_RES * |
page_res, |
|
|
ETEXT_DESC * |
monitor, |
|
|
const TBOX * |
target_word_box, |
|
|
const char * |
word_config, |
|
|
int |
dopasses |
|
) |
| |
recog_all_words()
Walk the page_res, recognizing all the words. If monitor is not null, it is used as a progress monitor/timeout/cancel. If dopasses is 0, all recognition passes are run, 1 just pass 1, 2 passes2 and higher. If target_word_box is not null, special things are done to words that overlap the target_word_box: if word_config is not null, the word config file is read for just the target word(s), otherwise, on pass 2 and beyond ONLY the target words are processed (Jetsoft modification.) Returns false if we cancelled prematurely.
- Parameters
-
page_res | page structure |
monitor | progress monitor |
word_config | word_config file |
target_word_box | specifies just to extract a rectangle |
dopasses | 0 - all, 1 just pass 1, 2 passes 2 and higher |
Definition at line 302 of file control.cpp.
314 if (dopasses==0 || dopasses==1) {
315 page_res_it.restart_page();
318 #ifndef DISABLED_LEGACY_ENGINE
329 for (
int i = 0; i < sub_langs_.size(); ++i) {
331 sub_langs_[i]->SwitchAdaptiveClassifier();
333 sub_langs_[i]->StartBackupAdaptiveClassifier();
337 #endif // ndef DISABLED_LEGACY_ENGINE
343 #ifndef DISABLED_LEGACY_ENGINE
347 #endif // ndef DISABLED_LEGACY_ENGINE
358 most_recently_used_ =
this;
362 for (page_res_it.restart_page(); page_res_it.word() !=
nullptr;
363 page_res_it.forward()) {
364 if (page_res_it.word()->word->flag(
W_REP_CHAR)) {
370 if (page_res_it.word()->best_choice->permuter() ==
USER_DAWG_PERM)
375 if (page_res_it.word()->blamer_bundle !=
nullptr &&
376 page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
378 page_res_it.word()->blamer_bundle->misadaption_debug());
383 if (dopasses == 1)
return true;
385 #ifndef DISABLED_LEGACY_ENGINE
390 page_res_it.restart_page();
396 most_recently_used_ =
this;
427 #endif // ndef DISABLED_LEGACY_ENGINE
434 #ifndef DISABLED_LEGACY_ENGINE
440 #endif //ndef DISABLED_LEGACY_ENGINE
442 const auto pageseg_mode = static_cast<PageSegMode>(
447 for (page_res_it.restart_page(); page_res_it.word() !=
nullptr;
448 page_res_it.forward()) {
450 const POLY_BLOCK* pb = page_res_it.block()->block !=
nullptr
451 ? page_res_it.block()->block->pdblk.poly_block()
455 page_res_it.DeleteCurrentWord();
459 if (monitor !=
nullptr) {
◆ recog_interactive()
bool tesseract::Tesseract::recog_interactive |
( |
PAGE_RES_IT * |
pr_it | ) |
|
Recognize a single word in interactive mode.
- Parameters
-
pr_it | the page results iterator |
Definition at line 77 of file control.cpp.
79 int16_t good_char_qual;
81 WordData word_data(*pr_it);
84 if (lstm_recognizer_ ==
nullptr) {
85 #ifndef DISABLED_LEGACY_ENGINE
87 #endif // ndef DISABLED_LEGACY_ENGINE
91 #ifndef DISABLED_LEGACY_ENGINE
95 tprintf(
"\n%d chars; word_blob_quality: %d; outline_errs: %d; "
96 "char_quality: %d; good_char_quality: %d\n",
101 #endif // ndef DISABLED_LEGACY_ENGINE
◆ recog_pseudo_word()
void tesseract::Tesseract::recog_pseudo_word |
( |
PAGE_RES * |
page_res, |
|
|
TBOX & |
selection_box |
|
) |
| |
◆ recog_training_segmented()
void tesseract::Tesseract::recog_training_segmented |
( |
const STRING & |
fname, |
|
|
PAGE_RES * |
page_res, |
|
|
volatile ETEXT_DESC * |
monitor, |
|
|
FILE * |
output_file |
|
) |
| |
Definition at line 84 of file recogtraining.cpp.
89 const char* lastdot = strrchr(box_fname.
c_str(),
'.');
90 if (lastdot !=
nullptr)
91 box_fname[lastdot - box_fname.
c_str()] =
'\0';
94 FILE* box_file = fopen(box_fname.
c_str(),
"r");
95 if (box_file ==
nullptr) {
96 tprintf(
"Error: Could not open file %s\n", box_fname.
c_str());
110 int examined_words = 0;
112 keep_going = read_t(&page_res_it, &tbox);
120 keep_going = read_t(&page_res_it, &tbox);
130 keep_going = read_t(&page_res_it, &tbox);
144 }
while (keep_going);
153 if (page_res_it.
word()) {
159 if (examined_words < 0.85 * total_words) {
161 "TODO(antonova): clean up recog_training_segmented; "
162 " It examined only a small fraction of the ambigs image.\n");
164 tprintf(
"recog_training_segmented: examined %d / %d words.\n", examined_words,
◆ recog_word()
void tesseract::Tesseract::recog_word |
( |
WERD_RES * |
word | ) |
|
Definition at line 41 of file tfacepp.cpp.
50 tprintf(
"recog_word ASSERT FAIL String:\"%s\"; "
51 "Strlen=%d; #Blobs=%d\n",
59 tprintf(
"Not all words have valid states relative to ratings matrix!!");
79 tprintf(
"Permuter Type Flipped from %d to %d\n",
◆ recog_word_recursive()
void tesseract::Tesseract::recog_word_recursive |
( |
WERD_RES * |
word | ) |
|
Definition at line 104 of file tfacepp.cpp.
114 tprintf(
"recog_word: Discarded long string \"%s\""
115 " (%d characters vs %d blobs)\n",
◆ RecogAllWordsPassN()
Definition at line 213 of file control.cpp.
222 for (
int w = 0; w < words->
size(); ++w) {
223 WordData* word = &(*words)[w];
224 if (w > 0) word->prev_word = &(*words)[w - 1];
225 if (monitor !=
nullptr) {
241 for (; w < words->
size(); ++w) {
247 if (word->word->tess_failed) {
249 for (s = 0; s < word->lang_words.size() &&
250 word->lang_words[s]->tess_failed; ++s) {}
252 if (s > word->lang_words.size())
continue;
255 while (pr_it->
word() !=
nullptr && pr_it->
word() != word->
word)
258 bool make_next_word_fuzzy =
false;
259 #ifndef DISABLED_LEGACY_ENGINE
265 #endif // ndef DISABLED_LEGACY_ENGINE
269 tprintf(
"Pass%d: %s [%s]\n", pass_n,
270 word->word->best_choice->unichar_string().c_str(),
271 word->word->best_choice->debug_string().c_str());
274 if (make_next_word_fuzzy && pr_it->
word() !=
nullptr) {
◆ recognize_page()
void tesseract::Tesseract::recognize_page |
( |
STRING & |
image_name | ) |
|
◆ reject_edge_blobs()
void tesseract::Tesseract::reject_edge_blobs |
( |
WERD_RES * |
word | ) |
|
Definition at line 263 of file reject.cpp.
274 for (
int blobindex = 0; blobindex < blobcount; blobindex++) {
280 word->
reject_map[blobindex].setrej_edge_char();
◆ reject_I_1_L()
void tesseract::Tesseract::reject_I_1_L |
( |
WERD_RES * |
word | ) |
|
◆ reject_mostly_rejects()
void tesseract::Tesseract::reject_mostly_rejects |
( |
WERD_RES * |
word | ) |
|
◆ rejection_passes()
void tesseract::Tesseract::rejection_passes |
( |
PAGE_RES * |
page_res, |
|
|
ETEXT_DESC * |
monitor, |
|
|
const TBOX * |
target_word_box, |
|
|
const char * |
word_config |
|
) |
| |
Definition at line 612 of file control.cpp.
624 if (monitor !=
nullptr) {
630 page_res_it.forward();
637 if (target_word_box &&
639 *target_word_box, word_config, 4)) {
640 page_res_it.forward();
645 page_res_it.rej_stat_word();
653 int16_t all_char_quality;
654 int16_t accepted_all_char_quality;
666 (blob_quality == 0) && (outline_errs >= chars_in_word))
669 page_res_it.forward();
674 (
"QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
675 " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
689 bool good_quality_doc =
◆ repeated_nonalphanum_wd()
bool tesseract::Tesseract::repeated_nonalphanum_wd |
( |
WERD_RES * |
word, |
|
|
ROW * |
row |
|
) |
| |
Definition at line 580 of file reject.cpp.
584 int16_t char_quality;
585 int16_t accepted_char_quality;
602 (char_quality == accepted_char_quality))
◆ ReportFailedBox()
void tesseract::Tesseract::ReportFailedBox |
( |
int |
boxfile_lineno, |
|
|
TBOX |
box, |
|
|
const char * |
box_ch, |
|
|
const char * |
err_msg |
|
) |
| |
◆ ReportXhtFixResult()
void tesseract::Tesseract::ReportXhtFixResult |
( |
bool |
accept_new_word, |
|
|
float |
new_x_ht, |
|
|
WERD_RES * |
word, |
|
|
WERD_RES * |
new_word |
|
) |
| |
Definition at line 1461 of file control.cpp.
1463 tprintf(
"New XHT Match:%s = %s ",
1474 new_x_ht > 0.1 ?
"STILL DOUBT" :
"OK",
1475 accept_new_word ?
"ACCEPTED" :
"");
◆ ReSegmentByClassification()
void tesseract::Tesseract::ReSegmentByClassification |
( |
PAGE_RES * |
page_res | ) |
|
◆ ResegmentCharBox()
bool tesseract::Tesseract::ResegmentCharBox |
( |
PAGE_RES * |
page_res, |
|
|
const TBOX * |
prev_box, |
|
|
const TBOX & |
box, |
|
|
const TBOX * |
next_box, |
|
|
const char * |
correct_text |
|
) |
| |
Gather consecutive blobs that match the given box into the best_state and corresponding correct_text.
Fights over which box owns which blobs are settled by pre-chopping and applying the blobs to box or next_box with the least non-overlap.
- Returns
- false if the box was in error, which can only be caused by failing to find an appropriate blob for a box.
This means that occasionally, blobs may be incorrectly segmented if the chopper fails to find a suitable chop point.
Definition at line 328 of file applybox.cpp.
◆ ResegmentWordBox()
bool tesseract::Tesseract::ResegmentWordBox |
( |
BLOCK_LIST * |
block_list, |
|
|
const TBOX & |
box, |
|
|
const TBOX * |
next_box, |
|
|
const char * |
correct_text |
|
) |
| |
◆ ResetAdaptiveClassifier()
void tesseract::Tesseract::ResetAdaptiveClassifier |
( |
| ) |
|
Definition at line 597 of file tesseractclass.cpp.
599 for (
int i = 0; i < sub_langs_.size(); ++i) {
600 sub_langs_[i]->ResetAdaptiveClassifierInternal();
◆ ResetDocumentDictionary()
void tesseract::Tesseract::ResetDocumentDictionary |
( |
| ) |
|
Definition at line 607 of file tesseractclass.cpp.
609 for (
int i = 0; i < sub_langs_.size(); ++i) {
610 sub_langs_[i]->getDict().ResetDocumentDictionary();
◆ reskew()
const FCOORD& tesseract::Tesseract::reskew |
( |
| ) |
const |
|
inline |
◆ RetryWithLanguage()
Definition at line 903 of file control.cpp.
908 tprintf(
"Trying word using lang %s, oem %d\n",
912 PointerVector<WERD_RES> new_words;
913 (this->*recognizer)(word_data, in_word, &new_words);
914 if (new_words.empty()) {
917 new_words.push_back(*in_word);
921 for (
int i = 0; i < new_words.size(); ++i)
922 new_words[i]->DebugTopChoice(
"Lang result");
928 debug, &new_words, best_words);
◆ right_to_left()
bool tesseract::Tesseract::right_to_left |
( |
| ) |
const |
|
inline |
◆ RunOldFixXht()
bool tesseract::Tesseract::RunOldFixXht |
( |
WERD_RES * |
word, |
|
|
BLOCK * |
block, |
|
|
ROW * |
row |
|
) |
| |
◆ safe_dict_word()
int16_t tesseract::Tesseract::safe_dict_word |
( |
const WERD_RES * |
werd_res | ) |
|
◆ scaled_color()
Pix* tesseract::Tesseract::scaled_color |
( |
| ) |
const |
|
inline |
◆ scaled_factor()
int tesseract::Tesseract::scaled_factor |
( |
| ) |
const |
|
inline |
◆ script_pos_pass()
void tesseract::Tesseract::script_pos_pass |
( |
PAGE_RES * |
page_res | ) |
|
Definition at line 733 of file control.cpp.
735 for (page_res_it.restart_page(); page_res_it.word() !=
nullptr;
736 page_res_it.forward()) {
739 page_res_it.forward();
742 const float x_height = page_res_it.block()->block->x_height();
743 float word_x_height = word->
x_height;
744 if (word_x_height < word->best_choice->min_x_height() ||
752 const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
754 small_cap_xheight - small_cap_delta <= word_x_height &&
755 word_x_height <= small_cap_xheight + small_cap_delta) {
765 if (num_upper > 0 && num_lower == 0)
◆ SearchForText()
void tesseract::Tesseract::SearchForText |
( |
const GenericVector< BLOB_CHOICE_LIST * > * |
choices, |
|
|
int |
choices_pos, |
|
|
int |
choices_length, |
|
|
const GenericVector< UNICHAR_ID > & |
target_text, |
|
|
int |
text_index, |
|
|
float |
rating, |
|
|
GenericVector< int > * |
segmentation, |
|
|
float * |
best_rating, |
|
|
GenericVector< int > * |
best_segmentation |
|
) |
| |
◆ SearchWords()
Definition at line 259 of file linerec.cpp.
264 const Dict* stopper_dict = lstm_recognizer_->
GetDict();
265 if (stopper_dict ==
nullptr) stopper_dict = &
getDict();
266 bool any_nonspace_delimited =
false;
267 for (
int w = 0; w < words->
size(); ++w) {
271 any_nonspace_delimited =
true;
275 for (
int w = 0; w < words->
size(); ++w) {
296 tprintf(
"Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
◆ SegmentPage()
int tesseract::Tesseract::SegmentPage |
( |
const STRING * |
input_file, |
|
|
BLOCK_LIST * |
blocks, |
|
|
Tesseract * |
osd_tess, |
|
|
OSResults * |
osr |
|
) |
| |
Segment the page according to the current value of tessedit_pageseg_mode. pix_binary_ is used as the source image and should not be nullptr. On return the blocks list owns all the constructed page layout.
Definition at line 113 of file pagesegmain.cpp.
119 BLOCK_IT block_it(blocks);
120 auto* block =
new BLOCK(
"",
true, 0, 0, 0, 0, width, height);
122 block_it.add_to_end(block);
133 BLOBNBOX_LIST diacritic_blobs;
134 int auto_page_seg_ret_val = 0;
135 TO_BLOCK_LIST to_blocks;
139 pageseg_mode, blocks, &to_blocks,
142 return auto_page_seg_ret_val;
146 deskew_ =
FCOORD(1.0f, 0.0f);
147 reskew_ =
FCOORD(1.0f, 0.0f);
149 Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
150 if (pixcleaned !=
nullptr) {
151 pixDestroy(&pix_binary_);
152 pix_binary_ = pixcleaned;
157 if (auto_page_seg_ret_val < 0) {
161 if (blocks->empty()) {
170 textord_.
TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
171 pix_thresholds_, pix_grey_, splitting || cjk_mode,
172 &diacritic_blobs, blocks, &to_blocks);
173 return auto_page_seg_ret_val;
◆ SelectGoodDiacriticOutlines()
Definition at line 1139 of file control.cpp.
1144 float target_cert = certainty_threshold;
1145 if (blob !=
nullptr) {
1149 tprintf(
"No Noise blob classified as %s=%g(%g) at:", best_str.
c_str(),
1150 target_cert, target_c2);
1160 pr_it, blob, &all_str);
1163 for (
int i = 0; i < test_outlines.
size(); ++i) {
1164 if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
1166 tprintf(
"All Noise blob classified as %s=%g, delta=%g at:",
1167 all_str.
c_str(), best_cert, best_cert - target_cert);
1173 while (num_outlines > 1 && best_index >= 0 &&
1174 (blob ==
nullptr || best_cert < target_cert || blob !=
nullptr)) {
1177 for (
int i = 0; i < outlines.
size(); ++i) {
1178 if (test_outlines[i]) {
1179 test_outlines[i] =
false;
1185 for (
int j = 0; j < outlines.
size(); ++j) {
1186 if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
1187 tprintf(
"%d", test_outlines[j]);
1189 tprintf(
" blob classified as %s=%g, delta=%g) at:", str.
c_str(),
1190 cert, cert - target_cert);
1193 if (cert > best_cert) {
1196 best_outlines = test_outlines;
1198 test_outlines[i] =
true;
1201 if (best_index >= 0) {
1202 test_outlines[best_index] =
false;
1206 if (best_cert >= target_cert) {
1208 *ok_outlines = best_outlines;
1210 tprintf(
"%s noise combination ", blob ?
"Adding" :
"New");
1211 for (
int i = 0; i < best_outlines.
size(); ++i) {
1212 tprintf(
"%d", best_outlines[i]);
1214 tprintf(
" yields certainty %g, beating target of %g\n", best_cert,
◆ set_done()
void tesseract::Tesseract::set_done |
( |
WERD_RES * |
word, |
|
|
int16_t |
pass |
|
) |
| |
◆ set_pix_grey()
void tesseract::Tesseract::set_pix_grey |
( |
Pix * |
grey_pix | ) |
|
|
inline |
Definition at line 206 of file tesseractclass.h.
207 pixDestroy(&pix_grey_);
208 pix_grey_ = grey_pix;
◆ set_pix_original()
void tesseract::Tesseract::set_pix_original |
( |
Pix * |
original_pix | ) |
|
|
inline |
Definition at line 214 of file tesseractclass.h.
215 pixDestroy(&pix_original_);
216 pix_original_ = original_pix;
218 for (
int i = 0; i < sub_langs_.size(); ++i) {
219 sub_langs_[i]->set_pix_original(original_pix ? pixClone(original_pix)
◆ set_pix_thresholds()
void tesseract::Tesseract::set_pix_thresholds |
( |
Pix * |
thresholds | ) |
|
|
inline |
Definition at line 240 of file tesseractclass.h.
241 pixDestroy(&pix_thresholds_);
242 pix_thresholds_ = thresholds;
◆ set_source_resolution()
void tesseract::Tesseract::set_source_resolution |
( |
int |
ppi | ) |
|
|
inline |
◆ set_unlv_suspects()
void tesseract::Tesseract::set_unlv_suspects |
( |
WERD_RES * |
word | ) |
|
Definition at line 272 of file output.cpp.
274 int len = word_res->reject_map.length();
275 const WERD_CHOICE &word = *(word_res->best_choice);
281 for (i = 0; i < len; i++) {
282 if (word_res->reject_map[i].rejected())
283 word_res->reject_map[i].setrej_minimal_rej_accept();
296 for (i = 0; i < len; ++i) {
297 if (word_res->reject_map[i].rejected() &&
299 word_res->reject_map[i].setrej_minimal_rej_accept();
303 rating_per_ch = word.
rating() / word_res->reject_map.length();
310 for (i = 0; i < len; ++i) {
311 if (word_res->reject_map[i].rejected() &&
313 word_res->reject_map[i].setrej_minimal_rej_accept();
317 for (i = 0; i < len; i++) {
318 if (word_res->reject_map[i].rejected()) {
319 if (word_res->reject_map[i].flag(
R_DOC_REJ))
320 word_res->reject_map[i].setrej_minimal_rej_accept();
322 word_res->reject_map[i].setrej_minimal_rej_accept();
323 if (word_res->reject_map[i].flag(
R_ROW_REJ))
324 word_res->reject_map[i].setrej_minimal_rej_accept();
333 for (i = 0; i < len; i++) {
334 if (word_res->reject_map[i].rejected()) {
337 word_res->reject_map[i].setrej_minimal_rej_accept();
341 word_res->reject_map[i].setrej_minimal_rej_accept();
353 for (i = 0; i < len; i++) {
354 if (word_res->reject_map[i].rejected() &&
355 (!word_res->reject_map[i].perm_rejected() ||
359 word_res->reject_map[i].setrej_minimal_rej_accept();
◆ set_word_fonts()
void tesseract::Tesseract::set_word_fonts |
( |
WERD_RES * |
word | ) |
|
set_word_fonts
Get the fonts for the word.
Definition at line 1961 of file control.cpp.
1967 #ifndef DISABLED_LEGACY_ENGINE
1969 if (fontinfo_size == 0)
return;
1975 tprintf(
"Examining fonts in %s\n",
1980 if (choice ==
nullptr)
continue;
1982 for (
int f = 0; f < fonts.
size(); ++f) {
1983 const int fontinfo_id = fonts[f].fontinfo_id;
1984 if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
1985 font_total_score[fontinfo_id] += fonts[f].score;
1990 int score1 = 0, score2 = 0;
1991 int16_t font_id1 = -1, font_id2 = -1;
1992 for (
int f = 0; f < fontinfo_size; ++f) {
1994 tprintf(
"Font %s, total score = %d\n",
1997 if (font_total_score[f] > score1) {
1999 font_id2 = font_id1;
2000 score1 = font_total_score[f];
2002 }
else if (font_total_score[f] > score2) {
2003 score2 = font_total_score[f];
2017 tprintf(
"Word modal font=%s, score=%d, 2nd choice %s/%d\n",
2022 tprintf(
"Word modal font=%s, score=%d. No 2nd choice\n",
2027 #endif // ndef DISABLED_LEGACY_ENGINE
◆ SetBlackAndWhitelist()
void tesseract::Tesseract::SetBlackAndWhitelist |
( |
| ) |
|
Definition at line 614 of file tesseractclass.cpp.
619 if (lstm_recognizer_) {
626 for (
int i = 0; i < sub_langs_.size(); ++i) {
627 sub_langs_[i]->unicharset.set_black_and_whitelist(
630 if (sub_langs_[i]->lstm_recognizer_) {
631 UNICHARSET& lstm_unicharset = sub_langs_[i]->lstm_recognizer_->GetUnicharset();
◆ SetEquationDetect()
void tesseract::Tesseract::SetEquationDetect |
( |
EquationDetect * |
detector | ) |
|
◆ SetScaledColor()
void tesseract::Tesseract::SetScaledColor |
( |
int |
factor, |
|
|
Pix * |
color |
|
) |
| |
|
inline |
Definition at line 262 of file tesseractclass.h.
263 scaled_factor_ = factor;
264 scaled_color_ = color;
◆ SetupAllWordsPassN()
void tesseract::Tesseract::SetupAllWordsPassN |
( |
int |
pass_n, |
|
|
const TBOX * |
target_word_box, |
|
|
const char * |
word_config, |
|
|
PAGE_RES * |
page_res, |
|
|
GenericVector< WordData > * |
words |
|
) |
| |
If tesseract is to be run, sets the words up ready for it.
Definition at line 154 of file control.cpp.
161 for (page_res_it.restart_page(); page_res_it.word() !=
nullptr;
162 page_res_it.forward()) {
163 if (target_word_box ==
nullptr ||
165 *target_word_box, word_config, 1)) {
170 for (
int w = 0; w < words->
size(); ++w) {
172 if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];
◆ SetupApplyBoxes()
Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: All fuzzy spaces are removed, and all the words are maximally chopped.
Definition at line 206 of file applybox.cpp.
211 BLOCK_IT b_it(block_list);
212 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
213 BLOCK* block = b_it.data();
215 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
216 ROW* row = r_it.data();
218 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
219 WERD* word = w_it.data();
221 delete w_it.extract();
229 auto* page_res =
new PAGE_RES(
false, block_list,
nullptr);
232 while ((word_res = pr_it.word()) !=
nullptr) {
234 pr_it.row()->row, word_res);
◆ SetupPageSegAndDetectOrientation()
ColumnFinder * tesseract::Tesseract::SetupPageSegAndDetectOrientation |
( |
PageSegMode |
pageseg_mode, |
|
|
BLOCK_LIST * |
blocks, |
|
|
Tesseract * |
osd_tess, |
|
|
OSResults * |
osr, |
|
|
TO_BLOCK_LIST * |
to_blocks, |
|
|
Pix ** |
photo_mask_pix, |
|
|
Pix ** |
music_mask_pix |
|
) |
| |
Sets up auto page segmentation, determines the orientation, and corrects it. Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to facilitate testing. photo_mask_pix is a pointer to a nullptr pointer that will be filled on return with the leptonica photo mask, which must be pixDestroyed by the caller. to_blocks is an empty list that will be filled with (usually a single) block that is used during layout analysis. This ugly API is required because of the possibility of a unlv zone file. TODO(rays) clean this up. See AutoPageSeg for other arguments. The returned ColumnFinder must be deleted after use.
Definition at line 284 of file pagesegmain.cpp.
290 pixa_debug_.
AddPix(pix_binary_,
"NoLines");
295 pixa_debug_.
AddPix(pix_binary_,
"NoImages");
302 TO_BLOCK_IT to_block_it(to_blocks);
306 TO_BLOCK* to_block = to_block_it.data();
307 TBOX blkbox = to_block->block->pdblk.bounding_box();
308 ColumnFinder* finder =
nullptr;
309 int estimated_resolution = source_resolution_;
314 estimated_resolution = res;
315 tprintf(
"Estimating resolution as %d\n", estimated_resolution);
319 if (to_block->line_size >= 2) {
320 finder =
new ColumnFinder(static_cast<int>(to_block->line_size),
324 &h_lines, vertical_x, vertical_y);
326 finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);
328 #ifndef DISABLED_LEGACY_ENGINE
334 BLOBNBOX_CLIST osd_blobs;
339 int osd_orientation = 0;
346 to_block, &osd_blobs);
348 if (
PSM_OSD_ENABLED(pageseg_mode) && osd_tess !=
nullptr && osr !=
nullptr) {
350 if (osd_tess !=
this) {
353 AddAllScriptsConverted(
unicharset, osd_tess->unicharset, &osd_scripts);
354 for (
int s = 0; s < sub_langs_.size(); ++s) {
355 AddAllScriptsConverted(sub_langs_[s]->
unicharset,
356 osd_tess->unicharset, &osd_scripts);
367 for (
int i = 0; i < 4; ++i) {
368 if (i != osd_orientation &&
374 const char* best_script_str =
375 osd_tess->unicharset.get_script_from_script_id(best_script_id);
376 bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
377 best_script_id == osd_tess->unicharset.hiragana_sid() ||
378 best_script_id == osd_tess->unicharset.katakana_sid() ||
379 strcmp(
"Japanese", best_script_str) == 0 ||
380 strcmp(
"Korean", best_script_str) == 0 ||
381 strcmp(
"Hangul", best_script_str) == 0;
383 finder->set_cjk_script(
true);
387 if (!cjk && !vertical_text && osd_orientation == 2) {
389 tprintf(
"OSD: Weak margin (%.2f), horiz textlines, not CJK: "
390 "Don't rotate.\n", osd_margin);
394 "OSD: Weak margin (%.2f) for %d blob text block, "
395 "but using orientation anyway: %d\n",
396 osd_margin, osd_blobs.length(), osd_orientation);
400 osd_blobs.shallow_clear();
401 finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
403 #endif // ndef DISABLED_LEGACY_ENGINE
◆ SetupUniversalFontIds()
void tesseract::Tesseract::SetupUniversalFontIds |
( |
| ) |
|
Definition at line 447 of file tessedit.cpp.
456 nullptr, 0,
nullptr,
nullptr,
false, mgr))
464 #endif // ndef DISABLED_LEGACY_ENGINE
◆ SetupWordPassN()
void tesseract::Tesseract::SetupWordPassN |
( |
int |
pass_n, |
|
|
WordData * |
word |
|
) |
| |
Definition at line 177 of file control.cpp.
178 if (pass_n == 1 || !word->word->done) {
185 word->row, word->block);
186 }
else if (pass_n == 2) {
188 word->word->caps_height = 0.0;
189 if (word->word->x_height == 0.0f)
190 word->word->x_height = word->row->x_height();
192 word->lang_words.truncate(0);
193 for (
int s = 0; s <= sub_langs_.size(); ++s) {
195 Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] :
this;
198 word->lang_words.push_back(word_res);
200 if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode !=
OEM_LSTM_ONLY) {
201 word_res->SetupForRecognition(
202 lang_t->unicharset, lang_t,
BestPix(),
203 lang_t->tessedit_ocr_engine_mode,
nullptr,
204 lang_t->classify_bln_numeric_mode,
205 lang_t->textord_use_cjk_fp_model,
206 lang_t->poly_allow_detailed_fx, word->row, word->block);
◆ SetupWordScripts()
void tesseract::Tesseract::SetupWordScripts |
( |
BLOCK_LIST * |
blocks | ) |
|
◆ source_resolution()
int tesseract::Tesseract::source_resolution |
( |
| ) |
const |
|
inline |
◆ split_and_recog_word()
void tesseract::Tesseract::split_and_recog_word |
( |
WERD_RES * |
word | ) |
|
Definition at line 137 of file tfacepp.cpp.
139 int bestgap = -INT32_MAX;
144 int gap = blob_box.
left() - prev_box.
right();
154 split_word(word, split_index, &word2, &orig_bb);
◆ split_word()
Definition at line 174 of file tfacepp.cpp.
179 ASSERT_HOST(split_pt >0 && split_pt < word->chopped_word->NumBlobs());
190 auto *chopped2 =
new TWERD;
192 for (
int i = split_pt; i < chopped->
NumBlobs(); ++i) {
193 chopped2->blobs.push_back(chopped->
blobs[i]);
197 delete word2->chopped_word;
198 word2->chopped_word =
nullptr;
202 word2->ClearResults();
204 word2->chopped_word = chopped2;
206 word2->SetupBasicsFromChoppedWord(
unicharset);
209 if (orig_bb !=
nullptr) {
215 word2->chopped_word->blobs[0]->bounding_box().left(),
220 *right_piece = word2;
221 *orig_blamer_bundle = orig_bb;
◆ SubAndSuperscriptFix()
bool tesseract::Tesseract::SubAndSuperscriptFix |
( |
WERD_RES * |
word | ) |
|
Attempt to split off any high (or low) bits at the ends of the word with poor certainty and recognize them separately. If the certainty gets much better and other sanity checks pass, accept.
This superscript fix is meant to be called in the second pass of recognition when we have tried once and already have a preliminary answer for word.
- Returns
- Whether we modified the given word.
Definition at line 100 of file superscript.cpp.
106 int num_leading, num_trailing;
108 float leading_certainty, trailing_certainty;
109 float avg_certainty, unlikely_threshold;
113 word, &num_leading, &sp_leading, &leading_certainty,
114 &num_trailing, &sp_trailing, &trailing_certainty,
115 &avg_certainty, &unlikely_threshold);
117 const char *leading_pos = sp_leading ==
SP_SUBSCRIPT ?
"sub" :
"super";
118 const char *trailing_pos = sp_trailing ==
SP_SUBSCRIPT ?
"sub" :
"super";
126 int num_remainder_leading = 0, num_remainder_trailing = 0;
127 if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
132 int last_word_char = num_blobs - 1 - num_trailing;
135 last_char_certainty <= unlikely_threshold) {
137 YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top,
138 nullptr,
nullptr, &rpos, &num_remainder_trailing);
139 if (num_trailing > 0 && rpos != sp_trailing) num_remainder_trailing = 0;
140 if (num_remainder_trailing > 0 &&
141 last_char_certainty < trailing_certainty) {
142 trailing_certainty = last_char_certainty;
145 bool another_blob_available = (num_remainder_trailing == 0) ||
146 num_leading + num_trailing + 1 < num_blobs;
148 if (another_blob_available &&
150 first_char_certainty <= unlikely_threshold) {
152 YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top,
153 &lpos, &num_remainder_leading,
nullptr,
nullptr);
154 if (num_leading > 0 && lpos != sp_leading) num_remainder_leading = 0;
155 if (num_remainder_leading > 0 &&
156 first_char_certainty < leading_certainty) {
157 leading_certainty = first_char_certainty;
163 if (num_leading + num_trailing +
164 num_remainder_leading + num_remainder_trailing == 0) {
169 tprintf(
"Candidate for superscript detection: %s (",
171 if (num_leading || num_remainder_leading) {
172 tprintf(
"%d.%d %s-leading ", num_leading, num_remainder_leading,
175 if (num_trailing || num_remainder_trailing) {
176 tprintf(
"%d.%d %s-trailing ", num_trailing, num_remainder_trailing,
185 tprintf(
" Certainties -- Average: %.2f Unlikely thresh: %.2f ",
186 avg_certainty, unlikely_threshold);
188 tprintf(
"Orig. leading (min): %.2f ", leading_certainty);
190 tprintf(
"Orig. trailing (min): %.2f ", trailing_certainty);
197 int num_chopped_leading =
198 LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
199 int num_chopped_trailing =
200 TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
202 int retry_leading = 0;
203 int retry_trailing = 0;
204 bool is_good =
false;
206 num_chopped_leading, leading_certainty, sp_leading,
207 num_chopped_trailing, trailing_certainty, sp_trailing,
208 word, &is_good, &retry_leading, &retry_trailing);
211 }
else if (retry_leading || retry_trailing) {
212 int retry_chopped_leading =
213 LeadingUnicharsToChopped(revised, retry_leading);
214 int retry_chopped_trailing =
215 TrailingUnicharsToChopped(revised, retry_trailing);
217 retry_chopped_leading, leading_certainty, sp_leading,
218 retry_chopped_trailing, trailing_certainty, sp_trailing,
219 revised, &is_good, &retry_leading, &retry_trailing);
◆ terrible_word_crunch()
Definition at line 482 of file docqual.cpp.
505 (garbage_level !=
G_OK))
508 (garbage_level !=
G_OK))
511 if (crunch_mode > 0) {
513 tprintf (
"Terrible_word_crunch (%d) on \"%s\"\n",
◆ tess_acceptable_word()
bool tesseract::Tesseract::tess_acceptable_word |
( |
WERD_RES * |
word | ) |
|
◆ tess_add_doc_word()
void tesseract::Tesseract::tess_add_doc_word |
( |
WERD_CHOICE * |
word_choice | ) |
|
◆ tess_segment_pass_n()
void tesseract::Tesseract::tess_segment_pass_n |
( |
int |
pass_n, |
|
|
WERD_RES * |
word |
|
) |
| |
Definition at line 31 of file tessbox.cpp.
33 int saved_enable_assoc = 0;
34 int saved_chop_enable = 0;
◆ TestNewNormalization()
bool tesseract::Tesseract::TestNewNormalization |
( |
int |
original_misfits, |
|
|
float |
baseline_shift, |
|
|
float |
new_x_ht, |
|
|
WERD_RES * |
word, |
|
|
BLOCK * |
block, |
|
|
ROW * |
row |
|
) |
| |
Definition at line 1518 of file control.cpp.
1521 bool accept_new_x_ht =
false;
1525 new_x_ht_word.blamer_bundle->CopyTruth(*(word->
blamer_bundle));
1527 new_x_ht_word.x_height = new_x_ht;
1528 new_x_ht_word.baseline_shift = baseline_shift;
1529 new_x_ht_word.caps_height = 0.0;
1530 new_x_ht_word.SetupForRecognition(
1535 if (!new_x_ht_word.tess_failed) {
1538 tprintf(
"Old misfits=%d with x-height %f, new=%d with x-height %f\n",
1540 new_misfits, new_x_ht);
1541 tprintf(
"Old rating= %f, certainty=%f, new=%f, %f\n",
1543 new_x_ht_word.best_choice->rating(),
1544 new_x_ht_word.best_choice->certainty());
1547 accept_new_x_ht = new_misfits < original_misfits &&
1548 (new_x_ht_word.best_choice->certainty() >
1550 new_x_ht_word.best_choice->rating() <
1556 if (accept_new_x_ht) {
◆ textord()
const Textord& tesseract::Tesseract::textord |
( |
| ) |
const |
|
inline |
◆ TidyUp()
void tesseract::Tesseract::TidyUp |
( |
PAGE_RES * |
page_res | ) |
|
◆ tilde_crunch()
void tesseract::Tesseract::tilde_crunch |
( |
PAGE_RES_IT & |
page_res_it | ) |
|
Definition at line 396 of file docqual.cpp.
402 bool prev_potential_marked =
false;
403 bool found_terrible_word =
false;
407 while (page_res_it.
word() !=
nullptr) {
409 if (pb !=
nullptr && !pb->
IsText()) {
413 word = page_res_it.
word();
422 found_terrible_word =
false;
424 prev_potential_marked =
false;
433 tprintf (
"T CRUNCHING: \"%s\"\n",
437 if (prev_potential_marked) {
438 while (copy_it.
word () != word) {
440 tprintf (
"P1 CRUNCHING: \"%s\"\n",
446 prev_potential_marked =
false;
448 found_terrible_word =
true;
452 garbage_level, ok_dict_word))) {
453 if (found_terrible_word) {
455 tprintf (
"P2 CRUNCHING: \"%s\"\n",
460 else if (!prev_potential_marked) {
461 copy_it = page_res_it;
462 prev_potential_marked =
true;
464 tprintf (
"P3 CRUNCHING: \"%s\"\n",
470 found_terrible_word =
false;
472 prev_potential_marked =
false;
474 tprintf (
"NO CRUNCH: \"%s\"\n",
◆ tilde_delete()
void tesseract::Tesseract::tilde_delete |
( |
PAGE_RES_IT & |
page_res_it | ) |
|
Definition at line 568 of file docqual.cpp.
573 bool deleting_from_bol =
false;
574 bool marked_delete_point =
false;
575 int16_t debug_delete_mode;
577 int16_t x_debug_delete_mode;
581 while (page_res_it.
word() !=
nullptr) {
582 word = page_res_it.
word();
588 tprintf (
"BOL CRUNCH DELETING(%d): \"%s\"\n",
593 deleting_from_bol =
true;
595 if (marked_delete_point) {
596 while (copy_it.
word() != word) {
598 x_debug_delete_mode);
600 tprintf (
"EOL CRUNCH DELETING(%d): \"%s\"\n",
609 tprintf (
"EOL CRUNCH DELETING(%d): \"%s\"\n",
614 deleting_from_bol =
false;
615 marked_delete_point =
false;
618 if (!marked_delete_point) {
619 copy_it = page_res_it;
620 marked_delete_point =
true;
625 deleting_from_bol =
false;
627 marked_delete_point =
false;
◆ TrainedXheightFix()
bool tesseract::Tesseract::TrainedXheightFix |
( |
WERD_RES * |
word, |
|
|
BLOCK * |
block, |
|
|
ROW * |
row |
|
) |
| |
Definition at line 1484 of file control.cpp.
1486 if (original_misfits == 0)
1488 float baseline_shift = 0.0f;
1490 if (baseline_shift != 0.0f) {
1496 if (original_misfits > 0) {
1497 float new_baseline_shift;
◆ TrainFromBoxes()
Definition at line 80 of file linerec.cpp.
84 int box_count = boxes.
size();
89 while (end_box < texts.
size() && texts[end_box] ==
"\t") ++end_box;
90 for (
int start_box = end_box; start_box < box_count; start_box = end_box) {
92 TBOX line_box = boxes[start_box];
93 STRING line_str = texts[start_box];
94 for (end_box = start_box + 1; end_box < box_count && texts[end_box] !=
"\t";
96 line_box += boxes[end_box];
97 line_str += texts[end_box];
100 BLOCK* best_block =
nullptr;
101 int best_overlap = 0;
102 BLOCK_IT b_it(block_list);
103 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
104 BLOCK* block = b_it.data();
111 if (overlap_box.
area() > best_overlap) {
112 best_overlap = overlap_box.
area();
117 ImageData* imagedata =
nullptr;
118 if (best_block ==
nullptr) {
119 tprintf(
"No block overlapping textline: %s\n", line_str.
c_str());
121 imagedata =
GetLineData(line_box, boxes, texts, start_box, end_box,
124 if (imagedata !=
nullptr)
125 training_data->AddPageToDocument(imagedata);
128 while (end_box < texts.
size() && texts[end_box] ==
"\t") ++end_box;
◆ TrainLineRecognizer()
bool tesseract::Tesseract::TrainLineRecognizer |
( |
const STRING & |
input_imagename, |
|
|
const STRING & |
output_basename, |
|
|
BLOCK_LIST * |
block_list |
|
) |
| |
Definition at line 43 of file linerec.cpp.
46 STRING lstmf_name = output_basename +
".lstmf";
47 DocumentData images(lstmf_name);
50 if (!images.LoadDocument(lstmf_name.
c_str(), 0, 0,
nullptr)) {
51 tprintf(
"Failed to read training data from %s!\n", lstmf_name.
c_str());
61 tprintf(
"Failed to read boxes from %s\n", input_imagename.
c_str());
65 if (images.PagesSize() == 0) {
66 tprintf(
"Failed to read pages from %s\n", input_imagename.
c_str());
70 if (!images.SaveDocument(lstmf_name.
c_str(),
nullptr)) {
71 tprintf(
"Failed to write training data to %s!\n", lstmf_name.
c_str());
◆ TrySuperscriptSplits()
WERD_RES * tesseract::Tesseract::TrySuperscriptSplits |
( |
int |
num_chopped_leading, |
|
|
float |
leading_certainty, |
|
|
ScriptPos |
leading_pos, |
|
|
int |
num_chopped_trailing, |
|
|
float |
trailing_certainty, |
|
|
ScriptPos |
trailing_pos, |
|
|
WERD_RES * |
word, |
|
|
bool * |
is_good, |
|
|
int * |
retry_rebuild_leading, |
|
|
int * |
retry_rebuild_trailing |
|
) |
| |
Try splitting off the given number of (chopped) blobs from the front and back of the given word and recognizing the pieces.
- Parameters
-
[in] | num_chopped_leading | how many chopped blobs from the left end of the word to chop off and try recognizing as a superscript (or subscript) |
[in] | leading_certainty | the (minimum) certainty had by the characters in the original leading section. |
[in] | leading_pos | "super" or "sub" (for debugging) |
[in] | num_chopped_trailing | how many chopped blobs from the right end of the word to chop off and try recognizing as a superscript (or subscript) |
[in] | trailing_certainty | the (minimum) certainty had by the characters in the original trailing section. |
[in] | trailing_pos | "super" or "sub" (for debugging) |
[in] | word | the word to try to chop up. |
[out] | is_good | do we believe our result? |
[out] | retry_rebuild_leading,retry_rebuild_trailing | If non-zero, and !is_good, then the caller may have luck trying to split the returned word with this number of (rebuilt) leading and trailing blobs / unichars. |
- Returns
- A word which is the result of re-recognizing as asked.
Definition at line 381 of file superscript.cpp.
391 *retry_rebuild_leading = *retry_rebuild_trailing = 0;
400 if (num_chopped_leading > 0) {
402 split_word(prefix, num_chopped_leading, &core, &bb0);
407 if (num_chopped_trailing > 0) {
408 int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
422 tprintf(
" recognizing first %d chopped blobs\n", num_chopped_leading);
426 tprintf(
" The leading bits look like %s %s\n",
437 tprintf(
" recognizing middle %d chopped blobs\n",
438 num_chopped - num_chopped_leading - num_chopped_trailing);
447 tprintf(
" recognizing last %d chopped blobs\n", num_chopped_trailing);
451 tprintf(
" The trailing bits look like %s %s\n",
466 retry_rebuild_leading,
nullptr);
470 nullptr, retry_rebuild_trailing);
472 *is_good = good_prefix && good_suffix;
473 if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
496 tprintf(
"%s superscript fix: %s\n", *is_good ?
"ACCEPT" :
"REJECT",
◆ unrej_good_chs()
void tesseract::Tesseract::unrej_good_chs |
( |
WERD_RES * |
word | ) |
|
◆ unrej_good_quality_words()
void tesseract::Tesseract::unrej_good_quality_words |
( |
PAGE_RES_IT & |
page_res_it | ) |
|
Definition at line 154 of file docqual.cpp.
155 word = page_res_it.
word ();
156 for (i = 0; i < word->reject_map.length (); i++) {
157 if (word->reject_map[i].accept_if_good_quality ())
166 word = page_res_it.
word ();
167 if (word->reject_map.quality_recoverable_rejects() &&
170 word->best_choice->unichar_string().c_str(),
171 word->best_choice->unichar_lengths().c_str())
179 current_row = page_res_it.
row ();
180 while ((page_res_it.
word () !=
nullptr) &&
181 (page_res_it.
row () == current_row))
189 current_block =
nullptr;
190 current_row =
nullptr;
191 while (page_res_it.
word () !=
nullptr) {
192 if (current_block != page_res_it.
block ()) {
193 current_block = page_res_it.
block ();
195 current_block->rej_count = 0;
197 if (current_row != page_res_it.
row ()) {
198 current_row = page_res_it.
row ();
200 current_row->rej_count = 0;
201 current_row->whole_word_rej_count = 0;
◆ word_adaptable()
bool tesseract::Tesseract::word_adaptable |
( |
WERD_RES * |
word, |
|
|
uint16_t |
mode |
|
) |
| |
Definition at line 50 of file adaptions.cpp.
64 if (flags.bit (ADAPTABLE_WERD)) {
67 tprintf(
"tess_would_adapt bit is false\n");
71 if (flags.bit (ACCEPTABLE_WERD)) {
74 tprintf(
"tess_accepted bit is false\n");
82 if (flags.bit (CHECK_DAWGS) &&
96 if (flags.bit (CHECK_SPACES) &&
102 if (flags.bit (CHECK_AMBIG_WERD) &&
109 tprintf(
"returning status %d\n", status);
◆ word_blank_and_set_display()
bool tesseract::Tesseract::word_blank_and_set_display |
( |
PAGE_RES_IT * |
pr_its | ) |
|
◆ word_bln_display()
bool tesseract::Tesseract::word_bln_display |
( |
PAGE_RES_IT * |
pr_it | ) |
|
◆ word_blob_quality()
int16_t tesseract::Tesseract::word_blob_quality |
( |
WERD_RES * |
word | ) |
|
◆ word_char_quality()
void tesseract::Tesseract::word_char_quality |
( |
WERD_RES * |
word, |
|
|
int16_t * |
match_count, |
|
|
int16_t * |
accepted_match_count |
|
) |
| |
Definition at line 95 of file docqual.cpp.
104 using namespace std::placeholders;
106 *word->
rebuild_word, std::bind(acceptIfGoodQuality, word, _1));
◆ word_contains_non_1_digit()
bool tesseract::Tesseract::word_contains_non_1_digit |
( |
const char * |
word, |
|
|
const char * |
word_lengths |
|
) |
| |
Definition at line 508 of file reject.cpp.
515 for (i = 0, offset = 0; word[offset] !=
'\0'; offset += word_lengths[i++]) {
517 (word_lengths[i] != 1 || word[offset] !=
'1'))
◆ word_deletable()
◆ word_display()
bool tesseract::Tesseract::word_display |
( |
PAGE_RES_IT * |
pr_it | ) |
|
word_display() Word Processor
Display a word according to its display modes
Definition at line 749 of file pgedit.cpp.
757 switch (color_mode) {
767 if (font_info.is_italic())
771 if (font_info.is_bold())
775 if (font_info.is_fixed_pitch())
779 if (font_info.is_serif())
783 if (word_res->small_caps)
787 if (best_choice->BlobPosition(i) ==
SP_DROPCAP)
795 image_win->
Pen(color);
796 TBOX box = box_word->BlobBox(i);
802 #endif // ndef DISABLED_LEGACY_ENGINE
809 if (word->display_flag(
DF_BOX)) {
810 word->bounding_box().plot(image_win,
811 static_cast<ScrollView::Color>((int32_t)
813 static_cast<ScrollView::Color>((int32_t)
819 C_BLOB_IT c_it(word->cblob_list());
820 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward())
821 c_it.data()->bounding_box().plot(image_win);
822 displayed_something =
true;
827 word->plot(image_win);
828 displayed_something =
true;
835 tword->
plot(image_win);
837 displayed_something =
true;
843 if (word->display_flag(
DF_TEXT) && word->text() !=
nullptr) {
847 !(word_res->blamer_bundle !=
nullptr &&
848 word_res->blamer_bundle->incorrect_result_reason() ==
IRR_CORRECT)) {
850 const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
851 if (blamer_bundle ==
nullptr) {
858 if (word_res->best_choice ==
nullptr) {
859 best_choice_str =
"NULL";
861 word_res->best_choice->string_and_lengths(&best_choice_str,
nullptr);
863 text += best_choice_str;
872 word_bb = word->bounding_box();
874 word_height = word_bb.height();
875 int text_height = 0.50 * word_height;
876 if (text_height > 20) text_height = 20;
877 image_win->
TextAttributes(
"Arial", text_height,
false,
false,
false);
878 shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
879 image_win->
Text(word_bb.left() + shift,
880 word_bb.bottom() + 0.25 * word_height, text.
c_str());
882 image_win->
Text(word_bb.left() + shift,
883 word_bb.bottom() + 0.25 * word_height - text_height,
887 displayed_something =
true;
890 if (!displayed_something)
891 word->bounding_box().plot(image_win,
893 static_cast<ScrollView::Color>((int32_t)
897 #endif // GRAPHICS_DISABLED
◆ word_dumper()
bool tesseract::Tesseract::word_dumper |
( |
PAGE_RES_IT * |
pr_it | ) |
|
word_dumper()
Dump members to the debug window
Definition at line 913 of file pgedit.cpp.
916 tprintf(
"Current blamer debug: %s\n",
917 word_res->blamer_bundle->debug().c_str());
922 #ifndef GRAPHICS_DISABLED
◆ word_outline_errs()
int16_t tesseract::Tesseract::word_outline_errs |
( |
WERD_RES * |
word | ) |
|
Definition at line 76 of file docqual.cpp.
86 *accepted_match_count = 0;
89 using namespace std::placeholders;
◆ word_set_display()
bool tesseract::Tesseract::word_set_display |
( |
PAGE_RES_IT * |
pr_it | ) |
|
word_set_display() Word processor
Display word according to current display mode settings
Definition at line 937 of file pgedit.cpp.
945 #ifndef DISABLED_LEGACY_ENGINE
◆ worst_noise_blob()
int16_t tesseract::Tesseract::worst_noise_blob |
( |
WERD_RES * |
word_res, |
|
|
float * |
worst_noise_score |
|
) |
| |
Definition at line 706 of file fixspace.cpp.
709 float noise_score[512];
731 tprintf(
"FP fixspace Noise metrics for \"%s\": ",
738 noise_score[i] = non_noise_limit;
743 tprintf(
"%1.1f ", noise_score[i]);
752 if (noise_score[i] >= non_noise_limit) {
764 if (noise_score[i] >= non_noise_limit) {
773 if (min_noise_blob > max_noise_blob)
776 *worst_noise_score = small_limit;
778 for (i = min_noise_blob; i <= max_noise_blob; i++) {
779 if (noise_score[i] < *worst_noise_score) {
781 *worst_noise_score = noise_score[i];
◆ write_results()
void tesseract::Tesseract::write_results |
( |
PAGE_RES_IT & |
page_res_it, |
|
|
char |
newline_type, |
|
|
bool |
force_eol |
|
) |
| |
Definition at line 96 of file output.cpp.
104 bool need_reject =
false;
182 tprintf (
"Dict word: \"%s\": %d\n",
191 word->
reject_map[i].setrej_minimal_rej_accept();
199 word->
reject_map[i].setrej_minimal_rej_accept();
◆ applybox_debug
int tesseract::Tesseract::applybox_debug = 1 |
◆ applybox_exposure_pattern
char* tesseract::Tesseract::applybox_exposure_pattern = ".exp" |
"Exposure value follows this pattern in the image" " filename. The name of the image files are expected" " to be in the form [lang].[fontname].exp[num].tif"
Definition at line 828 of file tesseractclass.h.
◆ applybox_learn_chars_and_char_frags_mode
bool tesseract::Tesseract::applybox_learn_chars_and_char_frags_mode = false |
"Learn both character fragments (as is done in the" " special low exposure mode) as well as unfragmented" " characters."
Definition at line 832 of file tesseractclass.h.
◆ applybox_learn_ngrams_mode
bool tesseract::Tesseract::applybox_learn_ngrams_mode = false |
"Each bounding box is assumed to contain ngrams. Only" " learn the ngrams whose outlines overlap horizontally."
Definition at line 835 of file tesseractclass.h.
◆ applybox_page
int tesseract::Tesseract::applybox_page = 0 |
◆ bidi_debug
int tesseract::Tesseract::bidi_debug = 0 |
◆ bland_unrej
bool tesseract::Tesseract::bland_unrej = false |
◆ chs_leading_punct
char* tesseract::Tesseract::chs_leading_punct = "('`\"" |
◆ chs_trailing_punct1
char* tesseract::Tesseract::chs_trailing_punct1 = ").,;:?!" |
◆ chs_trailing_punct2
char* tesseract::Tesseract::chs_trailing_punct2 = ")'`\"" |
◆ conflict_set_I_l_1
char* tesseract::Tesseract::conflict_set_I_l_1 = "Il1[]" |
◆ crunch_accept_ok
bool tesseract::Tesseract::crunch_accept_ok = true |
◆ crunch_debug
int tesseract::Tesseract::crunch_debug = 0 |
◆ crunch_del_cert
double tesseract::Tesseract::crunch_del_cert = -10.0 |
◆ crunch_del_high_word
double tesseract::Tesseract::crunch_del_high_word = 1.5 |
◆ crunch_del_low_word
double tesseract::Tesseract::crunch_del_low_word = 0.5 |
◆ crunch_del_max_ht
double tesseract::Tesseract::crunch_del_max_ht = 3.0 |
◆ crunch_del_min_ht
double tesseract::Tesseract::crunch_del_min_ht = 0.7 |
◆ crunch_del_min_width
double tesseract::Tesseract::crunch_del_min_width = 3.0 |
◆ crunch_del_rating
double tesseract::Tesseract::crunch_del_rating = 60 |
◆ crunch_early_convert_bad_unlv_chs
bool tesseract::Tesseract::crunch_early_convert_bad_unlv_chs = false |
◆ crunch_early_merge_tess_fails
bool tesseract::Tesseract::crunch_early_merge_tess_fails = true |
◆ crunch_include_numerals
bool tesseract::Tesseract::crunch_include_numerals = false |
◆ crunch_leave_accept_strings
bool tesseract::Tesseract::crunch_leave_accept_strings = false |
◆ crunch_leave_lc_strings
int tesseract::Tesseract::crunch_leave_lc_strings = 4 |
"Don't crunch words with long lower case strings"
Definition at line 958 of file tesseractclass.h.
◆ crunch_leave_ok_strings
bool tesseract::Tesseract::crunch_leave_ok_strings = true |
◆ crunch_leave_uc_strings
int tesseract::Tesseract::crunch_leave_uc_strings = 4 |
"Don't crunch words with long lower case strings"
Definition at line 960 of file tesseractclass.h.
◆ crunch_long_repetitions
int tesseract::Tesseract::crunch_long_repetitions = 3 |
◆ crunch_poor_garbage_cert
double tesseract::Tesseract::crunch_poor_garbage_cert = -9.0 |
◆ crunch_poor_garbage_rate
double tesseract::Tesseract::crunch_poor_garbage_rate = 60 |
◆ crunch_pot_indicators
int tesseract::Tesseract::crunch_pot_indicators = 1 |
◆ crunch_pot_poor_cert
double tesseract::Tesseract::crunch_pot_poor_cert = -8.0 |
◆ crunch_pot_poor_rate
double tesseract::Tesseract::crunch_pot_poor_rate = 40 |
◆ crunch_rating_max
int tesseract::Tesseract::crunch_rating_max = 10 |
◆ crunch_small_outlines_size
double tesseract::Tesseract::crunch_small_outlines_size = 0.6 |
◆ crunch_terrible_garbage
bool tesseract::Tesseract::crunch_terrible_garbage = true |
◆ crunch_terrible_rating
double tesseract::Tesseract::crunch_terrible_rating = 80.0 |
◆ debug_fix_space_level
int tesseract::Tesseract::debug_fix_space_level = 0 |
◆ debug_noise_removal
int tesseract::Tesseract::debug_noise_removal = 0 |
◆ debug_x_ht_level
int tesseract::Tesseract::debug_x_ht_level = 0 |
◆ enable_noise_removal
bool tesseract::Tesseract::enable_noise_removal = true |
"Remove and conditionally reassign small outlines when they" " confuse layout analysis, determining diacritics vs noise"
Definition at line 856 of file tesseractclass.h.
◆ file_type
char* tesseract::Tesseract::file_type = ".tif" |
◆ fixsp_done_mode
int tesseract::Tesseract::fixsp_done_mode = 1 |
◆ fixsp_non_noise_limit
int tesseract::Tesseract::fixsp_non_noise_limit = 1 |
◆ fixsp_small_outlines_size
double tesseract::Tesseract::fixsp_small_outlines_size = 0.28 |
◆ hocr_char_boxes
bool tesseract::Tesseract::hocr_char_boxes = false |
"Add coordinates for each character to hocr output"
Definition at line 933 of file tesseractclass.h.
◆ hocr_font_info
bool tesseract::Tesseract::hocr_font_info = false |
◆ interactive_display_mode
bool tesseract::Tesseract::interactive_display_mode = false |
◆ jpg_quality
int tesseract::Tesseract::jpg_quality = 85 |
◆ lstm_choice_iterations
int tesseract::Tesseract::lstm_choice_iterations = 5 |
"Sets the number of cascading iterations for the Beamsearch in " "lstm_choice_mode. Note that lstm_choice_mode must be set to " "a value greater than 0 to produce results."
Definition at line 1090 of file tesseractclass.h.
◆ lstm_choice_mode
int tesseract::Tesseract::lstm_choice_mode = 0 |
"Allows to include alternative symbols choices in the hOCR " "output. " "Valid input values are 0, 1 and 2. 0 is the default value. " "With 1 the alternative symbol choices per timestep are included. " "With 2 the alternative symbol choices are extracted from the CTC " "process instead of the lattice. The choices are mapped per " "character."
Definition at line 1086 of file tesseractclass.h.
◆ lstm_rating_coefficient
double tesseract::Tesseract::lstm_rating_coefficient = 5 |
"Sets the rating coefficient for the lstm choices. The smaller " "the coefficient, the better are the ratings for each choice " "and less information is lost due to the cut off at 0. The " "standard value is 5."
Definition at line 1095 of file tesseractclass.h.
◆ lstm_use_matrix
bool tesseract::Tesseract::lstm_use_matrix = 1 |
◆ min_characters_to_try
int tesseract::Tesseract::min_characters_to_try = 50 |
◆ min_orientation_margin
double tesseract::Tesseract::min_orientation_margin = 7.0 |
◆ min_sane_x_ht_pixels
int tesseract::Tesseract::min_sane_x_ht_pixels = 8 |
◆ multilang_debug_level
int tesseract::Tesseract::multilang_debug_level = 0 |
◆ noise_cert_basechar
double tesseract::Tesseract::noise_cert_basechar = -8.0 |
◆ noise_cert_disjoint
double tesseract::Tesseract::noise_cert_disjoint = -2.5 |
◆ noise_cert_factor
double tesseract::Tesseract::noise_cert_factor = 0.375 |
◆ noise_cert_punc
double tesseract::Tesseract::noise_cert_punc = -2.5 |
◆ noise_maxperblob
int tesseract::Tesseract::noise_maxperblob = 8 |
◆ noise_maxperword
int tesseract::Tesseract::noise_maxperword = 16 |
◆ numeric_punctuation
char* tesseract::Tesseract::numeric_punctuation = ".," |
◆ ocr_devanagari_split_strategy
"Whether to use the top-line splitting process for Devanagari " "documents while performing ocr."
Definition at line 817 of file tesseractclass.h.
◆ ok_repeated_ch_non_alphanum_wds
char* tesseract::Tesseract::ok_repeated_ch_non_alphanum_wds = "-?*\075" |
◆ outlines_2
char* tesseract::Tesseract::outlines_2 = "ij!?%\":;" |
◆ outlines_odd
char* tesseract::Tesseract::outlines_odd = "%| " |
◆ page_separator
char* tesseract::Tesseract::page_separator = "\f" |
"Page separator (default is form feed control character)"
Definition at line 1078 of file tesseractclass.h.
◆ pageseg_apply_music_mask
bool tesseract::Tesseract::pageseg_apply_music_mask = true |
"Detect music staff and remove intersecting components"
Definition at line 1097 of file tesseractclass.h.
◆ pageseg_devanagari_split_strategy
"Whether to use the top-line splitting process for Devanagari " "documents while performing page-segmentation."
Definition at line 813 of file tesseractclass.h.
◆ paragraph_debug_level
int tesseract::Tesseract::paragraph_debug_level = 0 |
◆ paragraph_text_based
bool tesseract::Tesseract::paragraph_text_based = true |
"Run paragraph detection on the post-text-recognition " "(more accurate)"
Definition at line 894 of file tesseractclass.h.
◆ poly_allow_detailed_fx
bool tesseract::Tesseract::poly_allow_detailed_fx = false |
"Allow feature extractors to see the original outline"
Definition at line 1061 of file tesseractclass.h.
◆ preserve_interword_spaces
bool tesseract::Tesseract::preserve_interword_spaces = false |
◆ quality_blob_pc
double tesseract::Tesseract::quality_blob_pc = 0.0 |
◆ quality_char_pc
double tesseract::Tesseract::quality_char_pc = 0.95 |
◆ quality_min_initial_alphas_reqd
int tesseract::Tesseract::quality_min_initial_alphas_reqd = 2 |
◆ quality_outline_pc
double tesseract::Tesseract::quality_outline_pc = 1.0 |
◆ quality_rej_pc
double tesseract::Tesseract::quality_rej_pc = 0.08 |
◆ quality_rowrej_pc
double tesseract::Tesseract::quality_rowrej_pc = 1.1 |
◆ rej_1Il_trust_permuter_type
bool tesseract::Tesseract::rej_1Il_trust_permuter_type = true |
◆ rej_1Il_use_dict_word
bool tesseract::Tesseract::rej_1Il_use_dict_word = false |
◆ rej_alphas_in_number_perm
bool tesseract::Tesseract::rej_alphas_in_number_perm = false |
◆ rej_trust_doc_dawg
bool tesseract::Tesseract::rej_trust_doc_dawg = false |
◆ rej_use_good_perm
bool tesseract::Tesseract::rej_use_good_perm = true |
◆ rej_use_sensible_wd
bool tesseract::Tesseract::rej_use_sensible_wd = false |
◆ rej_use_tess_accepted
bool tesseract::Tesseract::rej_use_tess_accepted = true |
◆ rej_use_tess_blanks
bool tesseract::Tesseract::rej_use_tess_blanks = true |
◆ rej_whole_of_mostly_reject_word_fract
double tesseract::Tesseract::rej_whole_of_mostly_reject_word_fract = 0.85 |
◆ subscript_max_y_top
double tesseract::Tesseract::subscript_max_y_top = 0.5 |
"Maximum top of a character measured as a multiple of x-height " "above the baseline for us to reconsider whether it's a " "subscript."
Definition at line 989 of file tesseractclass.h.
◆ superscript_bettered_certainty
double tesseract::Tesseract::superscript_bettered_certainty = 0.97 |
"What reduction in " "badness do we think sufficient to choose a superscript over " "what we'd thought. For example, a value of 0.6 means we want " "to reduce badness of certainty by 40%"
Definition at line 981 of file tesseractclass.h.
◆ superscript_debug
int tesseract::Tesseract::superscript_debug = 0 |
◆ superscript_min_y_bottom
double tesseract::Tesseract::superscript_min_y_bottom = 0.3 |
"Minimum bottom of a character measured as a multiple of " "x-height above the baseline for us to reconsider whether it's " "a superscript."
Definition at line 993 of file tesseractclass.h.
◆ superscript_scaledown_ratio
double tesseract::Tesseract::superscript_scaledown_ratio = 0.4 |
"A superscript scaled down more than this is unbelievably " "small. For example, 0.3 means we expect the font size to " "be no smaller than 30% of the text line font size."
Definition at line 985 of file tesseractclass.h.
◆ superscript_worse_certainty
double tesseract::Tesseract::superscript_worse_certainty = 2.0 |
"How many times worse " "certainty does a superscript position glyph need to be for us " "to try classifying it as a char with a different baseline?"
Definition at line 976 of file tesseractclass.h.
◆ suspect_accept_rating
double tesseract::Tesseract::suspect_accept_rating = -999.9 |
◆ suspect_constrain_1Il
bool tesseract::Tesseract::suspect_constrain_1Il = false |
◆ suspect_level
int tesseract::Tesseract::suspect_level = 99 |
◆ suspect_rating_per_ch
double tesseract::Tesseract::suspect_rating_per_ch = 999.9 |
◆ suspect_short_words
int tesseract::Tesseract::suspect_short_words = 2 |
◆ tessedit_adaption_debug
bool tesseract::Tesseract::tessedit_adaption_debug = false |
"Generate and print debug information for adaption"
Definition at line 821 of file tesseractclass.h.
◆ tessedit_ambigs_training
bool tesseract::Tesseract::tessedit_ambigs_training = false |
◆ tessedit_bigram_debug
int tesseract::Tesseract::tessedit_bigram_debug = 0 |
"Amount of debug output for bigram " "correction."
Definition at line 853 of file tesseractclass.h.
◆ tessedit_char_blacklist
char* tesseract::Tesseract::tessedit_char_blacklist = "" |
◆ tessedit_char_unblacklist
char* tesseract::Tesseract::tessedit_char_unblacklist = "" |
"List of chars to override tessedit_char_blacklist"
Definition at line 807 of file tesseractclass.h.
◆ tessedit_char_whitelist
char* tesseract::Tesseract::tessedit_char_whitelist = "" |
◆ tessedit_create_alto
bool tesseract::Tesseract::tessedit_create_alto = false |
◆ tessedit_create_boxfile
bool tesseract::Tesseract::tessedit_create_boxfile = false |
◆ tessedit_create_hocr
bool tesseract::Tesseract::tessedit_create_hocr = false |
◆ tessedit_create_lstmbox
bool tesseract::Tesseract::tessedit_create_lstmbox = false |
◆ tessedit_create_pdf
bool tesseract::Tesseract::tessedit_create_pdf = false |
◆ tessedit_create_tsv
bool tesseract::Tesseract::tessedit_create_tsv = false |
◆ tessedit_create_txt
bool tesseract::Tesseract::tessedit_create_txt = false |
◆ tessedit_create_wordstrbox
bool tesseract::Tesseract::tessedit_create_wordstrbox = false |
◆ tessedit_debug_block_rejection
bool tesseract::Tesseract::tessedit_debug_block_rejection = false |
◆ tessedit_debug_doc_rejection
bool tesseract::Tesseract::tessedit_debug_doc_rejection = false |
◆ tessedit_debug_fonts
bool tesseract::Tesseract::tessedit_debug_fonts = false |
◆ tessedit_debug_quality_metrics
bool tesseract::Tesseract::tessedit_debug_quality_metrics = false |
◆ tessedit_display_outwords
bool tesseract::Tesseract::tessedit_display_outwords = false |
◆ tessedit_do_invert
bool tesseract::Tesseract::tessedit_do_invert = true |
"Try inverting the image in `LSTMRecognizeWord`"
Definition at line 795 of file tesseractclass.h.
◆ tessedit_dont_blkrej_good_wds
bool tesseract::Tesseract::tessedit_dont_blkrej_good_wds = false |
◆ tessedit_dont_rowrej_good_wds
bool tesseract::Tesseract::tessedit_dont_rowrej_good_wds = false |
◆ tessedit_dump_choices
bool tesseract::Tesseract::tessedit_dump_choices = false |
◆ tessedit_dump_pageseg_images
bool tesseract::Tesseract::tessedit_dump_pageseg_images = false |
"Dump intermediate images made during page segmentation"
Definition at line 793 of file tesseractclass.h.
◆ tessedit_enable_bigram_correction
bool tesseract::Tesseract::tessedit_enable_bigram_correction = true |
"Enable correction based on the word bigram dictionary."
Definition at line 848 of file tesseractclass.h.
◆ tessedit_enable_dict_correction
bool tesseract::Tesseract::tessedit_enable_dict_correction = false |
"Enable single word correction based on the dictionary."
Definition at line 850 of file tesseractclass.h.
◆ tessedit_enable_doc_dict
bool tesseract::Tesseract::tessedit_enable_doc_dict = true |
◆ tessedit_fix_fuzzy_spaces
bool tesseract::Tesseract::tessedit_fix_fuzzy_spaces = true |
◆ tessedit_fix_hyphens
bool tesseract::Tesseract::tessedit_fix_hyphens = true |
◆ tessedit_flip_0O
bool tesseract::Tesseract::tessedit_flip_0O = true |
◆ tessedit_good_doc_still_rowrej_wd
double tesseract::Tesseract::tessedit_good_doc_still_rowrej_wd = 1.1 |
"rej good doc wd if more than this fraction rejected"
Definition at line 923 of file tesseractclass.h.
◆ tessedit_good_quality_unrej
bool tesseract::Tesseract::tessedit_good_quality_unrej = true |
◆ tessedit_image_border
int tesseract::Tesseract::tessedit_image_border = 2 |
◆ tessedit_init_config_only
bool tesseract::Tesseract::tessedit_init_config_only = false |
"Only initialize with the config file. Useful if the instance is " "not going to be used for OCR but say only for layout analysis."
Definition at line 1064 of file tesseractclass.h.
◆ tessedit_load_sublangs
char* tesseract::Tesseract::tessedit_load_sublangs = "" |
◆ tessedit_lower_flip_hyphen
double tesseract::Tesseract::tessedit_lower_flip_hyphen = 1.5 |
◆ tessedit_make_boxes_from_boxes
bool tesseract::Tesseract::tessedit_make_boxes_from_boxes = false |
◆ tessedit_minimal_rej_pass1
bool tesseract::Tesseract::tessedit_minimal_rej_pass1 = false |
◆ tessedit_minimal_rejection
bool tesseract::Tesseract::tessedit_minimal_rejection = false |
◆ tessedit_ocr_engine_mode
"Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults" " to loading and running the most accurate available."
Definition at line 802 of file tesseractclass.h.
◆ tessedit_override_permuter
bool tesseract::Tesseract::tessedit_override_permuter = true |
◆ tessedit_page_number
int tesseract::Tesseract::tessedit_page_number = -1 |
◆ tessedit_pageseg_mode
"Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," " 5=line, 6=word, 7=char" " (Values from PageSegMode enum in tesseract/publictypes.h)"
Definition at line 799 of file tesseractclass.h.
◆ tessedit_parallelize
int tesseract::Tesseract::tessedit_parallelize = 0 |
◆ tessedit_prefer_joined_punct
bool tesseract::Tesseract::tessedit_prefer_joined_punct = false |
◆ tessedit_preserve_blk_rej_perfect_wds
bool tesseract::Tesseract::tessedit_preserve_blk_rej_perfect_wds = true |
"Only rej partially rejected words in block rejection"
Definition at line 911 of file tesseractclass.h.
◆ tessedit_preserve_min_wd_len
int tesseract::Tesseract::tessedit_preserve_min_wd_len = 2 |
◆ tessedit_preserve_row_rej_perfect_wds
bool tesseract::Tesseract::tessedit_preserve_row_rej_perfect_wds = true |
"Only rej partially rejected words in row rejection"
Definition at line 913 of file tesseractclass.h.
◆ tessedit_reject_bad_qual_wds
bool tesseract::Tesseract::tessedit_reject_bad_qual_wds = true |
◆ tessedit_reject_block_percent
double tesseract::Tesseract::tessedit_reject_block_percent = 45.00 |
◆ tessedit_reject_doc_percent
double tesseract::Tesseract::tessedit_reject_doc_percent = 65.00 |
◆ tessedit_reject_mode
int tesseract::Tesseract::tessedit_reject_mode = 0 |
◆ tessedit_reject_row_percent
double tesseract::Tesseract::tessedit_reject_row_percent = 40.00 |
◆ tessedit_rejection_debug
bool tesseract::Tesseract::tessedit_rejection_debug = false |
◆ tessedit_resegment_from_boxes
bool tesseract::Tesseract::tessedit_resegment_from_boxes = false |
"Take segmentation and labeling from box file"
Definition at line 783 of file tesseractclass.h.
◆ tessedit_resegment_from_line_boxes
bool tesseract::Tesseract::tessedit_resegment_from_line_boxes = false |
"Conversion of word/line box file to char box file"
Definition at line 785 of file tesseractclass.h.
◆ tessedit_row_rej_good_docs
bool tesseract::Tesseract::tessedit_row_rej_good_docs = true |
◆ tessedit_tess_adaption_mode
int tesseract::Tesseract::tessedit_tess_adaption_mode = 0x27 |
◆ tessedit_test_adaption
bool tesseract::Tesseract::tessedit_test_adaption = false |
◆ tessedit_timing_debug
bool tesseract::Tesseract::tessedit_timing_debug = false |
◆ tessedit_train_from_boxes
bool tesseract::Tesseract::tessedit_train_from_boxes = false |
◆ tessedit_train_line_recognizer
bool tesseract::Tesseract::tessedit_train_line_recognizer = false |
"Break input into lines and remap boxes if present"
Definition at line 791 of file tesseractclass.h.
◆ tessedit_unrej_any_wd
bool tesseract::Tesseract::tessedit_unrej_any_wd = false |
◆ tessedit_upper_flip_hyphen
double tesseract::Tesseract::tessedit_upper_flip_hyphen = 1.8 |
◆ tessedit_use_primary_params_model
bool tesseract::Tesseract::tessedit_use_primary_params_model = false |
"In multilingual mode use params model of the primary language"
Definition at line 1053 of file tesseractclass.h.
◆ tessedit_use_reject_spaces
bool tesseract::Tesseract::tessedit_use_reject_spaces = true |
◆ tessedit_whole_wd_rej_row_percent
double tesseract::Tesseract::tessedit_whole_wd_rej_row_percent = 70.00 |
"Number of row rejects in whole word rejects" "which prevents whole row rejection"
Definition at line 909 of file tesseractclass.h.
◆ tessedit_word_for_word
bool tesseract::Tesseract::tessedit_word_for_word = false |
◆ tessedit_write_block_separators
bool tesseract::Tesseract::tessedit_write_block_separators = false |
◆ tessedit_write_images
bool tesseract::Tesseract::tessedit_write_images = false |
◆ tessedit_write_params_to_file
char* tesseract::Tesseract::tessedit_write_params_to_file = "" |
◆ tessedit_write_rep_codes
bool tesseract::Tesseract::tessedit_write_rep_codes = false |
◆ tessedit_write_unlv
bool tesseract::Tesseract::tessedit_write_unlv = false |
◆ tessedit_zero_kelvin_rejection
bool tesseract::Tesseract::tessedit_zero_kelvin_rejection = false |
◆ tessedit_zero_rejection
bool tesseract::Tesseract::tessedit_zero_rejection = false |
◆ test_pt
bool tesseract::Tesseract::test_pt = false |
◆ test_pt_x
double tesseract::Tesseract::test_pt_x = 99999.99 |
◆ test_pt_y
double tesseract::Tesseract::test_pt_y = 99999.99 |
◆ textonly_pdf
bool tesseract::Tesseract::textonly_pdf = false |
◆ textord_equation_detect
bool tesseract::Tesseract::textord_equation_detect = false |
◆ textord_tabfind_aligned_gap_fraction
double tesseract::Tesseract::textord_tabfind_aligned_gap_fraction = 0.75 |
"Fraction of height used as a minimum gap for aligned blobs."
Definition at line 1073 of file tesseractclass.h.
◆ textord_tabfind_force_vertical_text
bool tesseract::Tesseract::textord_tabfind_force_vertical_text = false |
◆ textord_tabfind_show_vlines
bool tesseract::Tesseract::textord_tabfind_show_vlines = false |
◆ textord_tabfind_vertical_text
bool tesseract::Tesseract::textord_tabfind_vertical_text = true |
◆ textord_tabfind_vertical_text_ratio
double tesseract::Tesseract::textord_tabfind_vertical_text_ratio = 0.5 |
"Fraction of textlines deemed vertical to use vertical page " "mode"
Definition at line 1071 of file tesseractclass.h.
◆ textord_use_cjk_fp_model
bool tesseract::Tesseract::textord_use_cjk_fp_model = false |
◆ unlv_tilde_crunching
bool tesseract::Tesseract::unlv_tilde_crunching = false |
◆ unrecognised_char
char* tesseract::Tesseract::unrecognised_char = "|" |
◆ user_defined_dpi
int tesseract::Tesseract::user_defined_dpi = 0 |
◆ x_ht_acceptance_tolerance
int tesseract::Tesseract::x_ht_acceptance_tolerance = 8 |
"Max allowed deviation of blob top outside of font data"
Definition at line 970 of file tesseractclass.h.
◆ x_ht_min_change
int tesseract::Tesseract::x_ht_min_change = 8 |
The documentation for this class was generated from the following files:
bool load_from_file(const char *const filename, bool skip_fragments)
bool tessedit_dont_rowrej_good_wds
void full_print(FILE *fp)
double superscript_bettered_certainty
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
const STRING & unichar_string() const
void set_ocr_split_strategy(SplitStrategy strategy)
bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
int16_t first_alphanum_index(const char *word, const char *word_lengths)
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
char * applybox_exposure_pattern
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
int os_detect_blobs(const GenericVector< int > *allowed_scripts, BLOBNBOX_CLIST *blob_list, OSResults *osr, tesseract::Tesseract *tess)
void set_x(int16_t xin)
rewrite function
void break_noisiest_blob_word(WERD_RES_LIST &words)
bool poly_allow_detailed_fx
void split_and_recog_word(WERD_RES *word)
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
STRING TruthString() const
TBOX bounding_box() const
void move(const ICOORD vec)
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
bool PSM_OSD_ENABLED(int pageseg_mode)
void set_use_cjk_fp_model(bool flag)
int quality_min_initial_alphas_reqd
double crunch_terrible_rating
bool use_ambigs_for_adaption
bool tessedit_use_reject_spaces
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
tesseract::BoxWord * box_word
void set_pageseg_split_strategy(SplitStrategy strategy)
WERD_CHOICE shallow_copy(int start, int end) const
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
bool tessedit_train_from_boxes
void turn_on_bit(uint8_t bit_num)
int pageseg_devanagari_split_strategy
bool get_islower(UNICHAR_ID unichar_id) const
int16_t failure_count(WERD_RES *word)
GenericVector< int > blob_widths
BLOCK_RES * next_block() const
void rej_word_block_rej()
TBOX intersection(const TBOX &box) const
void add_str_int(const char *str, int number)
float base_line(float xpos) const
double tessedit_good_doc_still_rowrej_wd
double crunch_poor_garbage_cert
bool flag(WERD_FLAGS mask) const
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
C_OUTLINE_LIST * out_list()
void set_word_fonts(WERD_RES *word)
void AddMessage(const char *format,...)
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
void set_orig_pix(Pix *pix)
void BestChoiceToCorrectText()
double superscript_worse_certainty
int x_ht_acceptance_tolerance
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
int editor_image_blob_bb_color
bool tessedit_display_outwords
bool textord_use_cjk_fp_model
char * chs_trailing_punct1
bool tessedit_create_wordstrbox
void SetVisible(bool visible)
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
void reject_whole_page(PAGE_RES_IT &page_res_it)
UNICHAR_ID unichar_id(int index) const
static ScrollView::Color NextColor(ScrollView::Color colour)
bool tessedit_create_boxfile
bool PSM_COL_FIND_ENABLED(int pageseg_mode)
bool noise_outlines(TWERD *word)
static const double kXHeightCapRatio
bool get_isdigit(UNICHAR_ID unichar_id) const
bool dangerous_ambig_found() const
UnicityTable< FontInfo > fontinfo_table_
void initialise(int16_t length)
bool get_isalpha(UNICHAR_ID unichar_id) const
int get_script_id_from_name(const char *script_name) const
double lstm_rating_coefficient
void recog_word(WERD_RES *word)
bool tessedit_debug_quality_metrics
int CountMisfitTops(WERD_RES *word_res)
tesseract::BoxWord * bln_boxes
TBOX bounding_box() const
#define INT_MEMBER(name, val, comment, vec)
void insert(const T &t, int index)
ROW_LIST * row_list()
get rows
SVMenuNode * AddChild(const char *txt)
void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, int height, Pix *binary_pix, Pix *thresholds_pix, Pix *grey_pix, bool use_box_bottoms, BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
void WritePDF(const char *filename)
bool tessedit_fix_fuzzy_spaces
void debug_word(PAGE_RES *page_res, const TBOX &selection_box)
void set_certainty(float new_val)
BLOCK_RES * block() const
bool applybox_learn_ngrams_mode
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB * > *target_blobs)
PROGRESS_FUNC2 progress_callback2
called whenever progress increases
void tilde_crunch(PAGE_RES_IT &page_res_it)
bool crunch_terrible_garbage
bool tessedit_write_rep_codes
CRUNCH_MODE unlv_crunch_mode
const float kCertaintyScale
constexpr int kMaxCredibleResolution
bool AddSelectedOutlines(const GenericVector< bool > &wanted, const GenericVector< C_BLOB * > &target_blobs, const GenericVector< C_OUTLINE * > &outlines, bool *make_next_word_fuzzy)
double min_orientation_margin
void make_bad()
Set the fields in this choice to be default (bad) values.
void cc_recog(WERD_RES *word)
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
const char *const kBackUpConfigFile
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
bool AdaptableWord(WERD_RES *word)
int dict_word(const WERD_CHOICE &word)
WERD_RES * restart_page()
GenericVector< STRING > misadaption_log
double superscript_min_y_bottom
bool contains(const FCOORD pt) const
GenericVector< int > blame_reasons
int crunch_leave_lc_strings
bool recog_interactive(PAGE_RES_IT *pr_it)
void set_unlv_suspects(WERD_RES *word)
void SetLangTesseract(Tesseract *lang_tesseract)
const char * get_script_from_script_id(int id) const
bool crunch_include_numerals
void transform_to_next_perm(WERD_RES_LIST &words)
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
bool rej_1Il_trust_permuter_type
WERD_CHOICE * prev_word_best_choice_
const FontInfo * fontinfo
void Load(const STRING &lang, TessdataManager *data_file)
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
bool tessedit_zero_rejection
bool tessedit_train_line_recognizer
double suspect_rating_per_ch
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
void read_config_file(const char *filename, SetParamConstraint constraint)
void Image(struct Pix *image, int x_pos, int y_pos)
double classify_max_certainty_margin
Assume a single uniform block of text. (Default.)
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
void ConsumeWordResults(WERD_RES *word)
void LearnWord(const char *fontname, WERD_RES *word)
ROW_RES * prev_row() const
int IntCastRounded(double x)
int classify_class_pruner_multiplier
static int SortByXMiddle(const void *v1, const void *v2)
bool write_results_empty_block
bool tessedit_enable_dict_correction
const UNICHARSET * unicharset() const
int tessedit_pageseg_mode
void tess_add_doc_word(WERD_CHOICE *word_choice)
GenericVector< int > best_state
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
bool tilde_crunch_written
int16_t count_alphas(const WERD_CHOICE &word)
bool rej_1Il_use_dict_word
int state(int index) const
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
void reject_I_1_L(WERD_RES *word)
void SetAllScriptPositions(tesseract::ScriptPos position)
BLOB_CHOICE * GetBlobChoice(int index) const
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
int tessedit_image_border
void set_segmentation_block_list(BLOCK_LIST *block_list)
static TESS_API DawgCache * GlobalDawgCache()
STRING language_data_path_prefix
char * ok_repeated_ch_non_alphanum_wds
void ZoomToRectangle(int x1, int y1, int x2, int y2)
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
const UNICHARSET & GetUnicharset() const
double superscript_scaledown_ratio
void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
int crunch_leave_uc_strings
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
bool textord_tabfind_force_vertical_text
int tessedit_bigram_debug
double fixsp_small_outlines_size
bool Split(bool split_for_pageseg, DebugPixa *pixa_debug)
bool enable_noise_removal
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
double crunch_pot_poor_cert
void script_pos_pass(PAGE_RES *page_res)
void set_y(int16_t yin)
rewrite function
BLOCK_RES_LIST block_res_list
bool tessedit_row_rej_good_docs
void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug)
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box)
void rotate(const FCOORD &vec)
void CloneChoppedToRebuild()
volatile int8_t ocr_alive
true if not last
int tessedit_preserve_min_wd_len
int push_back(T object)
Add an element in the table.
int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
#define BOOL_INIT_MEMBER(name, val, comment, vec)
void PreenXHeights(BLOCK_LIST *block_list)
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Treat the image as a single word.
int lstm_choice_iterations
bool tessedit_adaption_debug
char * chs_trailing_punct2
const UNICHARSET * uch_set
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
C_BLOB_LIST * cblob_list()
bool stopper_no_acceptable_choices
BlamerBundle * blamer_bundle
#define INT_INIT_MEMBER(name, val, comment, vec)
void set_global_subloc_code(int loc_code)
bool AcceptableResult(WERD_RES *word) const
void InitForRetryRecognition(const WERD_RES &source)
void SetupWordPassN(int pass_n, WordData *word)
int crunch_pot_indicators
UnicityTable< FontInfo > & get_fontinfo_table()
void MakeCurrentWordFuzzy()
static const char * IncorrectReasonName(IncorrectResultReason irr)
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
bool wordrec_enable_assoc
bool crunch_early_convert_bad_unlv_chs
void set_rating(float newrat)
void AddPix(const Pix *pix, const char *caption)
bool textord_tabfind_vertical_text
SVMenuNode * build_menu_new()
bool tessedit_resegment_from_line_boxes
double crunch_small_outlines_size
bool tessedit_make_boxes_from_boxes
const double kMinRefitXHeightFraction
bool major_right_to_left() const
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
void find_components(Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
int tessedit_tess_adaption_mode
bool PSM_SPARSE(int pageseg_mode)
void rej_word_bad_quality()
bool crunch_leave_ok_strings
void extract_edges(Pix *pix, BLOCK *block)
bool tessedit_prefer_joined_punct
static C_BLOB * deep_copy(const C_BLOB *src)
int debug_fix_space_level
void RecognizeLine(const ImageData &image_data, bool invert, bool debug, double worst_dict_cert, const TBOX &line_box, PointerVector< WERD_RES > *words, int lstm_choice_mode=0, int lstm_choice_amount=5)
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
bool pageseg_apply_music_mask
static WERD_RES * deep_copy(const WERD_RES *src)
PDBLK pdblk
Page Description Block.
void TidyUp(PAGE_RES *page_res)
void Add(T value, int count)
void * cancel_this
monitor-aware progress callback
bool word_bln_display(PAGE_RES_IT *pr_it)
bool tessedit_enable_doc_dict
void set_unichar_id(UNICHAR_ID unichar_id, int index)
bool interactive_display_mode
#define STRING_MEMBER(name, val, comment, vec)
void PrintBestChoices() const
bool suspect_constrain_1Il
bool SubAndSuperscriptFix(WERD_RES *word_res)
void DebugWordChoices(bool debug, const char *word_to_debug)
float min_x_height() const
int16_t progress
chars in this buffer(0)
const TBOX & BlobBox(int index) const
bool last_char_was_newline
STRING debug_str(UNICHAR_ID id) const
bool tessedit_debug_fonts
WERD_CHOICE * best_choice
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
bool major_x_overlap(const TBOX &box) const
int16_t safe_dict_word(const WERD_RES *werd_res)
ImageData * GetRectImage(const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
bool acceptable_number_string(const char *s, const char *lengths)
bool tessedit_test_adaption
bool tessedit_rejection_debug
bool tessedit_resegment_from_boxes
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
bool tessedit_reject_bad_qual_wds
bool fixspace_thinks_word_done(WERD_RES *word)
const char * c_str() const
void RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs)
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
void InsertSeam(int blob_number, SEAM *seam)
void set_certainty(float newrat)
char * tessedit_char_whitelist
void ProcessMatchedBlobs(const TWERD &other, std::function< void(int)> cb) const
void turn_off_bit(uint8_t bit_num)
POLY_BLOCK * poly_block() const
bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
bool crunch_leave_accept_strings
double noise_cert_disjoint
const FontInfo * fontinfo2
void set_flag(WERD_FLAGS mask, bool value)
void PrerecAllWordsPar(const GenericVector< WordData > &words)
bool word_adaptable(WERD_RES *word, uint16_t mode)
bool tessedit_preserve_row_rej_perfect_wds
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode)
void StartBackupAdaptiveClassifier()
const float kWorstDictCertainty
char * tessedit_load_sublangs
void set_global_loc_code(int loc_code)
bool tessedit_dump_pageseg_images
bool tessedit_zero_kelvin_rejection
void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
bool tessedit_override_permuter
GenericVector< TBLOB * > blobs
constexpr int kResolutionEstimationFactor
constexpr int kMinCredibleResolution
const int16_t kMaxBoxEdgeDiff
double crunch_del_low_word
double classify_max_rating_ratio
bool top_bottom_useful() const
double noise_cert_basechar
void SetupForLoad(DawgCache *dawg_cache)
bool script_has_xheight() const
#define double_MEMBER(name, val, comment, vec)
bool deadline_exceeded() const
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
Dict & getDict() override
bool tessedit_ambigs_training
float blob_noise_score(TBLOB *blob)
double tessedit_whole_wd_rej_row_percent
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
int crunch_long_repetitions
bool tessedit_minimal_rej_pass1
double crunch_poor_garbage_rate
void do_re_display(bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
void AttachOnCorner(BandTriMatrix< T > *array2)
void GetNonSuperscriptSpan(int *start, int *end) const
bool tess_acceptable_word(WERD_RES *word)
bool tessedit_create_alto
const ICOORD & topright() const
bool HasDifferentSplitStrategies() const
bool tessedit_good_quality_unrej
SEAM * chop_one_blob(const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, int *blob_number)
const GenericVector< tesseract::ScoredFont > & fonts() const
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
double rej_whole_of_mostly_reject_word_fract
const T & get(int id) const
Return the object from an id.
bool tessedit_dont_blkrej_good_wds
bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
void bigram_correction_pass(PAGE_RES *page_res)
GARBAGE_LEVEL garbage_word(WERD_RES *word, bool ok_dict_word)
void rej_word_tess_failure()
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
const STRING debug_string() const
int DivRounded(int a, int b)
bool crunch_early_merge_tess_fails
const double kMaxXHeightDeviationFraction
double tessedit_upper_flip_hyphen
bool tessedit_init_config_only
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
bool ContainsAnyNonSpaceDelimited() const
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
void process_selected_words(PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
WERD_CHOICE_LIST best_choices
double tessedit_lower_flip_hyphen
void SetScriptPositions()
const ICOORD & botleft() const
bool applybox_learn_chars_and_char_frags_mode
bool word_display(PAGE_RES_IT *pr_it)
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB * > *target_blobs)
bool textord_tabfind_show_vlines
bool tessedit_debug_doc_rejection
void blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box)
bool paragraph_text_based
GenericVector< SEAM * > seam_array
int ocr_devanagari_split_strategy
bool tessedit_write_images
TBOX bounding_box() const
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
char * tessedit_char_blacklist
double crunch_del_min_width
int get_script_table_size() const
int LabelSpecialText(TO_BLOCK *to_block) override
void TrainFromBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
void recog_word_recursive(WERD_RES *word)
int min_characters_to_try
int16_t first_alphanum_offset(const char *word, const char *word_lengths)
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
int16_t fontinfo_id() const
UnicharAmbigs unichar_ambigs
bool get_isupper(UNICHAR_ID unichar_id) const
double suspect_accept_rating
int size() const
Return the size used.
void BuildMenu(ScrollView *sv, bool menu_bar=true)
bool x_overlap(const TBOX &box) const
SVEvent * AwaitEvent(SVEventType type)
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
bool tessedit_dump_choices
bool tessedit_create_lstmbox
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
char * tessedit_char_unblacklist
bool tessedit_timing_debug
bool wordrec_debug_blamer
bool rej_use_tess_accepted
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params)
int editor_image_word_bb_color
int16_t fontinfo_id2() const
bool contains(char c) const
void SetupFake(const UNICHARSET &uch)
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
bool tessedit_enable_bigram_correction
bool AdaptiveClassifierIsEmpty() const
void pad(int xpad, int ypad)
int16_t count_outline_errs(char c, int16_t outline_count)
#define LOC_WRITE_RESULTS
void SetupUniversalFontIds()
double subscript_max_y_top
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
int multilang_debug_level
tesseract::Tesseract * tesseract
double textord_tabfind_vertical_text_ratio
bool major_overlap(const TBOX &box) const
char * numeric_punctuation
void font_recognition_pass(PAGE_RES *page_res)
void TextAttributes(const char *font, int pixel_size, bool bold, bool italic, bool underlined)
WERD_RES * next_word() const
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
double tessedit_reject_row_percent
int16_t doc_good_char_quality
void blamer_pass(PAGE_RES *page_res)
static Pix * FindImages(Pix *pix, DebugPixa *pixa_debug)
void convert_bad_unlv_chs(WERD_RES *word_res)
CANCEL_FUNC cancel
for errcode use
void ReSegmentByClassification(PAGE_RES *page_res)
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
double tessedit_reject_doc_percent
double textord_tabfind_aligned_gap_fraction
void dictionary_correction_pass(PAGE_RES *page_res)
bool one_ell_conflict(WERD_RES *word_res, bool update_map)
int8_t fontinfo_id2_count
void tilde_delete(PAGE_RES_IT &page_res_it)
bool PSM_ORIENTATION_ENABLED(int pageseg_mode)
bool assume_fixed_pitch_char_segment
Treat the image as a single word in a circle.
void init_to_size(int size, const T &t)
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
double crunch_del_high_word
uint32_t unsigned_size() const
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
char * conflict_set_I_l_1
DLLSYM void tprintf(const char *format,...)
Orientation and script detection only.
GenericVector< int > blob_gaps
void AddEventHandler(SVEventHandler *listener)
Add an Event Listener to this ScrollView Window.
FCOORD classify_rotation() const
const Dict * GetDict() const
bool tessedit_fix_hyphens
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
void set_blanks(uint8_t new_blanks)
bool tessedit_word_for_word
void fix_rep_char(PAGE_RES_IT *page_res_it)
void SearchWords(PointerVector< WERD_RES > *words)
bool textord_equation_detect
int paragraph_debug_level
void unrej_good_chs(WERD_RES *word)
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
bool tessedit_unrej_any_wd
void reject_edge_blobs(WERD_RES *word)
void plot(ScrollView *window)
TBOX bounding_box() const
void tess_segment_pass_n(int pass_n, WERD_RES *word)
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
int classify_integer_matcher_multiplier
void MergeAdjacentBlobs(int index)
IncorrectResultReason incorrect_result_reason() const
const char * id_to_unichar(UNICHAR_ID id) const
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
void ResetAdaptiveClassifierInternal()
void Text(int x, int y, const char *mystring)
bool unlv_tilde_crunching
double tessedit_reject_block_percent
void ReplaceBestChoice(WERD_CHOICE *choice)
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
std::unique_ptr< LanguageModel > language_model_
bool word_blank_and_set_display(PAGE_RES_IT *pr_its)
bool word_set_display(PAGE_RES_IT *pr_it)
bool rej_alphas_in_number_perm
bool tessedit_preserve_blk_rej_perfect_wds
void ResetAdaptiveClassifier()
ImageData * GetLineData(const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block)
void GetNoiseOutlines(GenericVector< C_OUTLINE * > *outlines)
bool classify_bln_numeric_mode
#define BOOL_MEMBER(name, val, comment, vec)
bool tessedit_debug_block_rejection
bool Load(const ParamsVectors *params, const char *lang, TessdataManager *mgr)
FCOORD re_rotation() const
WERD_RES * TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
char * ShowInputDialog(const char *msg)
void Rectangle(int x1, int y1, int x2, int y2)
bool tessedit_minimal_rejection
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
bool word_dumper(PAGE_RES_IT *pr_it)
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
bool tessedit_write_block_separators
bool tessedit_use_primary_params_model
void ResetDocumentDictionary()
int tessedit_ocr_engine_mode
int textord_debug_tabfind
int fixsp_non_noise_limit
void set_permuter(uint8_t perm)
const STRING & unichar_lengths() const
bool preserve_interword_spaces
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
C_BLOB_LIST * rej_cblob_list()
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
void SwitchAdaptiveClassifier()
void(Tesseract::*)(const WordData &, WERD_RES **, PointerVector< WERD_RES > *) WordRecognizer
void write_results(PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
char determine_newline_type(WERD *word, BLOCK *block, WERD *next_word, BLOCK *next_block)
bool tessedit_create_hocr
const int kBlnBaselineOffset
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
char * tessedit_write_params_to_file
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
int16_t word_blob_quality(WERD_RES *word)
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
void CopyFrom(const UNICHARSET &src)
double crunch_pot_poor_rate
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
const char * ScriptPosToString(enum ScriptPos script_pos)
void set_x_height(float new_xheight)
int16_t alpha_count(const char *word, const char *word_lengths)
int16_t word_outline_errs(WERD_RES *word)
bool check_debug_pt(WERD_RES *word, int location)
bool right_to_left() const
double quality_outline_pc
float max_x_height() const
static void LastChanceBlame(bool debug, WERD_RES *word)
bool AdaptiveClassifierIsFull() const