tesseract  5.0.0-alpha-619-ge9db
tesseract::Tesseract Class Reference

#include <tesseractclass.h>

Inheritance diagram for tesseract::Tesseract:
tesseract::Wordrec tesseract::Classify tesseract::CCStruct tesseract::CCUtil

Public Member Functions

 Tesseract ()
 
 ~Tesseract () override
 
DictgetDict () override
 
void Clear ()
 
void ResetAdaptiveClassifier ()
 
void ResetDocumentDictionary ()
 
void SetEquationDetect (EquationDetect *detector)
 
const FCOORDreskew () const
 
Pix ** mutable_pix_binary ()
 
Pix * pix_binary () const
 
Pix * pix_grey () const
 
void set_pix_grey (Pix *grey_pix)
 
Pix * pix_original () const
 
void set_pix_original (Pix *original_pix)
 
Pix * BestPix () const
 
void set_pix_thresholds (Pix *thresholds)
 
int source_resolution () const
 
void set_source_resolution (int ppi)
 
int ImageWidth () const
 
int ImageHeight () const
 
Pix * scaled_color () const
 
int scaled_factor () const
 
void SetScaledColor (int factor, Pix *color)
 
const Textordtextord () const
 
Textordmutable_textord ()
 
bool right_to_left () const
 
int num_sub_langs () const
 
Tesseractget_sub_lang (int index) const
 
bool AnyTessLang () const
 
bool AnyLSTMLang () const
 
void SetBlackAndWhitelist ()
 
void PrepareForPageseg ()
 
void PrepareForTessOCR (BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
 
int SegmentPage (const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
 
void SetupWordScripts (BLOCK_LIST *blocks)
 
int AutoPageSeg (PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
 
ColumnFinderSetupPageSegAndDetectOrientation (PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)
 
void PrerecAllWordsPar (const GenericVector< WordData > &words)
 
bool TrainLineRecognizer (const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list)
 
void TrainFromBoxes (const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
 
ImageDataGetLineData (const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block)
 
ImageDataGetRectImage (const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
 
void LSTMRecognizeWord (const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
 
void SearchWords (PointerVector< WERD_RES > *words)
 
bool ProcessTargetWord (const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
 
void SetupAllWordsPassN (int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
 
void SetupWordPassN (int pass_n, WordData *word)
 
bool RecogAllWordsPassN (int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
 
bool recog_all_words (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
 
void rejection_passes (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
 
void bigram_correction_pass (PAGE_RES *page_res)
 
void blamer_pass (PAGE_RES *page_res)
 
void script_pos_pass (PAGE_RES *page_res)
 
int RetryWithLanguage (const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
 
bool ReassignDiacritics (int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
 
void AssignDiacriticsToOverlappingBlobs (const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB * > *target_blobs)
 
void AssignDiacriticsToNewBlobs (const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB * > *target_blobs)
 
bool SelectGoodDiacriticOutlines (int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
 
float ClassifyBlobPlusOutlines (const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
 
float ClassifyBlobAsWord (int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
 
void classify_word_and_language (int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
 
void classify_word_pass1 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
 
void recog_pseudo_word (PAGE_RES *page_res, TBOX &selection_box)
 
void fix_rep_char (PAGE_RES_IT *page_res_it)
 
ACCEPTABLE_WERD_TYPE acceptable_word_string (const UNICHARSET &char_set, const char *s, const char *lengths)
 
void match_word_pass_n (int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
 
void classify_word_pass2 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
 
void ReportXhtFixResult (bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
 
bool RunOldFixXht (WERD_RES *word, BLOCK *block, ROW *row)
 
bool TrainedXheightFix (WERD_RES *word, BLOCK *block, ROW *row)
 
bool TestNewNormalization (int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
 
bool recog_interactive (PAGE_RES_IT *pr_it)
 
void set_word_fonts (WERD_RES *word)
 
void font_recognition_pass (PAGE_RES *page_res)
 
void dictionary_correction_pass (PAGE_RES *page_res)
 
bool check_debug_pt (WERD_RES *word, int location)
 
bool SubAndSuperscriptFix (WERD_RES *word_res)
 
void GetSubAndSuperscriptCandidates (const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
 
WERD_RESTrySuperscriptSplits (int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
 
bool BelievableSuperscript (bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
 
void output_pass (PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
 
void write_results (PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
 
void set_unlv_suspects (WERD_RES *word)
 
UNICHAR_ID get_rep_char (WERD_RES *word)
 
bool acceptable_number_string (const char *s, const char *lengths)
 
int16_t count_alphanums (const WERD_CHOICE &word)
 
int16_t count_alphas (const WERD_CHOICE &word)
 
void read_config_file (const char *filename, SetParamConstraint constraint)
 
int init_tesseract (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
 
int init_tesseract (const char *datapath, const char *language, OcrEngineMode oem)
 
int init_tesseract_internal (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
 
void SetupUniversalFontIds ()
 
int init_tesseract_lm (const char *arg0, const char *textbase, const char *language, TessdataManager *mgr)
 
void recognize_page (STRING &image_name)
 
void end_tesseract ()
 
bool init_tesseract_lang_data (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
 
void ParseLanguageString (const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
 
SVMenuNodebuild_menu_new ()
 
void pgeditor_main (int width, int height, PAGE_RES *page_res)
 
void process_image_event (const SVEvent &event)
 
bool process_cmd_win_event (int32_t cmd_event, char *new_value)
 
void debug_word (PAGE_RES *page_res, const TBOX &selection_box)
 
void do_re_display (bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
 
bool word_display (PAGE_RES_IT *pr_it)
 
bool word_bln_display (PAGE_RES_IT *pr_it)
 
bool word_blank_and_set_display (PAGE_RES_IT *pr_its)
 
bool word_set_display (PAGE_RES_IT *pr_it)
 
bool word_dumper (PAGE_RES_IT *pr_it)
 
void blob_feature_display (PAGE_RES *page_res, const TBOX &selection_box)
 
void make_reject_map (WERD_RES *word, ROW *row, int16_t pass)
 
bool one_ell_conflict (WERD_RES *word_res, bool update_map)
 
int16_t first_alphanum_index (const char *word, const char *word_lengths)
 
int16_t first_alphanum_offset (const char *word, const char *word_lengths)
 
int16_t alpha_count (const char *word, const char *word_lengths)
 
bool word_contains_non_1_digit (const char *word, const char *word_lengths)
 
void dont_allow_1Il (WERD_RES *word)
 
int16_t count_alphanums (WERD_RES *word)
 
void flip_0O (WERD_RES *word)
 
bool non_0_digit (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
 
bool non_O_upper (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
 
bool repeated_nonalphanum_wd (WERD_RES *word, ROW *row)
 
void nn_match_word (WERD_RES *word, ROW *row)
 
void nn_recover_rejects (WERD_RES *word, ROW *row)
 
void set_done (WERD_RES *word, int16_t pass)
 
int16_t safe_dict_word (const WERD_RES *werd_res)
 
void flip_hyphens (WERD_RES *word)
 
void reject_I_1_L (WERD_RES *word)
 
void reject_edge_blobs (WERD_RES *word)
 
void reject_mostly_rejects (WERD_RES *word)
 
bool word_adaptable (WERD_RES *word, uint16_t mode)
 
void recog_word_recursive (WERD_RES *word)
 
void recog_word (WERD_RES *word)
 
void split_and_recog_word (WERD_RES *word)
 
void split_word (WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
 
void join_words (WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
 
void match_current_words (WERD_RES_LIST &words, ROW *row, BLOCK *block)
 
int16_t fp_eval_word_spacing (WERD_RES_LIST &word_res_list)
 
void dump_words (WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
 
bool fixspace_thinks_word_done (WERD_RES *word)
 
GARBAGE_LEVEL garbage_word (WERD_RES *word, bool ok_dict_word)
 
bool potential_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
 
void tilde_crunch (PAGE_RES_IT &page_res_it)
 
void unrej_good_quality_words (PAGE_RES_IT &page_res_it)
 
void doc_and_block_rejection (PAGE_RES_IT &page_res_it, bool good_quality_doc)
 
void quality_based_rejection (PAGE_RES_IT &page_res_it, bool good_quality_doc)
 
void convert_bad_unlv_chs (WERD_RES *word_res)
 
void tilde_delete (PAGE_RES_IT &page_res_it)
 
int16_t word_blob_quality (WERD_RES *word)
 
void word_char_quality (WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
 
void unrej_good_chs (WERD_RES *word)
 
int16_t count_outline_errs (char c, int16_t outline_count)
 
int16_t word_outline_errs (WERD_RES *word)
 
bool terrible_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level)
 
CRUNCH_MODE word_deletable (WERD_RES *word, int16_t &delete_mode)
 
int16_t failure_count (WERD_RES *word)
 
bool noise_outlines (TWERD *word)
 
void tess_segment_pass_n (int pass_n, WERD_RES *word)
 
PAGE_RESApplyBoxes (const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
 
void PreenXHeights (BLOCK_LIST *block_list)
 
PAGE_RESSetupApplyBoxes (const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
 
void MaximallyChopWord (const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
 
bool ResegmentCharBox (PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
 
bool ResegmentWordBox (BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
 
void ReSegmentByClassification (PAGE_RES *page_res)
 
bool ConvertStringToUnichars (const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
 
bool FindSegmentation (const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
 
void SearchForText (const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
 
void TidyUp (PAGE_RES *page_res)
 
void ReportFailedBox (int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
 
void CorrectClassifyWords (PAGE_RES *page_res)
 
void ApplyBoxTraining (const STRING &fontname, PAGE_RES *page_res)
 
int CountMisfitTops (WERD_RES *word_res)
 
float ComputeCompatibleXheight (WERD_RES *word_res, float *baseline_shift)
 
FILE * init_recog_training (const STRING &fname)
 
void recog_training_segmented (const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
 
void ambigs_classify_and_output (const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
 
eval_word_spacing()

The basic measure is the number of characters in contextually confirmed words. (I.e the word is done) If all words are contextually confirmed the evaluation is deemed perfect.

Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is preferred.

The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space.

Conversely, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word + 2 sides of a "1" joined.

The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.

bool digit_or_numeric_punct (WERD_RES *word, int char_position)
 
int16_t eval_word_spacing (WERD_RES_LIST &word_res_list)
 
fix_sp_fp_word()

Test the current word to see if it can be split by deleting noise blobs. If so, do the business. Return with the iterator pointing to the same place if the word is unchanged, or the last of the replacement words.

void fix_noisy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
 
void fix_sp_fp_word (WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
 
int16_t worst_noise_blob (WERD_RES *word_res, float *worst_noise_score)
 
float blob_noise_score (TBLOB *blob)
 
void break_noisiest_blob_word (WERD_RES_LIST &words)
 
fix_fuzzy_spaces()

Walk over the page finding sequences of words joined by fuzzy spaces. Extract them as a sublist, process the sublist to find the optimal arrangement of spaces then replace the sublist in the ROW_RES.

Parameters
monitorprogress monitor
word_countcount of words in doc
[out]page_res
void fix_fuzzy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
 
void fix_fuzzy_spaces (ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
 
process_selected_words()

Walk the current block list applying the specified word processor function to each word that overlaps the selection_box.

void process_selected_words (PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
 
tess_add_doc_word

Add the given word to the document dictionary

void tess_add_doc_word (WERD_CHOICE *word_choice)
 
tess_acceptable_word
Returns
true if the word is regarded as "good enough".
Parameters
word_choiceafter context
raw_choicebefore context
bool tess_acceptable_word (WERD_RES *word)
 
- Public Member Functions inherited from tesseract::Wordrec
 Wordrec ()
 
 ~Wordrec () override=default
 
void SaveAltChoices (const LIST &best_choices, WERD_RES *word)
 
void FillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void CallFillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void SegSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void InitialSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void DoSegSearch (WERD_RES *word_res)
 
void add_seam_to_queue (float new_priority, SEAM *new_seam, SeamQueue *seams)
 
void choose_best_seam (SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
 
void combine_seam (const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue)
 
SEAMpick_good_seam (TBLOB *blob)
 
void try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
void try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
PRIORITY grade_split_length (SPLIT *split)
 
PRIORITY grade_sharpness (SPLIT *split)
 
bool near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
 
virtual BLOB_CHOICE_LIST * classify_piece (const GenericVector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
 
void merge_fragments (MATRIX *ratings, int16_t num_blobs)
 
void get_fragment_lists (int16_t current_frag, int16_t current_row, int16_t start, int16_t num_frag_parts, int16_t num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
 
void merge_and_put_fragment_lists (int16_t row, int16_t column, int16_t num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
 
void fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
 
void program_editup (const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
 
void cc_recog (WERD_RES *word)
 
void program_editdown (int32_t elasped_time)
 
void set_pass1 ()
 
void set_pass2 ()
 
int end_recog ()
 
BLOB_CHOICE_LIST * call_matcher (TBLOB *blob)
 
int dict_word (const WERD_CHOICE &word)
 
BLOB_CHOICE_LIST * classify_blob (TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
 
PRIORITY point_priority (EDGEPT *point)
 
void add_point_to_list (PointHeap *point_heap, EDGEPT *point)
 
bool is_inside_angle (EDGEPT *pt)
 
int angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
 
EDGEPTpick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
 
void prioritize_points (TESSLINE *outline, PointHeap *points)
 
void new_min_point (EDGEPT *local_min, PointHeap *points)
 
void new_max_point (EDGEPT *local_max, PointHeap *points)
 
void vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
 
SEAMattempt_blob_chop (TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
 
SEAMchop_numbered_blob (TWERD *word, int32_t blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
 
SEAMchop_overlapping_blob (const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)
 
SEAMimprove_one_blob (const GenericVector< BLOB_CHOICE * > &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
 
SEAMchop_one_blob (const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, int *blob_number)
 
void chop_word_main (WERD_RES *word)
 
void improve_by_chopping (float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
 
int select_blob_to_split (const GenericVector< BLOB_CHOICE * > &blob_choices, float rating_ceiling, bool split_next_to_fragment)
 
int select_blob_to_split_from_fixpt (DANGERR *fixpt)
 
- Public Member Functions inherited from tesseract::Classify
 Classify ()
 
 ~Classify () override
 
const ShapeTableshape_table () const
 
void SetStaticClassifier (ShapeClassifier *static_classifier)
 
void AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices)
 
bool LargeSpeckle (const TBLOB &blob)
 
ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)
 
int GetFontinfoId (ADAPT_CLASS Class, uint8_t ConfigId)
 
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
 
void ReadNewCutoffs (TFile *fp, uint16_t *Cutoffs)
 
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
ADAPT_TEMPLATES ReadAdaptedTemplates (TFile *File)
 
float ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
 
void FreeNormProtos ()
 
NORM_PROTOSReadNormProtos (TFile *fp)
 
void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
 
INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
 
void LearnWord (const char *fontname, WERD_RES *word)
 
void LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
 
void InitAdaptiveClassifier (TessdataManager *mgr)
 
void InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
 
void AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
 
void MasterMatcher (INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
 
void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
 
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
 
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
 
void AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results)
 
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
 
void DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results)
 
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
 
int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
 
void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
 
void PrintAdaptiveMatchResults (const ADAPT_RESULTS &results)
 
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
 
void RemoveBadMatches (ADAPT_RESULTS *Results)
 
void SetAdaptiveThreshold (float Threshold)
 
void ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
 
STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
 
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
 
int ShapeIDToClassID (int shape_id) const
 
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
 
int CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
 
int CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
 
UNICHAR_IDGetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass)
 
void DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results)
 
void AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES adaptive_templates)
 
void DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class)
 
bool AdaptableWord (WERD_RES *word)
 
void EndAdaptiveClassifier ()
 
void SettupPass1 ()
 
void SettupPass2 ()
 
void AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
 
void ClassifyAsNoise (ADAPT_RESULTS *Results)
 
void ResetAdaptiveClassifierInternal ()
 
void SwitchAdaptiveClassifier ()
 
void StartBackupAdaptiveClassifier ()
 
int GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
 
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
 
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
 
void UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob)
 
bool AdaptiveClassifierIsFull () const
 
bool AdaptiveClassifierIsEmpty () const
 
bool LooksLikeGarbage (TBLOB *blob)
 
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
 
void ClearCharNormArray (uint8_t *char_norm_array)
 
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
 
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
 
INT_TEMPLATES ReadIntTemplates (TFile *fp)
 
void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
 
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
 
void ShowMatchDisplay ()
 
UnicityTable< FontInfo > & get_fontinfo_table ()
 
const UnicityTable< FontInfo > & get_fontinfo_table () const
 
UnicityTable< FontSet > & get_fontset_table ()
 
void NormalizeOutlines (LIST Outlines, float *XScale, float *YScale)
 
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
FEATURE_SET ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
void LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
 
bool WriteTRFile (const STRING &filename)
 
- Public Member Functions inherited from tesseract::CCStruct
 CCStruct ()=default
 
 ~CCStruct () override
 
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
 
virtual ~CCUtil ()
 
void main_setup (const char *argv0, const char *basename)
 CCUtil::main_setup - set location of tessdata and name of image. More...
 
ParamsVectorsparams ()
 

Public Attributes

bool tessedit_resegment_from_boxes = false
 
bool tessedit_resegment_from_line_boxes = false
 
bool tessedit_train_from_boxes = false
 
bool tessedit_make_boxes_from_boxes = false
 
bool tessedit_train_line_recognizer = false
 
bool tessedit_dump_pageseg_images = false
 
bool tessedit_do_invert = true
 
int tessedit_pageseg_mode = PSM_SINGLE_BLOCK
 
int tessedit_ocr_engine_mode = tesseract::OEM_DEFAULT
 
char * tessedit_char_blacklist = ""
 
char * tessedit_char_whitelist = ""
 
char * tessedit_char_unblacklist = ""
 
bool tessedit_ambigs_training = false
 
int pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT
 
int ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT
 
char * tessedit_write_params_to_file = ""
 
bool tessedit_adaption_debug = false
 
int bidi_debug = 0
 
int applybox_debug = 1
 
int applybox_page = 0
 
char * applybox_exposure_pattern = ".exp"
 
bool applybox_learn_chars_and_char_frags_mode = false
 
bool applybox_learn_ngrams_mode = false
 
bool tessedit_display_outwords = false
 
bool tessedit_dump_choices = false
 
bool tessedit_timing_debug = false
 
bool tessedit_fix_fuzzy_spaces = true
 
bool tessedit_unrej_any_wd = false
 
bool tessedit_fix_hyphens = true
 
bool tessedit_enable_doc_dict = true
 
bool tessedit_debug_fonts = false
 
bool tessedit_debug_block_rejection = false
 
bool tessedit_enable_bigram_correction = true
 
bool tessedit_enable_dict_correction = false
 
int tessedit_bigram_debug = 0
 
bool enable_noise_removal = true
 
int debug_noise_removal = 0
 
double noise_cert_basechar = -8.0
 
double noise_cert_disjoint = -2.5
 
double noise_cert_punc = -2.5
 
double noise_cert_factor = 0.375
 
int noise_maxperblob = 8
 
int noise_maxperword = 16
 
int debug_x_ht_level = 0
 
char * chs_leading_punct = "('`\""
 
char * chs_trailing_punct1 = ").,;:?!"
 
char * chs_trailing_punct2 = ")'`\""
 
double quality_rej_pc = 0.08
 
double quality_blob_pc = 0.0
 
double quality_outline_pc = 1.0
 
double quality_char_pc = 0.95
 
int quality_min_initial_alphas_reqd = 2
 
int tessedit_tess_adaption_mode = 0x27
 
bool tessedit_minimal_rej_pass1 = false
 
bool tessedit_test_adaption = false
 
bool test_pt = false
 
double test_pt_x = 99999.99
 
double test_pt_y = 99999.99
 
int multilang_debug_level = 0
 
int paragraph_debug_level = 0
 
bool paragraph_text_based = true
 
bool lstm_use_matrix = 1
 
char * outlines_odd = "%| "
 
char * outlines_2 = "ij!?%\":;"
 
bool tessedit_good_quality_unrej = true
 
bool tessedit_use_reject_spaces = true
 
double tessedit_reject_doc_percent = 65.00
 
double tessedit_reject_block_percent = 45.00
 
double tessedit_reject_row_percent = 40.00
 
double tessedit_whole_wd_rej_row_percent = 70.00
 
bool tessedit_preserve_blk_rej_perfect_wds = true
 
bool tessedit_preserve_row_rej_perfect_wds = true
 
bool tessedit_dont_blkrej_good_wds = false
 
bool tessedit_dont_rowrej_good_wds = false
 
int tessedit_preserve_min_wd_len = 2
 
bool tessedit_row_rej_good_docs = true
 
double tessedit_good_doc_still_rowrej_wd = 1.1
 
bool tessedit_reject_bad_qual_wds = true
 
bool tessedit_debug_doc_rejection = false
 
bool tessedit_debug_quality_metrics = false
 
bool bland_unrej = false
 
double quality_rowrej_pc = 1.1
 
bool unlv_tilde_crunching = false
 
bool hocr_font_info = false
 
bool hocr_char_boxes = false
 
bool crunch_early_merge_tess_fails = true
 
bool crunch_early_convert_bad_unlv_chs = false
 
double crunch_terrible_rating = 80.0
 
bool crunch_terrible_garbage = true
 
double crunch_poor_garbage_cert = -9.0
 
double crunch_poor_garbage_rate = 60
 
double crunch_pot_poor_rate = 40
 
double crunch_pot_poor_cert = -8.0
 
double crunch_del_rating = 60
 
double crunch_del_cert = -10.0
 
double crunch_del_min_ht = 0.7
 
double crunch_del_max_ht = 3.0
 
double crunch_del_min_width = 3.0
 
double crunch_del_high_word = 1.5
 
double crunch_del_low_word = 0.5
 
double crunch_small_outlines_size = 0.6
 
int crunch_rating_max = 10
 
int crunch_pot_indicators = 1
 
bool crunch_leave_ok_strings = true
 
bool crunch_accept_ok = true
 
bool crunch_leave_accept_strings = false
 
bool crunch_include_numerals = false
 
int crunch_leave_lc_strings = 4
 
int crunch_leave_uc_strings = 4
 
int crunch_long_repetitions = 3
 
int crunch_debug = 0
 
int fixsp_non_noise_limit = 1
 
double fixsp_small_outlines_size = 0.28
 
bool tessedit_prefer_joined_punct = false
 
int fixsp_done_mode = 1
 
int debug_fix_space_level = 0
 
char * numeric_punctuation = ".,"
 
int x_ht_acceptance_tolerance = 8
 
int x_ht_min_change = 8
 
int superscript_debug = 0
 
double superscript_worse_certainty = 2.0
 
double superscript_bettered_certainty = 0.97
 
double superscript_scaledown_ratio = 0.4
 
double subscript_max_y_top = 0.5
 
double superscript_min_y_bottom = 0.3
 
bool tessedit_write_block_separators = false
 
bool tessedit_write_rep_codes = false
 
bool tessedit_write_unlv = false
 
bool tessedit_create_txt = false
 
bool tessedit_create_hocr = false
 
bool tessedit_create_alto = false
 
bool tessedit_create_lstmbox = false
 
bool tessedit_create_tsv = false
 
bool tessedit_create_wordstrbox = false
 
bool tessedit_create_pdf = false
 
bool textonly_pdf = false
 
int jpg_quality = 85
 
int user_defined_dpi = 0
 
int min_characters_to_try = 50
 
char * unrecognised_char = "|"
 
int suspect_level = 99
 
int suspect_short_words = 2
 
bool suspect_constrain_1Il = false
 
double suspect_rating_per_ch = 999.9
 
double suspect_accept_rating = -999.9
 
bool tessedit_minimal_rejection = false
 
bool tessedit_zero_rejection = false
 
bool tessedit_word_for_word = false
 
bool tessedit_zero_kelvin_rejection = false
 
int tessedit_reject_mode = 0
 
bool tessedit_rejection_debug = false
 
bool tessedit_flip_0O = true
 
double tessedit_lower_flip_hyphen = 1.5
 
double tessedit_upper_flip_hyphen = 1.8
 
bool rej_trust_doc_dawg = false
 
bool rej_1Il_use_dict_word = false
 
bool rej_1Il_trust_permuter_type = true
 
bool rej_use_tess_accepted = true
 
bool rej_use_tess_blanks = true
 
bool rej_use_good_perm = true
 
bool rej_use_sensible_wd = false
 
bool rej_alphas_in_number_perm = false
 
double rej_whole_of_mostly_reject_word_fract = 0.85
 
int tessedit_image_border = 2
 
char * ok_repeated_ch_non_alphanum_wds = "-?*\075"
 
char * conflict_set_I_l_1 = "Il1[]"
 
int min_sane_x_ht_pixels = 8
 
bool tessedit_create_boxfile = false
 
int tessedit_page_number = -1
 
bool tessedit_write_images = false
 
bool interactive_display_mode = false
 
char * file_type = ".tif"
 
bool tessedit_override_permuter = true
 
char * tessedit_load_sublangs = ""
 
bool tessedit_use_primary_params_model = false
 
double min_orientation_margin = 7.0
 
bool textord_tabfind_show_vlines = false
 
bool textord_use_cjk_fp_model = false
 
bool poly_allow_detailed_fx = false
 
bool tessedit_init_config_only = false
 
bool textord_equation_detect = false
 
bool textord_tabfind_vertical_text = true
 
bool textord_tabfind_force_vertical_text = false
 
double textord_tabfind_vertical_text_ratio = 0.5
 
double textord_tabfind_aligned_gap_fraction = 0.75
 
int tessedit_parallelize = 0
 
bool preserve_interword_spaces = false
 
char * page_separator = "\f"
 
int lstm_choice_mode = 0
 
int lstm_choice_iterations = 5
 
double lstm_rating_coefficient = 5
 
bool pageseg_apply_music_mask = true
 
- Public Attributes inherited from tesseract::Wordrec
bool merge_fragments_in_matrix = true
 
bool wordrec_enable_assoc = true
 
bool force_word_assoc = false
 
int repair_unchopped_blobs = 1
 
double tessedit_certainty_threshold = -2.25
 
int chop_debug = 0
 
bool chop_enable = 1
 
bool chop_vertical_creep = 0
 
int chop_split_length = 10000
 
int chop_same_distance = 2
 
int chop_min_outline_points = 6
 
int chop_seam_pile_size = 150
 
bool chop_new_seam_pile = 1
 
int chop_inside_angle = -50
 
int chop_min_outline_area = 2000
 
double chop_split_dist_knob = 0.5
 
double chop_overlap_knob = 0.9
 
double chop_center_knob = 0.15
 
int chop_centered_maxwidth = 90
 
double chop_sharpness_knob = 0.06
 
double chop_width_change_knob = 5.0
 
double chop_ok_split = 100.0
 
double chop_good_split = 50.0
 
int chop_x_y_weight = 3
 
bool assume_fixed_pitch_char_segment = false
 
int wordrec_debug_level = 0
 
int wordrec_max_join_chunks = 4
 
bool wordrec_skip_no_truth_words = false
 
bool wordrec_debug_blamer = false
 
bool wordrec_run_blamer = false
 
int segsearch_debug_level = 0
 
int segsearch_max_pain_points = 2000
 
int segsearch_max_futile_classifications = 10
 
double segsearch_max_char_wh_ratio = 2.0
 
bool save_alt_choices = true
 
std::unique_ptr< LanguageModellanguage_model_
 
PRIORITY pass2_ok_split
 
WERD_CHOICEprev_word_best_choice_
 
GenericVector< int > blame_reasons_
 
void(Wordrec::* fill_lattice_ )(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
- Public Attributes inherited from tesseract::Classify
bool allow_blob_division = true
 
bool prioritize_division = false
 
bool classify_enable_learning = true
 
int classify_debug_level = 0
 
int classify_norm_method = character
 
double classify_char_norm_range = 0.2
 
double classify_max_rating_ratio = 1.5
 
double classify_max_certainty_margin = 5.5
 
bool tess_cn_matching = 0
 
bool tess_bn_matching = 0
 
bool classify_enable_adaptive_matcher = 1
 
bool classify_use_pre_adapted_templates = 0
 
bool classify_save_adapted_templates = 0
 
bool classify_enable_adaptive_debugger = 0
 
bool classify_nonlinear_norm = 0
 
int matcher_debug_level = 0
 
int matcher_debug_flags = 0
 
int classify_learning_debug_level = 0
 
double matcher_good_threshold = 0.125
 
double matcher_reliable_adaptive_result = 0.0
 
double matcher_perfect_threshold = 0.02
 
double matcher_bad_match_pad = 0.15
 
double matcher_rating_margin = 0.1
 
double matcher_avg_noise_size = 12.0
 
int matcher_permanent_classes_min = 1
 
int matcher_min_examples_for_prototyping = 3
 
int matcher_sufficient_examples_for_prototyping = 5
 
double matcher_clustering_max_angle_delta = 0.015
 
double classify_misfit_junk_penalty = 0.0
 
double rating_scale = 1.5
 
double certainty_scale = 20.0
 
double tessedit_class_miss_scale = 0.00390625
 
double classify_adapted_pruning_factor = 2.5
 
double classify_adapted_pruning_threshold = -1.0
 
int classify_adapt_proto_threshold = 230
 
int classify_adapt_feature_threshold = 230
 
bool disable_character_fragments = true
 
double classify_character_fragments_garbage_certainty_threshold = -3.0
 
bool classify_debug_character_fragments = false
 
bool matcher_debug_separate_windows = false
 
char * classify_learn_debug_str = ""
 
int classify_class_pruner_threshold = 229
 
int classify_class_pruner_multiplier = 15
 
int classify_cp_cutoff_strength = 7
 
int classify_integer_matcher_multiplier = 10
 
bool classify_bln_numeric_mode = 0
 
double speckle_large_max_size = 0.30
 
double speckle_rating_penalty = 10.0
 
INT_TEMPLATES PreTrainedTemplates = nullptr
 
ADAPT_TEMPLATES AdaptedTemplates = nullptr
 
ADAPT_TEMPLATES BackupAdaptedTemplates = nullptr
 
BIT_VECTOR AllProtosOn = nullptr
 
BIT_VECTOR AllConfigsOn = nullptr
 
BIT_VECTOR AllConfigsOff = nullptr
 
BIT_VECTOR TempProtoMask = nullptr
 
NORM_PROTOSNormProtos = nullptr
 
UnicityTable< FontInfofontinfo_table_
 
UnicityTable< FontSetfontset_table_
 
bool EnableLearning = true
 
- Public Attributes inherited from tesseract::CCUtil
STRING datadir
 
STRING imagebasename
 
STRING lang
 
STRING language_data_path_prefix
 
UNICHARSET unicharset
 
UnicharAmbigs unichar_ambigs
 
STRING imagefile
 
STRING directory
 
int ambigs_debug_level = 0
 
bool use_ambigs_for_adaption = false
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Classify
static void SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
 
static void ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
 
- Static Public Attributes inherited from tesseract::CCStruct
static const double kDescenderFraction = 0.25
 
static const double kXHeightFraction = 0.5
 
static const double kAscenderFraction = 0.25
 
static const double kXHeightCapRatio
 
- Protected Member Functions inherited from tesseract::Wordrec
bool SegSearchDone (int num_futile_classifications)
 
void UpdateSegSearchNodes (float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
 
void ResetNGramSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
 
void InitBlamerForSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
 
- Protected Attributes inherited from tesseract::Classify
IntegerMatcher im_
 
FEATURE_DEFS_STRUCT feature_defs_
 
ShapeTableshape_table_ = nullptr
 

Detailed Description

Definition at line 172 of file tesseractclass.h.

Constructor & Destructor Documentation

◆ Tesseract()

tesseract::Tesseract::Tesseract ( )

Definition at line 52 of file tesseractclass.cpp.

54  "Take segmentation and labeling from box file",
55  this->params()),
57  "Conversion of word/line box file to char box file",
58  this->params()),
60  "Generate training data from boxed chars", this->params()),
62  "Generate more boxes from boxed chars", this->params()),
64  "Break input into lines and remap boxes if present",
65  this->params()),
67  "Dump intermediate images made during page segmentation",
68  this->params()),
70  "Try inverting the image in `LSTMRecognizeWord`", this->params()),
71  // The default for pageseg_mode is the old behaviour, so as not to
72  // upset anything that relies on that.
73  INT_MEMBER(
75  "Page seg mode: 0=osd only, 1=auto+osd, 2=auto_only, 3=auto, 4=column,"
76  " 5=block_vert, 6=block, 7=line, 8=word, 9=word_circle, 10=char,"
77  "11=sparse_text, 12=sparse_text+osd, 13=raw_line"
78  " (Values from PageSegMode enum in tesseract/publictypes.h)",
79  this->params()),
81  "Which OCR engine(s) to run (Tesseract, LSTM, both)."
82  " Defaults to loading and running the most accurate"
83  " available.",
84  this->params()),
86  "Blacklist of chars not to recognize", this->params()),
88  "Whitelist of chars to recognize", this->params()),
90  "List of chars to override tessedit_char_blacklist",
91  this->params()),
93  "Perform training for ambiguities", this->params()),
96  "Whether to use the top-line splitting process for Devanagari "
97  "documents while performing page-segmentation.",
98  this->params()),
101  "Whether to use the top-line splitting process for Devanagari "
102  "documents while performing ocr.",
103  this->params()),
105  "Write all parameters to the given file.", this->params()),
107  "Generate and print debug"
108  " information for adaption",
109  this->params()),
110  INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()),
111  INT_MEMBER(applybox_debug, 1, "Debug level", this->params()),
112  INT_MEMBER(applybox_page, 0, "Page number to apply boxes from",
113  this->params()),
115  "Exposure value follows"
116  " this pattern in the image filename. The name of the image"
117  " files are expected to be in the form"
118  " [lang].[fontname].exp[num].tif",
119  this->params()),
121  "Learn both character fragments (as is done in the"
122  " special low exposure mode) as well as unfragmented"
123  " characters.",
124  this->params()),
126  "Each bounding box"
127  " is assumed to contain ngrams. Only learn the ngrams"
128  " whose outlines overlap horizontally.",
129  this->params()),
130  BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words",
131  this->params()),
132  BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices",
133  this->params()),
134  BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats",
135  this->params()),
137  "Try to improve fuzzy spaces", this->params()),
139  "Don't bother with word plausibility", this->params()),
140  BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?",
141  this->params()),
143  "Add words to the document dictionary", this->params()),
144  BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char",
145  this->params()),
146  BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats",
147  this->params()),
149  "Enable correction based on the word bigram dictionary.",
150  this->params()),
152  "Enable single word correction based on the dictionary.",
153  this->params()),
155  "Amount of debug output for bigram correction.",
156  this->params()),
158  "Remove and conditionally reassign small outlines when they"
159  " confuse layout analysis, determining diacritics vs noise",
160  this->params()),
161  INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines",
162  this->params()),
163  // Worst (min) certainty, for which a diacritic is allowed to make the
164  // base
165  // character worse and still be included.
167  "Hingepoint for base char certainty", this->params()),
168  // Worst (min) certainty, for which a non-overlapping diacritic is allowed
169  // to make the base character worse and still be included.
171  "Hingepoint for disjoint certainty", this->params()),
172  // Worst (min) certainty, for which a diacritic is allowed to make a new
173  // stand-alone blob.
175  "Threshold for new punc char certainty", this->params()),
176  // Factor of certainty margin for adding diacritics to not count as worse.
178  "Scaling on certainty diff from Hingepoint",
179  this->params()),
180  INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob",
181  this->params()),
182  INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word",
183  this->params()),
184  INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()),
185  STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation",
186  this->params()),
187  STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation",
188  this->params()),
189  STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation",
190  this->params()),
192  "good_quality_doc lte rejection limit", this->params()),
194  "good_quality_doc gte good blobs limit", this->params()),
196  "good_quality_doc lte outline error limit", this->params()),
198  "good_quality_doc gte good char limit", this->params()),
199  INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word",
200  this->params()),
202  "Adaptation decision algorithm for tess", this->params()),
204  "Do minimal rejection on pass 1 output", this->params()),
205  BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria",
206  this->params()),
207  BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
208  double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
209  double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
210  INT_MEMBER(multilang_debug_level, 0, "Print multilang debug info.",
211  this->params()),
212  INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
213  this->params()),
215  "Run paragraph detection on the post-text-recognition "
216  "(more accurate)",
217  this->params()),
219  "Use ratings matrix/beam search with lstm", this->params()),
220  STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines",
221  this->params()),
222  STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines",
223  this->params()),
225  "Reduce rejection on good docs", this->params()),
226  BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?",
227  this->params()),
229  "%rej allowed before rej whole doc", this->params()),
231  "%rej allowed before rej whole block", this->params()),
233  "%rej allowed before rej whole row", this->params()),
235  "Number of row rejects in whole word rejects"
236  " which prevents whole row rejection",
237  this->params()),
239  "Only rej partially rejected words in block rejection",
240  this->params()),
242  "Only rej partially rejected words in row rejection",
243  this->params()),
245  "Use word segmentation quality metric", this->params()),
247  "Use word segmentation quality metric", this->params()),
249  "Only preserve wds longer than this", this->params()),
251  "Apply row rejection to good docs", this->params()),
253  "rej good doc wd if more than this fraction rejected",
254  this->params()),
256  "Reject all bad quality wds", this->params()),
257  BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats",
258  this->params()),
260  "Output data to debug file", this->params()),
261  BOOL_MEMBER(bland_unrej, false, "unrej potential with no checks",
262  this->params()),
264  "good_quality_doc gte good char limit", this->params()),
266  "Mark v.bad words for tilde crunch", this->params()),
267  BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
268  this->params()),
269  BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output",
270  this->params()),
271  BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
272  this->params()),
274  "Take out ~^ early?", this->params()),
275  double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this",
276  this->params()),
277  BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()),
279  "crunch garbage cert lt this", this->params()),
281  "crunch garbage rating lt this", this->params()),
282  double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this",
283  this->params()),
284  double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this",
285  this->params()),
286  double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this",
287  this->params()),
288  double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this",
289  this->params()),
290  double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this",
291  this->params()),
292  double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this",
293  this->params()),
295  "Del if word width lt xht x this", this->params()),
297  "Del if word gt xht x this above bl", this->params()),
299  "Del if word gt xht x this below bl", this->params()),
300  double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this",
301  this->params()),
302  INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch",
303  this->params()),
305  "How many potential indicators needed", this->params()),
306  BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings",
307  this->params()),
308  BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring",
309  this->params()),
311  "Don't pot crunch sensible strings", this->params()),
312  BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures",
313  this->params()),
315  "Don't crunch words with long lower case strings",
316  this->params()),
318  "Don't crunch words with long lower case strings",
319  this->params()),
321  "Crunch words with long repetitions", this->params()),
322  INT_MEMBER(crunch_debug, 0, "As it says", this->params()),
324  "How many non-noise blbs either side?", this->params()),
325  double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this",
326  this->params()),
328  "Reward punctuation joins", this->params()),
329  INT_MEMBER(fixsp_done_mode, 1, "What constitues done for spacing",
330  this->params()),
331  INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug",
332  this->params()),
334  "Punct. chs expected WITHIN numbers", this->params()),
336  "Max allowed deviation of blob top outside of font data",
337  this->params()),
339  "Min change in xht before actually trying it", this->params()),
341  "Debug level for sub & superscript fixer", this->params()),
344  "How many times worse "
345  "certainty does a superscript position glyph need to be for "
346  "us to try classifying it as a char with a different "
347  "baseline?",
348  this->params()),
351  "What reduction in "
352  "badness do we think sufficient to choose a superscript "
353  "over what we'd thought. For example, a value of 0.6 means "
354  "we want to reduce badness of certainty by at least 40%",
355  this->params()),
357  "A superscript scaled down more than this is unbelievably "
358  "small. For example, 0.3 means we expect the font size to "
359  "be no smaller than 30% of the text line font size.",
360  this->params()),
362  "Maximum top of a character measured as a multiple of "
363  "x-height above the baseline for us to reconsider whether "
364  "it's a subscript.",
365  this->params()),
367  "Minimum bottom of a character measured as a multiple of "
368  "x-height above the baseline for us to reconsider whether "
369  "it's a superscript.",
370  this->params()),
372  "Write block separators in output", this->params()),
373  BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code",
374  this->params()),
375  BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file",
376  this->params()),
377  BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file",
378  this->params()),
379  BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
380  this->params()),
381  BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file",
382  this->params()),
383  BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",
384  this->params()),
385  BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
386  this->params()),
387  BOOL_MEMBER(tessedit_create_wordstrbox, false, "Write WordStr format .box output file",
388  this->params()),
389  BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
390  this->params()),
391  BOOL_MEMBER(textonly_pdf, false,
392  "Create PDF with only one invisible text layer",
393  this->params()),
394  INT_MEMBER(jpg_quality, 85, "Set JPEG quality level", this->params()),
395  INT_MEMBER(user_defined_dpi, 0, "Specify DPI for input image",
396  this->params()),
398  "Specify minimum characters to try during OSD",
399  this->params()),
401  "Output char for unidentified blobs", this->params()),
402  INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
404  "Don't suspect dict wds longer than this", this->params()),
405  BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected",
406  this->params()),
408  "Don't touch bad rating limit", this->params()),
409  double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit",
410  this->params()),
412  "Only reject tess failures", this->params()),
413  BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING",
414  this->params()),
416  "Make output have exactly one word per WERD", this->params()),
418  "Don't reject ANYTHING AT ALL", this->params()),
419  INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm",
420  this->params()),
421  BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug",
422  this->params()),
423  BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips",
424  this->params()),
426  "Aspect ratio dot/hyphen test", this->params()),
428  "Aspect ratio dot/hyphen test", this->params()),
430  "Use DOC dawg in 11l conf. detector", this->params()),
431  BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test",
432  this->params()),
433  BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check",
434  this->params()),
435  BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control",
436  this->params()),
437  BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control",
438  this->params()),
439  BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control",
440  this->params()),
441  BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check",
442  this->params()),
443  BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check",
444  this->params()),
446  "if >this fract", this->params()),
447  INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit",
448  this->params()),
450  "Allow NN to unrej", this->params()),
451  STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set",
452  this->params()),
453  INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this",
454  this->params()),
455  BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes",
456  this->params()),
458  "-1 -> All pages, else specific page to process",
459  this->params()),
461  "Capture the image from the IPE", this->params()),
462  BOOL_MEMBER(interactive_display_mode, false, "Run interactively?",
463  this->params()),
464  STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
465  BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word",
466  this->params()),
468  "List of languages to load with this one", this->params()),
470  "In multilingual mode use params model of the"
471  " primary language",
472  this->params()),
474  "Min acceptable orientation margin", this->params()),
475  BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding",
476  this->params()),
477  BOOL_MEMBER(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model",
478  this->params()),
480  "Allow feature extractors to see the original outline",
481  this->params()),
483  "Only initialize with the config file. Useful if the "
484  "instance is not going to be used for OCR but say only "
485  "for layout analysis.",
486  this->params()),
487  BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
488  this->params()),
490  "Enable vertical detection", this->params()),
492  "Force using vertical text page mode", this->params()),
495  "Fraction of textlines deemed vertical to use vertical page "
496  "mode",
497  this->params()),
500  "Fraction of height used as a minimum gap for aligned blobs.",
501  this->params()),
502  INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
503  this->params()),
505  "Preserve multiple interword spaces", this->params()),
507  "Page separator (default is form feed control character)",
508  this->params()),
510  "Allows to include alternative symbols choices in the hOCR output. "
511  "Valid input values are 0, 1 and 2. 0 is the default value. "
512  "With 1 the alternative symbol choices per timestep are included. "
513  "With 2 alternative symbol choices are extracted from the CTC "
514  "process instead of the lattice. The choices are mapped per "
515  "character.",
516  this->params()),
517  INT_MEMBER(
519  "Sets the number of cascading iterations for the Beamsearch in "
520  "lstm_choice_mode. Note that lstm_choice_mode must be set to a "
521  "value greater than 0 to produce results.",
522  this->params()),
525  "Sets the rating coefficient for the lstm choices. The smaller the "
526  "coefficient, the better are the ratings for each choice and less "
527  "information is lost due to the cut off at 0. The standard value is "
528  "5", this->params()),
530  "Detect music staff and remove intersecting components", this->params()),
531 
532  backup_config_file_(nullptr),
533  pix_binary_(nullptr),
534  pix_grey_(nullptr),
535  pix_original_(nullptr),
536  pix_thresholds_(nullptr),
537  source_resolution_(0),
538  textord_(this),
539  right_to_left_(false),
540  scaled_color_(nullptr),
541  scaled_factor_(-1),
542  deskew_(1.0f, 0.0f),
543  reskew_(1.0f, 0.0f),
544  most_recently_used_(this),
545  font_table_size_(0),
546  equ_detect_(nullptr),
547 #ifndef ANDROID_BUILD
548  lstm_recognizer_(nullptr),
549 #endif
550  train_line_page_num_(0) {
551 }

◆ ~Tesseract()

tesseract::Tesseract::~Tesseract ( )
override

Definition at line 553 of file tesseractclass.cpp.

553  {
554  Clear();
555  pixDestroy(&pix_original_);
556  end_tesseract();
557  sub_langs_.delete_data_pointers();
558 #ifndef ANDROID_BUILD
559  delete lstm_recognizer_;
560  lstm_recognizer_ = nullptr;
561 #endif
562 }

Member Function Documentation

◆ acceptable_number_string()

bool tesseract::Tesseract::acceptable_number_string ( const char *  s,
const char *  lengths 
)

Definition at line 386 of file output.cpp.

388  {
389  bool prev_digit = false;
390 
391  if (*lengths == 1 && *s == '(')
392  s++;
393 
394  if (*lengths == 1 &&
395  ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
396  s++;
397 
398  for (; *s != '\0'; s += *(lengths++)) {
399  if (unicharset.get_isdigit(s, *lengths))
400  prev_digit = true;
401  else if (prev_digit &&
402  (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
403  prev_digit = false;
404  else if (prev_digit && *lengths == 1 &&
405  (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
406  return true;
407  else if (prev_digit &&
408  *lengths == 1 && (*s == '%') &&
409  (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
410  (*(s + *lengths + *(lengths + 1)) == '\0'))
411  return true;
412  else
413  return false;
414  }
415  return true;

◆ acceptable_word_string()

ACCEPTABLE_WERD_TYPE tesseract::Tesseract::acceptable_word_string ( const UNICHARSET char_set,
const char *  s,
const char *  lengths 
)

Definition at line 1744 of file control.cpp.

1745  {
1746  int i = 0;
1747  int offset = 0;
1748  int leading_punct_count;
1749  int upper_count = 0;
1750  int hyphen_pos = -1;
1752 
1753  if (strlen (lengths) > 20)
1754  return word_type;
1755 
1756  /* Single Leading punctuation char*/
1757 
1758  if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
1759  offset += lengths[i++];
1760  leading_punct_count = i;
1761 
1762  /* Initial cap */
1763  while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1764  offset += lengths[i++];
1765  upper_count++;
1766  }
1767  if (upper_count > 1) {
1768  word_type = AC_UPPER_CASE;
1769  } else {
1770  /* Lower case word, possibly with an initial cap */
1771  while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1772  offset += lengths[i++];
1773  }
1774  if (i - leading_punct_count < quality_min_initial_alphas_reqd)
1775  goto not_a_word;
1776  /*
1777  Allow a single hyphen in a lower case word
1778  - don't trust upper case - I've seen several cases of "H" -> "I-I"
1779  */
1780  if (lengths[i] == 1 && s[offset] == '-') {
1781  hyphen_pos = i;
1782  offset += lengths[i++];
1783  if (s[offset] != '\0') {
1784  while ((s[offset] != '\0') &&
1785  char_set.get_islower(s + offset, lengths[i])) {
1786  offset += lengths[i++];
1787  }
1788  if (i < hyphen_pos + 3)
1789  goto not_a_word;
1790  }
1791  } else {
1792  /* Allow "'s" in NON hyphenated lower case words */
1793  if (lengths[i] == 1 && (s[offset] == '\'') &&
1794  lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
1795  offset += lengths[i++];
1796  offset += lengths[i++];
1797  }
1798  }
1799  if (upper_count > 0)
1800  word_type = AC_INITIAL_CAP;
1801  else
1802  word_type = AC_LOWER_CASE;
1803  }
1804 
1805  /* Up to two different, constrained trailing punctuation chars */
1806  if (lengths[i] == 1 && s[offset] != '\0' &&
1807  STRING(chs_trailing_punct1).contains(s[offset]))
1808  offset += lengths[i++];
1809  if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
1810  s[offset - lengths[i - 1]] != s[offset] &&
1811  STRING(chs_trailing_punct2).contains (s[offset]))
1812  offset += lengths[i++];
1813 
1814  if (s[offset] != '\0')
1815  word_type = AC_UNACCEPTABLE;
1816 
1817  not_a_word:
1818 
1819  if (word_type == AC_UNACCEPTABLE) {
1820  /* Look for abbreviation string */
1821  i = 0;
1822  offset = 0;
1823  if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1824  word_type = AC_UC_ABBREV;
1825  while (s[offset] != '\0' &&
1826  char_set.get_isupper(s + offset, lengths[i]) &&
1827  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1828  offset += lengths[i++];
1829  offset += lengths[i++];
1830  }
1831  }
1832  else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1833  word_type = AC_LC_ABBREV;
1834  while (s[offset] != '\0' &&
1835  char_set.get_islower(s + offset, lengths[i]) &&
1836  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1837  offset += lengths[i++];
1838  offset += lengths[i++];
1839  }
1840  }
1841  if (s[offset] != '\0')
1842  word_type = AC_UNACCEPTABLE;
1843  }
1844 
1845  return word_type;
1846 }

◆ alpha_count()

int16_t tesseract::Tesseract::alpha_count ( const char *  word,
const char *  word_lengths 
)

Definition at line 494 of file reject.cpp.

497  {
498  int16_t i;
499  int16_t offset;
500  int16_t count = 0;
501 
502  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
503  if (unicharset.get_isalpha (word + offset, word_lengths[i]))
504  count++;
505  }

◆ ambigs_classify_and_output()

void tesseract::Tesseract::ambigs_classify_and_output ( const char *  label,
PAGE_RES_IT pr_it,
FILE *  output_file 
)

Definition at line 211 of file recogtraining.cpp.

213  {
214  // Classify word.
215  fflush(stdout);
216  WordData word_data(*pr_it);
217  SetupWordPassN(1, &word_data);
218  classify_word_and_language(1, pr_it, &word_data);
219  WERD_RES* werd_res = word_data.word;
220  WERD_CHOICE* best_choice = werd_res->best_choice;
221  ASSERT_HOST(best_choice != nullptr);
222 
223  // Compute the number of unichars in the label.
224  GenericVector<UNICHAR_ID> encoding;
225  if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
226  tprintf("Not outputting illegal unichar %s\n", label);
227  return;
228  }
229 
230  // Dump all paths through the ratings matrix (which is normally small).
231  int dim = werd_res->ratings->dimension();
232  const auto** blob_choices = new const BLOB_CHOICE*[dim];
233  PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset,
234  label, output_file);
235  delete[] blob_choices;
236 }

◆ AnyLSTMLang()

bool tesseract::Tesseract::AnyLSTMLang ( ) const
inline

Definition at line 293 of file tesseractclass.h.

293  {
295  return true;
296  for (int i = 0; i < sub_langs_.size(); ++i) {
297  if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) {
298  return true;
299  }
300  }
301  return false;
302  }

◆ AnyTessLang()

bool tesseract::Tesseract::AnyTessLang ( ) const
inline

Definition at line 283 of file tesseractclass.h.

283  {
285  return true;
286  for (int i = 0; i < sub_langs_.size(); ++i) {
287  if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_LSTM_ONLY)
288  return true;
289  }
290  return false;
291  }

◆ ApplyBoxes()

PAGE_RES * tesseract::Tesseract::ApplyBoxes ( const STRING fname,
bool  find_segmentation,
BLOCK_LIST *  block_list 
)

Definition at line 108 of file applybox.cpp.

111  {
112  GenericVector<TBOX> boxes;
113  GenericVector<STRING> texts, full_texts;
114  if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
115  nullptr)) {
116  return nullptr; // Can't do it.
117  }
118 
119  const int box_count = boxes.size();
120  int box_failures = 0;
121 
122  // In word mode, we use the boxes to make a word for each box, but
123  // in blob mode we use the existing words and maximally chop them first.
124  PAGE_RES* page_res = find_segmentation ?
125  nullptr : SetupApplyBoxes(boxes, block_list);
126  clear_any_old_text(block_list);
127 
128  for (int i = 0; i < box_count; i++) {
129  bool foundit = false;
130  if (page_res != nullptr) {
131  foundit = ResegmentCharBox(page_res,
132  (i == 0) ? nullptr : &boxes[i - 1],
133  boxes[i],
134  (i == box_count - 1) ? nullptr : &boxes[i + 1],
135  full_texts[i].c_str());
136  } else {
137  foundit = ResegmentWordBox(block_list, boxes[i],
138  (i == box_count - 1) ? nullptr : &boxes[i + 1],
139  texts[i].c_str());
140  }
141  if (!foundit) {
142  box_failures++;
143  ReportFailedBox(i, boxes[i], texts[i].c_str(),
144  "FAILURE! Couldn't find a matching blob");
145  }
146  }
147 
148  if (page_res == nullptr) {
149  // In word/line mode, we now maximally chop all the words and resegment
150  // them with the classifier.
151  page_res = SetupApplyBoxes(boxes, block_list);
152  ReSegmentByClassification(page_res);
153  }
154  if (applybox_debug > 0) {
155  tprintf("APPLY_BOXES:\n");
156  tprintf(" Boxes read from boxfile: %6d\n", box_count);
157  if (box_failures > 0)
158  tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
159  }
160  TidyUp(page_res);
161  return page_res;

◆ ApplyBoxTraining()

void tesseract::Tesseract::ApplyBoxTraining ( const STRING fontname,
PAGE_RES page_res 
)

◆ AssignDiacriticsToNewBlobs()

void tesseract::Tesseract::AssignDiacriticsToNewBlobs ( const GenericVector< C_OUTLINE * > &  outlines,
int  pass,
WERD real_word,
PAGE_RES_IT pr_it,
GenericVector< bool > *  word_wanted,
GenericVector< C_BLOB * > *  target_blobs 
)

Definition at line 1063 of file control.cpp.

1066  {
1067  GenericVector<bool> blob_wanted;
1068  word_wanted->init_to_size(outlines.size(), false);
1069  target_blobs->init_to_size(outlines.size(), nullptr);
1070  // Check for outlines that need to be turned into stand-alone blobs.
1071  for (int i = 0; i < outlines.size(); ++i) {
1072  if (outlines[i] == nullptr) continue;
1073  // Get a set of adjacent outlines that don't overlap any existing blob.
1074  blob_wanted.init_to_size(outlines.size(), false);
1075  int num_blob_outlines = 0;
1076  TBOX total_ol_box(outlines[i]->bounding_box());
1077  while (i < outlines.size() && outlines[i] != nullptr) {
1078  blob_wanted[i] = true;
1079  total_ol_box += outlines[i]->bounding_box();
1080  ++i;
1081  ++num_blob_outlines;
1082  }
1083  // Find the insertion point.
1084  C_BLOB_IT blob_it(real_word->cblob_list());
1085  while (!blob_it.at_last() &&
1086  blob_it.data_relative(1)->bounding_box().left() <=
1087  total_ol_box.left()) {
1088  blob_it.forward();
1089  }
1090  // Choose which combination of them we actually want and where to put
1091  // them.
1092  if (debug_noise_removal)
1093  tprintf("Num blobless outlines = %d\n", num_blob_outlines);
1094  C_BLOB* left_blob = blob_it.data();
1095  TBOX left_box = left_blob->bounding_box();
1096  C_BLOB* right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);
1097  if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr ||
1098  !right_blob->bounding_box().x_overlap(total_ol_box)) &&
1099  SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob,
1100  outlines, num_blob_outlines,
1101  &blob_wanted)) {
1102  if (debug_noise_removal) tprintf("Added to left blob\n");
1103  for (int j = 0; j < blob_wanted.size(); ++j) {
1104  if (blob_wanted[j]) {
1105  (*word_wanted)[j] = true;
1106  (*target_blobs)[j] = left_blob;
1107  }
1108  }
1109  } else if (right_blob != nullptr &&
1110  (!left_box.x_overlap(total_ol_box) ||
1111  right_blob->bounding_box().x_overlap(total_ol_box)) &&
1113  right_blob, outlines,
1114  num_blob_outlines, &blob_wanted)) {
1115  if (debug_noise_removal) tprintf("Added to right blob\n");
1116  for (int j = 0; j < blob_wanted.size(); ++j) {
1117  if (blob_wanted[j]) {
1118  (*word_wanted)[j] = true;
1119  (*target_blobs)[j] = right_blob;
1120  }
1121  }
1122  } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr,
1123  outlines, num_blob_outlines,
1124  &blob_wanted)) {
1125  if (debug_noise_removal) tprintf("Fitted between blobs\n");
1126  for (int j = 0; j < blob_wanted.size(); ++j) {
1127  if (blob_wanted[j]) {
1128  (*word_wanted)[j] = true;
1129  (*target_blobs)[j] = nullptr;
1130  }
1131  }
1132  }
1133  }
1134 }

◆ AssignDiacriticsToOverlappingBlobs()

void tesseract::Tesseract::AssignDiacriticsToOverlappingBlobs ( const GenericVector< C_OUTLINE * > &  outlines,
int  pass,
WERD real_word,
PAGE_RES_IT pr_it,
GenericVector< bool > *  word_wanted,
GenericVector< bool > *  overlapped_any_blob,
GenericVector< C_BLOB * > *  target_blobs 
)

Definition at line 1010 of file control.cpp.

1014  {
1015  GenericVector<bool> blob_wanted;
1016  word_wanted->init_to_size(outlines.size(), false);
1017  overlapped_any_blob->init_to_size(outlines.size(), false);
1018  target_blobs->init_to_size(outlines.size(), nullptr);
1019  // For each real blob, find the outlines that seriously overlap it.
1020  // A single blob could be several merged characters, so there can be quite
1021  // a few outlines overlapping, and the full engine needs to be used to chop
1022  // and join to get a sensible result.
1023  C_BLOB_IT blob_it(real_word->cblob_list());
1024  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1025  C_BLOB* blob = blob_it.data();
1026  const TBOX blob_box = blob->bounding_box();
1027  blob_wanted.init_to_size(outlines.size(), false);
1028  int num_blob_outlines = 0;
1029  for (int i = 0; i < outlines.size(); ++i) {
1030  if (blob_box.major_x_overlap(outlines[i]->bounding_box()) &&
1031  !(*word_wanted)[i]) {
1032  blob_wanted[i] = true;
1033  (*overlapped_any_blob)[i] = true;
1034  ++num_blob_outlines;
1035  }
1036  }
1037  if (debug_noise_removal) {
1038  tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
1039  blob_box.print();
1040  }
1041  // If any outlines overlap the blob, and not too many, classify the blob
1042  // (using the full engine, languages and all), and choose the maximal
1043  // combination of outlines that doesn't hurt the end-result classification
1044  // by too much. Mark them as wanted.
1045  if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
1046  if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob,
1047  outlines, num_blob_outlines,
1048  &blob_wanted)) {
1049  for (int i = 0; i < blob_wanted.size(); ++i) {
1050  if (blob_wanted[i]) {
1051  // Claim the outline and record where it is going.
1052  (*word_wanted)[i] = true;
1053  (*target_blobs)[i] = blob;
1054  }
1055  }
1056  }
1057  }
1058  }
1059 }

◆ AutoPageSeg()

int tesseract::Tesseract::AutoPageSeg ( PageSegMode  pageseg_mode,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  to_blocks,
BLOBNBOX_LIST *  diacritic_blobs,
Tesseract osd_tess,
OSResults osr 
)

Auto page segmentation. Divide the page image into blocks of uniform text linespacing and images.

Resolution (in ppi) is derived from the input image.

The output goes in the blocks list with corresponding TO_BLOCKs in the to_blocks list.

If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide the image into columns, but multiple blocks are still made if the text is of non-uniform linespacing.

If diacritic_blobs is non-null, then diacritics/noise blobs, that would confuse layout analysis by causing textline overlap, are placed there, with the expectation that they will be reassigned to words later and noise/diacriticness determined via classification.

If osd (orientation and script detection) is true then that is performed as well. If only_osd is true, then only orientation and script detection is performed. If osd is desired, (osd or only_osd) then osr_tess must be another Tesseract that was initialized especially for osd, and the results will be output into osr (orientation and script result).

Definition at line 214 of file pagesegmain.cpp.

214  {
215  TO_BLOCK_IT to_block_it(&temp_blocks);
216  TO_BLOCK* to_block = to_block_it.data();
217  if (musicmask_pix != nullptr) {
218  // TODO(rays) pass the musicmask_pix into FindBlocks and mark music
219  // blocks separately. For now combine with photomask_pix.
220  pixOr(photomask_pix, photomask_pix, musicmask_pix);
221  }
222  if (equ_detect_) {
223  finder->SetEquationDetect(equ_detect_);
224  }
225  result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_,
226  to_block, photomask_pix, pix_thresholds_,
227  pix_grey_, &pixa_debug_, &found_blocks,
228  diacritic_blobs, to_blocks);
229  if (result >= 0)
230  finder->GetDeskewVectors(&deskew_, &reskew_);
231  delete finder;
232  }
233  pixDestroy(&photomask_pix);
234  pixDestroy(&musicmask_pix);
235  if (result < 0) return result;
236 
237  blocks->clear();
238  BLOCK_IT block_it(blocks);
239  // Move the found blocks to the input/output blocks.
240  block_it.add_list_after(&found_blocks);
241  return result;
242 }
243 
244 // Helper adds all the scripts from sid_set converted to ids from osd_set to
245 // allowed_ids.
246 static void AddAllScriptsConverted(const UNICHARSET& sid_set,
247  const UNICHARSET& osd_set,
248  GenericVector<int>* allowed_ids) {
249  for (int i = 0; i < sid_set.get_script_table_size(); ++i) {
250  if (i != sid_set.null_sid()) {
251  const char* script = sid_set.get_script_from_script_id(i);
252  allowed_ids->push_back(osd_set.get_script_id_from_name(script));
253  }
254  }
255 }
256 

◆ BelievableSuperscript()

bool tesseract::Tesseract::BelievableSuperscript ( bool  debug,
const WERD_RES word,
float  certainty_threshold,
int *  left_ok,
int *  right_ok 
) const

Return whether this is believable superscript or subscript text.

We insist that:

  • there are no punctuation marks.
  • there are no italics.
  • no normal-sized character is smaller than superscript_scaledown_ratio of what it ought to be, and
  • each character is at least as certain as certainty_threshold.
Parameters
[in]debugIf true, spew debug output
[in]wordThe word whose best_choice we're evaluating
[in]certainty_thresholdIf any of the characters have less certainty than this, reject.
[out]left_okHow many left-side characters were ok?
[out]right_okHow many right-side characters were ok?
Returns
Whether the complete best choice is believable as a superscript.

Definition at line 520 of file superscript.cpp.

525  {
526  int initial_ok_run_count = 0;
527  int ok_run_count = 0;
528  float worst_certainty = 0.0f;
529  const WERD_CHOICE &wc = *word.best_choice;
530 
531  const UnicityTable<FontInfo>& fontinfo_table = get_fontinfo_table();
532  for (int i = 0; i < wc.length(); i++) {
533  TBLOB *blob = word.rebuild_word->blobs[i];
534  UNICHAR_ID unichar_id = wc.unichar_id(i);
535  float char_certainty = wc.certainty(i);
536  bool bad_certainty = char_certainty < certainty_threshold;
537  bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
538  bool is_italic = word.fontinfo && word.fontinfo->is_italic();
539  BLOB_CHOICE *choice = word.GetBlobChoice(i);
540  if (choice && fontinfo_table.size() > 0) {
541  // Get better information from the specific choice, if available.
542  int font_id1 = choice->fontinfo_id();
543  bool font1_is_italic = font_id1 >= 0
544  ? fontinfo_table.get(font_id1).is_italic() : false;
545  int font_id2 = choice->fontinfo_id2();
546  is_italic = font1_is_italic &&
547  (font_id2 < 0 || fontinfo_table.get(font_id2).is_italic());
548  }
549 
550  float height_fraction = 1.0f;
551  float char_height = blob->bounding_box().height();
552  float normal_height = char_height;
553  if (wc.unicharset()->top_bottom_useful()) {
554  int min_bot, max_bot, min_top, max_top;
555  wc.unicharset()->get_top_bottom(unichar_id,
556  &min_bot, &max_bot,
557  &min_top, &max_top);
558  float hi_height = max_top - max_bot;
559  float lo_height = min_top - min_bot;
560  normal_height = (hi_height + lo_height) / 2;
561  if (normal_height >= kBlnXHeight) {
562  // Only ding characters that we have decent information for because
563  // they're supposed to be normal sized, not tiny specks or dashes.
564  height_fraction = char_height / normal_height;
565  }
566  }
567  bool bad_height = height_fraction < superscript_scaledown_ratio;
568 
569  if (debug) {
570  if (is_italic) {
571  tprintf(" Rejecting: superscript is italic.\n");
572  }
573  if (is_punc) {
574  tprintf(" Rejecting: punctuation present.\n");
575  }
576  const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
577  if (bad_certainty) {
578  tprintf(" Rejecting: don't believe character %s with certainty %.2f "
579  "which is less than threshold %.2f\n", char_str,
580  char_certainty, certainty_threshold);
581  }
582  if (bad_height) {
583  tprintf(" Rejecting: character %s seems too small @ %.2f versus "
584  "expected %.2f\n", char_str, char_height, normal_height);
585  }
586  }
587  if (bad_certainty || bad_height || is_punc || is_italic) {
588  if (ok_run_count == i) {
589  initial_ok_run_count = ok_run_count;
590  }
591  ok_run_count = 0;
592  } else {
593  ok_run_count++;
594  }
595  if (char_certainty < worst_certainty) {
596  worst_certainty = char_certainty;
597  }
598  }
599  bool all_ok = ok_run_count == wc.length();
600  if (all_ok && debug) {
601  tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
602  }
603  if (!all_ok) {
604  if (left_ok) *left_ok = initial_ok_run_count;
605  if (right_ok) *right_ok = ok_run_count;
606  }
607  return all_ok;

◆ BestPix()

Pix* tesseract::Tesseract::BestPix ( ) const
inline

Definition at line 231 of file tesseractclass.h.

231  {
232  if (pixGetWidth(pix_original_) == ImageWidth()) {
233  return pix_original_;
234  } else if (pix_grey_ != nullptr) {
235  return pix_grey_;
236  } else {
237  return pix_binary_;
238  }
239  }

◆ bigram_correction_pass()

void tesseract::Tesseract::bigram_correction_pass ( PAGE_RES page_res)

Definition at line 467 of file control.cpp.

467  {
468  PAGE_RES_IT word_it(page_res);
469 
470  WERD_RES *w_prev = nullptr;
471  WERD_RES *w = word_it.word();
472  while (true) {
473  w_prev = w;
474  while (word_it.forward() != nullptr &&
475  (!word_it.word() || word_it.word()->part_of_combo)) {
476  // advance word_it, skipping over parts of combos
477  }
478  if (!word_it.word()) break;
479  w = word_it.word();
480  if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
481  continue;
482  }
483  if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
484  if (tessedit_bigram_debug) {
485  tprintf("Skipping because one of the words is W_REP_CHAR\n");
486  }
487  continue;
488  }
489  // Two words sharing the same language model, excellent!
490  GenericVector<WERD_CHOICE *> overrides_word1;
491  GenericVector<WERD_CHOICE *> overrides_word2;
492 
493  const STRING orig_w1_str = w_prev->best_choice->unichar_string();
494  const STRING orig_w2_str = w->best_choice->unichar_string();
495  WERD_CHOICE prev_best(w->uch_set);
496  {
497  int w1start, w1end;
498  w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
499  prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
500  }
501  WERD_CHOICE this_best(w->uch_set);
502  {
503  int w2start, w2end;
504  w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
505  this_best = w->best_choice->shallow_copy(w2start, w2end);
506  }
507 
508  if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
509  if (tessedit_bigram_debug) {
510  tprintf("Top choice \"%s %s\" verified by bigram model.\n",
511  orig_w1_str.c_str(), orig_w2_str.c_str());
512  }
513  continue;
514  }
515  if (tessedit_bigram_debug > 2) {
516  tprintf("Examining alt choices for \"%s %s\".\n",
517  orig_w1_str.c_str(), orig_w2_str.c_str());
518  }
519  if (tessedit_bigram_debug > 1) {
520  if (!w_prev->best_choices.singleton()) {
521  w_prev->PrintBestChoices();
522  }
523  if (!w->best_choices.singleton()) {
524  w->PrintBestChoices();
525  }
526  }
527  float best_rating = 0.0;
528  int best_idx = 0;
529  WERD_CHOICE_IT prev_it(&w_prev->best_choices);
530  for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
531  WERD_CHOICE *p1 = prev_it.data();
532  WERD_CHOICE strip1(w->uch_set);
533  {
534  int p1start, p1end;
535  p1->GetNonSuperscriptSpan(&p1start, &p1end);
536  strip1 = p1->shallow_copy(p1start, p1end);
537  }
538  WERD_CHOICE_IT w_it(&w->best_choices);
539  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
540  WERD_CHOICE *p2 = w_it.data();
541  WERD_CHOICE strip2(w->uch_set);
542  {
543  int p2start, p2end;
544  p2->GetNonSuperscriptSpan(&p2start, &p2end);
545  strip2 = p2->shallow_copy(p2start, p2end);
546  }
547  if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
548  overrides_word1.push_back(p1);
549  overrides_word2.push_back(p2);
550  if (overrides_word1.size() == 1 ||
551  p1->rating() + p2->rating() < best_rating) {
552  best_rating = p1->rating() + p2->rating();
553  best_idx = overrides_word1.size() - 1;
554  }
555  }
556  }
557  }
558  if (!overrides_word1.empty()) {
559  // Excellent, we have some bigram matches.
561  *overrides_word1[best_idx]) &&
563  *overrides_word2[best_idx])) {
564  if (tessedit_bigram_debug > 1) {
565  tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
566  "model.\n", orig_w1_str.c_str(), orig_w2_str.c_str());
567  }
568  continue;
569  }
570  const STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
571  const STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
572  if (new_w1_str != orig_w1_str) {
573  w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
574  }
575  if (new_w2_str != orig_w2_str) {
576  w->ReplaceBestChoice(overrides_word2[best_idx]);
577  }
578  if (tessedit_bigram_debug > 0) {
579  STRING choices_description;
580  int num_bigram_choices
581  = overrides_word1.size() * overrides_word2.size();
582  if (num_bigram_choices == 1) {
583  choices_description = "This was the unique bigram choice.";
584  } else {
585  if (tessedit_bigram_debug > 1) {
586  STRING bigrams_list;
587  const int kMaxChoicesToPrint = 20;
588  for (int i = 0; i < overrides_word1.size() &&
589  i < kMaxChoicesToPrint; i++) {
590  if (i > 0) { bigrams_list += ", "; }
591  WERD_CHOICE *p1 = overrides_word1[i];
592  WERD_CHOICE *p2 = overrides_word2[i];
593  bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
594  }
595  choices_description = "There were many choices: {";
596  choices_description += bigrams_list;
597  choices_description += "}";
598  } else {
599  choices_description.add_str_int("There were ", num_bigram_choices);
600  choices_description += " compatible bigrams.";
601  }
602  }
603  tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
604  orig_w1_str.c_str(), orig_w2_str.c_str(),
605  new_w1_str.c_str(), new_w2_str.c_str(),
606  choices_description.c_str());
607  }
608  }
609  }
610 }

◆ blamer_pass()

void tesseract::Tesseract::blamer_pass ( PAGE_RES page_res)

Definition at line 709 of file control.cpp.

709  {
710  if (!wordrec_run_blamer) return;
711  PAGE_RES_IT page_res_it(page_res);
712  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
713  page_res_it.forward()) {
714  WERD_RES *word = page_res_it.word();
717  }
718  tprintf("Blame reasons:\n");
719  for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
721  static_cast<IncorrectResultReason>(bl)),
722  page_res->blame_reasons[bl]);
723  }
724  if (page_res->misadaption_log.size() > 0) {
725  tprintf("Misadaption log:\n");
726  for (int i = 0; i < page_res->misadaption_log.size(); ++i) {
727  tprintf("%s\n", page_res->misadaption_log[i].c_str());
728  }
729  }
730 }

◆ blob_feature_display()

void tesseract::Tesseract::blob_feature_display ( PAGE_RES page_res,
const TBOX selection_box 
)

Definition at line 952 of file pgedit.cpp.

◆ blob_noise_score()

float tesseract::Tesseract::blob_noise_score ( TBLOB blob)

Definition at line 786 of file fixspace.cpp.

787  {
788  TBOX box; // BB of outline
789  int16_t outline_count = 0;
790  int16_t max_dimension;
791  int16_t largest_outline_dimension = 0;
792 
793  for (TESSLINE* ol = blob->outlines; ol != nullptr; ol= ol->next) {
794  outline_count++;
795  box = ol->bounding_box();
796  if (box.height() > box.width()) {
797  max_dimension = box.height();
798  } else {
799  max_dimension = box.width();
800  }
801 
802  if (largest_outline_dimension < max_dimension)
803  largest_outline_dimension = max_dimension;
804  }
805 
806  if (outline_count > 5) {
807  // penalise LOTS of blobs
808  largest_outline_dimension *= 2;
809  }
810 
811  box = blob->bounding_box();
812  if (box.bottom() > kBlnBaselineOffset * 4 ||
813  box.top() < kBlnBaselineOffset / 2) {
814  // Lax blob is if high or low
815  largest_outline_dimension /= 2;
816  }
817 
818  return largest_outline_dimension;

◆ break_noisiest_blob_word()

void tesseract::Tesseract::break_noisiest_blob_word ( WERD_RES_LIST &  words)

break_noisiest_blob_word() Find the word with the blob which looks like the worst noise. Break the word into two, deleting the noise blob.

Definition at line 641 of file fixspace.cpp.

642  {
643  WERD_RES_IT word_it(&words);
644  WERD_RES_IT worst_word_it;
645  float worst_noise_score = 9999;
646  int worst_blob_index = -1; // Noisiest blob of noisiest wd
647  int blob_index; // of wds noisiest blob
648  float noise_score; // of wds noisiest blob
649  WERD_RES *word_res;
650  C_BLOB_IT blob_it;
651  C_BLOB_IT rej_cblob_it;
652  C_BLOB_LIST new_blob_list;
653  C_BLOB_IT new_blob_it;
654  C_BLOB_IT new_rej_cblob_it;
655  WERD *new_word;
656  int16_t start_of_noise_blob;
657  int16_t i;
658 
659  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
660  blob_index = worst_noise_blob(word_it.data(), &noise_score);
661  if (blob_index > -1 && worst_noise_score > noise_score) {
662  worst_noise_score = noise_score;
663  worst_blob_index = blob_index;
664  worst_word_it = word_it;
665  }
666  }
667  if (worst_blob_index < 0) {
668  words.clear(); // signal termination
669  return;
670  }
671 
672  /* Now split the worst_word_it */
673 
674  word_res = worst_word_it.data();
675 
676  /* Move blobs before noise blob to a new bloblist */
677 
678  new_blob_it.set_to_list(&new_blob_list);
679  blob_it.set_to_list(word_res->word->cblob_list());
680  for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
681  new_blob_it.add_after_then_move(blob_it.extract());
682  }
683  start_of_noise_blob = blob_it.data()->bounding_box().left();
684  delete blob_it.extract(); // throw out noise blob
685 
686  new_word = new WERD(&new_blob_list, word_res->word);
687  new_word->set_flag(W_EOL, false);
688  word_res->word->set_flag(W_BOL, false);
689  word_res->word->set_blanks(1); // After break
690 
691  new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
692  rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
693  for (;
694  (!rej_cblob_it.empty() &&
695  (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
696  rej_cblob_it.forward()) {
697  new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
698  }
699 
700  auto* new_word_res = new WERD_RES(new_word);
701  new_word_res->combination = true;
702  worst_word_it.add_before_then_move(new_word_res);
703 
704  word_res->ClearResults();

◆ build_menu_new()

SVMenuNode * tesseract::Tesseract::build_menu_new ( )

Definition at line 298 of file pgedit.cpp.

298  {
299  SVMenuNode* parent_menu;
300  auto* root_menu_item = new SVMenuNode();
301 
302  SVMenuNode* modes_menu_item = root_menu_item->AddChild("MODES");
303 
304  modes_menu_item->AddChild("Change Display", CHANGE_DISP_CMD_EVENT);
305  modes_menu_item->AddChild("Dump Word", DUMP_WERD_CMD_EVENT);
306  modes_menu_item->AddChild("Show Point", SHOW_POINT_CMD_EVENT);
307  modes_menu_item->AddChild("Show BL Norm Word", SHOW_BLN_WERD_CMD_EVENT);
308  modes_menu_item->AddChild("Config Words", DEBUG_WERD_CMD_EVENT);
309  modes_menu_item->AddChild("Recog Words", RECOG_WERDS);
310  modes_menu_item->AddChild("Recog Blobs", RECOG_PSEUDO);
311  modes_menu_item->AddChild("Show Blob Features", SHOW_BLOB_FEATURES);
312 
313  parent_menu = root_menu_item->AddChild("DISPLAY");
314 
315  parent_menu->AddChild("Blamer", BLAMER_CMD_EVENT, false);
316  parent_menu->AddChild("Bounding Boxes", BOUNDING_BOX_CMD_EVENT, false);
317  parent_menu->AddChild("Correct Text", CORRECT_TEXT_CMD_EVENT, false);
318  parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, false);
319  parent_menu->AddChild("Baseline Normalized", BL_NORM_CMD_EVENT, false);
320  parent_menu->AddChild("Edge Steps", BITMAP_CMD_EVENT, true);
321  parent_menu->AddChild("Subscripts", SHOW_SUBSCRIPT_CMD_EVENT);
322  parent_menu->AddChild("Superscripts", SHOW_SUPERSCRIPT_CMD_EVENT);
323  parent_menu->AddChild("Italics", SHOW_ITALIC_CMD_EVENT);
324  parent_menu->AddChild("Bold", SHOW_BOLD_CMD_EVENT);
325  parent_menu->AddChild("Underline", SHOW_UNDERLINE_CMD_EVENT);
326  parent_menu->AddChild("FixedPitch", SHOW_FIXEDPITCH_CMD_EVENT);
327  parent_menu->AddChild("Serifs", SHOW_SERIF_CMD_EVENT);
328  parent_menu->AddChild("SmallCaps", SHOW_SMALLCAPS_CMD_EVENT);
329  parent_menu->AddChild("DropCaps", SHOW_DROPCAPS_CMD_EVENT);
330 
331 
332  parent_menu = root_menu_item->AddChild("OTHER");
333 
334  parent_menu->AddChild("Quit", QUIT_CMD_EVENT);
335  parent_menu->AddChild("Show Image", IMAGE_CMD_EVENT, false);
336  parent_menu->AddChild("ShowBlock Outlines", BLOCKS_CMD_EVENT, false);
337  parent_menu->AddChild("Show Baselines", BASELINES_CMD_EVENT, false);
338  parent_menu->AddChild("Uniform Display", UNIFORM_DISP_CMD_EVENT);
339  parent_menu->AddChild("Refresh Display", REFRESH_CMD_EVENT);
340 
341  return root_menu_item;
342 }

◆ check_debug_pt()

bool tesseract::Tesseract::check_debug_pt ( WERD_RES word,
int  location 
)

Definition at line 1848 of file control.cpp.

1848  {
1849  bool show_map_detail = false;
1850  int16_t i;
1851 
1852  if (!test_pt)
1853  return false;
1854 
1855  tessedit_rejection_debug.set_value (false);
1856  debug_x_ht_level.set_value(0);
1857 
1858  if (word->word->bounding_box().contains(FCOORD (test_pt_x, test_pt_y))) {
1859  if (location < 0)
1860  return true; // For breakpoint use
1861  tessedit_rejection_debug.set_value(true);
1862  debug_x_ht_level.set_value(2);
1863  tprintf ("\n\nTESTWD::");
1864  switch (location) {
1865  case 0:
1866  tprintf ("classify_word_pass1 start\n");
1867  word->word->print();
1868  break;
1869  case 10:
1870  tprintf ("make_reject_map: initial map");
1871  break;
1872  case 20:
1873  tprintf ("make_reject_map: after NN");
1874  break;
1875  case 30:
1876  tprintf ("classify_word_pass2 - START");
1877  break;
1878  case 40:
1879  tprintf ("classify_word_pass2 - Pre Xht");
1880  break;
1881  case 50:
1882  tprintf ("classify_word_pass2 - END");
1883  show_map_detail = true;
1884  break;
1885  case 60:
1886  tprintf ("fixspace");
1887  break;
1888  case 70:
1889  tprintf ("MM pass START");
1890  break;
1891  case 80:
1892  tprintf ("MM pass END");
1893  break;
1894  case 90:
1895  tprintf ("After Poor quality rejection");
1896  break;
1897  case 100:
1898  tprintf ("unrej_good_quality_words - START");
1899  break;
1900  case 110:
1901  tprintf ("unrej_good_quality_words - END");
1902  break;
1903  case 120:
1904  tprintf ("Write results pass");
1905  show_map_detail = true;
1906  break;
1907  }
1908  if (word->best_choice != nullptr) {
1909  tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
1910  word->reject_map.print(debug_fp);
1911  tprintf("\n");
1912  if (show_map_detail) {
1913  tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());
1914  for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1915  tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1916  word->reject_map[i].full_print(debug_fp);
1917  }
1918  }
1919  } else {
1920  tprintf("null best choice\n");
1921  }
1922  tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1923  tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1924  return true;
1925  } else {
1926  return false;
1927  }
1928 }

◆ classify_word_and_language()

void tesseract::Tesseract::classify_word_and_language ( int  pass_n,
PAGE_RES_IT pr_it,
WordData word_data 
)

Definition at line 1318 of file control.cpp.

1319  {
1320 #ifdef DISABLED_LEGACY_ENGINE
1322 #else
1323  WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
1325 #endif // def DISABLED_LEGACY_ENGINE
1326 
1327  // Best result so far.
1328  PointerVector<WERD_RES> best_words;
1329  // Points to the best result. May be word or in lang_words.
1330  const WERD_RES* word = word_data->word;
1331  clock_t start_t = clock();
1332  const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
1333  if (debug) {
1334  tprintf("%s word with lang %s at:",
1335  word->done ? "Already done" : "Processing",
1336  most_recently_used_->lang.c_str());
1337  word->word->bounding_box().print();
1338  }
1339  if (word->done) {
1340  // If done on pass1, leave it as-is.
1341  if (!word->tess_failed)
1342  most_recently_used_ = word->tesseract;
1343  return;
1344  }
1345  int sub = sub_langs_.size();
1346  if (most_recently_used_ != this) {
1347  // Get the index of the most_recently_used_.
1348  for (sub = 0; sub < sub_langs_.size() &&
1349  most_recently_used_ != sub_langs_[sub]; ++sub) {}
1350  }
1351  most_recently_used_->RetryWithLanguage(
1352  *word_data, recognizer, debug, &word_data->lang_words[sub], &best_words);
1353  Tesseract* best_lang_tess = most_recently_used_;
1354  if (!WordsAcceptable(best_words)) {
1355  // Try all the other languages to see if they are any better.
1356  if (most_recently_used_ != this &&
1357  this->RetryWithLanguage(*word_data, recognizer, debug,
1358  &word_data->lang_words[sub_langs_.size()],
1359  &best_words) > 0) {
1360  best_lang_tess = this;
1361  }
1362  for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
1363  ++i) {
1364  if (most_recently_used_ != sub_langs_[i] &&
1365  sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug,
1366  &word_data->lang_words[i],
1367  &best_words) > 0) {
1368  best_lang_tess = sub_langs_[i];
1369  }
1370  }
1371  }
1372  most_recently_used_ = best_lang_tess;
1373  if (!best_words.empty()) {
1374  if (best_words.size() == 1 && !best_words[0]->combination) {
1375  // Move the best single result to the main word.
1376  word_data->word->ConsumeWordResults(best_words[0]);
1377  } else {
1378  // Words came from LSTM, and must be moved to the PAGE_RES properly.
1379  word_data->word = best_words.back();
1380  pr_it->ReplaceCurrentWord(&best_words);
1381  }
1382  ASSERT_HOST(word_data->word->box_word != nullptr);
1383  } else {
1384  tprintf("no best words!!\n");
1385  }
1386  clock_t ocr_t = clock();
1387  if (tessedit_timing_debug) {
1388  tprintf("%s (ocr took %.2f sec)\n",
1389  word_data->word->best_choice->unichar_string().c_str(),
1390  static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
1391  }
1392 }

◆ classify_word_pass1()

void tesseract::Tesseract::classify_word_pass1 ( const WordData word_data,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  out_words 
)

classify_word_pass1

Baseline normalize the word and pass it to Tess.

Definition at line 1400 of file control.cpp.

1402  {
1403  ROW* row = word_data.row;
1404  BLOCK* block = word_data.block;
1405  prev_word_best_choice_ = word_data.prev_word != nullptr
1406  ? word_data.prev_word->word->best_choice : nullptr;
1407 #ifndef ANDROID_BUILD
1408 #ifdef DISABLED_LEGACY_ENGINE
1410 #else
1413 #endif // def DISABLED_LEGACY_ENGINE
1414  if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1415  LSTMRecognizeWord(*block, row, *in_word, out_words);
1416  if (!out_words->empty())
1417  return; // Successful lstm recognition.
1418  }
1420  // No fallback allowed, so use a fake.
1421  (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
1422  return;
1423  }
1424 
1425  #ifndef DISABLED_LEGACY_ENGINE
1426  // Fall back to tesseract for failed words or odd words.
1427  (*in_word)->SetupForRecognition(unicharset, this, BestPix(),
1428  OEM_TESSERACT_ONLY, nullptr,
1431  poly_allow_detailed_fx, row, block);
1432 #endif // ndef DISABLED_LEGACY_ENGINE
1433  }
1434 #endif // ndef ANDROID_BUILD
1435 
1436 #ifndef DISABLED_LEGACY_ENGINE
1437  WERD_RES* word = *in_word;
1438  match_word_pass_n(1, word, row, block);
1439  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1440  word->tess_would_adapt = AdaptableWord(word);
1441  bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
1442 
1443  if (adapt_ok) {
1444  // Send word to adaptive classifier for training.
1445  word->BestChoiceToCorrectText();
1446  LearnWord(nullptr, word);
1447  // Mark misadaptions if running blamer.
1448  if (word->blamer_bundle != nullptr) {
1451  }
1452  }
1453 
1454  if (tessedit_enable_doc_dict && !word->IsAmbiguous())
1456  }
1457 #endif // ndef DISABLED_LEGACY_ENGINE
1458 }

◆ classify_word_pass2()

void tesseract::Tesseract::classify_word_pass2 ( const WordData word_data,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  out_words 
)

classify_word_pass2

Control what to do with the word in pass 2

Definition at line 1571 of file control.cpp.

1573  {
1574  // Return if we do not want to run Tesseract.
1576  return;
1577  }
1578 #ifndef DISABLED_LEGACY_ENGINE
1579  ROW* row = word_data.row;
1580  BLOCK* block = word_data.block;
1581  WERD_RES* word = *in_word;
1582  prev_word_best_choice_ = word_data.prev_word != nullptr
1583  ? word_data.prev_word->word->best_choice : nullptr;
1584 
1586  check_debug_pt(word, 30);
1587  if (!word->done) {
1588  word->caps_height = 0.0;
1589  if (word->x_height == 0.0f)
1590  word->x_height = row->x_height();
1591  match_word_pass_n(2, word, row, block);
1592  check_debug_pt(word, 40);
1593  }
1594 
1595  SubAndSuperscriptFix(word);
1596 
1597  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1599  block->classify_rotation().y() == 0.0f) {
1600  // Use the tops and bottoms since they are available.
1601  TrainedXheightFix(word, block, row);
1602  }
1603 
1605  }
1606 #ifndef GRAPHICS_DISABLED
1608  if (fx_win == nullptr)
1609  create_fx_win();
1610  clear_fx_win();
1611  word->rebuild_word->plot(fx_win);
1612  TBOX wbox = word->rebuild_word->bounding_box();
1613  fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
1614  wbox.right(), wbox.bottom());
1616  }
1617 #endif
1619  check_debug_pt(word, 50);
1620 #endif // ndef DISABLED_LEGACY_ENGINE
1621 }

◆ ClassifyBlobAsWord()

float tesseract::Tesseract::ClassifyBlobAsWord ( int  pass_n,
PAGE_RES_IT pr_it,
C_BLOB blob,
STRING best_str,
float *  c2 
)

Definition at line 1269 of file control.cpp.

1270  {
1271  WERD* real_word = pr_it->word()->word;
1272  WERD* word = real_word->ConstructFromSingleBlob(
1273  real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob));
1274  WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
1275  // Get a new iterator that points to the new word.
1276  PAGE_RES_IT it(pr_it->page_res);
1277  while (it.word() != word_res && it.word() != nullptr) it.forward();
1278  ASSERT_HOST(it.word() == word_res);
1279  WordData wd(it);
1280  // Force full initialization.
1281  SetupWordPassN(1, &wd);
1282  classify_word_and_language(pass_n, &it, &wd);
1283  if (debug_noise_removal) {
1284  if (wd.word->raw_choice != nullptr) {
1285  tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height,
1286  wd.row->x_height(), wd.word->raw_choice->min_x_height(),
1287  wd.word->raw_choice->max_x_height());
1288  } else {
1289  tprintf("Got word with null raw choice xheight=%g, row=%g\n", word_res->x_height,
1290  wd.row->x_height());
1291  }
1292  }
1293  float cert = 0.0f;
1294  if (wd.word->raw_choice != nullptr) { // This probably shouldn't happen, but...
1295  cert = wd.word->raw_choice->certainty();
1296  float rat = wd.word->raw_choice->rating();
1297  *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1298  *best_str = wd.word->raw_choice->unichar_string();
1299  } else {
1300  *c2 = 0.0f;
1301  *best_str = "";
1302  }
1303  it.DeleteCurrentWord();
1304  pr_it->ResetWordIterator();
1305  return cert;
1306 }

◆ ClassifyBlobPlusOutlines()

float tesseract::Tesseract::ClassifyBlobPlusOutlines ( const GenericVector< bool > &  ok_outlines,
const GenericVector< C_OUTLINE * > &  outlines,
int  pass_n,
PAGE_RES_IT pr_it,
C_BLOB blob,
STRING best_str 
)

Definition at line 1225 of file control.cpp.

1228  {
1229  C_OUTLINE_IT ol_it;
1230  C_OUTLINE* first_to_keep = nullptr;
1231  C_BLOB* local_blob = nullptr;
1232  if (blob != nullptr) {
1233  // Add the required outlines to the blob.
1234  ol_it.set_to_list(blob->out_list());
1235  first_to_keep = ol_it.data();
1236  }
1237  for (int i = 0; i < ok_outlines.size(); ++i) {
1238  if (ok_outlines[i]) {
1239  // This outline is to be added.
1240  if (blob == nullptr) {
1241  local_blob = new C_BLOB(outlines[i]);
1242  blob = local_blob;
1243  ol_it.set_to_list(blob->out_list());
1244  } else {
1245  ol_it.add_before_stay_put(outlines[i]);
1246  }
1247  }
1248  }
1249  float c2;
1250  float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
1251  ol_it.move_to_first();
1252  if (first_to_keep == nullptr) {
1253  // We created blob. Empty its outlines and delete it.
1254  for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
1255  delete local_blob;
1256  cert = -c2;
1257  } else {
1258  // Remove the outlines that we put in.
1259  for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1260  ol_it.extract();
1261  }
1262  }
1263  return cert;
1264 }

◆ Clear()

void tesseract::Tesseract::Clear ( )

Definition at line 574 of file tesseractclass.cpp.

574  {
575  STRING debug_name = imagebasename + "_debug.pdf";
576  pixa_debug_.WritePDF(debug_name.c_str());
577  pixDestroy(&pix_binary_);
578  pixDestroy(&pix_grey_);
579  pixDestroy(&pix_thresholds_);
580  pixDestroy(&scaled_color_);
581  deskew_ = FCOORD(1.0f, 0.0f);
582  reskew_ = FCOORD(1.0f, 0.0f);
583  splitter_.Clear();
584  scaled_factor_ = -1;
585  for (int i = 0; i < sub_langs_.size(); ++i)
586  sub_langs_[i]->Clear();
587 }

◆ ComputeCompatibleXheight()

float tesseract::Tesseract::ComputeCompatibleXheight ( WERD_RES word_res,
float *  baseline_shift 
)

Definition at line 117 of file fixxht.cpp.

130  {
131  tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
132  unicharset.id_to_unichar(class_id),
133  height, min_bottom, max_bottom, min_top, max_top,
134  bottom, top);
135  }
136  // Use only chars that fit in the expected bottom range, and where
137  // the range of tops is sensibly near the xheight.
138  if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
139  bottom - x_ht_acceptance_tolerance <= max_bottom &&
140  min_top > kBlnBaselineOffset &&
141  max_top - kBlnBaselineOffset >= kBlnXHeight &&
142  misfit_dist > 0) {
143  // Compute the x-height position using proportionality between the
144  // actual height and expected height.
145  int min_xht = DivRounded(height * kBlnXHeight,
146  max_top - kBlnBaselineOffset);
147  int max_xht = DivRounded(height * kBlnXHeight,
148  min_top - kBlnBaselineOffset);
149  if (debug_x_ht_level >= 2) {
150  tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
151  }
152  // The range of expected heights gets a vote equal to the distance
153  // of the actual top from the expected top.
154  for (int y = min_xht; y <= max_xht; ++y)
155  top_stats.add(y, misfit_dist);
156  } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
157  bottom - x_ht_acceptance_tolerance > max_bottom) &&
158  bottom_shift == 0) {
159  // Get the range of required bottom shift.
160  int min_shift = min_bottom - bottom;
161  int max_shift = max_bottom - bottom;
162  if (debug_x_ht_level >= 2) {
163  tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
164  }
165  // The range of expected shifts gets a vote equal to the min distance
166  // of the actual bottom from the expected bottom, spread over the
167  // range of its acceptance.
168  int misfit_weight = abs(min_shift);
169  if (max_shift > min_shift)
170  misfit_weight /= max_shift - min_shift;
171  for (int y = min_shift; y <= max_shift; ++y)
172  shift_stats.add(y, misfit_weight);
173  } else {
174  if (bottom_shift == 0) {
175  // Things with bottoms that are already ok need to say so, on the
176  // 1st iteration only.
177  shift_stats.add(0, kBlnBaselineOffset);
178  }
179  if (debug_x_ht_level >= 2) {
180  tprintf(" already OK\n");
181  }
182  }
183  }
184  }
185  if (shift_stats.get_total() > top_stats.get_total()) {
186  bottom_shift = IntCastRounded(shift_stats.median());
187  if (debug_x_ht_level >= 2) {
188  tprintf("Applying bottom shift=%d\n", bottom_shift);
189  }
190  }
191  } while (bottom_shift != 0 &&
192  top_stats.get_total() < shift_stats.get_total());
193  // Baseline shift is opposite sign to the bottom shift.
194  *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
195  if (debug_x_ht_level >= 2) {
196  tprintf("baseline shift=%g\n", *baseline_shift);
197  }
198  if (top_stats.get_total() == 0)
199  return bottom_shift != 0 ? word_res->x_height : 0.0f;
200  // The new xheight is just the median vote, which is then scaled out
201  // of BLN space back to pixel space to get the x-height in pixel space.
202  float new_xht = top_stats.median();
203  if (debug_x_ht_level >= 2) {
204  tprintf("Median xht=%f\n", new_xht);
205  tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
206  new_xht, new_xht / word_res->denorm.y_scale());
207  }
208  // The xheight must change by at least x_ht_min_change to be used.
209  if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
210  return new_xht / word_res->denorm.y_scale();
211  else
212  return bottom_shift != 0 ? word_res->x_height : 0.0f;
213 }
214 
215 } // namespace tesseract

◆ convert_bad_unlv_chs()

void tesseract::Tesseract::convert_bad_unlv_chs ( WERD_RES word_res)

Definition at line 638 of file docqual.cpp.

640  {
641  int i;
642  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
643  UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
644  UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
645  UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
646  for (i = 0; i < word_res->reject_map.length(); ++i) {
647  if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
648  word_res->best_choice->set_unichar_id(unichar_dash, i);
649  if (word_res->reject_map[i].accepted ())
650  word_res->reject_map[i].setrej_unlv_rej ();
651  }
652  if (word_res->best_choice->unichar_id(i) == unichar_pow) {
653  word_res->best_choice->set_unichar_id(unichar_space, i);
654  if (word_res->reject_map[i].accepted ())
655  word_res->reject_map[i].setrej_unlv_rej ();
656  }

◆ ConvertStringToUnichars()

bool tesseract::Tesseract::ConvertStringToUnichars ( const char *  utf8,
GenericVector< UNICHAR_ID > *  class_ids 
)

◆ CorrectClassifyWords()

void tesseract::Tesseract::CorrectClassifyWords ( PAGE_RES page_res)

◆ count_alphanums() [1/2]

int16_t tesseract::Tesseract::count_alphanums ( const WERD_CHOICE word)

Definition at line 375 of file output.cpp.

376  {
377  int count = 0;
378  for (int i = 0; i < word.length(); ++i) {
379  if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
380  word.unicharset()->get_isdigit(word.unichar_id(i)))
381  count++;
382  }
383  return count;

◆ count_alphanums() [2/2]

int16_t tesseract::Tesseract::count_alphanums ( WERD_RES word)

Definition at line 556 of file reject.cpp.

559  {
560  int count = 0;
561  const WERD_CHOICE *best_choice = word_res->best_choice;
562  for (int i = 0; i < word_res->reject_map.length(); ++i) {
563  if ((word_res->reject_map[i].accepted()) &&
564  (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
565  word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
566  count++;
567  }

◆ count_alphas()

int16_t tesseract::Tesseract::count_alphas ( const WERD_CHOICE word)

Definition at line 365 of file output.cpp.

366  {
367  int count = 0;
368  for (int i = 0; i < word.length(); ++i) {
369  if (word.unicharset()->get_isalpha(word.unichar_id(i)))
370  count++;
371  }
372  return count;

◆ count_outline_errs()

int16_t tesseract::Tesseract::count_outline_errs ( char  c,
int16_t  outline_count 
)

Definition at line 121 of file docqual.cpp.

123  {
124  if ((tessedit_good_quality_unrej && good_quality_doc))
125  unrej_good_quality_words(page_res_it);
126  doc_and_block_rejection(page_res_it, good_quality_doc);
127  if (unlv_tilde_crunching) {
128  tilde_crunch(page_res_it);
129  tilde_delete(page_res_it);
130  }
131 }

◆ CountMisfitTops()

int tesseract::Tesseract::CountMisfitTops ( WERD_RES word_res)

Definition at line 85 of file fixxht.cpp.

89  {
90  tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
91  unicharset.id_to_unichar(class_id),
92  bad ? "Misfit" : "OK", top, min_top, max_top,
93  static_cast<int>(x_ht_acceptance_tolerance));
94  }
95  }
96  }
97  return bad_blobs;
98 }
99 
100 // Returns a new x-height maximally compatible with the result in word_res.
101 // See comment above for overall algorithm.
103  float* baseline_shift) {
104  STATS top_stats(0, UINT8_MAX);
105  STATS shift_stats(-UINT8_MAX, UINT8_MAX);
106  int bottom_shift = 0;
107  int num_blobs = word_res->rebuild_word->NumBlobs();
108  do {
109  top_stats.clear();
110  shift_stats.clear();
111  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
112  TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
113  UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);

◆ debug_word()

void tesseract::Tesseract::debug_word ( PAGE_RES page_res,
const TBOX selection_box 
)

debug_word

Process the whole image, but load word_config_ for the selected word(s).

Definition at line 665 of file pgedit.cpp.

665  {
666 #ifndef DISABLED_LEGACY_ENGINE
668 #endif
669  recog_all_words(page_res, nullptr, &selection_box, word_config_.c_str(), 0);
670 }

◆ dictionary_correction_pass()

void tesseract::Tesseract::dictionary_correction_pass ( PAGE_RES page_res)

Definition at line 2092 of file control.cpp.

2092  {
2093  PAGE_RES_IT word_it(page_res);
2094  for (WERD_RES* word = word_it.word(); word != nullptr;
2095  word = word_it.forward()) {
2096  if (word->best_choices.singleton())
2097  continue; // There are no alternates.
2098 
2099  const WERD_CHOICE* best = word->best_choice;
2100  if (word->tesseract->getDict().valid_word(*best) != 0)
2101  continue; // The best choice is in the dictionary.
2102 
2103  WERD_CHOICE_IT choice_it(&word->best_choices);
2104  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
2105  choice_it.forward()) {
2106  WERD_CHOICE* alternate = choice_it.data();
2107  if (word->tesseract->getDict().valid_word(*alternate)) {
2108  // The alternate choice is in the dictionary.
2109  if (tessedit_bigram_debug) {
2110  tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
2111  best->unichar_string().c_str(),
2112  alternate->unichar_string().c_str());
2113  }
2114  // Replace the 'best' choice with a better choice.
2115  word->ReplaceBestChoice(alternate);
2116  break;
2117  }
2118  }
2119  }
2120 }

◆ digit_or_numeric_punct()

bool tesseract::Tesseract::digit_or_numeric_punct ( WERD_RES word,
int  char_position 
)

Definition at line 369 of file fixspace.cpp.

370  {
371  int i;
372  int offset;
373 
374  for (i = 0, offset = 0; i < char_position;
375  offset += word->best_choice->unichar_lengths()[i++]);
376  return (
377  word->uch_set->get_isdigit(
378  word->best_choice->unichar_string().c_str() + offset,
379  word->best_choice->unichar_lengths()[i]) ||
380  (word->best_choice->permuter() == NUMBER_PERM &&
382  word->best_choice->unichar_string().c_str()[offset])));

◆ do_re_display()

void tesseract::Tesseract::do_re_display ( bool(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it)  word_painter)

do_re_display()

Redisplay page

Definition at line 349 of file pgedit.cpp.

350  {
351  int block_count = 1;
352 
353  image_win->Clear();
354  if (display_image) {
355  image_win->Image(pix_binary_, 0, 0);
356  }
357 
358  image_win->Brush(ScrollView::NONE);
359  PAGE_RES_IT pr_it(current_page_res);
360  for (WERD_RES* word = pr_it.word(); word != nullptr; word = pr_it.forward()) {
361  (this->*word_painter)(&pr_it);
362  if (display_baselines && pr_it.row() != pr_it.prev_row())
363  pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN);
364  if (display_blocks && pr_it.block() != pr_it.prev_block())
365  pr_it.block()->block->pdblk.plot(image_win, block_count++, ScrollView::RED);
366  }
367  image_win->Update();
368 }

◆ doc_and_block_rejection()

void tesseract::Tesseract::doc_and_block_rejection ( PAGE_RES_IT page_res_it,
bool  good_quality_doc 
)

Definition at line 225 of file docqual.cpp.

230  {
231  reject_whole_page(page_res_it);
233  tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
234  page_res_it.page_res->char_count,
235  page_res_it.page_res->rej_count);
236  }
237  } else {
239  tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
240  page_res_it.page_res->char_count,
241  page_res_it.page_res->rej_count);
242  }
243 
244  /* Walk blocks testing for block rejection */
245 
246  page_res_it.restart_page();
247  WERD_RES* word;
248  while ((word = page_res_it.word()) != nullptr) {
249  current_block = page_res_it.block();
250  block_no = current_block->block->pdblk.index();
251  if (current_block->char_count > 0 &&
252  (current_block->rej_count * 100.0 / current_block->char_count) >
255  tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
256  block_no, current_block->char_count,
257  current_block->rej_count);
258  }
259  prev_word_rejected = false;
260  while ((word = page_res_it.word()) != nullptr &&
261  (page_res_it.block() == current_block)) {
263  rej_word = word->reject_map.reject_count() > 0 ||
265  if (rej_word && tessedit_dont_blkrej_good_wds &&
268  *word->uch_set,
269  word->best_choice->unichar_string().c_str(),
270  word->best_choice->unichar_lengths().c_str()) !=
271  AC_UNACCEPTABLE) {
272  word_char_quality(word, &char_quality, &accepted_char_quality);
273  rej_word = char_quality != word->reject_map.length();
274  }
275  } else {
276  rej_word = true;
277  }
278  if (rej_word) {
279  /*
280  Reject spacing if both current and prev words are rejected.
281  NOTE - this is NOT restricted to FUZZY spaces. - When tried this
282  generated more space errors.
283  */
285  prev_word_rejected &&
286  page_res_it.prev_row() == page_res_it.row() &&
287  word->word->space() == 1)
288  word->reject_spaces = true;
290  }
291  prev_word_rejected = rej_word;
292  page_res_it.forward();
293  }
294  } else {
296  tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
297  block_no, page_res_it.block()->char_count,
298  page_res_it.block()->rej_count);
299  }
300 
301  /* Walk rows in block testing for row rejection */
302  row_no = 0;
303  while (page_res_it.word() != nullptr &&
304  page_res_it.block() == current_block) {
305  current_row = page_res_it.row();
306  row_no++;
307  /* Reject whole row if:
308  fraction of chars on row which are rejected exceed a limit AND
309  fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
310  limit
311  */
312  if (current_row->char_count > 0 &&
313  (current_row->rej_count * 100.0 / current_row->char_count) >
315  (current_row->whole_word_rej_count * 100.0 /
316  current_row->rej_count) <
319  tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
320  row_no, current_row->char_count,
321  current_row->rej_count);
322  }
323  prev_word_rejected = false;
324  while ((word = page_res_it.word()) != nullptr &&
325  page_res_it.row () == current_row) {
326  /* Preserve words on good docs unless they are mostly rejected*/
327  if (!tessedit_row_rej_good_docs && good_quality_doc) {
328  rej_word = word->reject_map.reject_count() /
329  static_cast<float>(word->reject_map.length()) >
332  /* Preserve perfect words anyway */
333  rej_word = word->reject_map.reject_count() > 0 ||
335  if (rej_word && tessedit_dont_rowrej_good_wds &&
338  word->best_choice->unichar_string().c_str(),
339  word->best_choice->unichar_lengths().c_str()) !=
340  AC_UNACCEPTABLE) {
341  word_char_quality(word, &char_quality,
342  &accepted_char_quality);
343  rej_word = char_quality != word->reject_map.length();
344  }
345  } else {
346  rej_word = true;
347  }
348  if (rej_word) {
349  /*
350  Reject spacing if both current and prev words are rejected.
351  NOTE - this is NOT restricted to FUZZY spaces. - When tried
352  this generated more space errors.
353  */
355  prev_word_rejected &&
356  page_res_it.prev_row() == page_res_it.row() &&
357  word->word->space () == 1)
358  word->reject_spaces = true;
360  }
361  prev_word_rejected = rej_word;
362  page_res_it.forward();
363  }
364  } else {
366  tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
367  row_no, current_row->char_count, current_row->rej_count);
368  }
369  while (page_res_it.word() != nullptr &&
370  page_res_it.row() == current_row)
371  page_res_it.forward();
372  }
373  }
374  }
375  }
376  }
377 }
378 
379 } // namespace tesseract
380 
381 /*************************************************************************
382  * reject_whole_page()
383  * Don't believe any of it - set the reject map to 00..00 in all words
384  *
385  *************************************************************************/
386 

◆ dont_allow_1Il()

void tesseract::Tesseract::dont_allow_1Il ( WERD_RES word)

Definition at line 524 of file reject.cpp.

527  {
528  int i = 0;
529  int offset;
530  int word_len = word->reject_map.length();
531  const char *s = word->best_choice->unichar_string().c_str();
532  const char *lengths = word->best_choice->unichar_lengths().c_str();
533  bool accepted_1Il = false;
534 
535  for (i = 0, offset = 0; i < word_len;
536  offset += word->best_choice->unichar_lengths()[i++]) {
537  if (word->reject_map[i].accepted()) {
538  if (STRING(conflict_set_I_l_1).contains(s[offset])) {
539  accepted_1Il = true;
540  } else {
541  if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
542  word->uch_set->get_isdigit(s + offset, lengths[i]))
543  return; // >=1 non 1Il ch accepted
544  }
545  }
546  }
547  if (!accepted_1Il)
548  return; //Nothing to worry about
549 
550  for (i = 0, offset = 0; i < word_len;
551  offset += word->best_choice->unichar_lengths()[i++]) {
552  if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
553  word->reject_map[i].accepted())

◆ dump_words()

void tesseract::Tesseract::dump_words ( WERD_RES_LIST &  perm,
int16_t  score,
int16_t  mode,
bool  improved 
)

Definition at line 475 of file fixspace.cpp.

475  {
476 void Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score,
477  int16_t mode, bool improved) {
478  WERD_RES_IT word_res_it(&perm);
479 
480  if (debug_fix_space_level > 0) {
481  if (mode == 1) {
482  stats_.dump_words_str = "";
483  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
484  word_res_it.forward()) {
485  if (!word_res_it.data()->part_of_combo) {
486  stats_.dump_words_str +=
487  word_res_it.data()->best_choice->unichar_string();
488  stats_.dump_words_str += ' ';
489  }
490  }
491  }
492 
493  if (debug_fix_space_level > 1) {
494  switch (mode) {
495  case 1:
496  tprintf("EXTRACTED (%d): \"", score);
497  break;
498  case 2:
499  tprintf("TESTED (%d): \"", score);
500  break;
501  case 3:
502  tprintf("RETURNED (%d): \"", score);
503  break;
504  }
505 
506  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
507  word_res_it.forward()) {
508  if (!word_res_it.data()->part_of_combo) {
509  tprintf("%s/%1d ",
510  word_res_it.data()->best_choice->unichar_string().c_str(),
511  static_cast<int>(word_res_it.data()->best_choice->permuter()));
512  }
513  }
514  tprintf("\"\n");
515  } else if (improved) {
516  tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.c_str());
517  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
518  word_res_it.forward()) {
519  if (!word_res_it.data()->part_of_combo) {
520  tprintf("%s/%1d ",
521  word_res_it.data()->best_choice->unichar_string().c_str(),
522  static_cast<int>(word_res_it.data()->best_choice->permuter()));
523  }
524  }
525  tprintf("\"\n");
526  }
527  }

◆ end_tesseract()

void tesseract::Tesseract::end_tesseract ( )

Definition at line 482 of file tessedit.cpp.

◆ eval_word_spacing()

int16_t tesseract::Tesseract::eval_word_spacing ( WERD_RES_LIST &  word_res_list)

Definition at line 265 of file fixspace.cpp.

266  {
267  WERD_RES_IT word_res_it(&word_res_list);
268  int16_t total_score = 0;
269  int16_t word_count = 0;
270  int16_t done_word_count = 0;
271  int16_t word_len;
272  int16_t i;
273  int16_t offset;
274  WERD_RES *word; // current word
275  int16_t prev_word_score = 0;
276  bool prev_word_done = false;
277  bool prev_char_1 = false; // prev ch a "1/I/l"?
278  bool prev_char_digit = false; // prev ch 2..9 or 0
279  bool current_char_1 = false;
280  bool current_word_ok_so_far;
281  STRING punct_chars = "!\"`',.:;";
282  bool prev_char_punct = false;
283  bool current_char_punct = false;
284  bool word_done = false;
285 
286  do {
287  word = word_res_it.data();
288  word_done = fixspace_thinks_word_done(word);
289  word_count++;
290  if (word->tess_failed) {
291  total_score += prev_word_score;
292  if (prev_word_done)
293  done_word_count++;
294  prev_word_score = 0;
295  prev_char_1 = false;
296  prev_char_digit = false;
297  prev_word_done = false;
298  } else {
299  /*
300  Can we add the prev word score and potentially count this word?
301  Yes IF it didn't end in a 1 when the first char of this word is a digit
302  AND it didn't end in a digit when the first char of this word is a 1
303  */
304  word_len = word->reject_map.length();
305  current_word_ok_so_far = false;
306  if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
307  (prev_char_digit && (
308  (word_done &&
309  word->best_choice->unichar_lengths().c_str()[0] == 1 &&
310  word->best_choice->unichar_string()[0] == '1') ||
311  (!word_done && STRING(conflict_set_I_l_1).contains(
312  word->best_choice->unichar_string()[0])))))) {
313  total_score += prev_word_score;
314  if (prev_word_done)
315  done_word_count++;
316  current_word_ok_so_far = word_done;
317  }
318 
319  if (current_word_ok_so_far) {
320  prev_word_done = true;
321  prev_word_score = word_len;
322  } else {
323  prev_word_done = false;
324  prev_word_score = 0;
325  }
326 
327  /* Add 1 to total score for every joined 1 regardless of context and
328  rejtn */
329  for (i = 0, prev_char_1 = false; i < word_len; i++) {
330  current_char_1 = word->best_choice->unichar_string()[i] == '1';
331  if (prev_char_1 || (current_char_1 && (i > 0)))
332  total_score++;
333  prev_char_1 = current_char_1;
334  }
335 
336  /* Add 1 to total score for every joined punctuation regardless of context
337  and rejtn */
339  for (i = 0, offset = 0, prev_char_punct = false; i < word_len;
340  offset += word->best_choice->unichar_lengths()[i++]) {
341  current_char_punct =
342  punct_chars.contains(word->best_choice->unichar_string()[offset]);
343  if (prev_char_punct || (current_char_punct && i > 0))
344  total_score++;
345  prev_char_punct = current_char_punct;
346  }
347  }
348  prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
349  for (i = 0, offset = 0; i < word_len - 1;
350  offset += word->best_choice->unichar_lengths()[i++]);
351  prev_char_1 =
352  ((word_done && (word->best_choice->unichar_string()[offset] == '1'))
353  || (!word_done && STRING(conflict_set_I_l_1).contains(
354  word->best_choice->unichar_string()[offset])));
355  }
356  /* Find next word */
357  do {
358  word_res_it.forward();
359  } while (word_res_it.data()->part_of_combo);
360  } while (!word_res_it.at_first());
361  total_score += prev_word_score;
362  if (prev_word_done)
363  done_word_count++;
364  if (done_word_count == word_count)
365  return PERFECT_WERDS;
366  else
367  return total_score;

◆ failure_count()

int16_t tesseract::Tesseract::failure_count ( WERD_RES word)

Definition at line 946 of file docqual.cpp.

949  {
950  const char *str = word->best_choice->unichar_string().c_str();
951  int tess_rejs = 0;
952 
953  for (; *str != '\0'; str++) {
954  if (*str == ' ')
955  tess_rejs++;

◆ FindSegmentation()

bool tesseract::Tesseract::FindSegmentation ( const GenericVector< UNICHAR_ID > &  target_text,
WERD_RES word_res 
)

◆ first_alphanum_index()

int16_t tesseract::Tesseract::first_alphanum_index ( const char *  word,
const char *  word_lengths 
)

Definition at line 468 of file reject.cpp.

471  {
472  int16_t i;
473  int16_t offset;
474 
475  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
476  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
477  unicharset.get_isdigit(word + offset, word_lengths[i]))
478  return i;
479  }

◆ first_alphanum_offset()

int16_t tesseract::Tesseract::first_alphanum_offset ( const char *  word,
const char *  word_lengths 
)

Definition at line 481 of file reject.cpp.

484  {
485  int16_t i;
486  int16_t offset;
487 
488  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
489  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
490  unicharset.get_isdigit(word + offset, word_lengths[i]))
491  return offset;
492  }

◆ fix_fuzzy_space_list()

void tesseract::Tesseract::fix_fuzzy_space_list ( WERD_RES_LIST &  best_perm,
ROW row,
BLOCK block 
)

Definition at line 170 of file fixspace.cpp.

174  {
175  int16_t best_score;
176  WERD_RES_LIST current_perm;
177  int16_t current_score;
178  bool improved = false;
179 
180  best_score = eval_word_spacing(best_perm); // default score
181  dump_words(best_perm, best_score, 1, improved);
182 
183  if (best_score != PERFECT_WERDS)
184  initialise_search(best_perm, current_perm);
185 
186  while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
187  match_current_words(current_perm, row, block);
188  current_score = eval_word_spacing(current_perm);
189  dump_words(current_perm, current_score, 2, improved);
190  if (current_score > best_score) {
191  best_perm.clear();
192  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
193  best_score = current_score;
194  improved = true;
195  }
196  if (current_score < PERFECT_WERDS)
197  transform_to_next_perm(current_perm);
198  }

◆ fix_fuzzy_spaces()

void tesseract::Tesseract::fix_fuzzy_spaces ( ETEXT_DESC monitor,
int32_t  word_count,
PAGE_RES page_res 
)

Definition at line 73 of file fixspace.cpp.

77  {
78  BLOCK_RES_IT block_res_it;
79  ROW_RES_IT row_res_it;
80  WERD_RES_IT word_res_it_from;
81  WERD_RES_IT word_res_it_to;
82  WERD_RES *word_res;
83  WERD_RES_LIST fuzzy_space_words;
84  int16_t new_length;
85  bool prevent_null_wd_fixsp; // DON'T process blobless wds
86  int32_t word_index; // current word
87 
88  block_res_it.set_to_list(&page_res->block_res_list);
89  word_index = 0;
90  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
91  block_res_it.forward()) {
92  row_res_it.set_to_list(&block_res_it.data()->row_res_list);
93  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
94  row_res_it.forward()) {
95  word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
96  while (!word_res_it_from.at_last()) {
97  word_res = word_res_it_from.data();
98  while (!word_res_it_from.at_last() &&
99  !(word_res->combination ||
100  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
101  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
102  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
103  block_res_it.data()->block);
104  word_res = word_res_it_from.forward();
105  word_index++;
106  if (monitor != nullptr) {
107  monitor->ocr_alive = true;
108  monitor->progress = 90 + 5 * word_index / word_count;
109  if (monitor->deadline_exceeded() ||
110  (monitor->cancel != nullptr &&
111  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
112  return;
113  }
114  }
115 
116  if (!word_res_it_from.at_last()) {
117  word_res_it_to = word_res_it_from;
118  prevent_null_wd_fixsp =
119  word_res->word->cblob_list()->empty();
120  if (check_debug_pt(word_res, 60))
121  debug_fix_space_level.set_value(10);
122  word_res_it_to.forward();
123  word_index++;
124  if (monitor != nullptr) {
125  monitor->ocr_alive = true;
126  monitor->progress = 90 + 5 * word_index / word_count;
127  if (monitor->deadline_exceeded() ||
128  (monitor->cancel != nullptr &&
129  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
130  return;
131  }
132  while (!word_res_it_to.at_last () &&
133  (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
134  word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
135  if (check_debug_pt(word_res, 60))
136  debug_fix_space_level.set_value(10);
137  if (word_res->word->cblob_list()->empty())
138  prevent_null_wd_fixsp = true;
139  word_res = word_res_it_to.forward();
140  }
141  if (check_debug_pt(word_res, 60))
142  debug_fix_space_level.set_value(10);
143  if (word_res->word->cblob_list()->empty())
144  prevent_null_wd_fixsp = true;
145  if (prevent_null_wd_fixsp) {
146  word_res_it_from = word_res_it_to;
147  } else {
148  fuzzy_space_words.assign_to_sublist(&word_res_it_from,
149  &word_res_it_to);
150  fix_fuzzy_space_list(fuzzy_space_words,
151  row_res_it.data()->row,
152  block_res_it.data()->block);
153  new_length = fuzzy_space_words.length();
154  word_res_it_from.add_list_before(&fuzzy_space_words);
155  for (;
156  !word_res_it_from.at_last() && new_length > 0;
157  new_length--) {
158  word_res_it_from.forward();
159  }
160  }
161  if (test_pt)
162  debug_fix_space_level.set_value(0);
163  }
164  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
165  block_res_it.data()->block);
166  // Last word in row
167  }
168  }

◆ fix_noisy_space_list()

void tesseract::Tesseract::fix_noisy_space_list ( WERD_RES_LIST &  best_perm,
ROW row,
BLOCK block 
)

Definition at line 595 of file fixspace.cpp.

597  {
598  int16_t best_score;
599  WERD_RES_IT best_perm_it(&best_perm);
600  WERD_RES_LIST current_perm;
601  WERD_RES_IT current_perm_it(&current_perm);
602  WERD_RES *old_word_res;
603  int16_t current_score;
604  bool improved = false;
605 
606  best_score = fp_eval_word_spacing(best_perm); // default score
607 
608  dump_words(best_perm, best_score, 1, improved);
609 
610  old_word_res = best_perm_it.data();
611  // Even deep_copy doesn't copy the underlying WERD unless its combination
612  // flag is true!.
613  old_word_res->combination = true; // Kludge to force deep copy
614  current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
615  old_word_res->combination = false; // Undo kludge
616 
617  break_noisiest_blob_word(current_perm);
618 
619  while (best_score != PERFECT_WERDS && !current_perm.empty()) {
620  match_current_words(current_perm, row, block);
621  current_score = fp_eval_word_spacing(current_perm);
622  dump_words(current_perm, current_score, 2, improved);
623  if (current_score > best_score) {
624  best_perm.clear();
625  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
626  best_score = current_score;
627  improved = true;
628  }
629  if (current_score < PERFECT_WERDS) {
630  break_noisiest_blob_word(current_perm);
631  }
632  }
633  dump_words(best_perm, best_score, 3, improved);

◆ fix_rep_char()

void tesseract::Tesseract::fix_rep_char ( PAGE_RES_IT page_res_it)

fix_rep_char() The word is a repeated char. (Leader.) Find the repeated char character. Create the appropriate single-word or multi-word sequence according to the size of spaces in between blobs, and correct the classifications where some of the characters disagree with the majority.

Definition at line 1705 of file control.cpp.

1705  {
1706  WERD_RES *word_res = page_res_it->word();
1707  const WERD_CHOICE &word = *(word_res->best_choice);
1708 
1709  // Find the frequency of each unique character in the word.
1710  SortHelper<UNICHAR_ID> rep_ch(word.length());
1711  for (int i = 0; i < word.length(); ++i) {
1712  rep_ch.Add(word.unichar_id(i), 1);
1713  }
1714 
1715  // Find the most frequent result.
1716  UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1717  int max_count = rep_ch.MaxCount(&maxch_id);
1718  // Find the best exemplar of a classifier result for maxch_id.
1719  BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
1720  if (best_choice == nullptr) {
1721  tprintf("Failed to find a choice for %s, occurring %d times\n",
1722  word_res->uch_set->debug_str(maxch_id).c_str(), max_count);
1723  return;
1724  }
1725  word_res->done = true;
1726 
1727  // Measure the mean space.
1728  int gap_count = 0;
1729  WERD* werd = word_res->word;
1730  C_BLOB_IT blob_it(werd->cblob_list());
1731  C_BLOB* prev_blob = blob_it.data();
1732  for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1733  C_BLOB* blob = blob_it.data();
1734  int gap = blob->bounding_box().left();
1735  gap -= prev_blob->bounding_box().right();
1736  ++gap_count;
1737  prev_blob = blob;
1738  }
1739  // Just correct existing classification.
1740  CorrectRepcharChoices(best_choice, word_res);
1741  word_res->reject_map.initialise(word.length());
1742 }

◆ fix_sp_fp_word()

void tesseract::Tesseract::fix_sp_fp_word ( WERD_RES_IT &  word_res_it,
ROW row,
BLOCK block 
)

Definition at line 561 of file fixspace.cpp.

563  {
564  WERD_RES *word_res;
565  WERD_RES_LIST sub_word_list;
566  WERD_RES_IT sub_word_list_it(&sub_word_list);
567  int16_t blob_index;
568  int16_t new_length;
569  float junk;
570 
571  word_res = word_res_it.data();
572  if (word_res->word->flag(W_REP_CHAR) ||
573  word_res->combination ||
574  word_res->part_of_combo ||
575  !word_res->word->flag(W_DONT_CHOP))
576  return;
577 
578  blob_index = worst_noise_blob(word_res, &junk);
579  if (blob_index < 0)
580  return;
581 
582  if (debug_fix_space_level > 1) {
583  tprintf("FP fixspace working on \"%s\"\n",
584  word_res->best_choice->unichar_string().c_str());
585  }
586  word_res->word->rej_cblob_list()->sort(c_blob_comparator);
587  sub_word_list_it.add_after_stay_put(word_res_it.extract());
588  fix_noisy_space_list(sub_word_list, row, block);
589  new_length = sub_word_list.length();
590  word_res_it.add_list_before(&sub_word_list);
591  for (; !word_res_it.at_last() && new_length > 1; new_length--) {
592  word_res_it.forward();
593  }

◆ fixspace_thinks_word_done()

bool tesseract::Tesseract::fixspace_thinks_word_done ( WERD_RES word)

Definition at line 529 of file fixspace.cpp.

530  {
531  if (word->done)
532  return true;
533 
534  /*
535  Use all the standard pass 2 conditions for mode 5 in set_done() in
536  reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
537  CARE WHETHER WE HAVE of/at on/an etc.
538  */
539  if (fixsp_done_mode > 0 &&
540  (word->tess_accepted ||
541  (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
542  fixsp_done_mode == 3) &&
543  (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr) &&
544  ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
545  (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
546  (word->best_choice->permuter() == USER_DAWG_PERM) ||
547  (word->best_choice->permuter() == NUMBER_PERM))) {
548  return true;
549  } else {
550  return false;
551  }

◆ flip_0O()

void tesseract::Tesseract::flip_0O ( WERD_RES word)

Definition at line 671 of file reject.cpp.

674  {
675  WERD_CHOICE *best_choice = word_res->best_choice;
676  int i;
677  TBOX out_box;
678 
679  if (!tessedit_flip_0O)
680  return;
681 
682  int num_blobs = word_res->rebuild_word->NumBlobs();
683  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
684  TBLOB* blob = word_res->rebuild_word->blobs[i];
685  if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
686  word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
687  out_box = blob->bounding_box();
688  if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
689  (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
690  return; //Beware words with sub/superscripts
691  }
692  }
693  UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
694  UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
695  if (unichar_0 == INVALID_UNICHAR_ID ||
696  !word_res->uch_set->get_enabled(unichar_0) ||
697  unichar_O == INVALID_UNICHAR_ID ||
698  !word_res->uch_set->get_enabled(unichar_O)) {
699  return; // 0 or O are not present/enabled in unicharset
700  }
701  for (i = 1; i < best_choice->length(); ++i) {
702  if (best_choice->unichar_id(i) == unichar_0 ||
703  best_choice->unichar_id(i) == unichar_O) {
704  /* A0A */
705  if ((i+1) < best_choice->length() &&
706  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
707  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
708  best_choice->set_unichar_id(unichar_O, i);
709  }
710  /* A00A */
711  if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
712  (i+1) < best_choice->length() &&
713  (best_choice->unichar_id(i+1) == unichar_0 ||
714  best_choice->unichar_id(i+1) == unichar_O) &&
715  (i+2) < best_choice->length() &&
716  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
717  best_choice->set_unichar_id(unichar_O, i);
718  i++;
719  }
720  /* AA0<non digit or end of word> */
721  if ((i > 1) &&
722  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
723  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
724  (((i+1) < best_choice->length() &&
725  !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
726  !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
727  !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
728  (i == best_choice->length() - 1))) {
729  best_choice->set_unichar_id(unichar_O, i);
730  }
731  /* 9O9 */
732  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
733  (i+1) < best_choice->length() &&
734  non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
735  best_choice->set_unichar_id(unichar_0, i);
736  }
737  /* 9OOO */
738  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
739  (i+2) < best_choice->length() &&
740  (best_choice->unichar_id(i+1) == unichar_0 ||
741  best_choice->unichar_id(i+1) == unichar_O) &&
742  (best_choice->unichar_id(i+2) == unichar_0 ||
743  best_choice->unichar_id(i+2) == unichar_O)) {
744  best_choice->set_unichar_id(unichar_0, i);
745  best_choice->set_unichar_id(unichar_0, i+1);
746  best_choice->set_unichar_id(unichar_0, i+2);
747  i += 2;
748  }
749  /* 9OO<non upper> */
750  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
751  (i+2) < best_choice->length() &&
752  (best_choice->unichar_id(i+1) == unichar_0 ||
753  best_choice->unichar_id(i+1) == unichar_O) &&
754  !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
755  best_choice->set_unichar_id(unichar_0, i);
756  best_choice->set_unichar_id(unichar_0, i+1);
757  i++;
758  }
759  /* 9O<non upper> */
760  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
761  (i+1) < best_choice->length() &&
762  !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
763  best_choice->set_unichar_id(unichar_0, i);
764  }
765  /* 9[.,]OOO.. */
766  if ((i > 1) &&
767  (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
768  word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
769  (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
770  best_choice->unichar_id(i-2) == unichar_O)) {
771  if (best_choice->unichar_id(i-2) == unichar_O) {
772  best_choice->set_unichar_id(unichar_0, i-2);
773  }
774  while (i < best_choice->length() &&
775  (best_choice->unichar_id(i) == unichar_O ||
776  best_choice->unichar_id(i) == unichar_0)) {
777  best_choice->set_unichar_id(unichar_0, i);
778  i++;
779  }
780  i--;
781  }

◆ flip_hyphens()

void tesseract::Tesseract::flip_hyphens ( WERD_RES word)

Definition at line 614 of file reject.cpp.

617  {
618  WERD_CHOICE *best_choice = word_res->best_choice;
619  int i;
620  int prev_right = -9999;
621  int next_left;
622  TBOX out_box;
623  float aspect_ratio;
624 
626  return;
627 
628  int num_blobs = word_res->rebuild_word->NumBlobs();
629  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
630  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
631  TBLOB* blob = word_res->rebuild_word->blobs[i];
632  out_box = blob->bounding_box();
633  if (i + 1 == num_blobs)
634  next_left = 9999;
635  else
636  next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
637  // Don't touch small or touching blobs - it is too dangerous.
638  if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
639  (out_box.left() > prev_right) && (out_box.right() < next_left)) {
640  aspect_ratio = out_box.width() / static_cast<float>(out_box.height());
641  if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
642  if (aspect_ratio >= tessedit_upper_flip_hyphen &&
643  word_res->uch_set->contains_unichar_id(unichar_dash) &&
644  word_res->uch_set->get_enabled(unichar_dash)) {
645  /* Certain HYPHEN */
646  best_choice->set_unichar_id(unichar_dash, i);
647  if (word_res->reject_map[i].rejected())
648  word_res->reject_map[i].setrej_hyphen_accept();
649  }
650  if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
651  word_res->reject_map[i].accepted())
652  //Suspected HYPHEN
653  word_res->reject_map[i].setrej_hyphen ();
654  }
655  else if (best_choice->unichar_id(i) == unichar_dash) {
656  if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
657  (word_res->reject_map[i].rejected()))
658  word_res->reject_map[i].setrej_hyphen_accept();
659  //Certain HYPHEN
660 
661  if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
662  (word_res->reject_map[i].accepted()))
663  //Suspected HYPHEN
664  word_res->reject_map[i].setrej_hyphen();
665  }
666  }

◆ font_recognition_pass()

void tesseract::Tesseract::font_recognition_pass ( PAGE_RES page_res)

font_recognition_pass

Smooth the fonts for the document.

Definition at line 2036 of file control.cpp.

2036  {
2037  PAGE_RES_IT page_res_it(page_res);
2038  WERD_RES *word; // current word
2039  STATS doc_fonts(0, font_table_size_); // font counters
2040 
2041  // Gather font id statistics.
2042  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2043  page_res_it.forward()) {
2044  word = page_res_it.word();
2045  if (word->fontinfo != nullptr) {
2046  doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
2047  }
2048  if (word->fontinfo2 != nullptr) {
2049  doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
2050  }
2051  }
2052  int16_t doc_font; // modal font
2053  int8_t doc_font_count; // modal font
2054  find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
2055  if (doc_font_count == 0)
2056  return;
2057  // Get the modal font pointer.
2058  const FontInfo* modal_font = nullptr;
2059  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2060  page_res_it.forward()) {
2061  word = page_res_it.word();
2062  if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) {
2063  modal_font = word->fontinfo;
2064  break;
2065  }
2066  if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) {
2067  modal_font = word->fontinfo2;
2068  break;
2069  }
2070  }
2071  ASSERT_HOST(modal_font != nullptr);
2072 
2073  // Assign modal font to weak words.
2074  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
2075  page_res_it.forward()) {
2076  word = page_res_it.word();
2077  const int length = word->best_choice->length();
2078 
2079  const int count = word->fontinfo_id_count;
2080  if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
2081  word->fontinfo = modal_font;
2082  // Counts only get 1 as it came from the doc.
2083  word->fontinfo_id_count = 1;
2084  }
2085  }
2086 }

◆ fp_eval_word_spacing()

int16_t tesseract::Tesseract::fp_eval_word_spacing ( WERD_RES_LIST &  word_res_list)

Definition at line 856 of file fixspace.cpp.

856  {
857 int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
858  WERD_RES_IT word_it(&word_res_list);
859  WERD_RES *word;
860  int16_t score = 0;
861  int16_t i;
862  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
863 
864  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
865  word = word_it.data();
866  if (word->rebuild_word == nullptr)
867  continue; // Can't handle cube words.
868  if (word->done ||
869  word->tess_accepted ||
870  word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
871  word->best_choice->permuter() == FREQ_DAWG_PERM ||
872  word->best_choice->permuter() == USER_DAWG_PERM ||
873  safe_dict_word(word) > 0) {
874  int num_blobs = word->rebuild_word->NumBlobs();
875  UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
876  for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
877  TBLOB* blob = word->rebuild_word->blobs[i];
878  if (word->best_choice->unichar_id(i) == space ||
879  blob_noise_score(blob) < small_limit) {
880  score -= 1; // penalise possibly erroneous non-space
881  } else if (word->reject_map[i].accepted()) {
882  score++;
883  }
884  }
885  }
886  }
887  if (score < 0)
888  score = 0;
889  return score;

◆ garbage_word()

GARBAGE_LEVEL tesseract::Tesseract::garbage_word ( WERD_RES word,
bool  ok_dict_word 
)

Definition at line 658 of file docqual.cpp.

660  {
661  enum STATES
662  {
663  JUNK,
664  FIRST_UPPER,
665  FIRST_LOWER,
666  FIRST_NUM,
667  SUBSEQUENT_UPPER,
668  SUBSEQUENT_LOWER,
669  SUBSEQUENT_NUM
670  };
671  const char *str = word->best_choice->unichar_string().c_str();
672  const char *lengths = word->best_choice->unichar_lengths().c_str();
673  STATES state = JUNK;
674  int len = 0;
675  int isolated_digits = 0;
676  int isolated_alphas = 0;
677  int bad_char_count = 0;
678  int tess_rejs = 0;
679  int dodgy_chars = 0;
680  int ok_chars;
681  UNICHAR_ID last_char = -1;
682  int alpha_repetition_count = 0;
683  int longest_alpha_repetition_count = 0;
684  int longest_lower_run_len = 0;
685  int lower_string_count = 0;
686  int longest_upper_run_len = 0;
687  int upper_string_count = 0;
688  int total_alpha_count = 0;
689  int total_digit_count = 0;
690 
691  for (; *str != '\0'; str += *(lengths++)) {
692  len++;
693  if (word->uch_set->get_isupper (str, *lengths)) {
694  total_alpha_count++;
695  switch (state) {
696  case SUBSEQUENT_UPPER:
697  case FIRST_UPPER:
698  state = SUBSEQUENT_UPPER;
699  upper_string_count++;
700  if (longest_upper_run_len < upper_string_count)
701  longest_upper_run_len = upper_string_count;
702  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
703  alpha_repetition_count++;
704  if (longest_alpha_repetition_count < alpha_repetition_count) {
705  longest_alpha_repetition_count = alpha_repetition_count;
706  }
707  }
708  else {
709  last_char = word->uch_set->unichar_to_id(str, *lengths);
710  alpha_repetition_count = 1;
711  }
712  break;
713  case FIRST_NUM:
714  isolated_digits++;
715  // Fall through.
716  default:
717  state = FIRST_UPPER;
718  last_char = word->uch_set->unichar_to_id(str, *lengths);
719  alpha_repetition_count = 1;
720  upper_string_count = 1;
721  break;
722  }
723  }
724  else if (word->uch_set->get_islower (str, *lengths)) {
725  total_alpha_count++;
726  switch (state) {
727  case SUBSEQUENT_LOWER:
728  case FIRST_LOWER:
729  state = SUBSEQUENT_LOWER;
730  lower_string_count++;
731  if (longest_lower_run_len < lower_string_count)
732  longest_lower_run_len = lower_string_count;
733  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
734  alpha_repetition_count++;
735  if (longest_alpha_repetition_count < alpha_repetition_count) {
736  longest_alpha_repetition_count = alpha_repetition_count;
737  }
738  }
739  else {
740  last_char = word->uch_set->unichar_to_id(str, *lengths);
741  alpha_repetition_count = 1;
742  }
743  break;
744  case FIRST_NUM:
745  isolated_digits++;
746  // Fall through.
747  default:
748  state = FIRST_LOWER;
749  last_char = word->uch_set->unichar_to_id(str, *lengths);
750  alpha_repetition_count = 1;
751  lower_string_count = 1;
752  break;
753  }
754  }
755  else if (word->uch_set->get_isdigit (str, *lengths)) {
756  total_digit_count++;
757  switch (state) {
758  case FIRST_NUM:
759  state = SUBSEQUENT_NUM;
760  case SUBSEQUENT_NUM:
761  break;
762  case FIRST_UPPER:
763  case FIRST_LOWER:
764  isolated_alphas++;
765  // Fall through.
766  default:
767  state = FIRST_NUM;
768  break;
769  }
770  }
771  else {
772  if (*lengths == 1 && *str == ' ')
773  tess_rejs++;
774  else
775  bad_char_count++;
776  switch (state) {
777  case FIRST_NUM:
778  isolated_digits++;
779  break;
780  case FIRST_UPPER:
781  case FIRST_LOWER:
782  isolated_alphas++;
783  default:
784  break;
785  }
786  state = JUNK;
787  }
788  }
789 
790  switch (state) {
791  case FIRST_NUM:
792  isolated_digits++;
793  break;
794  case FIRST_UPPER:
795  case FIRST_LOWER:
796  isolated_alphas++;
797  default:
798  break;
799  }
800 
802  total_alpha_count += total_digit_count - isolated_digits;
803  }
804 
805  if (crunch_leave_ok_strings && len >= 4 &&
806  2 * (total_alpha_count - isolated_alphas) > len &&
807  longest_alpha_repetition_count < crunch_long_repetitions) {
808  if ((crunch_accept_ok &&
809  acceptable_word_string(*word->uch_set, str, lengths) !=
810  AC_UNACCEPTABLE) ||
811  longest_lower_run_len > crunch_leave_lc_strings ||
812  longest_upper_run_len > crunch_leave_uc_strings)
813  return G_NEVER_CRUNCH;
814  }
815  if (word->reject_map.length() > 1 &&
816  strpbrk(str, " ") == nullptr &&
817  (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
818  word->best_choice->permuter() == FREQ_DAWG_PERM ||
819  word->best_choice->permuter() == USER_DAWG_PERM ||
820  word->best_choice->permuter() == NUMBER_PERM ||
821  acceptable_word_string(*word->uch_set, str, lengths) !=
822  AC_UNACCEPTABLE || ok_dict_word))
823  return G_OK;
824 
825  ok_chars = len - bad_char_count - isolated_digits -
826  isolated_alphas - tess_rejs;
827 
828  if (crunch_debug > 3) {
829  tprintf("garbage_word: \"%s\"\n",
830  word->best_choice->unichar_string().c_str());
831  tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
832  len,
833  bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
834  }
835  if (bad_char_count == 0 &&
836  tess_rejs == 0 &&
837  (len > isolated_digits + isolated_alphas || len <= 2))
838  return G_OK;
839 
840  if (tess_rejs > ok_chars ||
841  (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
842  return G_TERRIBLE;
843 
844  if (len > 4) {
845  dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
846  isolated_alphas;
847  if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5)
848  return G_DODGY;
849  else
850  return G_OK;
851  } else {
852  dodgy_chars = 2 * tess_rejs + bad_char_count;
853  if ((len == 4 && dodgy_chars > 2) ||
854  (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
855  return G_DODGY;
856  else
857  return G_OK;

◆ get_rep_char()

UNICHAR_ID tesseract::Tesseract::get_rep_char ( WERD_RES word)

Definition at line 251 of file output.cpp.

251  { // what char is repeated?
252  int i;
253  for (i = 0; ((i < word->reject_map.length()) &&
254  (word->reject_map[i].rejected())); ++i);
255 
256  if (i < word->reject_map.length()) {
257  return word->best_choice->unichar_id(i);
258  } else {
259  return word->uch_set->unichar_to_id(unrecognised_char.c_str());
260  }
261 }

◆ get_sub_lang()

Tesseract* tesseract::Tesseract::get_sub_lang ( int  index) const
inline

Definition at line 279 of file tesseractclass.h.

279  {
280  return sub_langs_[index];
281  }

◆ getDict()

Dict & tesseract::Tesseract::getDict ( )
overridevirtual

Reimplemented from tesseract::Classify.

Definition at line 564 of file tesseractclass.cpp.

564  {
565  if (0 == Classify::getDict().NumDawgs() && AnyLSTMLang()) {
566  if (lstm_recognizer_ && lstm_recognizer_->GetDict()) {
567  return *lstm_recognizer_->GetDict();
568  }
569  }
570  return Classify::getDict();
571 }

◆ GetLineData()

ImageData * tesseract::Tesseract::GetLineData ( const TBOX line_box,
const GenericVector< TBOX > &  boxes,
const GenericVector< STRING > &  texts,
int  start_box,
int  end_box,
const BLOCK block 
)

Definition at line 135 of file linerec.cpp.

139  {
140  TBOX revised_box;
141  ImageData* image_data = GetRectImage(line_box, block, kImagePadding,
142  &revised_box);
143  if (image_data == nullptr) return nullptr;
144  image_data->set_page_number(applybox_page);
145  // Copy the boxes and shift them so they are relative to the image.
146  FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
147  ICOORD shift = -revised_box.botleft();
148  GenericVector<TBOX> line_boxes;
149  GenericVector<STRING> line_texts;
150  for (int b = start_box; b < end_box; ++b) {
151  TBOX box = boxes[b];
152  box.rotate(block_rotation);
153  box.move(shift);
154  line_boxes.push_back(box);
155  line_texts.push_back(texts[b]);
156  }
157  GenericVector<int> page_numbers;
158  page_numbers.init_to_size(line_boxes.size(), applybox_page);
159  image_data->AddBoxes(line_boxes, line_texts, page_numbers);
160  return image_data;
161 }

◆ GetRectImage()

ImageData * tesseract::Tesseract::GetRectImage ( const TBOX box,
const BLOCK block,
int  padding,
TBOX revised_box 
) const

Definition at line 169 of file linerec.cpp.

170  {
171  TBOX wbox = box;
172  wbox.pad(padding, padding);
173  *revised_box = wbox;
174  // Number of clockwise 90 degree rotations needed to get back to tesseract
175  // coords from the clipped image.
176  int num_rotations = 0;
177  if (block.re_rotation().y() > 0.0f)
178  num_rotations = 1;
179  else if (block.re_rotation().x() < 0.0f)
180  num_rotations = 2;
181  else if (block.re_rotation().y() < 0.0f)
182  num_rotations = 3;
183  // Handle two cases automatically: 1 the box came from the block, 2 the box
184  // came from a box file, and refers to the image, which the block may not.
185  if (block.pdblk.bounding_box().major_overlap(*revised_box))
186  revised_box->rotate(block.re_rotation());
187  // Now revised_box always refers to the image.
188  // BestPix is never colormapped, but may be of any depth.
189  Pix* pix = BestPix();
190  int width = pixGetWidth(pix);
191  int height = pixGetHeight(pix);
192  TBOX image_box(0, 0, width, height);
193  // Clip to image bounds;
194  *revised_box &= image_box;
195  if (revised_box->null_box()) return nullptr;
196  Box* clip_box = boxCreate(revised_box->left(), height - revised_box->top(),
197  revised_box->width(), revised_box->height());
198  Pix* box_pix = pixClipRectangle(pix, clip_box, nullptr);
199  if (box_pix == nullptr) return nullptr;
200  boxDestroy(&clip_box);
201  if (num_rotations > 0) {
202  Pix* rot_pix = pixRotateOrth(box_pix, num_rotations);
203  pixDestroy(&box_pix);
204  box_pix = rot_pix;
205  }
206  // Convert sub-8-bit images to 8 bit.
207  int depth = pixGetDepth(box_pix);
208  if (depth < 8) {
209  Pix* grey;
210  grey = pixConvertTo8(box_pix, false);
211  pixDestroy(&box_pix);
212  box_pix = grey;
213  }
214  bool vertical_text = false;
215  if (num_rotations > 0) {
216  // Rotated the clipped revised box back to internal coordinates.
217  FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());
218  revised_box->rotate(rotation);
219  if (num_rotations != 2)
220  vertical_text = true;
221  }
222  return new ImageData(vertical_text, box_pix);
223 }

◆ GetSubAndSuperscriptCandidates()

void tesseract::Tesseract::GetSubAndSuperscriptCandidates ( const WERD_RES word,
int *  num_rebuilt_leading,
ScriptPos leading_pos,
float *  leading_certainty,
int *  num_rebuilt_trailing,
ScriptPos trailing_pos,
float *  trailing_certainty,
float *  avg_certainty,
float *  unlikely_threshold 
)

Determine how many characters (rebuilt blobs) on each end of a given word might plausibly be superscripts so SubAndSuperscriptFix can try to re-recognize them. Even if we find no whole blobs at either end, we will set *unlikely_threshold to a certainty that might be used to select "bad enough" outlier characters. If *unlikely_threshold is set to 0, though, there's really no hope.

Parameters
[in]wordThe word to examine.
[out]num_rebuilt_leadingthe number of rebuilt blobs at the start of the word which are all up or down and seem badly classified.
[out]leading_pos"super" or "sub" (for debugging)
[out]leading_certaintythe worst certainty in the leading blobs.
[out]num_rebuilt_trailingthe number of rebuilt blobs at the end of the word which are all up or down and seem badly classified.
[out]trailing_pos"super" or "sub" (for debugging)
[out]trailing_certaintythe worst certainty in the trailing blobs.
[out]avg_certaintythe average certainty of "normal" blobs in the word.
[out]unlikely_thresholdthe threshold (on certainty) we used to select "bad enough" outlier characters.

Definition at line 252 of file superscript.cpp.

261  {
262  *avg_certainty = *unlikely_threshold = 0.0f;
263  *num_rebuilt_leading = *num_rebuilt_trailing = 0;
264  *leading_certainty = *trailing_certainty = 0.0f;
265 
266  int super_y_bottom =
268  int sub_y_top =
270 
271  // Step one: Get an average certainty for "normally placed" characters.
272 
273  // Counts here are of blobs in the rebuild_word / unichars in best_choice.
274  *leading_pos = *trailing_pos = SP_NORMAL;
275  int leading_outliers = 0;
276  int trailing_outliers = 0;
277  int num_normal = 0;
278  float normal_certainty_total = 0.0f;
279  float worst_normal_certainty = 0.0f;
280  ScriptPos last_pos = SP_NORMAL;
281  int num_blobs = word->rebuild_word->NumBlobs();
282  for (int b = 0; b < num_blobs; ++b) {
283  TBOX box = word->rebuild_word->blobs[b]->bounding_box();
284  ScriptPos pos = SP_NORMAL;
285  if (box.bottom() >= super_y_bottom) {
286  pos = SP_SUPERSCRIPT;
287  } else if (box.top() <= sub_y_top) {
288  pos = SP_SUBSCRIPT;
289  }
290  if (pos == SP_NORMAL) {
291  if (word->best_choice->unichar_id(b) != 0) {
292  float char_certainty = word->best_choice->certainty(b);
293  if (char_certainty < worst_normal_certainty) {
294  worst_normal_certainty = char_certainty;
295  }
296  num_normal++;
297  normal_certainty_total += char_certainty;
298  }
299  if (trailing_outliers == b) {
300  leading_outliers = trailing_outliers;
301  *leading_pos = last_pos;
302  }
303  trailing_outliers = 0;
304  } else {
305  if (last_pos == pos) {
306  trailing_outliers++;
307  } else {
308  trailing_outliers = 1;
309  }
310  }
311  last_pos = pos;
312  }
313  *trailing_pos = last_pos;
314  if (num_normal >= 3) { // throw out the worst as an outlier.
315  num_normal--;
316  normal_certainty_total -= worst_normal_certainty;
317  }
318  if (num_normal > 0) {
319  *avg_certainty = normal_certainty_total / num_normal;
320  *unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
321  }
322  if (num_normal == 0 ||
323  (leading_outliers == 0 && trailing_outliers == 0)) {
324  return;
325  }
326 
327  // Step two: Try to split off bits of the word that are both outliers
328  // and have much lower certainty than average
329  // Calculate num_leading and leading_certainty.
330  for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0;
331  *num_rebuilt_leading < leading_outliers;
332  (*num_rebuilt_leading)++) {
333  float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);
334  if (char_certainty > *unlikely_threshold) {
335  break;
336  }
337  if (char_certainty < *leading_certainty) {
338  *leading_certainty = char_certainty;
339  }
340  }
341 
342  // Calculate num_trailing and trailing_certainty.
343  for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
344  *num_rebuilt_trailing < trailing_outliers;
345  (*num_rebuilt_trailing)++) {
346  int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
347  float char_certainty = word->best_choice->certainty(blob_idx);
348  if (char_certainty > *unlikely_threshold) {
349  break;
350  }
351  if (char_certainty < *trailing_certainty) {
352  *trailing_certainty = char_certainty;
353  }
354  }

◆ ImageHeight()

int tesseract::Tesseract::ImageHeight ( ) const
inline

Definition at line 253 of file tesseractclass.h.

253  {
254  return pixGetHeight(pix_binary_);
255  }

◆ ImageWidth()

int tesseract::Tesseract::ImageWidth ( ) const
inline

Definition at line 250 of file tesseractclass.h.

250  {
251  return pixGetWidth(pix_binary_);
252  }

◆ init_recog_training()

FILE * tesseract::Tesseract::init_recog_training ( const STRING fname)

Definition at line 36 of file recogtraining.cpp.

36  {
38  tessedit_tess_adaption_mode.set_value(0); // turn off adaption
39  tessedit_enable_doc_dict.set_value(0); // turn off document dictionary
40  // Explore all segmentations.
42  }
43 
44  STRING output_fname = fname;
45  const char* lastdot = strrchr(output_fname.c_str(), '.');
46  if (lastdot != nullptr)
47  output_fname[lastdot - output_fname.c_str()] = '\0';
48  output_fname += ".txt";
49  FILE* output_file = fopen(output_fname.c_str(), "a+");
50  if (output_file == nullptr) {
51  tprintf("Error: Could not open file %s\n", output_fname.c_str());
52  ASSERT_HOST(output_file);
53  }
54  return output_file;
55 }

◆ init_tesseract() [1/2]

int tesseract::Tesseract::init_tesseract ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params,
TessdataManager mgr 
)

Definition at line 302 of file tessedit.cpp.

303  {
304  if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
305  const char* lang_str = langs_to_load[lang_index].c_str();
306  Tesseract* tess_to_init;
307  if (!loaded_primary) {
308  tess_to_init = this;
309  } else {
310  tess_to_init = new Tesseract;
311  }
312 
313  int result = tess_to_init->init_tesseract_internal(
314  arg0, textbase, lang_str, oem, configs, configs_size, vars_vec,
315  vars_values, set_only_non_debug_params, mgr);
316  // Forget that language, but keep any reader we were given.
317  mgr->Clear();
318 
319  if (!loaded_primary) {
320  if (result < 0) {
321  tprintf("Failed loading language '%s'\n", lang_str);
322  } else {
323  ParseLanguageString(tess_to_init->tessedit_load_sublangs.c_str(),
324  &langs_to_load, &langs_not_to_load);
325  loaded_primary = true;
326  }
327  } else {
328  if (result < 0) {
329  tprintf("Failed loading language '%s'\n", lang_str);
330  delete tess_to_init;
331  } else {
332  sub_langs_.push_back(tess_to_init);
333  // Add any languages that this language requires
334  ParseLanguageString(tess_to_init->tessedit_load_sublangs.c_str(),
335  &langs_to_load, &langs_not_to_load);
336  }
337  }
338  }
339  }
340  if (!loaded_primary) {
341  tprintf("Tesseract couldn't load any languages!\n");
342  return -1; // Couldn't load any language!
343  }
344 #ifndef DISABLED_LEGACY_ENGINE
345  if (!sub_langs_.empty()) {
346  // In multilingual mode word ratings have to be directly comparable,
347  // so use the same language model weights for all languages:
348  // use the primary language's params model if
349  // tessedit_use_primary_params_model is set,
350  // otherwise use default language model weights.
352  for (int s = 0; s < sub_langs_.size(); ++s) {
353  sub_langs_[s]->language_model_->getParamsModel().Copy(
354  this->language_model_->getParamsModel());
355  }
356  tprintf("Using params model of the primary language\n");
357  } else {
358  this->language_model_->getParamsModel().Clear();
359  for (int s = 0; s < sub_langs_.size(); ++s) {
360  sub_langs_[s]->language_model_->getParamsModel().Clear();
361  }
362  }
363  }
364 
366 #endif // ndef DISABLED_LEGACY_ENGINE
367  return 0;
368 }
369 
370 // Common initialization for a single language.
371 // arg0 is the datapath for the tessdata directory, which could be the
372 // path of the tessdata directory with no trailing /, or (if tessdata
373 // lives in the same directory as the executable, the path of the executable,
374 // hence the name arg0.
375 // textbase is an optional output file basename (used only for training)
376 // language is the language code to load.
377 // oem controls which engine(s) will operate on the image
378 // configs (argv) is an array of config filenames to load variables from.
379 // May be nullptr.
380 // configs_size (argc) is the number of elements in configs.
381 // vars_vec is an optional vector of variables to set.
382 // vars_values is an optional corresponding vector of values for the variables
383 // in vars_vec.
384 // If set_only_init_params is true, then only the initialization variables

◆ init_tesseract() [2/2]

int tesseract::Tesseract::init_tesseract ( const char *  datapath,
const char *  language,
OcrEngineMode  oem 
)
inline

Definition at line 510 of file tesseractclass.h.

511  {
512  TessdataManager mgr;
513  return init_tesseract(datapath, nullptr, language, oem, nullptr, 0, nullptr,
514  nullptr, false, &mgr);
515  }

◆ init_tesseract_internal()

int tesseract::Tesseract::init_tesseract_internal ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params,
TessdataManager mgr 
)

Definition at line 402 of file tessedit.cpp.

404  : nullptr,
405  init_tesseract ? mgr : nullptr);
406  return 0; // Normal exit
407 }
408 
409 #ifndef DISABLED_LEGACY_ENGINE
410 
411 // Helper builds the all_fonts table by adding new fonts from new_fonts.
412 static void CollectFonts(const UnicityTable<FontInfo>& new_fonts,
413  UnicityTable<FontInfo>* all_fonts) {
414  for (int i = 0; i < new_fonts.size(); ++i) {
415  // UnicityTable uniques as we go.
416  all_fonts->push_back(new_fonts.get(i));
417  }
418 }
419 
420 // Helper assigns an id to lang_fonts using the index in all_fonts table.
421 static void AssignIds(const UnicityTable<FontInfo>& all_fonts,
422  UnicityTable<FontInfo>* lang_fonts) {
423  for (int i = 0; i < lang_fonts->size(); ++i) {

◆ init_tesseract_lang_data()

bool tesseract::Tesseract::init_tesseract_lang_data ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params,
TessdataManager mgr 
)

Definition at line 95 of file tessedit.cpp.

96  {
97  tprintf("Error opening data file %s\n", tessdata_path.c_str());
98  tprintf(
99  "Please make sure the TESSDATA_PREFIX environment variable is set"
100  " to your \"tessdata\" directory.\n");
101  return false;
102  }
103 #ifndef DISABLED_LEGACY_ENGINE
104  if (oem == OEM_DEFAULT) {
105  // Set the engine mode from availability, which can then be overridden by
106  // the config file when we read it below.
107  if (!mgr->IsLSTMAvailable()) {
109  } else if (!mgr->IsBaseAvailable()) {
111  } else {
113  }
114  }
115 #endif // ndef DISABLED_LEGACY_ENGINE
116 
117  // If a language specific config file (lang.config) exists, load it in.
118  TFile fp;
119  if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
121  this->params());
122  }
123 
124  SetParamConstraint set_params_constraint =
125  set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY
127  // Load tesseract variables from config files. This is done after loading
128  // language-specific variables from [lang].traineddata file, so that custom
129  // config files can override values in [lang].traineddata file.
130  for (int i = 0; i < configs_size; ++i) {
131  read_config_file(configs[i], set_params_constraint);
132  }
133 
134  // Set params specified in vars_vec (done after setting params from config
135  // files, so that params in vars_vec can override those from files).
136  if (vars_vec != nullptr && vars_values != nullptr) {
137  for (int i = 0; i < vars_vec->size(); ++i) {
138  if (!ParamUtils::SetParam((*vars_vec)[i].c_str(),
139  (*vars_values)[i].c_str(),
140  set_params_constraint, this->params())) {
141  tprintf("Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].c_str());
142  }
143  }
144  }
145 
146  if (!tessedit_write_params_to_file.empty()) {
147  FILE* params_file = fopen(tessedit_write_params_to_file.c_str(), "wb");
148  if (params_file != nullptr) {
149  ParamUtils::PrintParams(params_file, this->params());
150  fclose(params_file);
151  } else {
152  tprintf("Failed to open %s for writing params.\n",
154  }
155  }
156 
157  // Determine which ocr engine(s) should be loaded and used for recognition.
158  if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
159 
160  // If we are only loading the config file (and so not planning on doing any
161  // recognition) then there's nothing else do here.
163  return true;
164  }
165 
166 // The various OcrEngineMode settings (see tesseract/publictypes.h) determine which
167 // engine-specific data files need to be loaded.
168 // If LSTM_ONLY is requested, the base Tesseract files are *Not* required.
169 #ifndef ANDROID_BUILD
170 # ifdef DISABLED_LEGACY_ENGINE
172 # else
175 # endif // ndef DISABLED_LEGACY_ENGINE
176  if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {
177  lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix);
178  ASSERT_HOST(lstm_recognizer_->Load(
179  this->params(), lstm_use_matrix ? language : nullptr, mgr));
180  } else {
181  tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
183  }
184  }
185 #endif // ndef ANDROID_BUILD
186 
187  // Load the unicharset
189  // Avoid requiring a unicharset when we aren't running base tesseract.
190 #ifndef ANDROID_BUILD
191  unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
192 #endif // ndef ANDROID_BUILD
193  }
194 #ifndef DISABLED_LEGACY_ENGINE
195  else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) ||
196  !unicharset.load_from_file(&fp, false)) {
197  tprintf("Error: Tesseract (legacy) engine requested, but components are "
198  "not present in %s!!\n", tessdata_path.c_str());
199  return false;
200  }
201 #endif // ndef DISABLED_LEGACY_ENGINE
202  if (unicharset.size() > MAX_NUM_CLASSES) {
203  tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
204  return false;
205  }
206  right_to_left_ = unicharset.major_right_to_left();
207 
208 #ifndef DISABLED_LEGACY_ENGINE
209 
210  // Setup initial unichar ambigs table and read universal ambigs.
211  UNICHARSET encoder_unicharset;
212  encoder_unicharset.CopyFrom(unicharset);
214  unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
215 
216  if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
217  unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp,
220  }
221 
222  // Init ParamsModel.
223  // Load pass1 and pass2 weights (for now these two sets are the same, but in
224  // the future separate sets of weights can be generated).
226  ++p) {
227  language_model_->getParamsModel().SetPass(
228  static_cast<ParamsModel::PassEnum>(p));
229  if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
230  if (!language_model_->getParamsModel().LoadFromFp(lang.c_str(), &fp)) {
231  return false;
232  }
233  }
234  }
235 #endif // ndef DISABLED_LEGACY_ENGINE
236 
237  return true;
238 }
239 
240 // Helper returns true if the given string is in the vector of strings.
241 static bool IsStrInList(const STRING& str,
242  const GenericVector<STRING>& str_list) {
243  for (int i = 0; i < str_list.size(); ++i) {
244  if (str_list[i] == str) return true;
245  }
246  return false;
247 }
248 
249 // Parse a string of the form [~]<lang>[+[~]<lang>]*.
250 // Langs with no prefix get appended to to_load, provided they
251 // are not in there already.
252 // Langs with ~ prefix get appended to not_to_load, provided they are not in
253 // there already.
254 void Tesseract::ParseLanguageString(const char* lang_str,

◆ init_tesseract_lm()

int tesseract::Tesseract::init_tesseract_lm ( const char *  arg0,
const char *  textbase,
const char *  language,
TessdataManager mgr 
)

Definition at line 469 of file tessedit.cpp.

470  {
472  RECOG_WERDS,
473  RECOG_PSEUDO,
475 };
476 } // namespace tesseract

◆ join_words()

void tesseract::Tesseract::join_words ( WERD_RES word,
WERD_RES word2,
BlamerBundle orig_bb 
) const

Definition at line 231 of file tfacepp.cpp.

235  {
236  TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
237  TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
238  // Tack the word2 outputs onto the end of the word outputs.
239  word->chopped_word->blobs += word2->chopped_word->blobs;
240  word->rebuild_word->blobs += word2->rebuild_word->blobs;
241  word2->chopped_word->blobs.clear();
242  word2->rebuild_word->blobs.clear();
243  TPOINT split_pt;
244  split_pt.x = (prev_box.right() + blob_box.left()) / 2;
245  split_pt.y = (prev_box.top() + prev_box.bottom() +
246  blob_box.top() + blob_box.bottom()) / 4;
247  // Move the word2 seams onto the end of the word1 seam_array.
248  // Since the seam list is one element short, an empty seam marking the
249  // end of the last blob in the first word is needed first.
250  word->seam_array.push_back(new SEAM(0.0f, split_pt));
251  word->seam_array += word2->seam_array;
252  word2->seam_array.truncate(0);
253  // Fix widths and gaps.
254  word->blob_widths += word2->blob_widths;
255  word->blob_gaps += word2->blob_gaps;
256  // Fix the ratings matrix.
257  int rat1 = word->ratings->dimension();
258  int rat2 = word2->ratings->dimension();
259  word->ratings->AttachOnCorner(word2->ratings);
260  ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
261  word->best_state += word2->best_state;
262  // Append the word choices.
263  *word->raw_choice += *word2->raw_choice;
264 
265  // How many alt choices from each should we try to get?
266  const int kAltsPerPiece = 2;
267  // When do we start throwing away extra alt choices?
268  const int kTooManyAltChoices = 100;
269 
270  // Construct the cartesian product of the best_choices of word(1) and word2.
271  WERD_CHOICE_LIST joined_choices;
272  WERD_CHOICE_IT jc_it(&joined_choices);
273  WERD_CHOICE_IT bc1_it(&word->best_choices);
274  WERD_CHOICE_IT bc2_it(&word2->best_choices);
275  int num_word1_choices = word->best_choices.length();
276  int total_joined_choices = num_word1_choices;
277  // Nota Bene: For the main loop here, we operate only on the 2nd and greater
278  // word2 choices, and put them in the joined_choices list. The 1st word2
279  // choice gets added to the original word1 choices in-place after we have
280  // finished with them.
281  int bc2_index = 1;
282  for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
283  if (total_joined_choices >= kTooManyAltChoices &&
284  bc2_index > kAltsPerPiece)
285  break;
286  int bc1_index = 0;
287  for (bc1_it.move_to_first(); bc1_index < num_word1_choices;
288  ++bc1_index, bc1_it.forward()) {
289  if (total_joined_choices >= kTooManyAltChoices &&
290  bc1_index > kAltsPerPiece)
291  break;
292  auto *wc = new WERD_CHOICE(*bc1_it.data());
293  *wc += *bc2_it.data();
294  jc_it.add_after_then_move(wc);
295  ++total_joined_choices;
296  }
297  }
298  // Now that we've filled in as many alternates as we want, paste the best
299  // choice for word2 onto the original word alt_choices.
300  bc1_it.move_to_first();
301  bc2_it.move_to_first();
302  for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
303  *bc1_it.data() += *bc2_it.data();
304  }
305  bc1_it.move_to_last();
306  bc1_it.add_list_after(&joined_choices);
307 
308  // Restore the pointer to original blamer bundle and combine blamer
309  // information recorded in the splits.
310  if (orig_bb != nullptr) {
311  orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle,
313  delete word->blamer_bundle;
314  word->blamer_bundle = orig_bb;
315  }
316  word->SetupBoxWord();
317  word->reject_map.initialise(word->box_word->length());

◆ LSTMRecognizeWord()

void tesseract::Tesseract::LSTMRecognizeWord ( const BLOCK block,
ROW row,
WERD_RES word,
PointerVector< WERD_RES > *  words 
)

Definition at line 228 of file linerec.cpp.

229  {
230  TBOX word_box = word->word->bounding_box();
231  // Get the word image - no frills.
234  // In single word mode, use the whole image without any other row/word
235  // interpretation.
236  word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
237  } else {
238  float baseline = row->base_line((word_box.left() + word_box.right()) / 2);
239  if (baseline + row->descenders() < word_box.bottom())
240  word_box.set_bottom(baseline + row->descenders());
241  if (baseline + row->x_height() + row->ascenders() > word_box.top())
242  word_box.set_top(baseline + row->x_height() + row->ascenders());
243  }
244  ImageData* im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
245  if (im_data == nullptr) return;
246 
247  bool do_invert = tessedit_do_invert;
248  lstm_recognizer_->RecognizeLine(*im_data, do_invert, classify_debug_level > 0,
250  word_box, words, lstm_choice_mode,
252  delete im_data;
253  SearchWords(words);
254 }

◆ make_reject_map()

void tesseract::Tesseract::make_reject_map ( WERD_RES word,
ROW row,
int16_t  pass 
)

◆ match_current_words()

void tesseract::Tesseract::match_current_words ( WERD_RES_LIST &  words,
ROW row,
BLOCK block 
)

Definition at line 222 of file fixspace.cpp.

222  {
223 void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
224  BLOCK* block) {
225  WERD_RES_IT word_it(&words);
226  WERD_RES *word;
227  // Since we are not using PAGE_RES to iterate over words, we need to update
228  // prev_word_best_choice_ before calling classify_word_pass2().
229  prev_word_best_choice_ = nullptr;
230  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
231  word = word_it.data();
232  if ((!word->part_of_combo) && (word->box_word == nullptr)) {
233  WordData word_data(block, row, word);
234  SetupWordPassN(2, &word_data);
235  classify_word_and_language(2, nullptr, &word_data);
236  }
238  }

◆ match_word_pass_n()

void tesseract::Tesseract::match_word_pass_n ( int  pass_n,
WERD_RES word,
ROW row,
BLOCK block 
)

match_word_pass2

Baseline normalize the word and pass it to Tess.

Definition at line 1629 of file control.cpp.

1630  {
1631  if (word->tess_failed) return;
1632  tess_segment_pass_n(pass_n, word);
1633 
1634  if (!word->tess_failed) {
1635  if (!word->word->flag (W_REP_CHAR)) {
1636  word->fix_quotes();
1638  word->fix_hyphens();
1639  /* Don't trust fix_quotes! - though I think I've fixed the bug */
1640  if (word->best_choice->length() != word->box_word->length()) {
1641  tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1642  " #Blobs=%d\n",
1643  word->best_choice->debug_string().c_str(),
1644  word->best_choice->length(),
1645  word->box_word->length());
1646 
1647  }
1648  word->tess_accepted = tess_acceptable_word(word);
1649 
1650  // Also sets word->done flag
1651  make_reject_map(word, row, pass_n);
1652  }
1653  }
1654  set_word_fonts(word);
1655 
1656  ASSERT_HOST(word->raw_choice != nullptr);
1657 }

◆ MaximallyChopWord()

void tesseract::Tesseract::MaximallyChopWord ( const GenericVector< TBOX > &  boxes,
BLOCK block,
ROW row,
WERD_RES word_res 
)

Tests the chopper by exhaustively running chop_one_blob. The word_res will contain filled chopped_word, seam_array, denorm, box_word and best_state for the maximally chopped word.

Definition at line 242 of file applybox.cpp.

245  {
246  if (!word_res->SetupForRecognition(unicharset, this, BestPix(),
247  tessedit_ocr_engine_mode, nullptr,
251  row, block)) {
252  word_res->CloneChoppedToRebuild();
253  return;
254  }
255  if (chop_debug) {
256  tprintf("Maximally chopping word at:");
257  word_res->word->bounding_box().print();
258  }
259  GenericVector<BLOB_CHOICE*> blob_choices;
260  ASSERT_HOST(!word_res->chopped_word->blobs.empty());
261  auto rating = static_cast<float>(INT8_MAX);
262  for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
263  // The rating and certainty are not quite arbitrary. Since
264  // select_blob_to_chop uses the worst certainty to choose, they all have
265  // to be different, so starting with INT8_MAX, subtract 1/8 for each blob
266  // in here, and then divide by e each time they are chopped, which
267  // should guarantee a set of unequal values for the whole tree of blobs
268  // produced, however much chopping is required. The chops are thus only
269  // limited by the ability of the chopper to find suitable chop points,
270  // and not by the value of the certainties.
271  auto* choice =
272  new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
273  blob_choices.push_back(choice);
274  rating -= 0.125f;
275  }
276  const double e = exp(1.0); // The base of natural logs.
277  int blob_number;
278  int right_chop_index = 0;
280  // We only chop if the language is not fixed pitch like CJK.
281  SEAM* seam = nullptr;
282  while ((seam = chop_one_blob(boxes, blob_choices, word_res,
283  &blob_number)) != nullptr) {
284  word_res->InsertSeam(blob_number, seam);
285  BLOB_CHOICE* left_choice = blob_choices[blob_number];
286  rating = left_choice->rating() / e;
287  left_choice->set_rating(rating);
288  left_choice->set_certainty(-rating);
289  // combine confidence w/ serial #
290  auto* right_choice = new BLOB_CHOICE(++right_chop_index,
291  rating - 0.125f, -rating, -1,
292  0.0f, 0.0f, 0.0f, BCC_FAKE);
293  blob_choices.insert(right_choice, blob_number + 1);
294  }
295  }
296  word_res->CloneChoppedToRebuild();
297  word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);

◆ mutable_pix_binary()

Pix** tesseract::Tesseract::mutable_pix_binary ( )
inline

Definition at line 196 of file tesseractclass.h.

196  {
197  pixDestroy(&pix_binary_);
198  return &pix_binary_;
199  }

◆ mutable_textord()

Textord* tesseract::Tesseract::mutable_textord ( )
inline

Definition at line 269 of file tesseractclass.h.

269  {
270  return &textord_;
271  }

◆ nn_match_word()

void tesseract::Tesseract::nn_match_word ( WERD_RES word,
ROW row 
)

◆ nn_recover_rejects()

void tesseract::Tesseract::nn_recover_rejects ( WERD_RES word,
ROW row 
)

◆ noise_outlines()

bool tesseract::Tesseract::noise_outlines ( TWERD word)

Definition at line 958 of file docqual.cpp.

961  {
962  TBOX box; // BB of outline
963  int16_t outline_count = 0;
964  int16_t small_outline_count = 0;
965  int16_t max_dimension;
966  float small_limit = kBlnXHeight * crunch_small_outlines_size;
967 
968  for (int b = 0; b < word->NumBlobs(); ++b) {
969  TBLOB* blob = word->blobs[b];
970  for (TESSLINE* ol = blob->outlines; ol != nullptr; ol = ol->next) {
971  outline_count++;
972  box = ol->bounding_box();
973  if (box.height() > box.width())
974  max_dimension = box.height();
975  else
976  max_dimension = box.width();
977  if (max_dimension < small_limit)
978  small_outline_count++;
979  }

◆ non_0_digit()

bool tesseract::Tesseract::non_0_digit ( const UNICHARSET ch_set,
UNICHAR_ID  unichar_id 
)

Definition at line 787 of file reject.cpp.

◆ non_O_upper()

bool tesseract::Tesseract::non_O_upper ( const UNICHARSET ch_set,
UNICHAR_ID  unichar_id 
)

Definition at line 783 of file reject.cpp.

◆ num_sub_langs()

int tesseract::Tesseract::num_sub_langs ( ) const
inline

Definition at line 276 of file tesseractclass.h.

276  {
277  return sub_langs_.size();
278  }

◆ one_ell_conflict()

bool tesseract::Tesseract::one_ell_conflict ( WERD_RES word_res,
bool  update_map 
)

Definition at line 291 of file reject.cpp.

293  {
294  const char *word;
295  const char *lengths;
296  int16_t word_len; //its length
297  int16_t first_alphanum_index_;
298  int16_t first_alphanum_offset_;
299  int16_t i;
300  int16_t offset;
301  bool non_conflict_set_char; //non conf set a/n?
302  bool conflict = false;
303  bool allow_1s;
304  ACCEPTABLE_WERD_TYPE word_type;
305  bool dict_perm_type;
306  bool dict_word_ok;
307  int dict_word_type;
308 
309  word = word_res->best_choice->unichar_string().c_str();
310  lengths = word_res->best_choice->unichar_lengths().c_str();
311  word_len = strlen(lengths);
312  /*
313  If there are no occurrences of the conflict set characters then the word
314  is OK.
315  */
316  if (strpbrk(word, conflict_set_I_l_1.c_str()) == nullptr)
317  return false;
318 
319  /*
320  There is a conflict if there are NO other (confirmed) alphanumerics apart
321  from those in the conflict set.
322  */
323 
324  for (i = 0, offset = 0, non_conflict_set_char = false;
325  (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
326  non_conflict_set_char =
327  (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
328  word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
329  !STRING (conflict_set_I_l_1).contains (word[offset]);
330  if (!non_conflict_set_char) {
331  if (update_map)
332  reject_I_1_L(word_res);
333  return true;
334  }
335 
336  /*
337  If the word is accepted by a dawg permuter, and the first alpha character
338  is "I" or "l", check to see if the alternative is also a dawg word. If it
339  is, then there is a potential error otherwise the word is ok.
340  */
341 
342  dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
343  (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
345  (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
346  (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
347  dict_word_type = dict_word(*(word_res->best_choice));
348  dict_word_ok = (dict_word_type > 0) &&
349  (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
350 
351  if ((rej_1Il_use_dict_word && dict_word_ok) ||
352  (rej_1Il_trust_permuter_type && dict_perm_type) ||
353  (dict_perm_type && dict_word_ok)) {
354  first_alphanum_index_ = first_alphanum_index (word, lengths);
355  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
356  if (lengths[first_alphanum_index_] == 1 &&
357  word[first_alphanum_offset_] == 'I') {
358  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
359  if (safe_dict_word(word_res) > 0) {
360  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
361  if (update_map)
362  word_res->reject_map[first_alphanum_index_].
363  setrej_1Il_conflict();
364  return true;
365  }
366  else {
367  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
368  return false;
369  }
370  }
371 
372  if (lengths[first_alphanum_index_] == 1 &&
373  word[first_alphanum_offset_] == 'l') {
374  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
375  if (safe_dict_word(word_res) > 0) {
376  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
377  if (update_map)
378  word_res->reject_map[first_alphanum_index_].
379  setrej_1Il_conflict();
380  return true;
381  }
382  else {
383  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
384  return false;
385  }
386  }
387  return false;
388  }
389 
390  /*
391  NEW 1Il code. The old code relied on permuter types too much. In fact,
392  tess will use TOP_CHOICE permute for good things like "palette".
393  In this code the string is examined independently to see if it looks like
394  a well formed word.
395  */
396 
397  /*
398  REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
399  dictionary word.
400  */
401  first_alphanum_index_ = first_alphanum_index (word, lengths);
402  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
403  if (lengths[first_alphanum_index_] == 1 &&
404  word[first_alphanum_offset_] == 'l') {
405  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
406  if (safe_dict_word(word_res) > 0)
407  return false;
408  else
409  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
410  }
411  else if (lengths[first_alphanum_index_] == 1 &&
412  word[first_alphanum_offset_] == 'I') {
413  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
414  if (safe_dict_word(word_res) > 0)
415  return false;
416  else
417  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
418  }
419  /*
420  For strings containing digits:
421  If there are no alphas OR the numeric permuter liked the word,
422  reject any non 1 conflict chs
423  Else reject all conflict chs
424  */
425  if (word_contains_non_1_digit (word, lengths)) {
426  allow_1s = (alpha_count (word, lengths) == 0) ||
427  (word_res->best_choice->permuter () == NUMBER_PERM);
428 
429  int16_t offset;
430  conflict = false;
431  for (i = 0, offset = 0; word[offset] != '\0';
432  offset += word_res->best_choice->unichar_lengths()[i++]) {
433  if ((!allow_1s || (word[offset] != '1')) &&
434  STRING (conflict_set_I_l_1).contains (word[offset])) {
435  if (update_map)
436  word_res->reject_map[i].setrej_1Il_conflict ();
437  conflict = true;
438  }
439  }
440  return conflict;
441  }
442  /*
443  For anything else. See if it conforms to an acceptable word type. If so,
444  treat accordingly.
445  */
446  word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
447  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
448  first_alphanum_index_ = first_alphanum_index (word, lengths);
449  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
450  if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
451  if (update_map)
452  word_res->reject_map[first_alphanum_index_].
453  setrej_1Il_conflict ();
454  return true;
455  }
456  else
457  return false;
458  }
459  else if (word_type == AC_UPPER_CASE) {
460  return false;
461  }
462  else {
463  if (update_map)
464  reject_I_1_L(word_res);
465  return true;

◆ output_pass()

void tesseract::Tesseract::output_pass ( PAGE_RES_IT page_res_it,
const TBOX target_word_box 
)

Definition at line 35 of file output.cpp.

35  {
36 void Tesseract::output_pass( //Tess output pass //send to api
37  PAGE_RES_IT &page_res_it,
38  const TBOX *target_word_box) {
39  BLOCK_RES *block_of_last_word;
40  bool force_eol; //During output
41  BLOCK *nextblock; //block of next word
42  WERD *nextword; //next word
43 
44  page_res_it.restart_page ();
45  block_of_last_word = nullptr;
46  while (page_res_it.word () != nullptr) {
47  check_debug_pt (page_res_it.word (), 120);
48 
49  if (target_word_box) {
50  TBOX current_word_box = page_res_it.word()->word->bounding_box();
51  FCOORD center_pt(
52  (current_word_box.right() + current_word_box.left()) / 2,
53  (current_word_box.bottom() + current_word_box.top()) / 2);
54  if (!target_word_box->contains(center_pt)) {
55  page_res_it.forward();
56  continue;
57  }
58  }
60  block_of_last_word != page_res_it.block ()) {
61  block_of_last_word = page_res_it.block ();
62  }
63 
64  force_eol = (tessedit_write_block_separators &&
65  (page_res_it.block () != page_res_it.next_block ())) ||
66  (page_res_it.next_word () == nullptr);
67 
68  if (page_res_it.next_word () != nullptr)
69  nextword = page_res_it.next_word ()->word;
70  else
71  nextword = nullptr;
72  if (page_res_it.next_block () != nullptr)
73  nextblock = page_res_it.next_block ()->block;
74  else
75  nextblock = nullptr;
76  //regardless of tilde crunching
77  write_results(page_res_it,
78  determine_newline_type(page_res_it.word()->word,
79  page_res_it.block()->block,
80  nextword, nextblock), force_eol);
81  page_res_it.forward();
82  }

◆ ParseLanguageString()

void tesseract::Tesseract::ParseLanguageString ( const char *  lang_str,
GenericVector< STRING > *  to_load,
GenericVector< STRING > *  not_to_load 
)

Definition at line 270 of file tessedit.cpp.

276  {
277  target->push_back(lang_code);
278  }
279  }
280 }
281 
282 // Initialize for potentially a set of languages defined by the language
283 // string and recursively any additional languages required by any language
284 // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
285 // See init_tesseract_internal for args.
286 int Tesseract::init_tesseract(const char* arg0, const char* textbase,
287  const char* language, OcrEngineMode oem,
288  char** configs, int configs_size,
289  const GenericVector<STRING>* vars_vec,
290  const GenericVector<STRING>* vars_values,
291  bool set_only_non_debug_params,
292  TessdataManager* mgr) {
293  GenericVector<STRING> langs_to_load;
294  GenericVector<STRING> langs_not_to_load;
295  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
296 

◆ pgeditor_main()

void tesseract::Tesseract::pgeditor_main ( int  width,
int  height,
PAGE_RES page_res 
)

pgeditor_main()

Top level editor operation: Setup a new window and an according event handler

Definition at line 378 of file pgedit.cpp.

378  {
379  current_page_res = page_res;
380  if (current_page_res->block_res_list.empty())
381  return;
382 
383  recog_done = false;
384  stillRunning = true;
385 
386  build_image_window(width, height);
387  word_display_mode.turn_on_bit(DF_EDGE_STEP);
389 #ifndef GRAPHICS_DISABLED
390  pe = new ParamsEditor(this, image_win);
391 #endif
392  PGEventHandler pgEventHandler(this);
393 
394  image_win->AddEventHandler(&pgEventHandler);
395  image_win->AddMessageBox();
396 
397  SVMenuNode* svMenuRoot = build_menu_new();
398 
399  svMenuRoot->BuildMenu(image_win);
400  image_win->SetVisible(true);
401 
402  image_win->AwaitEvent(SVET_DESTROY);
403  image_win->AddEventHandler(nullptr);
404 }

◆ pix_binary()

Pix* tesseract::Tesseract::pix_binary ( ) const
inline

Definition at line 200 of file tesseractclass.h.

200  {
201  return pix_binary_;
202  }

◆ pix_grey()

Pix* tesseract::Tesseract::pix_grey ( ) const
inline

Definition at line 203 of file tesseractclass.h.

203  {
204  return pix_grey_;
205  }

◆ pix_original()

Pix* tesseract::Tesseract::pix_original ( ) const
inline

Definition at line 210 of file tesseractclass.h.

210  {
211  return pix_original_;
212  }

◆ potential_word_crunch()

bool tesseract::Tesseract::potential_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level,
bool  ok_dict_word 
)

Definition at line 520 of file docqual.cpp.

524  {
525  float rating_per_ch;
526  int adjusted_len;
527  const char *str = word->best_choice->unichar_string().c_str();
528  const char *lengths = word->best_choice->unichar_lengths().c_str();
529  bool word_crunchable;
530  int poor_indicator_count = 0;
531 
532  word_crunchable = !crunch_leave_accept_strings ||
533  word->reject_map.length() < 3 ||
535  str, lengths) == AC_UNACCEPTABLE &&
536  !ok_dict_word);
537 
538  adjusted_len = word->reject_map.length();
539  if (adjusted_len > 10)
540  adjusted_len = 10;
541  rating_per_ch = word->best_choice->rating() / adjusted_len;
542 
543  if (rating_per_ch > crunch_pot_poor_rate) {
544  if (crunch_debug > 2) {
545  tprintf("Potential poor rating on \"%s\"\n",
546  word->best_choice->unichar_string().c_str());
547  }
548  poor_indicator_count++;
549  }
550 
551  if (word_crunchable &&
553  if (crunch_debug > 2) {
554  tprintf("Potential poor cert on \"%s\"\n",
555  word->best_choice->unichar_string().c_str());
556  }
557  poor_indicator_count++;
558  }
559 
560  if (garbage_level != G_OK) {
561  if (crunch_debug > 2) {
562  tprintf("Potential garbage on \"%s\"\n",
563  word->best_choice->unichar_string().c_str());
564  }
565  poor_indicator_count++;
566  }

◆ PreenXHeights()

void tesseract::Tesseract::PreenXHeights ( BLOCK_LIST *  block_list)

Any row xheight that is significantly different from the median is set to the median.

Definition at line 180 of file applybox.cpp.

181  {
182  const double median_xheight = MedianXHeight(block_list);
183  const double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
184  // Strip all fuzzy space markers to simplify the PAGE_RES.
185  BLOCK_IT b_it(block_list);
186  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
187  BLOCK* block = b_it.data();
188  ROW_IT r_it(block->row_list());
189  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
190  ROW* row = r_it.data();
191  const double diff = fabs(row->x_height() - median_xheight);
192  if (diff > max_deviation) {
193  if (applybox_debug) {
194  tprintf("row xheight=%g, but median xheight = %g\n",
195  row->x_height(), median_xheight);
196  }
197  row->set_x_height(static_cast<float>(median_xheight));
198  }
199  }
200  }

◆ PrepareForPageseg()

void tesseract::Tesseract::PrepareForPageseg ( )

Definition at line 641 of file tesseractclass.cpp.

641  {
643  // Find the max splitter strategy over all langs.
644  auto max_pageseg_strategy =
645  static_cast<ShiroRekhaSplitter::SplitStrategy>(
646  static_cast<int32_t>(pageseg_devanagari_split_strategy));
647  for (int i = 0; i < sub_langs_.size(); ++i) {
648  auto pageseg_strategy =
649  static_cast<ShiroRekhaSplitter::SplitStrategy>(
650  static_cast<int32_t>(sub_langs_[i]->pageseg_devanagari_split_strategy));
651  if (pageseg_strategy > max_pageseg_strategy)
652  max_pageseg_strategy = pageseg_strategy;
653  pixDestroy(&sub_langs_[i]->pix_binary_);
654  sub_langs_[i]->pix_binary_ = pixClone(pix_binary());
655  }
656  // Perform shiro-rekha (top-line) splitting and replace the current image by
657  // the newly split image.
658  splitter_.set_orig_pix(pix_binary());
659  splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
660  if (splitter_.Split(true, &pixa_debug_)) {
661  ASSERT_HOST(splitter_.splitted_image());
662  pixDestroy(&pix_binary_);
663  pix_binary_ = pixClone(splitter_.splitted_image());
664  }
665 }

◆ PrepareForTessOCR()

void tesseract::Tesseract::PrepareForTessOCR ( BLOCK_LIST *  block_list,
Tesseract osd_tess,
OSResults osr 
)

Definition at line 672 of file tesseractclass.cpp.

673  {
674  // Find the max splitter strategy over all langs.
675  auto max_ocr_strategy =
676  static_cast<ShiroRekhaSplitter::SplitStrategy>(
677  static_cast<int32_t>(ocr_devanagari_split_strategy));
678  for (int i = 0; i < sub_langs_.size(); ++i) {
679  auto ocr_strategy =
680  static_cast<ShiroRekhaSplitter::SplitStrategy>(
681  static_cast<int32_t>(sub_langs_[i]->ocr_devanagari_split_strategy));
682  if (ocr_strategy > max_ocr_strategy)
683  max_ocr_strategy = ocr_strategy;
684  }
685  // Utilize the segmentation information available.
686  splitter_.set_segmentation_block_list(block_list);
687  splitter_.set_ocr_split_strategy(max_ocr_strategy);
688  // Run the splitter for OCR
689  bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
690  // Restore pix_binary to the binarized original pix for future reference.
691  ASSERT_HOST(splitter_.orig_pix());
692  pixDestroy(&pix_binary_);
693  pix_binary_ = pixClone(splitter_.orig_pix());
694  // If the pageseg and ocr strategies are different, refresh the block list
695  // (from the last SegmentImage call) with blobs from the real image to be used
696  // for OCR.
697  if (splitter_.HasDifferentSplitStrategies()) {
698  BLOCK block("", true, 0, 0, 0, 0, pixGetWidth(pix_binary_),
699  pixGetHeight(pix_binary_));
700  Pix* pix_for_ocr = split_for_ocr ? splitter_.splitted_image() :
701  splitter_.orig_pix();
702  extract_edges(pix_for_ocr, &block);
703  splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());
704  }
705  // The splitter isn't needed any more after this, so save memory by clearing.
706  splitter_.Clear();
707 }

◆ PrerecAllWordsPar()

void tesseract::Tesseract::PrerecAllWordsPar ( const GenericVector< WordData > &  words)

Definition at line 38 of file par_control.cpp.

38  {
39  // Prepare all the blobs.
41  for (int w = 0; w < words.size(); ++w) {
42  if (words[w].word->ratings != nullptr &&
43  words[w].word->ratings->get(0, 0) == nullptr) {
44  for (int s = 0; s < words[w].lang_words.size(); ++s) {
45  Tesseract* sub = s < sub_langs_.size() ? sub_langs_[s] : this;
46  const WERD_RES& word = *words[w].lang_words[s];
47  for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) {
48  blobs.push_back(BlobData(b, sub, word));
49  }
50  }
51  }
52  }
53  // Pre-classify all the blobs.
54  if (tessedit_parallelize > 1) {
55 #ifdef _OPENMP
56 #pragma omp parallel for num_threads(10)
57 #endif // _OPENMP
58  for (int b = 0; b < blobs.size(); ++b) {
59  *blobs[b].choices =
60  blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, nullptr);
61  }
62  } else {
63  // TODO(AMD) parallelize this.
64  for (int b = 0; b < blobs.size(); ++b) {
65  *blobs[b].choices =
66  blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, nullptr);
67  }
68  }
69 }

◆ process_cmd_win_event()

bool tesseract::Tesseract::process_cmd_win_event ( int32_t  cmd_event,
char *  new_value 
)

Definition at line 415 of file pgedit.cpp.

418  {
419  char msg[160];
420  bool exit = false;
421 
422  color_mode = CM_RAINBOW;
423 
424  // Run recognition on the full page if needed.
425  switch (cmd_event) {
426  case BLAMER_CMD_EVENT:
430  case SHOW_BOLD_CMD_EVENT:
436  if (!recog_done) {
437  recog_all_words(current_page_res, nullptr, nullptr, nullptr, 0);
438  recog_done = true;
439  }
440  break;
441  default:
442  break;
443  }
444 
445  char* parameter;
446 
447  switch (cmd_event) {
448  case NULL_CMD_EVENT:
449  break;
450 
452  case DUMP_WERD_CMD_EVENT:
455  case RECOG_WERDS:
456  case RECOG_PSEUDO:
457  case SHOW_BLOB_FEATURES:
458  mode =static_cast<CMD_EVENTS>(cmd_event);
459  break;
461  mode = DEBUG_WERD_CMD_EVENT;
462  parameter = image_win->ShowInputDialog("Config File Name");
463  word_config_ = parameter;
464  delete[] parameter;
465  break;
467  if (new_value[0] == 'T')
468  word_display_mode.turn_on_bit(DF_BOX);
469  else
470  word_display_mode.turn_off_bit(DF_BOX);
471  mode = CHANGE_DISP_CMD_EVENT;
472  break;
473  case BLAMER_CMD_EVENT:
474  if (new_value[0] == 'T')
475  word_display_mode.turn_on_bit(DF_BLAMER);
476  else
477  word_display_mode.turn_off_bit(DF_BLAMER);
479  mode = CHANGE_DISP_CMD_EVENT;
480  break;
482  if (new_value[0] == 'T')
483  word_display_mode.turn_on_bit(DF_TEXT);
484  else
485  word_display_mode.turn_off_bit(DF_TEXT);
486  mode = CHANGE_DISP_CMD_EVENT;
487  break;
488  case POLYGONAL_CMD_EVENT:
489  if (new_value[0] == 'T')
490  word_display_mode.turn_on_bit(DF_POLYGONAL);
491  else
492  word_display_mode.turn_off_bit(DF_POLYGONAL);
493  mode = CHANGE_DISP_CMD_EVENT;
494  break;
495  case BL_NORM_CMD_EVENT:
496  if (new_value[0] == 'T')
497  word_display_mode.turn_on_bit(DF_BN_POLYGONAL);
498  else
499  word_display_mode.turn_off_bit(DF_BN_POLYGONAL);
500  mode = CHANGE_DISP_CMD_EVENT;
501  break;
502  case BITMAP_CMD_EVENT:
503  if (new_value[0] == 'T')
504  word_display_mode.turn_on_bit(DF_EDGE_STEP);
505  else
506  word_display_mode.turn_off_bit(DF_EDGE_STEP);
507  mode = CHANGE_DISP_CMD_EVENT;
508  break;
511  break;
512  case IMAGE_CMD_EVENT:
513  display_image =(new_value[0] == 'T');
515  break;
516  case BLOCKS_CMD_EVENT:
517  display_blocks =(new_value[0] == 'T');
519  break;
520  case BASELINES_CMD_EVENT:
521  display_baselines =(new_value[0] == 'T');
523  break;
525  color_mode = CM_SUBSCRIPT;
527  break;
529  color_mode = CM_SUPERSCRIPT;
531  break;
533  color_mode = CM_ITALIC;
535  break;
536  case SHOW_BOLD_CMD_EVENT:
537  color_mode = CM_BOLD;
539  break;
541  color_mode = CM_UNDERLINE;
543  break;
545  color_mode = CM_FIXEDPITCH;
547  break;
549  color_mode = CM_SERIF;
551  break;
553  color_mode = CM_SMALLCAPS;
555  break;
557  color_mode = CM_DROPCAPS;
559  break;
560  case REFRESH_CMD_EVENT:
562  break;
563  case QUIT_CMD_EVENT:
564  exit = true;
566  break;
567 
568  default:
569  snprintf(msg, sizeof(msg), "Unrecognised event %" PRId32 "(%s)",
570  cmd_event, new_value);
571  image_win->AddMessage(msg);
572  break;
573  }
574  return exit;
575 }

◆ process_image_event()

void tesseract::Tesseract::process_image_event ( const SVEvent event)

process_image_event()

User has done something in the image window - mouse down or up. Work out what it is and do something with it. If DOWN - just remember where it was. If UP - for each word in the selected area do the operation defined by the current mode.

Definition at line 587 of file pgedit.cpp.

588  {
589  // The following variable should remain static, since it is used by
590  // debug editor, which uses a single Tesseract instance.
591  static ICOORD down;
592  ICOORD up;
593  TBOX selection_box;
594  char msg[80];
595 
596  switch(event.type) {
597 
598  case SVET_SELECTION:
599  if (event.type == SVET_SELECTION) {
600  down.set_x(event.x + event.x_size);
601  down.set_y(event.y + event.y_size);
602  if (mode == SHOW_POINT_CMD_EVENT)
603  show_point(current_page_res, event.x, event.y);
604  }
605 
606  up.set_x(event.x);
607  up.set_y(event.y);
608 
609  selection_box = TBOX(down, up);
610 
611  switch(mode) {
614  current_page_res,
615  selection_box,
617  break;
618  case DUMP_WERD_CMD_EVENT:
619  process_selected_words(current_page_res,
620  selection_box,
622  break;
624  process_selected_words(current_page_res,
625  selection_box,
627  break;
629  debug_word(current_page_res, selection_box);
630  break;
632  break; // ignore up event
633 
634  case RECOG_WERDS:
635  #ifndef DISABLED_LEGACY_ENGINE
636  image_win->AddMessage("Recogging selected words");
637  this->process_selected_words(current_page_res,
638  selection_box,
640  #endif // ndef DISABLED_LEGACY_ENGINE
641  break;
642  case RECOG_PSEUDO:
643  image_win->AddMessage("Recogging selected blobs");
644  recog_pseudo_word(current_page_res, selection_box);
645  break;
646  case SHOW_BLOB_FEATURES:
647  blob_feature_display(current_page_res, selection_box);
648  break;
649 
650  default:
651  sprintf(msg, "Mode %d not yet implemented", mode);
652  image_win->AddMessage(msg);
653  break;
654  }
655  default:
656  break;
657  }
658 }

◆ process_selected_words()

void tesseract::Tesseract::process_selected_words ( PAGE_RES page_res,
TBOX selection_box,
bool(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it)  word_processor 
)

Definition at line 45 of file pagewalk.cpp.

◆ ProcessTargetWord()

bool tesseract::Tesseract::ProcessTargetWord ( const TBOX word_box,
const TBOX target_word_box,
const char *  word_config,
int  pass 
)

Definition at line 120 of file control.cpp.

123  {
124  if (word_config != nullptr) {
125  if (word_box.major_overlap(target_word_box)) {
126  if (backup_config_file_ == nullptr) {
127  backup_config_file_ = kBackUpConfigFile;
128  FILE* config_fp = fopen(backup_config_file_, "wb");
129  if (config_fp == nullptr) {
130  tprintf("Error, failed to open file \"%s\"\n", backup_config_file_);
131  } else {
132  ParamUtils::PrintParams(config_fp, params());
133  fclose(config_fp);
134  }
135  ParamUtils::ReadParamsFile(word_config,
137  params());
138  }
139  } else {
140  if (backup_config_file_ != nullptr) {
141  ParamUtils::ReadParamsFile(backup_config_file_,
143  params());
144  backup_config_file_ = nullptr;
145  }
146  }
147  } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
148  return false;
149  }
150  return true;
151 }

◆ quality_based_rejection()

void tesseract::Tesseract::quality_based_rejection ( PAGE_RES_IT page_res_it,
bool  good_quality_doc 
)

Definition at line 133 of file docqual.cpp.

◆ read_config_file()

void tesseract::Tesseract::read_config_file ( const char *  filename,
SetParamConstraint  constraint 
)

Definition at line 64 of file tessedit.cpp.

84  {

◆ ReassignDiacritics()

bool tesseract::Tesseract::ReassignDiacritics ( int  pass,
PAGE_RES_IT pr_it,
bool *  make_next_word_fuzzy 
)

Definition at line 944 of file control.cpp.

945  {
946  *make_next_word_fuzzy = false;
947  WERD* real_word = pr_it->word()->word;
948  if (real_word->rej_cblob_list()->empty() ||
949  real_word->cblob_list()->empty() ||
950  real_word->rej_cblob_list()->length() > noise_maxperword)
951  return false;
952  real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
953  // Get the noise outlines into a vector with matching bool map.
954  GenericVector<C_OUTLINE*> outlines;
955  real_word->GetNoiseOutlines(&outlines);
956  GenericVector<bool> word_wanted;
957  GenericVector<bool> overlapped_any_blob;
958  GenericVector<C_BLOB*> target_blobs;
959  AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it,
960  &word_wanted, &overlapped_any_blob,
961  &target_blobs);
962  // Filter the outlines that overlapped any blob and put them into the word
963  // now. This simplifies the remaining task and also makes it more accurate
964  // as it has more completed blobs to work on.
965  GenericVector<bool> wanted;
966  GenericVector<C_BLOB*> wanted_blobs;
967  GenericVector<C_OUTLINE*> wanted_outlines;
968  int num_overlapped = 0;
969  int num_overlapped_used = 0;
970  for (int i = 0; i < overlapped_any_blob.size(); ++i) {
971  if (overlapped_any_blob[i]) {
972  ++num_overlapped;
973  if (word_wanted[i]) ++num_overlapped_used;
974  wanted.push_back(word_wanted[i]);
975  wanted_blobs.push_back(target_blobs[i]);
976  wanted_outlines.push_back(outlines[i]);
977  outlines[i] = nullptr;
978  }
979  }
980  real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr);
981  AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted,
982  &target_blobs);
983  int non_overlapped = 0;
984  int non_overlapped_used = 0;
985  for (int i = 0; i < word_wanted.size(); ++i) {
986  if (word_wanted[i]) ++non_overlapped_used;
987  if (outlines[i] != nullptr) ++non_overlapped_used;
988  }
989  if (debug_noise_removal) {
990  tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
991  num_overlapped_used, num_overlapped, non_overlapped_used,
992  non_overlapped);
993  real_word->bounding_box().print();
994  }
995  // Now we have decided which outlines we want, put them into the real_word.
996  if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines,
997  make_next_word_fuzzy)) {
998  pr_it->MakeCurrentWordFuzzy();
999  }
1000  // TODO(rays) Parts of combos have a deep copy of the real word, and need
1001  // to have their noise outlines moved/assigned in the same way!!
1002  return num_overlapped_used != 0 || non_overlapped_used != 0;
1003 }

◆ recog_all_words()

bool tesseract::Tesseract::recog_all_words ( PAGE_RES page_res,
ETEXT_DESC monitor,
const TBOX target_word_box,
const char *  word_config,
int  dopasses 
)

recog_all_words()

Walk the page_res, recognizing all the words. If monitor is not null, it is used as a progress monitor/timeout/cancel. If dopasses is 0, all recognition passes are run, 1 just pass 1, 2 passes2 and higher. If target_word_box is not null, special things are done to words that overlap the target_word_box: if word_config is not null, the word config file is read for just the target word(s), otherwise, on pass 2 and beyond ONLY the target words are processed (Jetsoft modification.) Returns false if we cancelled prematurely.

Parameters
page_respage structure
monitorprogress monitor
word_configword_config file
target_word_boxspecifies just to extract a rectangle
dopasses0 - all, 1 just pass 1, 2 passes 2 and higher

Definition at line 302 of file control.cpp.

306  {
307  PAGE_RES_IT page_res_it(page_res);
308 
310  tessedit_test_adaption.set_value (true);
311  tessedit_minimal_rejection.set_value (true);
312  }
313 
314  if (dopasses==0 || dopasses==1) {
315  page_res_it.restart_page();
316  // ****************** Pass 1 *******************
317 
318  #ifndef DISABLED_LEGACY_ENGINE
319  // If the adaptive classifier is full switch to one we prepared earlier,
320  // ie on the previous page. If the current adaptive classifier is non-empty,
321  // prepare a backup starting at this page, in case it fills up. Do all this
322  // independently for each language.
323  if (AdaptiveClassifierIsFull()) {
325  } else if (!AdaptiveClassifierIsEmpty()) {
327  }
328  // Now check the sub-langs as well.
329  for (int i = 0; i < sub_langs_.size(); ++i) {
330  if (sub_langs_[i]->AdaptiveClassifierIsFull()) {
331  sub_langs_[i]->SwitchAdaptiveClassifier();
332  } else if (!sub_langs_[i]->AdaptiveClassifierIsEmpty()) {
333  sub_langs_[i]->StartBackupAdaptiveClassifier();
334  }
335  }
336 
337  #endif // ndef DISABLED_LEGACY_ENGINE
338 
339  // Set up all words ready for recognition, so that if parallelism is on
340  // all the input and output classes are ready to run the classifier.
342  SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
343  #ifndef DISABLED_LEGACY_ENGINE
344  if (tessedit_parallelize) {
345  PrerecAllWordsPar(words);
346  }
347  #endif // ndef DISABLED_LEGACY_ENGINE
348 
349  stats_.word_count = words.size();
350 
351  stats_.dict_words = 0;
352  stats_.doc_blob_quality = 0;
353  stats_.doc_outline_errs = 0;
354  stats_.doc_char_quality = 0;
355  stats_.good_char_count = 0;
356  stats_.doc_good_char_quality = 0;
357 
358  most_recently_used_ = this;
359  // Run pass 1 word recognition.
360  if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) return false;
361  // Pass 1 post-processing.
362  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
363  page_res_it.forward()) {
364  if (page_res_it.word()->word->flag(W_REP_CHAR)) {
365  fix_rep_char(&page_res_it);
366  continue;
367  }
368 
369  // Count dict words.
370  if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
371  ++(stats_.dict_words);
372 
373  // Update misadaption log (we only need to do it on pass 1, since
374  // adaption only happens on this pass).
375  if (page_res_it.word()->blamer_bundle != nullptr &&
376  page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
377  page_res->misadaption_log.push_back(
378  page_res_it.word()->blamer_bundle->misadaption_debug());
379  }
380  }
381  }
382 
383  if (dopasses == 1) return true;
384 
385  #ifndef DISABLED_LEGACY_ENGINE
386 
387  // ****************** Pass 2 *******************
389  AnyTessLang()) {
390  page_res_it.restart_page();
392  SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
393  if (tessedit_parallelize) {
394  PrerecAllWordsPar(words);
395  }
396  most_recently_used_ = this;
397  // Run pass 2 word recognition.
398  if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;
399  }
400 
401  // The next passes are only required for Tess-only.
402  if (AnyTessLang() && !AnyLSTMLang()) {
403  // ****************** Pass 3 *******************
404  // Fix fuzzy spaces.
406 
409  fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
410 
411  // ****************** Pass 4 *******************
414 
415  // ****************** Pass 5,6 *******************
416  rejection_passes(page_res, monitor, target_word_box, word_config);
417 
418  // ****************** Pass 8 *******************
419  font_recognition_pass(page_res);
420 
421  // ****************** Pass 9 *******************
422  // Check the correctness of the final results.
423  blamer_pass(page_res);
424  script_pos_pass(page_res);
425  }
426 
427  #endif // ndef DISABLED_LEGACY_ENGINE
428 
429  // Write results pass.
431  // This is now redundant, but retained commented so show how to obtain
432  // bounding boxes and style information.
433 
434  #ifndef DISABLED_LEGACY_ENGINE
435  // changed by jetsoft
436  // needed for dll to output memory structure
437  if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
438  output_pass(page_res_it, target_word_box);
439  // end jetsoft
440  #endif //ndef DISABLED_LEGACY_ENGINE
441 
442  const auto pageseg_mode = static_cast<PageSegMode>(
443  static_cast<int>(tessedit_pageseg_mode));
444  textord_.CleanupSingleRowResult(pageseg_mode, page_res);
445 
446  // Remove empty words, as these mess up the result iterators.
447  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
448  page_res_it.forward()) {
449  const WERD_RES* word = page_res_it.word();
450  const POLY_BLOCK* pb = page_res_it.block()->block != nullptr
451  ? page_res_it.block()->block->pdblk.poly_block()
452  : nullptr;
453  if (word->best_choice == nullptr || word->best_choice->length() == 0 ||
454  (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) {
455  page_res_it.DeleteCurrentWord();
456  }
457  }
458 
459  if (monitor != nullptr) {
460  monitor->progress = 100;
461  }
462  return true;
463 }

◆ recog_interactive()

bool tesseract::Tesseract::recog_interactive ( PAGE_RES_IT pr_it)

Recognize a single word in interactive mode.

Parameters
pr_itthe page results iterator

Definition at line 77 of file control.cpp.

77  {
78  int16_t char_qual;
79  int16_t good_char_qual;
80 
81  WordData word_data(*pr_it);
82  SetupWordPassN(2, &word_data);
83  // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.
84  if (lstm_recognizer_ == nullptr) {
85 #ifndef DISABLED_LEGACY_ENGINE
86  classify_word_and_language(2, pr_it, &word_data);
87 #endif // ndef DISABLED_LEGACY_ENGINE
88  } else {
89  classify_word_and_language(1, pr_it, &word_data);
90  }
91 #ifndef DISABLED_LEGACY_ENGINE
93  WERD_RES* word_res = pr_it->word();
94  word_char_quality(word_res, &char_qual, &good_char_qual);
95  tprintf("\n%d chars; word_blob_quality: %d; outline_errs: %d; "
96  "char_quality: %d; good_char_quality: %d\n",
97  word_res->reject_map.length(),
98  word_blob_quality(word_res),
99  word_outline_errs(word_res), char_qual, good_char_qual);
100  }
101 #endif // ndef DISABLED_LEGACY_ENGINE
102  return true;
103 }

◆ recog_pseudo_word()

void tesseract::Tesseract::recog_pseudo_word ( PAGE_RES page_res,
TBOX selection_box 
)

Definition at line 62 of file control.cpp.

63  {
64  PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
65  if (it != nullptr) {
67  it->DeleteCurrentWord();
68  delete it;
69  }
70 }

◆ recog_training_segmented()

void tesseract::Tesseract::recog_training_segmented ( const STRING fname,
PAGE_RES page_res,
volatile ETEXT_DESC monitor,
FILE *  output_file 
)

Definition at line 84 of file recogtraining.cpp.

87  {
88  STRING box_fname = fname;
89  const char* lastdot = strrchr(box_fname.c_str(), '.');
90  if (lastdot != nullptr)
91  box_fname[lastdot - box_fname.c_str()] = '\0';
92  box_fname += ".box";
93  // ReadNextBox() will close box_file
94  FILE* box_file = fopen(box_fname.c_str(), "r");
95  if (box_file == nullptr) {
96  tprintf("Error: Could not open file %s\n", box_fname.c_str());
97  ASSERT_HOST(box_file);
98  }
99 
100  PAGE_RES_IT page_res_it;
101  page_res_it.page_res = page_res;
102  page_res_it.restart_page();
103  STRING label;
104 
105  // Process all the words on this page.
106  TBOX tbox; // tesseract-identified box
107  TBOX bbox; // box from the box file
108  bool keep_going;
109  int line_number = 0;
110  int examined_words = 0;
111  do {
112  keep_going = read_t(&page_res_it, &tbox);
113  keep_going &=
114  ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
115  // Align bottom left points of the TBOXes.
116  while (keep_going &&
117  !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
118  if (bbox.bottom() < tbox.bottom()) {
119  page_res_it.forward();
120  keep_going = read_t(&page_res_it, &tbox);
121  } else {
122  keep_going =
123  ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
124  }
125  }
126  while (keep_going &&
127  !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
128  if (bbox.left() > tbox.left()) {
129  page_res_it.forward();
130  keep_going = read_t(&page_res_it, &tbox);
131  } else {
132  keep_going =
133  ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
134  }
135  }
136  // OCR the word if top right points of the TBOXes are similar.
137  if (keep_going &&
138  NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
139  NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
140  ambigs_classify_and_output(label.c_str(), &page_res_it, output_file);
141  examined_words++;
142  }
143  page_res_it.forward();
144  } while (keep_going);
145 
146  // Set up scripts on all of the words that did not get sent to
147  // ambigs_classify_and_output. They all should have, but if all the
148  // werd_res's don't get uch_sets, tesseract will crash when you try
149  // to iterate over them. :-(
150  int total_words = 0;
151  for (page_res_it.restart_page(); page_res_it.block() != nullptr;
152  page_res_it.forward()) {
153  if (page_res_it.word()) {
154  if (page_res_it.word()->uch_set == nullptr)
155  page_res_it.word()->SetupFake(unicharset);
156  total_words++;
157  }
158  }
159  if (examined_words < 0.85 * total_words) {
160  tprintf(
161  "TODO(antonova): clean up recog_training_segmented; "
162  " It examined only a small fraction of the ambigs image.\n");
163  }
164  tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words,
165  total_words);
166 }

◆ recog_word()

void tesseract::Tesseract::recog_word ( WERD_RES word)

Definition at line 41 of file tfacepp.cpp.

41  {
42  if (classify_debug_level) tprintf("No truth for word - skipping\n");
43  word->tess_failed = true;
44  return;
45  }
48  word->SetupBoxWord();
49  if (word->best_choice->length() != word->box_word->length()) {
50  tprintf("recog_word ASSERT FAIL String:\"%s\"; "
51  "Strlen=%d; #Blobs=%d\n",
52  word->best_choice->debug_string().c_str(),
53  word->best_choice->length(), word->box_word->length());
54  }
55  ASSERT_HOST(word->best_choice->length() == word->box_word->length());
56  // Check that the ratings matrix size matches the sum of all the
57  // segmentation states.
58  if (!word->StatesAllValid()) {
59  tprintf("Not all words have valid states relative to ratings matrix!!");
60  word->DebugWordChoices(true, nullptr);
61  ASSERT_HOST(word->StatesAllValid());
62  }
64  /* Override the permuter type if a straight dictionary check disagrees. */
65  uint8_t perm_type = word->best_choice->permuter();
66  if ((perm_type != SYSTEM_DAWG_PERM) &&
67  (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
68  uint8_t real_dict_perm_type = dict_word(*word->best_choice);
69  if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
70  (real_dict_perm_type == FREQ_DAWG_PERM) ||
71  (real_dict_perm_type == USER_DAWG_PERM)) &&
73  word->best_choice->unichar_lengths().c_str()) > 0)) {
74  word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
75  }
76  }
78  perm_type != word->best_choice->permuter()) {
79  tprintf("Permuter Type Flipped from %d to %d\n",
80  perm_type, word->best_choice->permuter());
81  }
82  }
83  // Factored out from control.cpp
84  ASSERT_HOST((word->best_choice == nullptr) == (word->raw_choice == nullptr));
85  if (word->best_choice == nullptr || word->best_choice->length() == 0 ||
86  static_cast<int>(strspn(word->best_choice->unichar_string().c_str(),
87  " ")) == word->best_choice->length()) {
88  word->tess_failed = true;
89  word->reject_map.initialise(word->box_word->length());
91  } else {
92  word->tess_failed = false;
93  }
94 }
95 
96 

◆ recog_word_recursive()

void tesseract::Tesseract::recog_word_recursive ( WERD_RES word)

Definition at line 104 of file tfacepp.cpp.

105  {
106  return split_and_recog_word(word);
107  }
108  cc_recog(word);
109  word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
110 
111  // Do sanity checks and minor fixes on best_choice.
112  if (word->best_choice->length() > word_length) {
113  word->best_choice->make_bad(); // should never happen
114  tprintf("recog_word: Discarded long string \"%s\""
115  " (%d characters vs %d blobs)\n",
116  word->best_choice->unichar_string().c_str(),
117  word->best_choice->length(), word_length);
118  tprintf("Word is at:");
119  word->word->bounding_box().print();
120  }
121  if (word->best_choice->length() < word_length) {
122  UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
123  while (word->best_choice->length() < word_length) {
124  word->best_choice->append_unichar_id(space_id, 1, 0.0,
125  word->best_choice->certainty());
126  }
127  }
128 }
129 

◆ RecogAllWordsPassN()

bool tesseract::Tesseract::RecogAllWordsPassN ( int  pass_n,
ETEXT_DESC monitor,
PAGE_RES_IT pr_it,
GenericVector< WordData > *  words 
)

Definition at line 213 of file control.cpp.

215  {
216  // TODO(rays) Before this loop can be parallelized (it would yield a massive
217  // speed-up) all remaining member globals need to be converted to local/heap
218  // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
219  // added. The results will be significantly different with adaption on, and
220  // deterioration will need investigation.
221  pr_it->restart_page();
222  for (int w = 0; w < words->size(); ++w) {
223  WordData* word = &(*words)[w];
224  if (w > 0) word->prev_word = &(*words)[w - 1];
225  if (monitor != nullptr) {
226  monitor->ocr_alive = true;
227  if (pass_n == 1) {
228  monitor->progress = 70 * w / words->size();
229  } else {
230  monitor->progress = 70 + 30 * w / words->size();
231  }
232  if (monitor->progress_callback2 != nullptr) {
233  TBOX box = pr_it->word()->word->bounding_box();
234  (*monitor->progress_callback2)(monitor, box.left(),
235  box.right(), box.top(), box.bottom());
236  }
237  if (monitor->deadline_exceeded() ||
238  (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this,
239  words->size()))) {
240  // Timeout. Fake out the rest of the words.
241  for (; w < words->size(); ++w) {
242  (*words)[w].word->SetupFake(unicharset);
243  }
244  return false;
245  }
246  }
247  if (word->word->tess_failed) {
248  int s;
249  for (s = 0; s < word->lang_words.size() &&
250  word->lang_words[s]->tess_failed; ++s) {}
251  // If all are failed, skip it. Image words are skipped by this test.
252  if (s > word->lang_words.size()) continue;
253  }
254  // Sync pr_it with the wth WordData.
255  while (pr_it->word() != nullptr && pr_it->word() != word->word)
256  pr_it->forward();
257  ASSERT_HOST(pr_it->word() != nullptr);
258  bool make_next_word_fuzzy = false;
259  #ifndef DISABLED_LEGACY_ENGINE
260  if (!AnyLSTMLang() &&
261  ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
262  // Needs to be setup again to see the new outlines in the chopped_word.
263  SetupWordPassN(pass_n, word);
264  }
265  #endif // ndef DISABLED_LEGACY_ENGINE
266 
267  classify_word_and_language(pass_n, pr_it, word);
269  tprintf("Pass%d: %s [%s]\n", pass_n,
270  word->word->best_choice->unichar_string().c_str(),
271  word->word->best_choice->debug_string().c_str());
272  }
273  pr_it->forward();
274  if (make_next_word_fuzzy && pr_it->word() != nullptr) {
275  pr_it->MakeCurrentWordFuzzy();
276  }
277  }
278  return true;
279 }

◆ recognize_page()

void tesseract::Tesseract::recognize_page ( STRING image_name)

◆ reject_edge_blobs()

void tesseract::Tesseract::reject_edge_blobs ( WERD_RES word)

Definition at line 263 of file reject.cpp.

263  {
265  TBOX word_box = word->word->bounding_box();
266  // Use the box_word as it is already denormed back to image coordinates.
267  int blobcount = word->box_word->length();
268 
269  if (word_box.left() < tessedit_image_border ||
270  word_box.bottom() < tessedit_image_border ||
271  word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
272  word_box.top() + tessedit_image_border > ImageHeight() - 1) {
273  ASSERT_HOST(word->reject_map.length() == blobcount);
274  for (int blobindex = 0; blobindex < blobcount; blobindex++) {
275  TBOX blob_box = word->box_word->BlobBox(blobindex);
276  if (blob_box.left() < tessedit_image_border ||
277  blob_box.bottom() < tessedit_image_border ||
278  blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
279  blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
280  word->reject_map[blobindex].setrej_edge_char();
281  // Close to edge
282  }
283  }
284  }

◆ reject_I_1_L()

void tesseract::Tesseract::reject_I_1_L ( WERD_RES word)

Definition at line 191 of file reject.cpp.

193  {
194 void Tesseract::reject_I_1_L(WERD_RES *word) {
195  int16_t i;
196  int16_t offset;
197 
198  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
199  offset += word->best_choice->unichar_lengths()[i], i += 1) {
201  contains (word->best_choice->unichar_string()[offset])) {
202  //rej 1Il conflict
203  word->reject_map[i].setrej_1Il_conflict ();

◆ reject_mostly_rejects()

void tesseract::Tesseract::reject_mostly_rejects ( WERD_RES word)

Definition at line 571 of file reject.cpp.

574  {
575  /* Reject the whole of the word if the fraction of rejects exceeds a limit */
576 
577  if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >=

◆ rejection_passes()

void tesseract::Tesseract::rejection_passes ( PAGE_RES page_res,
ETEXT_DESC monitor,
const TBOX target_word_box,
const char *  word_config 
)

Definition at line 612 of file control.cpp.

615  {
616  PAGE_RES_IT page_res_it(page_res);
617  // ****************** Pass 5 *******************
618  // Gather statistics on rejects.
619  int word_index = 0;
620  while (!tessedit_test_adaption && page_res_it.word() != nullptr) {
622  WERD_RES* word = page_res_it.word();
623  word_index++;
624  if (monitor != nullptr) {
625  monitor->ocr_alive = true;
626  monitor->progress = 95 + 5 * word_index / stats_.word_count;
627  }
628  if (word->rebuild_word == nullptr) {
629  // Word was not processed by tesseract.
630  page_res_it.forward();
631  continue;
632  }
633  check_debug_pt(word, 70);
634 
635  // changed by jetsoft
636  // specific to its needs to extract one word when need
637  if (target_word_box &&
639  *target_word_box, word_config, 4)) {
640  page_res_it.forward();
641  continue;
642  }
643  // end jetsoft
644 
645  page_res_it.rej_stat_word();
646  const int chars_in_word = word->reject_map.length();
647  const int rejects_in_word = word->reject_map.reject_count();
648 
649  const int blob_quality = word_blob_quality(word);
650  stats_.doc_blob_quality += blob_quality;
651  const int outline_errs = word_outline_errs(word);
652  stats_.doc_outline_errs += outline_errs;
653  int16_t all_char_quality;
654  int16_t accepted_all_char_quality;
655  word_char_quality(word, &all_char_quality, &accepted_all_char_quality);
656  stats_.doc_char_quality += all_char_quality;
657  const uint8_t permuter_type = word->best_choice->permuter();
658  if ((permuter_type == SYSTEM_DAWG_PERM) ||
659  (permuter_type == FREQ_DAWG_PERM) ||
660  (permuter_type == USER_DAWG_PERM)) {
661  stats_.good_char_count += chars_in_word - rejects_in_word;
662  stats_.doc_good_char_quality += accepted_all_char_quality;
663  }
664  check_debug_pt(word, 80);
666  (blob_quality == 0) && (outline_errs >= chars_in_word))
668  check_debug_pt(word, 90);
669  page_res_it.forward();
670  }
671 
673  tprintf
674  ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
675  " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
676  page_res->char_count, page_res->rej_count,
677  page_res->rej_count / static_cast<float>(page_res->char_count),
678  stats_.doc_blob_quality,
679  stats_.doc_blob_quality / static_cast<float>(page_res->char_count),
680  stats_.doc_outline_errs,
681  stats_.doc_outline_errs / static_cast<float>(page_res->char_count),
682  stats_.doc_char_quality,
683  stats_.doc_char_quality / static_cast<float>(page_res->char_count),
684  stats_.doc_good_char_quality,
685  (stats_.good_char_count > 0) ?
686  (stats_.doc_good_char_quality /
687  static_cast<float>(stats_.good_char_count)) : 0.0);
688  }
689  bool good_quality_doc =
690  ((page_res->rej_count / static_cast<float>(page_res->char_count)) <=
691  quality_rej_pc) &&
692  (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=
693  quality_blob_pc) &&
694  (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=
696  (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=
698 
699  // ****************** Pass 6 *******************
700  // Do whole document or whole block rejection pass
701  if (!tessedit_test_adaption) {
703  quality_based_rejection(page_res_it, good_quality_doc);
704  }
705 }

◆ repeated_nonalphanum_wd()

bool tesseract::Tesseract::repeated_nonalphanum_wd ( WERD_RES word,
ROW row 
)

Definition at line 580 of file reject.cpp.

583  {
584  int16_t char_quality;
585  int16_t accepted_char_quality;
586 
587  if (word->best_choice->unichar_lengths().length() <= 1)
588  return false;
589 
591  contains(word->best_choice->unichar_string()[0]))
592  return false;
593 
594  UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
595  for (int i = 1; i < word->best_choice->length(); ++i) {
596  if (word->best_choice->unichar_id(i) != uch_id) return false;
597  }
598 
599  word_char_quality(word, &char_quality, &accepted_char_quality);
600 
601  if ((word->best_choice->unichar_lengths().length () == char_quality) &&
602  (char_quality == accepted_char_quality))
603  return true;

◆ ReportFailedBox()

void tesseract::Tesseract::ReportFailedBox ( int  boxfile_lineno,
TBOX  box,
const char *  box_ch,
const char *  err_msg 
)

◆ ReportXhtFixResult()

void tesseract::Tesseract::ReportXhtFixResult ( bool  accept_new_word,
float  new_x_ht,
WERD_RES word,
WERD_RES new_word 
)

Definition at line 1461 of file control.cpp.

1462  {
1463  tprintf("New XHT Match:%s = %s ",
1464  word->best_choice->unichar_string().c_str(),
1465  word->best_choice->debug_string().c_str());
1466  word->reject_map.print(debug_fp);
1467  tprintf(" -> %s = %s ",
1468  new_word->best_choice->unichar_string().c_str(),
1469  new_word->best_choice->debug_string().c_str());
1470  new_word->reject_map.print(debug_fp);
1471  tprintf(" %s->%s %s %s\n",
1472  word->guessed_x_ht ? "GUESS" : "CERT",
1473  new_word->guessed_x_ht ? "GUESS" : "CERT",
1474  new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
1475  accept_new_word ? "ACCEPTED" : "");
1476 }

◆ ReSegmentByClassification()

void tesseract::Tesseract::ReSegmentByClassification ( PAGE_RES page_res)

◆ ResegmentCharBox()

bool tesseract::Tesseract::ResegmentCharBox ( PAGE_RES page_res,
const TBOX prev_box,
const TBOX box,
const TBOX next_box,
const char *  correct_text 
)

Gather consecutive blobs that match the given box into the best_state and corresponding correct_text.

Fights over which box owns which blobs are settled by pre-chopping and applying the blobs to box or next_box with the least non-overlap.

Returns
false if the box was in error, which can only be caused by failing to find an appropriate blob for a box.

This means that occasionally, blobs may be incorrectly segmented if the chopper fails to find a suitable chop point.

Definition at line 328 of file applybox.cpp.

◆ ResegmentWordBox()

bool tesseract::Tesseract::ResegmentWordBox ( BLOCK_LIST *  block_list,
const TBOX box,
const TBOX next_box,
const char *  correct_text 
)

◆ ResetAdaptiveClassifier()

void tesseract::Tesseract::ResetAdaptiveClassifier ( )

Definition at line 597 of file tesseractclass.cpp.

597  {
599  for (int i = 0; i < sub_langs_.size(); ++i) {
600  sub_langs_[i]->ResetAdaptiveClassifierInternal();
601  }
602 }

◆ ResetDocumentDictionary()

void tesseract::Tesseract::ResetDocumentDictionary ( )

Definition at line 607 of file tesseractclass.cpp.

607  {
609  for (int i = 0; i < sub_langs_.size(); ++i) {
610  sub_langs_[i]->getDict().ResetDocumentDictionary();
611  }
612 }

◆ reskew()

const FCOORD& tesseract::Tesseract::reskew ( ) const
inline

Definition at line 192 of file tesseractclass.h.

192  {
193  return reskew_;
194  }

◆ RetryWithLanguage()

int tesseract::Tesseract::RetryWithLanguage ( const WordData word_data,
WordRecognizer  recognizer,
bool  debug,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  best_words 
)

Definition at line 903 of file control.cpp.

906  {
907  if (debug) {
908  tprintf("Trying word using lang %s, oem %d\n",
909  lang.c_str(), static_cast<int>(tessedit_ocr_engine_mode));
910  }
911  // Run the recognizer on the word.
912  PointerVector<WERD_RES> new_words;
913  (this->*recognizer)(word_data, in_word, &new_words);
914  if (new_words.empty()) {
915  // Transfer input word to new_words, as the classifier must have put
916  // the result back in the input.
917  new_words.push_back(*in_word);
918  *in_word = nullptr;
919  }
920  if (debug) {
921  for (int i = 0; i < new_words.size(); ++i)
922  new_words[i]->DebugTopChoice("Lang result");
923  }
924  // Initial version is a bit of a hack based on better certainty and rating
925  // or a dictionary vs non-dictionary word.
926  return SelectBestWords(classify_max_rating_ratio,
928  debug, &new_words, best_words);
929 }

◆ right_to_left()

bool tesseract::Tesseract::right_to_left ( ) const
inline

Definition at line 273 of file tesseractclass.h.

273  {
274  return right_to_left_;
275  }

◆ RunOldFixXht()

bool tesseract::Tesseract::RunOldFixXht ( WERD_RES word,
BLOCK block,
ROW row 
)

◆ safe_dict_word()

int16_t tesseract::Tesseract::safe_dict_word ( const WERD_RES werd_res)

Definition at line 605 of file reject.cpp.

608  {
609  const WERD_CHOICE &word = *werd_res->best_choice;

◆ scaled_color()

Pix* tesseract::Tesseract::scaled_color ( ) const
inline

Definition at line 256 of file tesseractclass.h.

256  {
257  return scaled_color_;
258  }

◆ scaled_factor()

int tesseract::Tesseract::scaled_factor ( ) const
inline

Definition at line 259 of file tesseractclass.h.

259  {
260  return scaled_factor_;
261  }

◆ script_pos_pass()

void tesseract::Tesseract::script_pos_pass ( PAGE_RES page_res)

Definition at line 733 of file control.cpp.

733  {
734  PAGE_RES_IT page_res_it(page_res);
735  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
736  page_res_it.forward()) {
737  WERD_RES* word = page_res_it.word();
738  if (word->word->flag(W_REP_CHAR)) {
739  page_res_it.forward();
740  continue;
741  }
742  const float x_height = page_res_it.block()->block->x_height();
743  float word_x_height = word->x_height;
744  if (word_x_height < word->best_choice->min_x_height() ||
745  word_x_height > word->best_choice->max_x_height()) {
746  word_x_height = (word->best_choice->min_x_height() +
747  word->best_choice->max_x_height()) / 2.0f;
748  }
749  // Test for small caps. Word capheight must be close to block xheight,
750  // and word must contain no lower case letters, and at least one upper case.
751  const double small_cap_xheight = x_height * kXHeightCapRatio;
752  const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
753  if (word->uch_set->script_has_xheight() &&
754  small_cap_xheight - small_cap_delta <= word_x_height &&
755  word_x_height <= small_cap_xheight + small_cap_delta) {
756  // Scan for upper/lower.
757  int num_upper = 0;
758  int num_lower = 0;
759  for (int i = 0; i < word->best_choice->length(); ++i) {
760  if (word->uch_set->get_isupper(word->best_choice->unichar_id(i)))
761  ++num_upper;
762  else if (word->uch_set->get_islower(word->best_choice->unichar_id(i)))
763  ++num_lower;
764  }
765  if (num_upper > 0 && num_lower == 0)
766  word->small_caps = true;
767  }
768  word->SetScriptPositions();
769  }
770 }

◆ SearchForText()

void tesseract::Tesseract::SearchForText ( const GenericVector< BLOB_CHOICE_LIST * > *  choices,
int  choices_pos,
int  choices_length,
const GenericVector< UNICHAR_ID > &  target_text,
int  text_index,
float  rating,
GenericVector< int > *  segmentation,
float *  best_rating,
GenericVector< int > *  best_segmentation 
)

◆ SearchWords()

void tesseract::Tesseract::SearchWords ( PointerVector< WERD_RES > *  words)

Definition at line 259 of file linerec.cpp.

259  {
260  // Run the segmentation search on the network outputs and make a BoxWord
261  // for each of the output words.
262  // If we drop a word as junk, then there is always a space in front of the
263  // next.
264  const Dict* stopper_dict = lstm_recognizer_->GetDict();
265  if (stopper_dict == nullptr) stopper_dict = &getDict();
266  bool any_nonspace_delimited = false;
267  for (int w = 0; w < words->size(); ++w) {
268  WERD_RES* word = (*words)[w];
269  if (word->best_choice != nullptr &&
271  any_nonspace_delimited = true;
272  break;
273  }
274  }
275  for (int w = 0; w < words->size(); ++w) {
276  WERD_RES* word = (*words)[w];
277  if (word->best_choice == nullptr) {
278  // It is a dud.
279  word->SetupFake(lstm_recognizer_->GetUnicharset());
280  } else {
281  // Set the best state.
282  for (int i = 0; i < word->best_choice->length(); ++i) {
283  int length = word->best_choice->state(i);
284  word->best_state.push_back(length);
285  }
286  word->reject_map.initialise(word->best_choice->length());
287  word->tess_failed = false;
288  word->tess_accepted = true;
289  word->tess_would_adapt = false;
290  word->done = true;
291  word->tesseract = this;
292  float word_certainty = std::min(word->space_certainty,
293  word->best_choice->certainty());
294  word_certainty *= kCertaintyScale;
295  if (getDict().stopper_debug_level >= 1) {
296  tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
297  word->best_choice->certainty(), word->space_certainty,
298  std::min(word->space_certainty, word->best_choice->certainty()) *
300  word_certainty);
301  word->best_choice->print();
302  }
303  word->best_choice->set_certainty(word_certainty);
304 
305  word->tess_accepted = stopper_dict->AcceptableResult(word);
306  }
307  }
308 }

◆ SegmentPage()

int tesseract::Tesseract::SegmentPage ( const STRING input_file,
BLOCK_LIST *  blocks,
Tesseract osd_tess,
OSResults osr 
)

Segment the page according to the current value of tessedit_pageseg_mode. pix_binary_ is used as the source image and should not be nullptr. On return the blocks list owns all the constructed page layout.

Definition at line 113 of file pagesegmain.cpp.

116  {
117  // No UNLV file present. Work according to the PageSegMode.
118  // First make a single block covering the whole image.
119  BLOCK_IT block_it(blocks);
120  auto* block = new BLOCK("", true, 0, 0, 0, 0, width, height);
121  block->set_right_to_left(right_to_left());
122  block_it.add_to_end(block);
123  } else {
124  // UNLV file present. Use PSM_SINGLE_BLOCK.
125  pageseg_mode = PSM_SINGLE_BLOCK;
126  }
127  // The diacritic_blobs holds noise blobs that may be diacritics. They
128  // are separated out on areas of the image that seem noisy and short-circuit
129  // the layout process, going straight from the initial partition creation
130  // right through to after word segmentation, where they are added to the
131  // rej_cblobs list of the most appropriate word. From there classification
132  // will determine whether they are used.
133  BLOBNBOX_LIST diacritic_blobs;
134  int auto_page_seg_ret_val = 0;
135  TO_BLOCK_LIST to_blocks;
136  if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
137  PSM_SPARSE(pageseg_mode)) {
138  auto_page_seg_ret_val = AutoPageSeg(
139  pageseg_mode, blocks, &to_blocks,
140  enable_noise_removal ? &diacritic_blobs : nullptr, osd_tess, osr);
141  if (pageseg_mode == PSM_OSD_ONLY)
142  return auto_page_seg_ret_val;
143  // To create blobs from the image region bounds uncomment this line:
144  // to_blocks.clear(); // Uncomment to go back to the old mode.
145  } else {
146  deskew_ = FCOORD(1.0f, 0.0f);
147  reskew_ = FCOORD(1.0f, 0.0f);
148  if (pageseg_mode == PSM_CIRCLE_WORD) {
149  Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
150  if (pixcleaned != nullptr) {
151  pixDestroy(&pix_binary_);
152  pix_binary_ = pixcleaned;
153  }
154  }
155  }
156 
157  if (auto_page_seg_ret_val < 0) {
158  return -1;
159  }
160 
161  if (blocks->empty()) {
163  tprintf("Empty page\n");
164  return 0; // AutoPageSeg found an empty page.
165  }
166  bool splitting =
168  bool cjk_mode = textord_use_cjk_fp_model;
169 
170  textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
171  pix_thresholds_, pix_grey_, splitting || cjk_mode,
172  &diacritic_blobs, blocks, &to_blocks);
173  return auto_page_seg_ret_val;
174 }
175 

◆ SelectGoodDiacriticOutlines()

bool tesseract::Tesseract::SelectGoodDiacriticOutlines ( int  pass,
float  certainty_threshold,
PAGE_RES_IT pr_it,
C_BLOB blob,
const GenericVector< C_OUTLINE * > &  outlines,
int  num_outlines,
GenericVector< bool > *  ok_outlines 
)

Definition at line 1139 of file control.cpp.

1142  {
1143  STRING best_str;
1144  float target_cert = certainty_threshold;
1145  if (blob != nullptr) {
1146  float target_c2;
1147  target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2);
1148  if (debug_noise_removal) {
1149  tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.c_str(),
1150  target_cert, target_c2);
1151  blob->bounding_box().print();
1152  }
1153  target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
1154  }
1155  GenericVector<bool> test_outlines = *ok_outlines;
1156  // Start with all the outlines in.
1157  STRING all_str;
1158  GenericVector<bool> best_outlines = *ok_outlines;
1159  float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1160  pr_it, blob, &all_str);
1161  if (debug_noise_removal) {
1162  TBOX ol_box;
1163  for (int i = 0; i < test_outlines.size(); ++i) {
1164  if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
1165  }
1166  tprintf("All Noise blob classified as %s=%g, delta=%g at:",
1167  all_str.c_str(), best_cert, best_cert - target_cert);
1168  ol_box.print();
1169  }
1170  // Iteratively zero out the bit that improves the certainty the most, until
1171  // we get past the threshold, have zero bits, or fail to improve.
1172  int best_index = 0; // To zero out.
1173  while (num_outlines > 1 && best_index >= 0 &&
1174  (blob == nullptr || best_cert < target_cert || blob != nullptr)) {
1175  // Find the best bit to zero out.
1176  best_index = -1;
1177  for (int i = 0; i < outlines.size(); ++i) {
1178  if (test_outlines[i]) {
1179  test_outlines[i] = false;
1180  STRING str;
1181  float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1182  pr_it, blob, &str);
1183  if (debug_noise_removal) {
1184  TBOX ol_box;
1185  for (int j = 0; j < outlines.size(); ++j) {
1186  if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
1187  tprintf("%d", test_outlines[j]);
1188  }
1189  tprintf(" blob classified as %s=%g, delta=%g) at:", str.c_str(),
1190  cert, cert - target_cert);
1191  ol_box.print();
1192  }
1193  if (cert > best_cert) {
1194  best_cert = cert;
1195  best_index = i;
1196  best_outlines = test_outlines;
1197  }
1198  test_outlines[i] = true;
1199  }
1200  }
1201  if (best_index >= 0) {
1202  test_outlines[best_index] = false;
1203  --num_outlines;
1204  }
1205  }
1206  if (best_cert >= target_cert) {
1207  // Save the best combination.
1208  *ok_outlines = best_outlines;
1209  if (debug_noise_removal) {
1210  tprintf("%s noise combination ", blob ? "Adding" : "New");
1211  for (int i = 0; i < best_outlines.size(); ++i) {
1212  tprintf("%d", best_outlines[i]);
1213  }
1214  tprintf(" yields certainty %g, beating target of %g\n", best_cert,
1215  target_cert);
1216  }
1217  return true;
1218  }
1219 
1220  return false;
1221 }

◆ set_done()

void tesseract::Tesseract::set_done ( WERD_RES word,
int16_t  pass 
)

◆ set_pix_grey()

void tesseract::Tesseract::set_pix_grey ( Pix *  grey_pix)
inline

Definition at line 206 of file tesseractclass.h.

206  {
207  pixDestroy(&pix_grey_);
208  pix_grey_ = grey_pix;
209  }

◆ set_pix_original()

void tesseract::Tesseract::set_pix_original ( Pix *  original_pix)
inline

Definition at line 214 of file tesseractclass.h.

214  {
215  pixDestroy(&pix_original_);
216  pix_original_ = original_pix;
217  // Clone to sublangs as well.
218  for (int i = 0; i < sub_langs_.size(); ++i) {
219  sub_langs_[i]->set_pix_original(original_pix ? pixClone(original_pix)
220  : nullptr);
221  }
222  }

◆ set_pix_thresholds()

void tesseract::Tesseract::set_pix_thresholds ( Pix *  thresholds)
inline

Definition at line 240 of file tesseractclass.h.

240  {
241  pixDestroy(&pix_thresholds_);
242  pix_thresholds_ = thresholds;
243  }

◆ set_source_resolution()

void tesseract::Tesseract::set_source_resolution ( int  ppi)
inline

Definition at line 247 of file tesseractclass.h.

247  {
248  source_resolution_ = ppi;
249  }

◆ set_unlv_suspects()

void tesseract::Tesseract::set_unlv_suspects ( WERD_RES word)

Definition at line 272 of file output.cpp.

273  {
274  int len = word_res->reject_map.length();
275  const WERD_CHOICE &word = *(word_res->best_choice);
276  const UNICHARSET &uchset = *word.unicharset();
277  int i;
278  float rating_per_ch;
279 
280  if (suspect_level == 0) {
281  for (i = 0; i < len; i++) {
282  if (word_res->reject_map[i].rejected())
283  word_res->reject_map[i].setrej_minimal_rej_accept();
284  }
285  return;
286  }
287 
288  if (suspect_level >= 3)
289  return; //Use defaults
290 
291  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
292 
293  if (safe_dict_word(word_res) &&
294  (count_alphas(word) > suspect_short_words)) {
295  /* Unreject alphas in dictionary words */
296  for (i = 0; i < len; ++i) {
297  if (word_res->reject_map[i].rejected() &&
298  uchset.get_isalpha(word.unichar_id(i)))
299  word_res->reject_map[i].setrej_minimal_rej_accept();
300  }
301  }
302 
303  rating_per_ch = word.rating() / word_res->reject_map.length();
304 
305  if (rating_per_ch >= suspect_rating_per_ch)
306  return; // Don't touch bad ratings
307 
308  if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
309  /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
310  for (i = 0; i < len; ++i) {
311  if (word_res->reject_map[i].rejected() &&
312  (!uchset.eq(word.unichar_id(i), " ")))
313  word_res->reject_map[i].setrej_minimal_rej_accept();
314  }
315  }
316 
317  for (i = 0; i < len; i++) {
318  if (word_res->reject_map[i].rejected()) {
319  if (word_res->reject_map[i].flag(R_DOC_REJ))
320  word_res->reject_map[i].setrej_minimal_rej_accept();
321  if (word_res->reject_map[i].flag(R_BLOCK_REJ))
322  word_res->reject_map[i].setrej_minimal_rej_accept();
323  if (word_res->reject_map[i].flag(R_ROW_REJ))
324  word_res->reject_map[i].setrej_minimal_rej_accept();
325  }
326  }
327 
328  if (suspect_level == 2)
329  return;
330 
331  if (!suspect_constrain_1Il ||
332  (word_res->reject_map.length() <= suspect_short_words)) {
333  for (i = 0; i < len; i++) {
334  if (word_res->reject_map[i].rejected()) {
335  if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
336  word_res->reject_map[i].flag(R_POSTNN_1IL)))
337  word_res->reject_map[i].setrej_minimal_rej_accept();
338 
339  if (!suspect_constrain_1Il &&
340  word_res->reject_map[i].flag(R_MM_REJECT))
341  word_res->reject_map[i].setrej_minimal_rej_accept();
342  }
343  }
344  }
345 
346  if (acceptable_word_string(*word_res->uch_set,
347  word.unichar_string().c_str(),
348  word.unichar_lengths().c_str()) !=
349  AC_UNACCEPTABLE ||
351  word.unichar_lengths().c_str())) {
352  if (word_res->reject_map.length() > suspect_short_words) {
353  for (i = 0; i < len; i++) {
354  if (word_res->reject_map[i].rejected() &&
355  (!word_res->reject_map[i].perm_rejected() ||
356  word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
357  word_res->reject_map[i].flag (R_POSTNN_1IL) ||
358  word_res->reject_map[i].flag (R_MM_REJECT))) {
359  word_res->reject_map[i].setrej_minimal_rej_accept();
360  }
361  }
362  }
363  }

◆ set_word_fonts()

void tesseract::Tesseract::set_word_fonts ( WERD_RES word)

set_word_fonts

Get the fonts for the word.

Definition at line 1961 of file control.cpp.

1961  {
1962  // Don't try to set the word fonts for an lstm word, as the configs
1963  // will be meaningless.
1964  if (word->chopped_word == nullptr) return;
1965  ASSERT_HOST(word->best_choice != nullptr);
1966 
1967 #ifndef DISABLED_LEGACY_ENGINE
1968  const int fontinfo_size = get_fontinfo_table().size();
1969  if (fontinfo_size == 0) return;
1970  GenericVector<int> font_total_score;
1971  font_total_score.init_to_size(fontinfo_size, 0);
1972 
1973  // Compute the font scores for the word
1974  if (tessedit_debug_fonts) {
1975  tprintf("Examining fonts in %s\n",
1976  word->best_choice->debug_string().c_str());
1977  }
1978  for (int b = 0; b < word->best_choice->length(); ++b) {
1979  const BLOB_CHOICE* choice = word->GetBlobChoice(b);
1980  if (choice == nullptr) continue;
1981  const GenericVector<ScoredFont>& fonts = choice->fonts();
1982  for (int f = 0; f < fonts.size(); ++f) {
1983  const int fontinfo_id = fonts[f].fontinfo_id;
1984  if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
1985  font_total_score[fontinfo_id] += fonts[f].score;
1986  }
1987  }
1988  }
1989  // Find the top and 2nd choice for the word.
1990  int score1 = 0, score2 = 0;
1991  int16_t font_id1 = -1, font_id2 = -1;
1992  for (int f = 0; f < fontinfo_size; ++f) {
1993  if (tessedit_debug_fonts && font_total_score[f] > 0) {
1994  tprintf("Font %s, total score = %d\n",
1995  fontinfo_table_.get(f).name, font_total_score[f]);
1996  }
1997  if (font_total_score[f] > score1) {
1998  score2 = score1;
1999  font_id2 = font_id1;
2000  score1 = font_total_score[f];
2001  font_id1 = f;
2002  } else if (font_total_score[f] > score2) {
2003  score2 = font_total_score[f];
2004  font_id2 = f;
2005  }
2006  }
2007  word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : nullptr;
2008  word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : nullptr;
2009  // Each score has a limit of UINT16_MAX, so divide by that to get the number
2010  // of "votes" for that font, ie number of perfect scores.
2011  word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX);
2012  word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX);
2013  if (score1 > 0) {
2014  const FontInfo fi = fontinfo_table_.get(font_id1);
2015  if (tessedit_debug_fonts) {
2016  if (word->fontinfo_id2_count > 0 && font_id2 >= 0) {
2017  tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
2018  fi.name, word->fontinfo_id_count,
2019  fontinfo_table_.get(font_id2).name,
2020  word->fontinfo_id2_count);
2021  } else {
2022  tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
2023  fi.name, word->fontinfo_id_count);
2024  }
2025  }
2026  }
2027 #endif // ndef DISABLED_LEGACY_ENGINE
2028 }

◆ SetBlackAndWhitelist()

void tesseract::Tesseract::SetBlackAndWhitelist ( )

Definition at line 614 of file tesseractclass.cpp.

614  {
615  // Set the white and blacklists (if any)
617  tessedit_char_whitelist.c_str(),
618  tessedit_char_unblacklist.c_str());
619  if (lstm_recognizer_) {
620  UNICHARSET& lstm_unicharset = lstm_recognizer_->GetUnicharset();
621  lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
622  tessedit_char_whitelist.c_str(),
623  tessedit_char_unblacklist.c_str());
624  }
625  // Black and white lists should apply to all loaded classifiers.
626  for (int i = 0; i < sub_langs_.size(); ++i) {
627  sub_langs_[i]->unicharset.set_black_and_whitelist(
629  tessedit_char_unblacklist.c_str());
630  if (sub_langs_[i]->lstm_recognizer_) {
631  UNICHARSET& lstm_unicharset = sub_langs_[i]->lstm_recognizer_->GetUnicharset();
632  lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
633  tessedit_char_whitelist.c_str(),
634  tessedit_char_unblacklist.c_str());
635  }
636  }
637 }

◆ SetEquationDetect()

void tesseract::Tesseract::SetEquationDetect ( EquationDetect detector)

Definition at line 591 of file tesseractclass.cpp.

591  {
592  equ_detect_ = detector;
593  equ_detect_->SetLangTesseract(this);
594 }

◆ SetScaledColor()

void tesseract::Tesseract::SetScaledColor ( int  factor,
Pix *  color 
)
inline

Definition at line 262 of file tesseractclass.h.

262  {
263  scaled_factor_ = factor;
264  scaled_color_ = color;
265  }

◆ SetupAllWordsPassN()

void tesseract::Tesseract::SetupAllWordsPassN ( int  pass_n,
const TBOX target_word_box,
const char *  word_config,
PAGE_RES page_res,
GenericVector< WordData > *  words 
)

If tesseract is to be run, sets the words up ready for it.

Definition at line 154 of file control.cpp.

158  {
159  // Prepare all the words.
160  PAGE_RES_IT page_res_it(page_res);
161  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
162  page_res_it.forward()) {
163  if (target_word_box == nullptr ||
164  ProcessTargetWord(page_res_it.word()->word->bounding_box(),
165  *target_word_box, word_config, 1)) {
166  words->push_back(WordData(page_res_it));
167  }
168  }
169  // Setup all the words for recognition with polygonal approximation.
170  for (int w = 0; w < words->size(); ++w) {
171  SetupWordPassN(pass_n, &(*words)[w]);
172  if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];
173  }
174 }

◆ SetupApplyBoxes()

PAGE_RES * tesseract::Tesseract::SetupApplyBoxes ( const GenericVector< TBOX > &  boxes,
BLOCK_LIST *  block_list 
)

Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: All fuzzy spaces are removed, and all the words are maximally chopped.

Definition at line 206 of file applybox.cpp.

208  {
209  PreenXHeights(block_list);
210  // Strip all fuzzy space markers to simplify the PAGE_RES.
211  BLOCK_IT b_it(block_list);
212  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
213  BLOCK* block = b_it.data();
214  ROW_IT r_it(block->row_list());
215  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
216  ROW* row = r_it.data();
217  WERD_IT w_it(row->word_list());
218  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
219  WERD* word = w_it.data();
220  if (word->cblob_list()->empty()) {
221  delete w_it.extract();
222  } else {
223  word->set_flag(W_FUZZY_SP, false);
224  word->set_flag(W_FUZZY_NON, false);
225  }
226  }
227  }
228  }
229  auto* page_res = new PAGE_RES(false, block_list, nullptr);
230  PAGE_RES_IT pr_it(page_res);
231  WERD_RES* word_res;
232  while ((word_res = pr_it.word()) != nullptr) {
233  MaximallyChopWord(boxes, pr_it.block()->block,
234  pr_it.row()->row, word_res);
235  pr_it.forward();
236  }
237  return page_res;

◆ SetupPageSegAndDetectOrientation()

ColumnFinder * tesseract::Tesseract::SetupPageSegAndDetectOrientation ( PageSegMode  pageseg_mode,
BLOCK_LIST *  blocks,
Tesseract osd_tess,
OSResults osr,
TO_BLOCK_LIST *  to_blocks,
Pix **  photo_mask_pix,
Pix **  music_mask_pix 
)

Sets up auto page segmentation, determines the orientation, and corrects it. Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to facilitate testing. photo_mask_pix is a pointer to a nullptr pointer that will be filled on return with the leptonica photo mask, which must be pixDestroyed by the caller. to_blocks is an empty list that will be filled with (usually a single) block that is used during layout analysis. This ugly API is required because of the possibility of a unlv zone file. TODO(rays) clean this up. See AutoPageSeg for other arguments. The returned ColumnFinder must be deleted after use.

Definition at line 284 of file pagesegmain.cpp.

289  {
290  pixa_debug_.AddPix(pix_binary_, "NoLines");
291  }
292  // Leptonica is used to find a mask of the photo regions in the input.
293  *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
295  pixa_debug_.AddPix(pix_binary_, "NoImages");
296  }
297  if (!PSM_COL_FIND_ENABLED(pageseg_mode)) v_lines.clear();
298 
299  // The rest of the algorithm uses the usual connected components.
300  textord_.find_components(pix_binary_, blocks, to_blocks);
301 
302  TO_BLOCK_IT to_block_it(to_blocks);
303  // There must be exactly one input block.
304  // TODO(rays) handle new textline finding with a UNLV zone file.
305  ASSERT_HOST(to_blocks->singleton());
306  TO_BLOCK* to_block = to_block_it.data();
307  TBOX blkbox = to_block->block->pdblk.bounding_box();
308  ColumnFinder* finder = nullptr;
309  int estimated_resolution = source_resolution_;
310  if (source_resolution_ == kMinCredibleResolution) {
311  // Try to estimate resolution from typical body text size.
312  int res = IntCastRounded(to_block->line_size * kResolutionEstimationFactor);
313  if (res > estimated_resolution && res < kMaxCredibleResolution) {
314  estimated_resolution = res;
315  tprintf("Estimating resolution as %d\n", estimated_resolution);
316  }
317  }
318 
319  if (to_block->line_size >= 2) {
320  finder = new ColumnFinder(static_cast<int>(to_block->line_size),
321  blkbox.botleft(), blkbox.topright(),
322  estimated_resolution, textord_use_cjk_fp_model,
324  &h_lines, vertical_x, vertical_y);
325 
326  finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);
327 
328 #ifndef DISABLED_LEGACY_ENGINE
329 
330  if (equ_detect_) {
331  equ_detect_->LabelSpecialText(to_block);
332  }
333 
334  BLOBNBOX_CLIST osd_blobs;
335  // osd_orientation is the number of 90 degree rotations to make the
336  // characters upright. (See tesseract/osdetect.h for precise definition.)
337  // We want the text lines horizontal, (vertical text indicates vertical
338  // textlines) which may conflict (eg vertically written CJK).
339  int osd_orientation = 0;
340  bool vertical_text = textord_tabfind_force_vertical_text ||
341  pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
342  if (!vertical_text && textord_tabfind_vertical_text &&
343  PSM_ORIENTATION_ENABLED(pageseg_mode)) {
344  vertical_text =
345  finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio,
346  to_block, &osd_blobs);
347  }
348  if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != nullptr && osr != nullptr) {
349  GenericVector<int> osd_scripts;
350  if (osd_tess != this) {
351  // We are running osd as part of layout analysis, so constrain the
352  // scripts to those allowed by *this.
353  AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
354  for (int s = 0; s < sub_langs_.size(); ++s) {
355  AddAllScriptsConverted(sub_langs_[s]->unicharset,
356  osd_tess->unicharset, &osd_scripts);
357  }
358  }
359  os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
360  if (pageseg_mode == PSM_OSD_ONLY) {
361  delete finder;
362  return nullptr;
363  }
364  osd_orientation = osr->best_result.orientation_id;
365  double osd_score = osr->orientations[osd_orientation];
366  double osd_margin = min_orientation_margin * 2;
367  for (int i = 0; i < 4; ++i) {
368  if (i != osd_orientation &&
369  osd_score - osr->orientations[i] < osd_margin) {
370  osd_margin = osd_score - osr->orientations[i];
371  }
372  }
373  int best_script_id = osr->best_result.script_id;
374  const char* best_script_str =
375  osd_tess->unicharset.get_script_from_script_id(best_script_id);
376  bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
377  best_script_id == osd_tess->unicharset.hiragana_sid() ||
378  best_script_id == osd_tess->unicharset.katakana_sid() ||
379  strcmp("Japanese", best_script_str) == 0 ||
380  strcmp("Korean", best_script_str) == 0 ||
381  strcmp("Hangul", best_script_str) == 0;
382  if (cjk) {
383  finder->set_cjk_script(true);
384  }
385  if (osd_margin < min_orientation_margin) {
386  // The margin is weak.
387  if (!cjk && !vertical_text && osd_orientation == 2) {
388  // upside down latin text is improbable with such a weak margin.
389  tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
390  "Don't rotate.\n", osd_margin);
391  osd_orientation = 0;
392  } else {
393  tprintf(
394  "OSD: Weak margin (%.2f) for %d blob text block, "
395  "but using orientation anyway: %d\n",
396  osd_margin, osd_blobs.length(), osd_orientation);
397  }
398  }
399  }
400  osd_blobs.shallow_clear();
401  finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
402 
403 #endif // ndef DISABLED_LEGACY_ENGINE
404  }
405 
406  return finder;
407 }
408 
409 } // namespace tesseract.

◆ SetupUniversalFontIds()

void tesseract::Tesseract::SetupUniversalFontIds ( )

Definition at line 447 of file tessedit.cpp.

454  {
455  if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
456  nullptr, 0, nullptr, nullptr, false, mgr))
457  return -1;
459  getDict().Load(lang, mgr);
460  getDict().FinishLoad();
461  return 0;
462 }
463 
464 #endif // ndef DISABLED_LEGACY_ENGINE
465 

◆ SetupWordPassN()

void tesseract::Tesseract::SetupWordPassN ( int  pass_n,
WordData word 
)

Definition at line 177 of file control.cpp.

177  {
178  if (pass_n == 1 || !word->word->done) {
179  if (pass_n == 1) {
180  word->word->SetupForRecognition(unicharset, this, BestPix(),
181  tessedit_ocr_engine_mode, nullptr,
185  word->row, word->block);
186  } else if (pass_n == 2) {
187  // TODO(rays) Should we do this on pass1 too?
188  word->word->caps_height = 0.0;
189  if (word->word->x_height == 0.0f)
190  word->word->x_height = word->row->x_height();
191  }
192  word->lang_words.truncate(0);
193  for (int s = 0; s <= sub_langs_.size(); ++s) {
194  // The sub_langs_.size() entry is for the master language.
195  Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
196  auto* word_res = new WERD_RES;
197  word_res->InitForRetryRecognition(*word->word);
198  word->lang_words.push_back(word_res);
199  // LSTM doesn't get setup for pass2.
200  if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
201  word_res->SetupForRecognition(
202  lang_t->unicharset, lang_t, BestPix(),
203  lang_t->tessedit_ocr_engine_mode, nullptr,
204  lang_t->classify_bln_numeric_mode,
205  lang_t->textord_use_cjk_fp_model,
206  lang_t->poly_allow_detailed_fx, word->row, word->block);
207  }
208  }
209  }
210 }

◆ SetupWordScripts()

void tesseract::Tesseract::SetupWordScripts ( BLOCK_LIST *  blocks)

◆ source_resolution()

int tesseract::Tesseract::source_resolution ( ) const
inline

Definition at line 244 of file tesseractclass.h.

244  {
245  return source_resolution_;
246  }

◆ split_and_recog_word()

void tesseract::Tesseract::split_and_recog_word ( WERD_RES word)

Definition at line 137 of file tfacepp.cpp.

137  {
138  // Find the biggest blob gap in the chopped_word.
139  int bestgap = -INT32_MAX;
140  int split_index = 0;
141  for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
142  TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
143  TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
144  int gap = blob_box.left() - prev_box.right();
145  if (gap > bestgap) {
146  bestgap = gap;
147  split_index = b;
148  }
149  }
150  ASSERT_HOST(split_index > 0);
151 
152  WERD_RES *word2 = nullptr;
153  BlamerBundle *orig_bb = nullptr;
154  split_word(word, split_index, &word2, &orig_bb);
155 
156  // Recognize the first part of the word.
157  recog_word_recursive(word);
158  // Recognize the second part of the word.
159  recog_word_recursive(word2);
160 
161  join_words(word, word2, orig_bb);
162 }

◆ split_word()

void tesseract::Tesseract::split_word ( WERD_RES word,
int  split_pt,
WERD_RES **  right_piece,
BlamerBundle **  orig_blamer_bundle 
) const

Definition at line 174 of file tfacepp.cpp.

178  {
179  ASSERT_HOST(split_pt >0 && split_pt < word->chopped_word->NumBlobs());
180 
181  // Save a copy of the blamer bundle so we can try to reconstruct it below.
182  BlamerBundle *orig_bb =
183  word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : nullptr;
184 
185  auto *word2 = new WERD_RES(*word);
186 
187  // blow away the copied chopped_word, as we want to work with
188  // the blobs from the input chopped_word so seam_arrays can be merged.
189  TWERD *chopped = word->chopped_word;
190  auto *chopped2 = new TWERD;
191  chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
192  for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
193  chopped2->blobs.push_back(chopped->blobs[i]);
194  }
195  chopped->blobs.truncate(split_pt);
196  word->chopped_word = nullptr;
197  delete word2->chopped_word;
198  word2->chopped_word = nullptr;
199 
200  const UNICHARSET &unicharset = *word->uch_set;
201  word->ClearResults();
202  word2->ClearResults();
203  word->chopped_word = chopped;
204  word2->chopped_word = chopped2;
206  word2->SetupBasicsFromChoppedWord(unicharset);
207 
208  // Try to adjust the blamer bundle.
209  if (orig_bb != nullptr) {
210  // TODO(rays) Looks like a leak to me.
211  // orig_bb should take, rather than copy.
212  word->blamer_bundle = new BlamerBundle();
213  word2->blamer_bundle = new BlamerBundle();
214  orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
215  word2->chopped_word->blobs[0]->bounding_box().left(),
217  word->blamer_bundle, word2->blamer_bundle);
218  }
219 
220  *right_piece = word2;
221  *orig_blamer_bundle = orig_bb;

◆ SubAndSuperscriptFix()

bool tesseract::Tesseract::SubAndSuperscriptFix ( WERD_RES word)

Attempt to split off any high (or low) bits at the ends of the word with poor certainty and recognize them separately. If the certainty gets much better and other sanity checks pass, accept.

This superscript fix is meant to be called in the second pass of recognition when we have tried once and already have a preliminary answer for word.

Returns
Whether we modified the given word.

Definition at line 100 of file superscript.cpp.

101  {
102  if (word->tess_failed || word->word->flag(W_REP_CHAR) ||
103  !word->best_choice) {
104  return false;
105  }
106  int num_leading, num_trailing;
107  ScriptPos sp_leading, sp_trailing;
108  float leading_certainty, trailing_certainty;
109  float avg_certainty, unlikely_threshold;
110 
111  // Calculate the number of whole suspicious characters at the edges.
113  word, &num_leading, &sp_leading, &leading_certainty,
114  &num_trailing, &sp_trailing, &trailing_certainty,
115  &avg_certainty, &unlikely_threshold);
116 
117  const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super";
118  const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super";
119 
120  int num_blobs = word->best_choice->length();
121 
122  // Calculate the remainder (partial characters) at the edges.
123  // This accounts for us having classified the best version of
124  // a word as [speaker?'] when it was instead [speaker.^{21}]
125  // (that is we accidentally thought the 2 was attached to the period).
126  int num_remainder_leading = 0, num_remainder_trailing = 0;
127  if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
128  int super_y_bottom =
130  int sub_y_top =
132  int last_word_char = num_blobs - 1 - num_trailing;
133  float last_char_certainty = word->best_choice->certainty(last_word_char);
134  if (word->best_choice->unichar_id(last_word_char) != 0 &&
135  last_char_certainty <= unlikely_threshold) {
136  ScriptPos rpos;
137  YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top,
138  nullptr, nullptr, &rpos, &num_remainder_trailing);
139  if (num_trailing > 0 && rpos != sp_trailing) num_remainder_trailing = 0;
140  if (num_remainder_trailing > 0 &&
141  last_char_certainty < trailing_certainty) {
142  trailing_certainty = last_char_certainty;
143  }
144  }
145  bool another_blob_available = (num_remainder_trailing == 0) ||
146  num_leading + num_trailing + 1 < num_blobs;
147  int first_char_certainty = word->best_choice->certainty(num_leading);
148  if (another_blob_available &&
149  word->best_choice->unichar_id(num_leading) != 0 &&
150  first_char_certainty <= unlikely_threshold) {
151  ScriptPos lpos;
152  YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top,
153  &lpos, &num_remainder_leading, nullptr, nullptr);
154  if (num_leading > 0 && lpos != sp_leading) num_remainder_leading = 0;
155  if (num_remainder_leading > 0 &&
156  first_char_certainty < leading_certainty) {
157  leading_certainty = first_char_certainty;
158  }
159  }
160  }
161 
162  // If nothing to do, bail now.
163  if (num_leading + num_trailing +
164  num_remainder_leading + num_remainder_trailing == 0) {
165  return false;
166  }
167 
168  if (superscript_debug >= 1) {
169  tprintf("Candidate for superscript detection: %s (",
170  word->best_choice->unichar_string().c_str());
171  if (num_leading || num_remainder_leading) {
172  tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading,
173  leading_pos);
174  }
175  if (num_trailing || num_remainder_trailing) {
176  tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing,
177  trailing_pos);
178  }
179  tprintf(")\n");
180  }
181  if (superscript_debug >= 3) {
182  word->best_choice->print();
183  }
184  if (superscript_debug >= 2) {
185  tprintf(" Certainties -- Average: %.2f Unlikely thresh: %.2f ",
186  avg_certainty, unlikely_threshold);
187  if (num_leading)
188  tprintf("Orig. leading (min): %.2f ", leading_certainty);
189  if (num_trailing)
190  tprintf("Orig. trailing (min): %.2f ", trailing_certainty);
191  tprintf("\n");
192  }
193 
194  // We've now calculated the number of rebuilt blobs we want to carve off.
195  // However, split_word() works from TBLOBs in chopped_word, so we need to
196  // convert to those.
197  int num_chopped_leading =
198  LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
199  int num_chopped_trailing =
200  TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
201 
202  int retry_leading = 0;
203  int retry_trailing = 0;
204  bool is_good = false;
205  WERD_RES *revised = TrySuperscriptSplits(
206  num_chopped_leading, leading_certainty, sp_leading,
207  num_chopped_trailing, trailing_certainty, sp_trailing,
208  word, &is_good, &retry_leading, &retry_trailing);
209  if (is_good) {
210  word->ConsumeWordResults(revised);
211  } else if (retry_leading || retry_trailing) {
212  int retry_chopped_leading =
213  LeadingUnicharsToChopped(revised, retry_leading);
214  int retry_chopped_trailing =
215  TrailingUnicharsToChopped(revised, retry_trailing);
216  WERD_RES *revised2 = TrySuperscriptSplits(
217  retry_chopped_leading, leading_certainty, sp_leading,
218  retry_chopped_trailing, trailing_certainty, sp_trailing,
219  revised, &is_good, &retry_leading, &retry_trailing);
220  if (is_good) {
221  word->ConsumeWordResults(revised2);
222  }
223  delete revised2;
224  }
225  delete revised;
226  return is_good;

◆ terrible_word_crunch()

bool tesseract::Tesseract::terrible_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level 
)

Definition at line 482 of file docqual.cpp.

485  {
486  float rating_per_ch;
487  int adjusted_len;
488  int crunch_mode = 0;
489 
490  if ((word->best_choice->unichar_string().length() == 0) ||
491  (strspn(word->best_choice->unichar_string().c_str(), " ") ==
493  crunch_mode = 1;
494  else {
495  adjusted_len = word->reject_map.length ();
496  if (adjusted_len > crunch_rating_max)
497  adjusted_len = crunch_rating_max;
498  rating_per_ch = word->best_choice->rating () / adjusted_len;
499 
500  if (rating_per_ch > crunch_terrible_rating)
501  crunch_mode = 2;
502  else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
503  crunch_mode = 3;
504  else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
505  (garbage_level != G_OK))
506  crunch_mode = 4;
507  else if ((rating_per_ch > crunch_poor_garbage_rate) &&
508  (garbage_level != G_OK))
509  crunch_mode = 5;
510  }
511  if (crunch_mode > 0) {
512  if (crunch_debug > 2) {
513  tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
514  crunch_mode, word->best_choice->unichar_string().c_str());
515  }
516  return true;
517  }
518  else

◆ tess_acceptable_word()

bool tesseract::Tesseract::tess_acceptable_word ( WERD_RES word)

Definition at line 61 of file tessbox.cpp.

62  {
63  return getDict().AcceptableResult(word);

◆ tess_add_doc_word()

void tesseract::Tesseract::tess_add_doc_word ( WERD_CHOICE word_choice)

Definition at line 71 of file tessbox.cpp.

72  {
73  getDict().add_document_word(*word_choice);

◆ tess_segment_pass_n()

void tesseract::Tesseract::tess_segment_pass_n ( int  pass_n,
WERD_RES word 
)

Definition at line 31 of file tessbox.cpp.

31  {
32 void Tesseract::tess_segment_pass_n(int pass_n, WERD_RES *word) {
33  int saved_enable_assoc = 0;
34  int saved_chop_enable = 0;
35 
36  if (word->word->flag(W_DONT_CHOP)) {
37  saved_enable_assoc = wordrec_enable_assoc;
38  saved_chop_enable = chop_enable;
39  wordrec_enable_assoc.set_value(0);
40  chop_enable.set_value(0);
41  }
42  if (pass_n == 1)
43  set_pass1();
44  else
45  set_pass2();
46  recog_word(word);
47  if (word->best_choice == nullptr)
48  word->SetupFake(*word->uch_set);
49  if (word->word->flag(W_DONT_CHOP)) {
50  wordrec_enable_assoc.set_value(saved_enable_assoc);
51  chop_enable.set_value(saved_chop_enable);
52  }

◆ TestNewNormalization()

bool tesseract::Tesseract::TestNewNormalization ( int  original_misfits,
float  baseline_shift,
float  new_x_ht,
WERD_RES word,
BLOCK block,
ROW row 
)

Definition at line 1518 of file control.cpp.

1520  {
1521  bool accept_new_x_ht = false;
1522  WERD_RES new_x_ht_word(word->word);
1523  if (word->blamer_bundle != nullptr) {
1524  new_x_ht_word.blamer_bundle = new BlamerBundle();
1525  new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
1526  }
1527  new_x_ht_word.x_height = new_x_ht;
1528  new_x_ht_word.baseline_shift = baseline_shift;
1529  new_x_ht_word.caps_height = 0.0;
1530  new_x_ht_word.SetupForRecognition(
1531  unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
1533  poly_allow_detailed_fx, row, block);
1534  match_word_pass_n(2, &new_x_ht_word, row, block);
1535  if (!new_x_ht_word.tess_failed) {
1536  int new_misfits = CountMisfitTops(&new_x_ht_word);
1537  if (debug_x_ht_level >= 1) {
1538  tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
1539  original_misfits, word->x_height,
1540  new_misfits, new_x_ht);
1541  tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
1542  word->best_choice->rating(), word->best_choice->certainty(),
1543  new_x_ht_word.best_choice->rating(),
1544  new_x_ht_word.best_choice->certainty());
1545  }
1546  // The misfits must improve and either the rating or certainty.
1547  accept_new_x_ht = new_misfits < original_misfits &&
1548  (new_x_ht_word.best_choice->certainty() >
1549  word->best_choice->certainty() ||
1550  new_x_ht_word.best_choice->rating() <
1551  word->best_choice->rating());
1552  if (debug_x_ht_level >= 1) {
1553  ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
1554  }
1555  }
1556  if (accept_new_x_ht) {
1557  word->ConsumeWordResults(&new_x_ht_word);
1558  return true;
1559  }
1560  return false;
1561 }

◆ textord()

const Textord& tesseract::Tesseract::textord ( ) const
inline

Definition at line 266 of file tesseractclass.h.

266  {
267  return textord_;
268  }

◆ TidyUp()

void tesseract::Tesseract::TidyUp ( PAGE_RES page_res)

◆ tilde_crunch()

void tesseract::Tesseract::tilde_crunch ( PAGE_RES_IT page_res_it)

Definition at line 396 of file docqual.cpp.

397  {
398 void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
399  WERD_RES *word;
400  GARBAGE_LEVEL garbage_level;
401  PAGE_RES_IT copy_it;
402  bool prev_potential_marked = false;
403  bool found_terrible_word = false;
404  bool ok_dict_word;
405 
406  page_res_it.restart_page();
407  while (page_res_it.word() != nullptr) {
408  POLY_BLOCK* pb = page_res_it.block()->block->pdblk.poly_block();
409  if (pb != nullptr && !pb->IsText()) {
410  page_res_it.forward();
411  continue;
412  }
413  word = page_res_it.word();
414 
416  convert_bad_unlv_chs(word);
417 
419  word->merge_tess_fails();
420 
421  if (word->reject_map.accept_count () != 0) {
422  found_terrible_word = false;
423  //Forget earlier potential crunches
424  prev_potential_marked = false;
425  }
426  else {
427  ok_dict_word = safe_dict_word(word);
428  garbage_level = garbage_word(word, ok_dict_word);
429 
430  if ((garbage_level != G_NEVER_CRUNCH) &&
431  (terrible_word_crunch (word, garbage_level))) {
432  if (crunch_debug > 0) {
433  tprintf ("T CRUNCHING: \"%s\"\n",
434  word->best_choice->unichar_string().c_str());
435  }
437  if (prev_potential_marked) {
438  while (copy_it.word () != word) {
439  if (crunch_debug > 0) {
440  tprintf ("P1 CRUNCHING: \"%s\"\n",
441  copy_it.word()->best_choice->unichar_string().c_str());
442  }
443  copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
444  copy_it.forward ();
445  }
446  prev_potential_marked = false;
447  }
448  found_terrible_word = true;
449  }
450  else if ((garbage_level != G_NEVER_CRUNCH) &&
451  (potential_word_crunch (word,
452  garbage_level, ok_dict_word))) {
453  if (found_terrible_word) {
454  if (crunch_debug > 0) {
455  tprintf ("P2 CRUNCHING: \"%s\"\n",
456  word->best_choice->unichar_string().c_str());
457  }
459  }
460  else if (!prev_potential_marked) {
461  copy_it = page_res_it;
462  prev_potential_marked = true;
463  if (crunch_debug > 1) {
464  tprintf ("P3 CRUNCHING: \"%s\"\n",
465  word->best_choice->unichar_string().c_str());
466  }
467  }
468  }
469  else {
470  found_terrible_word = false;
471  //Forget earlier potential crunches
472  prev_potential_marked = false;
473  if (crunch_debug > 2) {
474  tprintf ("NO CRUNCH: \"%s\"\n",
475  word->best_choice->unichar_string().c_str());
476  }
477  }
478  }
479  page_res_it.forward ();

◆ tilde_delete()

void tesseract::Tesseract::tilde_delete ( PAGE_RES_IT page_res_it)

Definition at line 568 of file docqual.cpp.

570  {
571  WERD_RES *word;
572  PAGE_RES_IT copy_it;
573  bool deleting_from_bol = false;
574  bool marked_delete_point = false;
575  int16_t debug_delete_mode;
576  CRUNCH_MODE delete_mode;
577  int16_t x_debug_delete_mode;
578  CRUNCH_MODE x_delete_mode;
579 
580  page_res_it.restart_page();
581  while (page_res_it.word() != nullptr) {
582  word = page_res_it.word();
583 
584  delete_mode = word_deletable (word, debug_delete_mode);
585  if (delete_mode != CR_NONE) {
586  if (word->word->flag (W_BOL) || deleting_from_bol) {
587  if (crunch_debug > 0) {
588  tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
589  debug_delete_mode,
590  word->best_choice->unichar_string().c_str());
591  }
592  word->unlv_crunch_mode = delete_mode;
593  deleting_from_bol = true;
594  } else if (word->word->flag(W_EOL)) {
595  if (marked_delete_point) {
596  while (copy_it.word() != word) {
597  x_delete_mode = word_deletable (copy_it.word (),
598  x_debug_delete_mode);
599  if (crunch_debug > 0) {
600  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
601  x_debug_delete_mode,
602  copy_it.word()->best_choice->unichar_string().c_str());
603  }
604  copy_it.word ()->unlv_crunch_mode = x_delete_mode;
605  copy_it.forward ();
606  }
607  }
608  if (crunch_debug > 0) {
609  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
610  debug_delete_mode,
611  word->best_choice->unichar_string().c_str());
612  }
613  word->unlv_crunch_mode = delete_mode;
614  deleting_from_bol = false;
615  marked_delete_point = false;
616  }
617  else {
618  if (!marked_delete_point) {
619  copy_it = page_res_it;
620  marked_delete_point = true;
621  }
622  }
623  }
624  else {
625  deleting_from_bol = false;
626  //Forget earlier potential crunches
627  marked_delete_point = false;
628  }
629  /*
630  The following step has been left till now as the tess fails are used to
631  determine if the word is deletable.
632  */
634  word->merge_tess_fails();
635  page_res_it.forward ();

◆ TrainedXheightFix()

bool tesseract::Tesseract::TrainedXheightFix ( WERD_RES word,
BLOCK block,
ROW row 
)

Definition at line 1484 of file control.cpp.

1484  {
1485  int original_misfits = CountMisfitTops(word);
1486  if (original_misfits == 0)
1487  return false;
1488  float baseline_shift = 0.0f;
1489  float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
1490  if (baseline_shift != 0.0f) {
1491  // Try the shift on its own first.
1492  if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height,
1493  word, block, row))
1494  return false;
1495  original_misfits = CountMisfitTops(word);
1496  if (original_misfits > 0) {
1497  float new_baseline_shift;
1498  // Now recompute the new x_height.
1499  new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
1500  if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1501  // No test of return value here, as we are definitely making a change
1502  // to the word by shifting the baseline.
1503  TestNewNormalization(original_misfits, baseline_shift, new_x_ht,
1504  word, block, row);
1505  }
1506  }
1507  return true;
1508  } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1509  return TestNewNormalization(original_misfits, 0.0f, new_x_ht,
1510  word, block, row);
1511  } else {
1512  return false;
1513  }
1514 }

◆ TrainFromBoxes()

void tesseract::Tesseract::TrainFromBoxes ( const GenericVector< TBOX > &  boxes,
const GenericVector< STRING > &  texts,
BLOCK_LIST *  block_list,
DocumentData training_data 
)

Definition at line 80 of file linerec.cpp.

83  {
84  int box_count = boxes.size();
85  // Process all the text lines in this page, as defined by the boxes.
86  int end_box = 0;
87  // Don't let \t, which marks newlines in the box file, get into the line
88  // content, as that makes the line unusable in training.
89  while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
90  for (int start_box = end_box; start_box < box_count; start_box = end_box) {
91  // Find the textline of boxes starting at start and their bounding box.
92  TBOX line_box = boxes[start_box];
93  STRING line_str = texts[start_box];
94  for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t";
95  ++end_box) {
96  line_box += boxes[end_box];
97  line_str += texts[end_box];
98  }
99  // Find the most overlapping block.
100  BLOCK* best_block = nullptr;
101  int best_overlap = 0;
102  BLOCK_IT b_it(block_list);
103  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
104  BLOCK* block = b_it.data();
105  if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText())
106  continue; // Not a text block.
107  TBOX block_box = block->pdblk.bounding_box();
108  block_box.rotate(block->re_rotation());
109  if (block_box.major_overlap(line_box)) {
110  TBOX overlap_box = line_box.intersection(block_box);
111  if (overlap_box.area() > best_overlap) {
112  best_overlap = overlap_box.area();
113  best_block = block;
114  }
115  }
116  }
117  ImageData* imagedata = nullptr;
118  if (best_block == nullptr) {
119  tprintf("No block overlapping textline: %s\n", line_str.c_str());
120  } else {
121  imagedata = GetLineData(line_box, boxes, texts, start_box, end_box,
122  *best_block);
123  }
124  if (imagedata != nullptr)
125  training_data->AddPageToDocument(imagedata);
126  // Don't let \t, which marks newlines in the box file, get into the line
127  // content, as that makes the line unusable in training.
128  while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
129  }
130 }

◆ TrainLineRecognizer()

bool tesseract::Tesseract::TrainLineRecognizer ( const STRING input_imagename,
const STRING output_basename,
BLOCK_LIST *  block_list 
)

Definition at line 43 of file linerec.cpp.

45  {
46  STRING lstmf_name = output_basename + ".lstmf";
47  DocumentData images(lstmf_name);
48  if (applybox_page > 0) {
49  // Load existing document for the previous pages.
50  if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) {
51  tprintf("Failed to read training data from %s!\n", lstmf_name.c_str());
52  return false;
53  }
54  }
55  GenericVector<TBOX> boxes;
57  // Get the boxes for this page, if there are any.
58  if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr,
59  nullptr) ||
60  boxes.empty()) {
61  tprintf("Failed to read boxes from %s\n", input_imagename.c_str());
62  return false;
63  }
64  TrainFromBoxes(boxes, texts, block_list, &images);
65  if (images.PagesSize() == 0) {
66  tprintf("Failed to read pages from %s\n", input_imagename.c_str());
67  return false;
68  }
69  images.Shuffle();
70  if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) {
71  tprintf("Failed to write training data to %s!\n", lstmf_name.c_str());
72  return false;
73  }
74  return true;
75 }

◆ TrySuperscriptSplits()

WERD_RES * tesseract::Tesseract::TrySuperscriptSplits ( int  num_chopped_leading,
float  leading_certainty,
ScriptPos  leading_pos,
int  num_chopped_trailing,
float  trailing_certainty,
ScriptPos  trailing_pos,
WERD_RES word,
bool *  is_good,
int *  retry_rebuild_leading,
int *  retry_rebuild_trailing 
)

Try splitting off the given number of (chopped) blobs from the front and back of the given word and recognizing the pieces.

Parameters
[in]num_chopped_leadinghow many chopped blobs from the left end of the word to chop off and try recognizing as a superscript (or subscript)
[in]leading_certaintythe (minimum) certainty had by the characters in the original leading section.
[in]leading_pos"super" or "sub" (for debugging)
[in]num_chopped_trailinghow many chopped blobs from the right end of the word to chop off and try recognizing as a superscript (or subscript)
[in]trailing_certaintythe (minimum) certainty had by the characters in the original trailing section.
[in]trailing_pos"super" or "sub" (for debugging)
[in]wordthe word to try to chop up.
[out]is_gooddo we believe our result?
[out]retry_rebuild_leading,retry_rebuild_trailingIf non-zero, and !is_good, then the caller may have luck trying to split the returned word with this number of (rebuilt) leading and trailing blobs / unichars.
Returns
A word which is the result of re-recognizing as asked.

Definition at line 381 of file superscript.cpp.

388  {
389  int num_chopped = word->chopped_word->NumBlobs();
390 
391  *retry_rebuild_leading = *retry_rebuild_trailing = 0;
392 
393  // Chop apart the word into up to three pieces.
394 
395  BlamerBundle *bb0 = nullptr;
396  BlamerBundle *bb1 = nullptr;
397  WERD_RES *prefix = nullptr;
398  WERD_RES *core = nullptr;
399  WERD_RES *suffix = nullptr;
400  if (num_chopped_leading > 0) {
401  prefix = new WERD_RES(*word);
402  split_word(prefix, num_chopped_leading, &core, &bb0);
403  } else {
404  core = new WERD_RES(*word);
405  }
406 
407  if (num_chopped_trailing > 0) {
408  int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
409  split_word(core, split_pt, &suffix, &bb1);
410  }
411 
412  // Recognize the pieces in turn.
413  int saved_cp_multiplier = classify_class_pruner_multiplier;
414  int saved_im_multiplier = classify_integer_matcher_multiplier;
415  if (prefix) {
416  // Turn off Tesseract's y-position penalties for the leading superscript.
419 
420  // Adjust our expectations about the baseline for this prefix.
421  if (superscript_debug >= 3) {
422  tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
423  }
424  recog_word_recursive(prefix);
425  if (superscript_debug >= 2) {
426  tprintf(" The leading bits look like %s %s\n",
427  ScriptPosToString(leading_pos),
428  prefix->best_choice->unichar_string().c_str());
429  }
430 
431  // Restore the normal y-position penalties.
432  classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
433  classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
434  }
435 
436  if (superscript_debug >= 3) {
437  tprintf(" recognizing middle %d chopped blobs\n",
438  num_chopped - num_chopped_leading - num_chopped_trailing);
439  }
440 
441  if (suffix) {
442  // Turn off Tesseract's y-position penalties for the trailing superscript.
445 
446  if (superscript_debug >= 3) {
447  tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
448  }
449  recog_word_recursive(suffix);
450  if (superscript_debug >= 2) {
451  tprintf(" The trailing bits look like %s %s\n",
452  ScriptPosToString(trailing_pos),
453  suffix->best_choice->unichar_string().c_str());
454  }
455 
456  // Restore the normal y-position penalties.
457  classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
458  classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
459  }
460 
461  // Evaluate whether we think the results are believably better
462  // than what we already had.
463  bool good_prefix = !prefix || BelievableSuperscript(
464  superscript_debug >= 1, *prefix,
465  superscript_bettered_certainty * leading_certainty,
466  retry_rebuild_leading, nullptr);
467  bool good_suffix = !suffix || BelievableSuperscript(
468  superscript_debug >= 1, *suffix,
469  superscript_bettered_certainty * trailing_certainty,
470  nullptr, retry_rebuild_trailing);
471 
472  *is_good = good_prefix && good_suffix;
473  if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
474  // None of it is any good. Quit now.
475  delete core;
476  delete prefix;
477  delete suffix;
478  delete bb1;
479  return nullptr;
480  }
481  recog_word_recursive(core);
482 
483  // Now paste the results together into core.
484  if (suffix) {
485  suffix->SetAllScriptPositions(trailing_pos);
486  join_words(core, suffix, bb1);
487  }
488  if (prefix) {
489  prefix->SetAllScriptPositions(leading_pos);
490  join_words(prefix, core, bb0);
491  core = prefix;
492  prefix = nullptr;
493  }
494 
495  if (superscript_debug >= 1) {
496  tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
497  core->best_choice->unichar_string().c_str());
498  }
499  return core;

◆ unrej_good_chs()

void tesseract::Tesseract::unrej_good_chs ( WERD_RES word)

Definition at line 112 of file docqual.cpp.

◆ unrej_good_quality_words()

void tesseract::Tesseract::unrej_good_quality_words ( PAGE_RES_IT page_res_it)

Definition at line 154 of file docqual.cpp.

154  {
155  word = page_res_it.word ();
156  for (i = 0; i < word->reject_map.length (); i++) {
157  if (word->reject_map[i].accept_if_good_quality ())
158  word->reject_map[i].setrej_quality_accept ();
159  }
160  page_res_it.forward ();
161  }
162  else if ((page_res_it.row ()->char_count > 0) &&
163  ((page_res_it.row ()->rej_count /
164  static_cast<float>(page_res_it.row ()->char_count)) <=
166  word = page_res_it.word ();
167  if (word->reject_map.quality_recoverable_rejects() &&
169  acceptable_word_string(*word->uch_set,
170  word->best_choice->unichar_string().c_str(),
171  word->best_choice->unichar_lengths().c_str())
172  != AC_UNACCEPTABLE)) {
173  unrej_good_chs(word);
174  }
175  page_res_it.forward ();
176  }
177  else {
178  // Skip to end of dodgy row.
179  current_row = page_res_it.row ();
180  while ((page_res_it.word () != nullptr) &&
181  (page_res_it.row () == current_row))
182  page_res_it.forward ();
183  }
184  check_debug_pt (page_res_it.word (), 110);
185  }
186  page_res_it.restart_page ();
187  page_res_it.page_res->char_count = 0;
188  page_res_it.page_res->rej_count = 0;
189  current_block = nullptr;
190  current_row = nullptr;
191  while (page_res_it.word () != nullptr) {
192  if (current_block != page_res_it.block ()) {
193  current_block = page_res_it.block ();
194  current_block->char_count = 0;
195  current_block->rej_count = 0;
196  }
197  if (current_row != page_res_it.row ()) {
198  current_row = page_res_it.row ();
199  current_row->char_count = 0;
200  current_row->rej_count = 0;
201  current_row->whole_word_rej_count = 0;
202  }
203  page_res_it.rej_stat_word ();
204  page_res_it.forward ();
205  }
206 }
207 
208 
209 /*************************************************************************
210  * doc_and_block_rejection()
211  *
212  * If the page has too many rejects - reject all of it.
213  * If any block has too many rejects - reject all words in the block
214  *************************************************************************/
215 
216 void Tesseract::doc_and_block_rejection( //reject big chunks

◆ word_adaptable()

bool tesseract::Tesseract::word_adaptable ( WERD_RES word,
uint16_t  mode 
)

Definition at line 50 of file adaptions.cpp.

59  {
60  if (tessedit_adaption_debug) tprintf("adaption disabled\n");
61  return false;
62  }
63 
64  if (flags.bit (ADAPTABLE_WERD)) {
65  status |= word->tess_would_adapt; // result of Classify::AdaptableWord()
66  if (tessedit_adaption_debug && !status) {
67  tprintf("tess_would_adapt bit is false\n");
68  }
69  }
70 
71  if (flags.bit (ACCEPTABLE_WERD)) {
72  status |= word->tess_accepted;
73  if (tessedit_adaption_debug && !status) {
74  tprintf("tess_accepted bit is false\n");
75  }
76  }
77 
78  if (!status) { // If not set then
79  return false; // ignore other checks
80  }
81 
82  if (flags.bit (CHECK_DAWGS) &&
83  (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
84  (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
85  (word->best_choice->permuter () != USER_DAWG_PERM) &&
86  (word->best_choice->permuter () != NUMBER_PERM)) {
87  if (tessedit_adaption_debug) tprintf("word not in dawgs\n");
88  return false;
89  }
90 
91  if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, false)) {
92  if (tessedit_adaption_debug) tprintf("word has ell conflict\n");
93  return false;
94  }
95 
96  if (flags.bit (CHECK_SPACES) &&
97  (strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {
98  if (tessedit_adaption_debug) tprintf("word contains spaces\n");
99  return false;
100  }
101 
102  if (flags.bit (CHECK_AMBIG_WERD) &&
104  if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
105  return false;
106  }
107 
109  tprintf("returning status %d\n", status);
110  }
111  return status;
112 }
113 
114 } // namespace tesseract

◆ word_blank_and_set_display()

bool tesseract::Tesseract::word_blank_and_set_display ( PAGE_RES_IT pr_its)

Definition at line 705 of file pgedit.cpp.

708  {
709  WERD_RES* word_res = pr_it->word();

◆ word_bln_display()

bool tesseract::Tesseract::word_bln_display ( PAGE_RES_IT pr_it)

word_bln_display()

Normalize word and display in word window

Definition at line 717 of file pgedit.cpp.

724  {
725  it.data()->plot_normed(word_res->denorm, color, ScrollView::BROWN,
726  bln_word_window_handle());
727  color = WERD::NextColor(color);
728  }
729  bln_word_window_handle()->Update();
730  return true;
731 }
732 
733 
734 

◆ word_blob_quality()

int16_t tesseract::Tesseract::word_blob_quality ( WERD_RES word)

Definition at line 64 of file docqual.cpp.

67  {
68  for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
69  TBLOB* blob = word->rebuild_word->blobs[b];
70  err_count += count_outline_errs(word->best_choice->unichar_string()[i],
71  blob->NumOutlines());
72  i++;
73  }
74  }

◆ word_char_quality()

void tesseract::Tesseract::word_char_quality ( WERD_RES word,
int16_t *  match_count,
int16_t *  accepted_match_count 
)

Definition at line 95 of file docqual.cpp.

101  {
102  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
103  word->rebuild_word->blobs.empty()) {
104  using namespace std::placeholders; // for _1
106  *word->rebuild_word, std::bind(acceptIfGoodQuality, word, _1));
107  }

◆ word_contains_non_1_digit()

bool tesseract::Tesseract::word_contains_non_1_digit ( const char *  word,
const char *  word_lengths 
)

Definition at line 508 of file reject.cpp.

511  {
512  int16_t i;
513  int16_t offset;
514 
515  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
516  if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
517  (word_lengths[i] != 1 || word[offset] != '1'))
518  return true;
519  }

◆ word_deletable()

CRUNCH_MODE tesseract::Tesseract::word_deletable ( WERD_RES word,
int16_t &  delete_mode 
)

Definition at line 875 of file docqual.cpp.

878  {
879  int word_len = word->reject_map.length ();
880  float rating_per_ch;
881  TBOX box; //BB of word
882 
883  if (word->unlv_crunch_mode == CR_NONE) {
884  delete_mode = 0;
885  return CR_NONE;
886  }
887 
888  if (word_len == 0) {
889  delete_mode = 1;
890  return CR_DELETE;
891  }
892 
893  if (word->rebuild_word != nullptr) {
894  // Cube leaves rebuild_word nullptr.
895  box = word->rebuild_word->bounding_box();
896  if (box.height () < crunch_del_min_ht * kBlnXHeight) {
897  delete_mode = 4;
898  return CR_DELETE;
899  }
900 
901  if (noise_outlines(word->rebuild_word)) {
902  delete_mode = 5;
903  return CR_DELETE;
904  }
905  }
906 
907  if ((failure_count (word) * 1.5) > word_len) {
908  delete_mode = 2;
909  return CR_LOOSE_SPACE;
910  }
911 
912  if (word->best_choice->certainty () < crunch_del_cert) {
913  delete_mode = 7;
914  return CR_LOOSE_SPACE;
915  }
916 
917  rating_per_ch = word->best_choice->rating () / word_len;
918 
919  if (rating_per_ch > crunch_del_rating) {
920  delete_mode = 8;
921  return CR_LOOSE_SPACE;
922  }
923 
925  delete_mode = 9;
926  return CR_LOOSE_SPACE;
927  }
928 
929  if (box.bottom () >
931  delete_mode = 10;
932  return CR_LOOSE_SPACE;
933  }
934 
935  if (box.height () > crunch_del_max_ht * kBlnXHeight) {
936  delete_mode = 11;
937  return CR_LOOSE_SPACE;
938  }
939 
940  if (box.width () < crunch_del_min_width * kBlnXHeight) {
941  delete_mode = 3;
942  return CR_LOOSE_SPACE;
943  }
944 

◆ word_display()

bool tesseract::Tesseract::word_display ( PAGE_RES_IT pr_it)

word_display() Word Processor

Display a word according to its display modes

Definition at line 749 of file pgedit.cpp.

755  {
757  switch (color_mode) {
758  case CM_SUBSCRIPT:
759  if (best_choice->BlobPosition(i) == SP_SUBSCRIPT)
760  color = ScrollView::RED;
761  break;
762  case CM_SUPERSCRIPT:
763  if (best_choice->BlobPosition(i) == SP_SUPERSCRIPT)
764  color = ScrollView::RED;
765  break;
766  case CM_ITALIC:
767  if (font_info.is_italic())
768  color = ScrollView::RED;
769  break;
770  case CM_BOLD:
771  if (font_info.is_bold())
772  color = ScrollView::RED;
773  break;
774  case CM_FIXEDPITCH:
775  if (font_info.is_fixed_pitch())
776  color = ScrollView::RED;
777  break;
778  case CM_SERIF:
779  if (font_info.is_serif())
780  color = ScrollView::RED;
781  break;
782  case CM_SMALLCAPS:
783  if (word_res->small_caps)
784  color = ScrollView::RED;
785  break;
786  case CM_DROPCAPS:
787  if (best_choice->BlobPosition(i) == SP_DROPCAP)
788  color = ScrollView::RED;
789  break;
790  // TODO(rays) underline is currently completely unsupported.
791  case CM_UNDERLINE:
792  default:
793  break;
794  }
795  image_win->Pen(color);
796  TBOX box = box_word->BlobBox(i);
797  image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top());
798  }
799  return true;
800  #else
801  return false;
802  #endif // ndef DISABLED_LEGACY_ENGINE
803  }
804  /*
805  Note the double coercions of(COLOUR)((int32_t)editor_image_word_bb_color)
806  etc. are to keep the compiler happy.
807  */
808  // display bounding box
809  if (word->display_flag(DF_BOX)) {
810  word->bounding_box().plot(image_win,
811  static_cast<ScrollView::Color>((int32_t)
813  static_cast<ScrollView::Color>((int32_t)
815 
816  auto c = static_cast<ScrollView::Color>((int32_t) editor_image_blob_bb_color);
817  image_win->Pen(c);
818  // cblob iterator
819  C_BLOB_IT c_it(word->cblob_list());
820  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward())
821  c_it.data()->bounding_box().plot(image_win);
822  displayed_something = true;
823  }
824 
825  // display edge steps
826  if (word->display_flag(DF_EDGE_STEP)) { // edgesteps available
827  word->plot(image_win); // rainbow colors
828  displayed_something = true;
829  }
830 
831  // display poly approx
832  if (word->display_flag(DF_POLYGONAL)) {
833  // need to convert
835  tword->plot(image_win);
836  delete tword;
837  displayed_something = true;
838  }
839 
840  // Display correct text and blamer information.
841  STRING text;
842  STRING blame;
843  if (word->display_flag(DF_TEXT) && word->text() != nullptr) {
844  text = word->text();
845  }
846  if (word->display_flag(DF_BLAMER) &&
847  !(word_res->blamer_bundle != nullptr &&
848  word_res->blamer_bundle->incorrect_result_reason() == IRR_CORRECT)) {
849  text = "";
850  const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
851  if (blamer_bundle == nullptr) {
852  text += "NULL";
853  } else {
854  text = blamer_bundle->TruthString();
855  }
856  text += " -> ";
857  STRING best_choice_str;
858  if (word_res->best_choice == nullptr) {
859  best_choice_str = "NULL";
860  } else {
861  word_res->best_choice->string_and_lengths(&best_choice_str, nullptr);
862  }
863  text += best_choice_str;
864  IncorrectResultReason reason = (blamer_bundle == nullptr) ?
865  IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason();
866  ASSERT_HOST(reason < IRR_NUM_REASONS);
867  blame += " [";
868  blame += BlamerBundle::IncorrectReasonName(reason);
869  blame += "]";
870  }
871  if (text.length() > 0) {
872  word_bb = word->bounding_box();
873  image_win->Pen(ScrollView::RED);
874  word_height = word_bb.height();
875  int text_height = 0.50 * word_height;
876  if (text_height > 20) text_height = 20;
877  image_win->TextAttributes("Arial", text_height, false, false, false);
878  shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
879  image_win->Text(word_bb.left() + shift,
880  word_bb.bottom() + 0.25 * word_height, text.c_str());
881  if (blame.length() > 0) {
882  image_win->Text(word_bb.left() + shift,
883  word_bb.bottom() + 0.25 * word_height - text_height,
884  blame.c_str());
885  }
886 
887  displayed_something = true;
888  }
889 
890  if (!displayed_something) // display BBox anyway
891  word->bounding_box().plot(image_win,
892  static_cast<ScrollView::Color>((int32_t) editor_image_word_bb_color),
893  static_cast<ScrollView::Color>((int32_t)
895  return true;
896 }
897 #endif // GRAPHICS_DISABLED
898 
904 bool Tesseract::word_dumper(PAGE_RES_IT* pr_it) {
905  if (pr_it->block()->block != nullptr) {

◆ word_dumper()

bool tesseract::Tesseract::word_dumper ( PAGE_RES_IT pr_it)

word_dumper()

Dump members to the debug window

Definition at line 913 of file pgedit.cpp.

915  {
916  tprintf("Current blamer debug: %s\n",
917  word_res->blamer_bundle->debug().c_str());
918  }
919  return true;
920 }
921 
922 #ifndef GRAPHICS_DISABLED
923 
929  WERD* word = pr_it->word()->word;

◆ word_outline_errs()

int16_t tesseract::Tesseract::word_outline_errs ( WERD_RES word)

Definition at line 76 of file docqual.cpp.

84  {
85  *match_count = 0;
86  *accepted_match_count = 0;
87  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
88  !word->rebuild_word->blobs.empty()) {
89  using namespace std::placeholders; // for _1

◆ word_set_display()

bool tesseract::Tesseract::word_set_display ( PAGE_RES_IT pr_it)

word_set_display() Word processor

Display word according to current display mode settings

Definition at line 937 of file pgedit.cpp.

944  {
945 #ifndef DISABLED_LEGACY_ENGINE
946  PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
947  if (it != nullptr) {

◆ worst_noise_blob()

int16_t tesseract::Tesseract::worst_noise_blob ( WERD_RES word_res,
float *  worst_noise_score 
)

Definition at line 706 of file fixspace.cpp.

708  {
709  float noise_score[512];
710  int i;
711  int min_noise_blob; // 1st contender
712  int max_noise_blob; // last contender
713  int non_noise_count;
714  int worst_noise_blob; // Worst blob
715  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
716  float non_noise_limit = kBlnXHeight * 0.8;
717 
718  if (word_res->rebuild_word == nullptr)
719  return -1; // Can't handle cube words.
720 
721  // Normalised.
722  int blob_count = word_res->box_word->length();
723  ASSERT_HOST(blob_count <= 512);
724  if (blob_count < 5)
725  return -1; // too short to split
726 
727  /* Get the noise scores for all blobs */
728 
729  #ifndef SECURE_NAMES
730  if (debug_fix_space_level > 5)
731  tprintf("FP fixspace Noise metrics for \"%s\": ",
732  word_res->best_choice->unichar_string().c_str());
733  #endif
734 
735  for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
736  TBLOB* blob = word_res->rebuild_word->blobs[i];
737  if (word_res->reject_map[i].accepted())
738  noise_score[i] = non_noise_limit;
739  else
740  noise_score[i] = blob_noise_score(blob);
741 
742  if (debug_fix_space_level > 5)
743  tprintf("%1.1f ", noise_score[i]);
744  }
745  if (debug_fix_space_level > 5)
746  tprintf("\n");
747 
748  /* Now find the worst one which is far enough away from the end of the word */
749 
750  non_noise_count = 0;
751  for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
752  if (noise_score[i] >= non_noise_limit) {
753  non_noise_count++;
754  }
755  }
756  if (non_noise_count < fixsp_non_noise_limit)
757  return -1;
758 
759  min_noise_blob = i;
760 
761  non_noise_count = 0;
762  for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
763  i--) {
764  if (noise_score[i] >= non_noise_limit) {
765  non_noise_count++;
766  }
767  }
768  if (non_noise_count < fixsp_non_noise_limit)
769  return -1;
770 
771  max_noise_blob = i;
772 
773  if (min_noise_blob > max_noise_blob)
774  return -1;
775 
776  *worst_noise_score = small_limit;
777  worst_noise_blob = -1;
778  for (i = min_noise_blob; i <= max_noise_blob; i++) {
779  if (noise_score[i] < *worst_noise_score) {
780  worst_noise_blob = i;
781  *worst_noise_score = noise_score[i];
782  }
783  }
784  return worst_noise_blob;

◆ write_results()

void tesseract::Tesseract::write_results ( PAGE_RES_IT page_res_it,
char  newline_type,
bool  force_eol 
)

Definition at line 96 of file output.cpp.

100  { // override tilde crunch?
101  WERD_RES *word = page_res_it.word();
102  const UNICHARSET &uchset = *word->uch_set;
103  int i;
104  bool need_reject = false;
105  UNICHAR_ID space = uchset.unichar_to_id(" ");
106 
107  if ((word->unlv_crunch_mode != CR_NONE ||
108  word->best_choice->length() == 0) &&
110  if ((word->unlv_crunch_mode != CR_DELETE) &&
111  (!stats_.tilde_crunch_written ||
112  ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
113  (word->word->space () > 0) &&
114  !word->word->flag (W_FUZZY_NON) &&
115  !word->word->flag (W_FUZZY_SP)))) {
116  if (!word->word->flag (W_BOL) &&
117  (word->word->space () > 0) &&
118  !word->word->flag (W_FUZZY_NON) &&
119  !word->word->flag (W_FUZZY_SP)) {
120  stats_.last_char_was_tilde = false;
121  }
122  need_reject = true;
123  }
124  if ((need_reject && !stats_.last_char_was_tilde) ||
125  (force_eol && stats_.write_results_empty_block)) {
126  /* Write a reject char - mark as rejected unless zero_rejection mode */
127  stats_.last_char_was_tilde = true;
128  stats_.tilde_crunch_written = true;
129  stats_.last_char_was_newline = false;
130  stats_.write_results_empty_block = false;
131  }
132 
133  if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
134  stats_.tilde_crunch_written = false;
135  stats_.last_char_was_newline = true;
136  stats_.last_char_was_tilde = false;
137  }
138 
139  if (force_eol)
140  stats_.write_results_empty_block = true;
141  return;
142  }
143 
144  /* NORMAL PROCESSING of non tilde crunched words */
145 
146  stats_.tilde_crunch_written = false;
147  if (newline_type)
148  stats_.last_char_was_newline = true;
149  else
150  stats_.last_char_was_newline = false;
151  stats_.write_results_empty_block = force_eol; // about to write a real word
152 
153  if (unlv_tilde_crunching &&
154  stats_.last_char_was_tilde &&
155  (word->word->space() == 0) &&
157  (word->best_choice->unichar_id(0) == space)) {
158  /* Prevent adjacent tilde across words - we know that adjacent tildes within
159  words have been removed */
160  word->MergeAdjacentBlobs(0);
161  }
162  if (newline_type ||
164  stats_.last_char_was_tilde = false;
165  else {
166  if (word->reject_map.length () > 0) {
167  if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
168  stats_.last_char_was_tilde = true;
169  else
170  stats_.last_char_was_tilde = false;
171  }
172  else if (word->word->space () > 0)
173  stats_.last_char_was_tilde = false;
174  /* else it is unchanged as there are no output chars */
175  }
176 
177  ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
178 
179  set_unlv_suspects(word);
180  check_debug_pt (word, 120);
182  tprintf ("Dict word: \"%s\": %d\n",
183  word->best_choice->debug_string().c_str(),
184  dict_word(*(word->best_choice)));
185  }
186  if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
188  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
189  for (i = 0; i < word->best_choice->length(); ++i) {
190  if (word->reject_map[i].rejected())
191  word->reject_map[i].setrej_minimal_rej_accept();
192  }
193  }
195  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
196  for (i = 0; i < word->best_choice->length(); ++i) {
197  if ((word->best_choice->unichar_id(i) != space) &&
198  word->reject_map[i].rejected())
199  word->reject_map[i].setrej_minimal_rej_accept();
200  }
201  }

Member Data Documentation

◆ applybox_debug

int tesseract::Tesseract::applybox_debug = 1

"Debug level"

Definition at line 823 of file tesseractclass.h.

◆ applybox_exposure_pattern

char* tesseract::Tesseract::applybox_exposure_pattern = ".exp"

"Exposure value follows this pattern in the image" " filename. The name of the image files are expected" " to be in the form [lang].[fontname].exp[num].tif"

Definition at line 828 of file tesseractclass.h.

◆ applybox_learn_chars_and_char_frags_mode

bool tesseract::Tesseract::applybox_learn_chars_and_char_frags_mode = false

"Learn both character fragments (as is done in the" " special low exposure mode) as well as unfragmented" " characters."

Definition at line 832 of file tesseractclass.h.

◆ applybox_learn_ngrams_mode

bool tesseract::Tesseract::applybox_learn_ngrams_mode = false

"Each bounding box is assumed to contain ngrams. Only" " learn the ngrams whose outlines overlap horizontally."

Definition at line 835 of file tesseractclass.h.

◆ applybox_page

int tesseract::Tesseract::applybox_page = 0

"Page number to apply boxes from"

Definition at line 824 of file tesseractclass.h.

◆ bidi_debug

int tesseract::Tesseract::bidi_debug = 0

"Debug level for BiDi"

Definition at line 822 of file tesseractclass.h.

◆ bland_unrej

bool tesseract::Tesseract::bland_unrej = false

"unrej potential with no checks"

Definition at line 928 of file tesseractclass.h.

◆ chs_leading_punct

char* tesseract::Tesseract::chs_leading_punct = "('`\""

"Leading punctuation"

Definition at line 873 of file tesseractclass.h.

◆ chs_trailing_punct1

char* tesseract::Tesseract::chs_trailing_punct1 = ").,;:?!"

"1st Trailing punctuation"

Definition at line 874 of file tesseractclass.h.

◆ chs_trailing_punct2

char* tesseract::Tesseract::chs_trailing_punct2 = ")'`\""

"2nd Trailing punctuation"

Definition at line 875 of file tesseractclass.h.

◆ conflict_set_I_l_1

char* tesseract::Tesseract::conflict_set_I_l_1 = "Il1[]"

"Il1 conflict set"

Definition at line 1041 of file tesseractclass.h.

◆ crunch_accept_ok

bool tesseract::Tesseract::crunch_accept_ok = true

"Use acceptability in okstring"

Definition at line 953 of file tesseractclass.h.

◆ crunch_debug

int tesseract::Tesseract::crunch_debug = 0

"As it says"

Definition at line 962 of file tesseractclass.h.

◆ crunch_del_cert

double tesseract::Tesseract::crunch_del_cert = -10.0

"POTENTIAL crunch cert lt this"

Definition at line 943 of file tesseractclass.h.

◆ crunch_del_high_word

double tesseract::Tesseract::crunch_del_high_word = 1.5

"Del if word gt xht x this above bl"

Definition at line 947 of file tesseractclass.h.

◆ crunch_del_low_word

double tesseract::Tesseract::crunch_del_low_word = 0.5

"Del if word gt xht x this below bl"

Definition at line 948 of file tesseractclass.h.

◆ crunch_del_max_ht

double tesseract::Tesseract::crunch_del_max_ht = 3.0

"Del if word ht gt xht x this"

Definition at line 945 of file tesseractclass.h.

◆ crunch_del_min_ht

double tesseract::Tesseract::crunch_del_min_ht = 0.7

"Del if word ht lt xht x this"

Definition at line 944 of file tesseractclass.h.

◆ crunch_del_min_width

double tesseract::Tesseract::crunch_del_min_width = 3.0

"Del if word width lt xht x this"

Definition at line 946 of file tesseractclass.h.

◆ crunch_del_rating

double tesseract::Tesseract::crunch_del_rating = 60

"POTENTIAL crunch rating lt this"

Definition at line 942 of file tesseractclass.h.

◆ crunch_early_convert_bad_unlv_chs

bool tesseract::Tesseract::crunch_early_convert_bad_unlv_chs = false

"Take out ~^ early?"

Definition at line 935 of file tesseractclass.h.

◆ crunch_early_merge_tess_fails

bool tesseract::Tesseract::crunch_early_merge_tess_fails = true

"Before word crunch?"

Definition at line 934 of file tesseractclass.h.

◆ crunch_include_numerals

bool tesseract::Tesseract::crunch_include_numerals = false

"Fiddle alpha figures"

Definition at line 956 of file tesseractclass.h.

◆ crunch_leave_accept_strings

bool tesseract::Tesseract::crunch_leave_accept_strings = false

"Don't pot crunch sensible strings"

Definition at line 955 of file tesseractclass.h.

◆ crunch_leave_lc_strings

int tesseract::Tesseract::crunch_leave_lc_strings = 4

"Don't crunch words with long lower case strings"

Definition at line 958 of file tesseractclass.h.

◆ crunch_leave_ok_strings

bool tesseract::Tesseract::crunch_leave_ok_strings = true

"Don't touch sensible strings"

Definition at line 952 of file tesseractclass.h.

◆ crunch_leave_uc_strings

int tesseract::Tesseract::crunch_leave_uc_strings = 4

"Don't crunch words with long lower case strings"

Definition at line 960 of file tesseractclass.h.

◆ crunch_long_repetitions

int tesseract::Tesseract::crunch_long_repetitions = 3

"Crunch words with long repetitions"

Definition at line 961 of file tesseractclass.h.

◆ crunch_poor_garbage_cert

double tesseract::Tesseract::crunch_poor_garbage_cert = -9.0

"crunch garbage cert lt this"

Definition at line 938 of file tesseractclass.h.

◆ crunch_poor_garbage_rate

double tesseract::Tesseract::crunch_poor_garbage_rate = 60

"crunch garbage rating lt this"

Definition at line 939 of file tesseractclass.h.

◆ crunch_pot_indicators

int tesseract::Tesseract::crunch_pot_indicators = 1

"How many potential indicators needed"

Definition at line 951 of file tesseractclass.h.

◆ crunch_pot_poor_cert

double tesseract::Tesseract::crunch_pot_poor_cert = -8.0

"POTENTIAL crunch cert lt this"

Definition at line 941 of file tesseractclass.h.

◆ crunch_pot_poor_rate

double tesseract::Tesseract::crunch_pot_poor_rate = 40

"POTENTIAL crunch rating lt this"

Definition at line 940 of file tesseractclass.h.

◆ crunch_rating_max

int tesseract::Tesseract::crunch_rating_max = 10

"For adj length in rating per ch"

Definition at line 950 of file tesseractclass.h.

◆ crunch_small_outlines_size

double tesseract::Tesseract::crunch_small_outlines_size = 0.6

"Small if lt xht x this"

Definition at line 949 of file tesseractclass.h.

◆ crunch_terrible_garbage

bool tesseract::Tesseract::crunch_terrible_garbage = true

"As it says"

Definition at line 937 of file tesseractclass.h.

◆ crunch_terrible_rating

double tesseract::Tesseract::crunch_terrible_rating = 80.0

"crunch rating lt this"

Definition at line 936 of file tesseractclass.h.

◆ debug_fix_space_level

int tesseract::Tesseract::debug_fix_space_level = 0

"Contextual fixspace debug"

Definition at line 967 of file tesseractclass.h.

◆ debug_noise_removal

int tesseract::Tesseract::debug_noise_removal = 0

"Debug reassignment of small outlines"

Definition at line 857 of file tesseractclass.h.

◆ debug_x_ht_level

int tesseract::Tesseract::debug_x_ht_level = 0

"Reestimate debug"

Definition at line 872 of file tesseractclass.h.

◆ enable_noise_removal

bool tesseract::Tesseract::enable_noise_removal = true

"Remove and conditionally reassign small outlines when they" " confuse layout analysis, determining diacritics vs noise"

Definition at line 856 of file tesseractclass.h.

◆ file_type

char* tesseract::Tesseract::file_type = ".tif"

"Filename extension"

Definition at line 1048 of file tesseractclass.h.

◆ fixsp_done_mode

int tesseract::Tesseract::fixsp_done_mode = 1

"What constitues done for spacing"

Definition at line 966 of file tesseractclass.h.

◆ fixsp_non_noise_limit

int tesseract::Tesseract::fixsp_non_noise_limit = 1

"How many non-noise blbs either side?"

Definition at line 963 of file tesseractclass.h.

◆ fixsp_small_outlines_size

double tesseract::Tesseract::fixsp_small_outlines_size = 0.28

"Small if lt xht x this"

Definition at line 964 of file tesseractclass.h.

◆ hocr_char_boxes

bool tesseract::Tesseract::hocr_char_boxes = false

"Add coordinates for each character to hocr output"

Definition at line 933 of file tesseractclass.h.

◆ hocr_font_info

bool tesseract::Tesseract::hocr_font_info = false

"Add font info to hocr output"

Definition at line 931 of file tesseractclass.h.

◆ interactive_display_mode

bool tesseract::Tesseract::interactive_display_mode = false

"Run interactively?"

Definition at line 1047 of file tesseractclass.h.

◆ jpg_quality

int tesseract::Tesseract::jpg_quality = 85

"Set JPEG quality level"

Definition at line 1009 of file tesseractclass.h.

◆ lstm_choice_iterations

int tesseract::Tesseract::lstm_choice_iterations = 5

"Sets the number of cascading iterations for the Beamsearch in " "lstm_choice_mode. Note that lstm_choice_mode must be set to " "a value greater than 0 to produce results."

Definition at line 1090 of file tesseractclass.h.

◆ lstm_choice_mode

int tesseract::Tesseract::lstm_choice_mode = 0

"Allows to include alternative symbols choices in the hOCR " "output. " "Valid input values are 0, 1 and 2. 0 is the default value. " "With 1 the alternative symbol choices per timestep are included. " "With 2 the alternative symbol choices are extracted from the CTC " "process instead of the lattice. The choices are mapped per " "character."

Definition at line 1086 of file tesseractclass.h.

◆ lstm_rating_coefficient

double tesseract::Tesseract::lstm_rating_coefficient = 5

"Sets the rating coefficient for the lstm choices. The smaller " "the coefficient, the better are the ratings for each choice " "and less information is lost due to the cut off at 0. The " "standard value is 5."

Definition at line 1095 of file tesseractclass.h.

◆ lstm_use_matrix

bool tesseract::Tesseract::lstm_use_matrix = 1

"Use ratings matrix/beam searct with lstm"

Definition at line 895 of file tesseractclass.h.

◆ min_characters_to_try

int tesseract::Tesseract::min_characters_to_try = 50

"Specify minimum characters to try during OSD"

Definition at line 1012 of file tesseractclass.h.

◆ min_orientation_margin

double tesseract::Tesseract::min_orientation_margin = 7.0

"Min acceptable orientation margin"

Definition at line 1057 of file tesseractclass.h.

◆ min_sane_x_ht_pixels

int tesseract::Tesseract::min_sane_x_ht_pixels = 8

"Reject any x-ht lt or eq than this"

Definition at line 1042 of file tesseractclass.h.

◆ multilang_debug_level

int tesseract::Tesseract::multilang_debug_level = 0

"Print multilang debug info."

Definition at line 890 of file tesseractclass.h.

◆ noise_cert_basechar

double tesseract::Tesseract::noise_cert_basechar = -8.0

"Hingepoint for base char certainty"

Definition at line 860 of file tesseractclass.h.

◆ noise_cert_disjoint

double tesseract::Tesseract::noise_cert_disjoint = -2.5

"Hingepoint for disjoint certainty"

Definition at line 863 of file tesseractclass.h.

◆ noise_cert_factor

double tesseract::Tesseract::noise_cert_factor = 0.375

"Scaling on certainty diff from Hingepoint"

Definition at line 869 of file tesseractclass.h.

◆ noise_cert_punc

double tesseract::Tesseract::noise_cert_punc = -2.5

"Threshold for new punc char certainty"

Definition at line 866 of file tesseractclass.h.

◆ noise_maxperblob

int tesseract::Tesseract::noise_maxperblob = 8

"Max diacritics to apply to a blob"

Definition at line 870 of file tesseractclass.h.

◆ noise_maxperword

int tesseract::Tesseract::noise_maxperword = 16

"Max diacritics to apply to a word"

Definition at line 871 of file tesseractclass.h.

◆ numeric_punctuation

char* tesseract::Tesseract::numeric_punctuation = ".,"

"Punct. chs expected WITHIN numbers"

Definition at line 968 of file tesseractclass.h.

◆ ocr_devanagari_split_strategy

int tesseract::Tesseract::ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT

"Whether to use the top-line splitting process for Devanagari " "documents while performing ocr."

Definition at line 817 of file tesseractclass.h.

◆ ok_repeated_ch_non_alphanum_wds

char* tesseract::Tesseract::ok_repeated_ch_non_alphanum_wds = "-?*\075"

"Allow NN to unrej"

Definition at line 1040 of file tesseractclass.h.

◆ outlines_2

char* tesseract::Tesseract::outlines_2 = "ij!?%\":;"

"Non standard number of outlines"

Definition at line 897 of file tesseractclass.h.

◆ outlines_odd

char* tesseract::Tesseract::outlines_odd = "%| "

"Non standard number of outlines"

Definition at line 896 of file tesseractclass.h.

◆ page_separator

char* tesseract::Tesseract::page_separator = "\f"

"Page separator (default is form feed control character)"

Definition at line 1078 of file tesseractclass.h.

◆ pageseg_apply_music_mask

bool tesseract::Tesseract::pageseg_apply_music_mask = true

"Detect music staff and remove intersecting components"

Definition at line 1097 of file tesseractclass.h.

◆ pageseg_devanagari_split_strategy

int tesseract::Tesseract::pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT

"Whether to use the top-line splitting process for Devanagari " "documents while performing page-segmentation."

Definition at line 813 of file tesseractclass.h.

◆ paragraph_debug_level

int tesseract::Tesseract::paragraph_debug_level = 0

"Print paragraph debug info."

Definition at line 891 of file tesseractclass.h.

◆ paragraph_text_based

bool tesseract::Tesseract::paragraph_text_based = true

"Run paragraph detection on the post-text-recognition " "(more accurate)"

Definition at line 894 of file tesseractclass.h.

◆ poly_allow_detailed_fx

bool tesseract::Tesseract::poly_allow_detailed_fx = false

"Allow feature extractors to see the original outline"

Definition at line 1061 of file tesseractclass.h.

◆ preserve_interword_spaces

bool tesseract::Tesseract::preserve_interword_spaces = false

"Preserve multiple interword spaces"

Definition at line 1076 of file tesseractclass.h.

◆ quality_blob_pc

double tesseract::Tesseract::quality_blob_pc = 0.0

"good_quality_doc gte good blobs limit"

Definition at line 877 of file tesseractclass.h.

◆ quality_char_pc

double tesseract::Tesseract::quality_char_pc = 0.95

"good_quality_doc gte good char limit"

Definition at line 880 of file tesseractclass.h.

◆ quality_min_initial_alphas_reqd

int tesseract::Tesseract::quality_min_initial_alphas_reqd = 2

"alphas in a good word"

Definition at line 881 of file tesseractclass.h.

◆ quality_outline_pc

double tesseract::Tesseract::quality_outline_pc = 1.0

"good_quality_doc lte outline error limit"

Definition at line 879 of file tesseractclass.h.

◆ quality_rej_pc

double tesseract::Tesseract::quality_rej_pc = 0.08

"good_quality_doc lte rejection limit"

Definition at line 876 of file tesseractclass.h.

◆ quality_rowrej_pc

double tesseract::Tesseract::quality_rowrej_pc = 1.1

"good_quality_doc gte good char limit"

Definition at line 929 of file tesseractclass.h.

◆ rej_1Il_trust_permuter_type

bool tesseract::Tesseract::rej_1Il_trust_permuter_type = true

"Don't double check"

Definition at line 1032 of file tesseractclass.h.

◆ rej_1Il_use_dict_word

bool tesseract::Tesseract::rej_1Il_use_dict_word = false

"Use dictword test"

Definition at line 1031 of file tesseractclass.h.

◆ rej_alphas_in_number_perm

bool tesseract::Tesseract::rej_alphas_in_number_perm = false

"Extend permuter check"

Definition at line 1037 of file tesseractclass.h.

◆ rej_trust_doc_dawg

bool tesseract::Tesseract::rej_trust_doc_dawg = false

"Use DOC dawg in 11l conf. detector"

Definition at line 1030 of file tesseractclass.h.

◆ rej_use_good_perm

bool tesseract::Tesseract::rej_use_good_perm = true

"Individual rejection control"

Definition at line 1035 of file tesseractclass.h.

◆ rej_use_sensible_wd

bool tesseract::Tesseract::rej_use_sensible_wd = false

"Extend permuter check"

Definition at line 1036 of file tesseractclass.h.

◆ rej_use_tess_accepted

bool tesseract::Tesseract::rej_use_tess_accepted = true

"Individual rejection control"

Definition at line 1033 of file tesseractclass.h.

◆ rej_use_tess_blanks

bool tesseract::Tesseract::rej_use_tess_blanks = true

"Individual rejection control"

Definition at line 1034 of file tesseractclass.h.

◆ rej_whole_of_mostly_reject_word_fract

double tesseract::Tesseract::rej_whole_of_mostly_reject_word_fract = 0.85

"if >this fract"

Definition at line 1038 of file tesseractclass.h.

◆ subscript_max_y_top

double tesseract::Tesseract::subscript_max_y_top = 0.5

"Maximum top of a character measured as a multiple of x-height " "above the baseline for us to reconsider whether it's a " "subscript."

Definition at line 989 of file tesseractclass.h.

◆ superscript_bettered_certainty

double tesseract::Tesseract::superscript_bettered_certainty = 0.97

"What reduction in " "badness do we think sufficient to choose a superscript over " "what we'd thought. For example, a value of 0.6 means we want " "to reduce badness of certainty by 40%"

Definition at line 981 of file tesseractclass.h.

◆ superscript_debug

int tesseract::Tesseract::superscript_debug = 0

"Debug level for sub & superscript fixer"

Definition at line 972 of file tesseractclass.h.

◆ superscript_min_y_bottom

double tesseract::Tesseract::superscript_min_y_bottom = 0.3

"Minimum bottom of a character measured as a multiple of " "x-height above the baseline for us to reconsider whether it's " "a superscript."

Definition at line 993 of file tesseractclass.h.

◆ superscript_scaledown_ratio

double tesseract::Tesseract::superscript_scaledown_ratio = 0.4

"A superscript scaled down more than this is unbelievably " "small. For example, 0.3 means we expect the font size to " "be no smaller than 30% of the text line font size."

Definition at line 985 of file tesseractclass.h.

◆ superscript_worse_certainty

double tesseract::Tesseract::superscript_worse_certainty = 2.0

"How many times worse " "certainty does a superscript position glyph need to be for us " "to try classifying it as a char with a different baseline?"

Definition at line 976 of file tesseractclass.h.

◆ suspect_accept_rating

double tesseract::Tesseract::suspect_accept_rating = -999.9

"Accept good rating limit"

Definition at line 1018 of file tesseractclass.h.

◆ suspect_constrain_1Il

bool tesseract::Tesseract::suspect_constrain_1Il = false

"UNLV keep 1Il chars rejected"

Definition at line 1016 of file tesseractclass.h.

◆ suspect_level

int tesseract::Tesseract::suspect_level = 99

"Suspect marker level"

Definition at line 1014 of file tesseractclass.h.

◆ suspect_rating_per_ch

double tesseract::Tesseract::suspect_rating_per_ch = 999.9

"Don't touch bad rating limit"

Definition at line 1017 of file tesseractclass.h.

◆ suspect_short_words

int tesseract::Tesseract::suspect_short_words = 2

"Don't Suspect dict wds longer than this"

Definition at line 1015 of file tesseractclass.h.

◆ tessedit_adaption_debug

bool tesseract::Tesseract::tessedit_adaption_debug = false

"Generate and print debug information for adaption"

Definition at line 821 of file tesseractclass.h.

◆ tessedit_ambigs_training

bool tesseract::Tesseract::tessedit_ambigs_training = false

"Perform training for ambiguities"

Definition at line 809 of file tesseractclass.h.

◆ tessedit_bigram_debug

int tesseract::Tesseract::tessedit_bigram_debug = 0

"Amount of debug output for bigram " "correction."

Definition at line 853 of file tesseractclass.h.

◆ tessedit_char_blacklist

char* tesseract::Tesseract::tessedit_char_blacklist = ""

"Blacklist of chars not to recognize"

Definition at line 804 of file tesseractclass.h.

◆ tessedit_char_unblacklist

char* tesseract::Tesseract::tessedit_char_unblacklist = ""

"List of chars to override tessedit_char_blacklist"

Definition at line 807 of file tesseractclass.h.

◆ tessedit_char_whitelist

char* tesseract::Tesseract::tessedit_char_whitelist = ""

"Whitelist of chars to recognize"

Definition at line 805 of file tesseractclass.h.

◆ tessedit_create_alto

bool tesseract::Tesseract::tessedit_create_alto = false

"Write .xml ALTO output file"

Definition at line 1000 of file tesseractclass.h.

◆ tessedit_create_boxfile

bool tesseract::Tesseract::tessedit_create_boxfile = false

"Output text with boxes"

Definition at line 1043 of file tesseractclass.h.

◆ tessedit_create_hocr

bool tesseract::Tesseract::tessedit_create_hocr = false

"Write .html hOCR output file"

Definition at line 999 of file tesseractclass.h.

◆ tessedit_create_lstmbox

bool tesseract::Tesseract::tessedit_create_lstmbox = false

"Write .box file for LSTM training"

Definition at line 1002 of file tesseractclass.h.

◆ tessedit_create_pdf

bool tesseract::Tesseract::tessedit_create_pdf = false

"Write .pdf output file"

Definition at line 1006 of file tesseractclass.h.

◆ tessedit_create_tsv

bool tesseract::Tesseract::tessedit_create_tsv = false

"Write .tsv output file"

Definition at line 1003 of file tesseractclass.h.

◆ tessedit_create_txt

bool tesseract::Tesseract::tessedit_create_txt = false

"Write .txt output file"

Definition at line 998 of file tesseractclass.h.

◆ tessedit_create_wordstrbox

bool tesseract::Tesseract::tessedit_create_wordstrbox = false

"Write WordStr format .box output file"

Definition at line 1005 of file tesseractclass.h.

◆ tessedit_debug_block_rejection

bool tesseract::Tesseract::tessedit_debug_block_rejection = false

"Block and Row stats"

Definition at line 846 of file tesseractclass.h.

◆ tessedit_debug_doc_rejection

bool tesseract::Tesseract::tessedit_debug_doc_rejection = false

"Page stats"

Definition at line 925 of file tesseractclass.h.

◆ tessedit_debug_fonts

bool tesseract::Tesseract::tessedit_debug_fonts = false

"Output font info per char"

Definition at line 845 of file tesseractclass.h.

◆ tessedit_debug_quality_metrics

bool tesseract::Tesseract::tessedit_debug_quality_metrics = false

"Output data to debug file"

Definition at line 927 of file tesseractclass.h.

◆ tessedit_display_outwords

bool tesseract::Tesseract::tessedit_display_outwords = false

"Draw output words"

Definition at line 836 of file tesseractclass.h.

◆ tessedit_do_invert

bool tesseract::Tesseract::tessedit_do_invert = true

"Try inverting the image in `LSTMRecognizeWord`"

Definition at line 795 of file tesseractclass.h.

◆ tessedit_dont_blkrej_good_wds

bool tesseract::Tesseract::tessedit_dont_blkrej_good_wds = false

"Use word segmentation quality metric"

Definition at line 915 of file tesseractclass.h.

◆ tessedit_dont_rowrej_good_wds

bool tesseract::Tesseract::tessedit_dont_rowrej_good_wds = false

"Use word segmentation quality metric"

Definition at line 917 of file tesseractclass.h.

◆ tessedit_dump_choices

bool tesseract::Tesseract::tessedit_dump_choices = false

"Dump char choices"

Definition at line 837 of file tesseractclass.h.

◆ tessedit_dump_pageseg_images

bool tesseract::Tesseract::tessedit_dump_pageseg_images = false

"Dump intermediate images made during page segmentation"

Definition at line 793 of file tesseractclass.h.

◆ tessedit_enable_bigram_correction

bool tesseract::Tesseract::tessedit_enable_bigram_correction = true

"Enable correction based on the word bigram dictionary."

Definition at line 848 of file tesseractclass.h.

◆ tessedit_enable_dict_correction

bool tesseract::Tesseract::tessedit_enable_dict_correction = false

"Enable single word correction based on the dictionary."

Definition at line 850 of file tesseractclass.h.

◆ tessedit_enable_doc_dict

bool tesseract::Tesseract::tessedit_enable_doc_dict = true

"Add words to the document dictionary"

Definition at line 844 of file tesseractclass.h.

◆ tessedit_fix_fuzzy_spaces

bool tesseract::Tesseract::tessedit_fix_fuzzy_spaces = true

"Try to improve fuzzy spaces"

Definition at line 839 of file tesseractclass.h.

◆ tessedit_fix_hyphens

bool tesseract::Tesseract::tessedit_fix_hyphens = true

"Crunch double hyphens?"

Definition at line 842 of file tesseractclass.h.

◆ tessedit_flip_0O

bool tesseract::Tesseract::tessedit_flip_0O = true

"Contextual 0O O0 flips"

Definition at line 1027 of file tesseractclass.h.

◆ tessedit_good_doc_still_rowrej_wd

double tesseract::Tesseract::tessedit_good_doc_still_rowrej_wd = 1.1

"rej good doc wd if more than this fraction rejected"

Definition at line 923 of file tesseractclass.h.

◆ tessedit_good_quality_unrej

bool tesseract::Tesseract::tessedit_good_quality_unrej = true

"Reduce rejection on good docs"

Definition at line 899 of file tesseractclass.h.

◆ tessedit_image_border

int tesseract::Tesseract::tessedit_image_border = 2

"Rej blbs near image edge limit"

Definition at line 1039 of file tesseractclass.h.

◆ tessedit_init_config_only

bool tesseract::Tesseract::tessedit_init_config_only = false

"Only initialize with the config file. Useful if the instance is " "not going to be used for OCR but say only for layout analysis."

Definition at line 1064 of file tesseractclass.h.

◆ tessedit_load_sublangs

char* tesseract::Tesseract::tessedit_load_sublangs = ""

"List of languages to load with this one"

Definition at line 1051 of file tesseractclass.h.

◆ tessedit_lower_flip_hyphen

double tesseract::Tesseract::tessedit_lower_flip_hyphen = 1.5

"Aspect ratio dot/hyphen test"

Definition at line 1028 of file tesseractclass.h.

◆ tessedit_make_boxes_from_boxes

bool tesseract::Tesseract::tessedit_make_boxes_from_boxes = false

"Generate more boxes from boxed chars"

Definition at line 789 of file tesseractclass.h.

◆ tessedit_minimal_rej_pass1

bool tesseract::Tesseract::tessedit_minimal_rej_pass1 = false

"Do minimal rejection on pass 1 output"

Definition at line 885 of file tesseractclass.h.

◆ tessedit_minimal_rejection

bool tesseract::Tesseract::tessedit_minimal_rejection = false

"Only reject tess failures"

Definition at line 1019 of file tesseractclass.h.

◆ tessedit_ocr_engine_mode

int tesseract::Tesseract::tessedit_ocr_engine_mode = tesseract::OEM_DEFAULT

"Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults" " to loading and running the most accurate available."

Definition at line 802 of file tesseractclass.h.

◆ tessedit_override_permuter

bool tesseract::Tesseract::tessedit_override_permuter = true

"According to dict_word"

Definition at line 1049 of file tesseractclass.h.

◆ tessedit_page_number

int tesseract::Tesseract::tessedit_page_number = -1

"-1 -> All pages, else specific page to process"

Definition at line 1045 of file tesseractclass.h.

◆ tessedit_pageseg_mode

int tesseract::Tesseract::tessedit_pageseg_mode = PSM_SINGLE_BLOCK

"Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," " 5=line, 6=word, 7=char" " (Values from PageSegMode enum in tesseract/publictypes.h)"

Definition at line 799 of file tesseractclass.h.

◆ tessedit_parallelize

int tesseract::Tesseract::tessedit_parallelize = 0

"Run in parallel where possible"

Definition at line 1074 of file tesseractclass.h.

◆ tessedit_prefer_joined_punct

bool tesseract::Tesseract::tessedit_prefer_joined_punct = false

"Reward punctuation joins"

Definition at line 965 of file tesseractclass.h.

◆ tessedit_preserve_blk_rej_perfect_wds

bool tesseract::Tesseract::tessedit_preserve_blk_rej_perfect_wds = true

"Only rej partially rejected words in block rejection"

Definition at line 911 of file tesseractclass.h.

◆ tessedit_preserve_min_wd_len

int tesseract::Tesseract::tessedit_preserve_min_wd_len = 2

"Only preserve wds longer than this"

Definition at line 919 of file tesseractclass.h.

◆ tessedit_preserve_row_rej_perfect_wds

bool tesseract::Tesseract::tessedit_preserve_row_rej_perfect_wds = true

"Only rej partially rejected words in row rejection"

Definition at line 913 of file tesseractclass.h.

◆ tessedit_reject_bad_qual_wds

bool tesseract::Tesseract::tessedit_reject_bad_qual_wds = true

"Reject all bad quality wds"

Definition at line 924 of file tesseractclass.h.

◆ tessedit_reject_block_percent

double tesseract::Tesseract::tessedit_reject_block_percent = 45.00

"%rej allowed before rej whole block"

Definition at line 904 of file tesseractclass.h.

◆ tessedit_reject_doc_percent

double tesseract::Tesseract::tessedit_reject_doc_percent = 65.00

"%rej allowed before rej whole doc"

Definition at line 902 of file tesseractclass.h.

◆ tessedit_reject_mode

int tesseract::Tesseract::tessedit_reject_mode = 0

"Rejection algorithm"

Definition at line 1025 of file tesseractclass.h.

◆ tessedit_reject_row_percent

double tesseract::Tesseract::tessedit_reject_row_percent = 40.00

"%rej allowed before rej whole row"

Definition at line 906 of file tesseractclass.h.

◆ tessedit_rejection_debug

bool tesseract::Tesseract::tessedit_rejection_debug = false

"Adaption debug"

Definition at line 1026 of file tesseractclass.h.

◆ tessedit_resegment_from_boxes

bool tesseract::Tesseract::tessedit_resegment_from_boxes = false

"Take segmentation and labeling from box file"

Definition at line 783 of file tesseractclass.h.

◆ tessedit_resegment_from_line_boxes

bool tesseract::Tesseract::tessedit_resegment_from_line_boxes = false

"Conversion of word/line box file to char box file"

Definition at line 785 of file tesseractclass.h.

◆ tessedit_row_rej_good_docs

bool tesseract::Tesseract::tessedit_row_rej_good_docs = true

"Apply row rejection to good docs"

Definition at line 921 of file tesseractclass.h.

◆ tessedit_tess_adaption_mode

int tesseract::Tesseract::tessedit_tess_adaption_mode = 0x27

"Adaptation decision algorithm for tess"

Definition at line 883 of file tesseractclass.h.

◆ tessedit_test_adaption

bool tesseract::Tesseract::tessedit_test_adaption = false

"Test adaption criteria"

Definition at line 886 of file tesseractclass.h.

◆ tessedit_timing_debug

bool tesseract::Tesseract::tessedit_timing_debug = false

"Print timing stats"

Definition at line 838 of file tesseractclass.h.

◆ tessedit_train_from_boxes

bool tesseract::Tesseract::tessedit_train_from_boxes = false

"Generate training data from boxed chars"

Definition at line 787 of file tesseractclass.h.

◆ tessedit_train_line_recognizer

bool tesseract::Tesseract::tessedit_train_line_recognizer = false

"Break input into lines and remap boxes if present"

Definition at line 791 of file tesseractclass.h.

◆ tessedit_unrej_any_wd

bool tesseract::Tesseract::tessedit_unrej_any_wd = false

"Don't bother with word plausibility"

Definition at line 841 of file tesseractclass.h.

◆ tessedit_upper_flip_hyphen

double tesseract::Tesseract::tessedit_upper_flip_hyphen = 1.8

"Aspect ratio dot/hyphen test"

Definition at line 1029 of file tesseractclass.h.

◆ tessedit_use_primary_params_model

bool tesseract::Tesseract::tessedit_use_primary_params_model = false

"In multilingual mode use params model of the primary language"

Definition at line 1053 of file tesseractclass.h.

◆ tessedit_use_reject_spaces

bool tesseract::Tesseract::tessedit_use_reject_spaces = true

"Reject spaces?"

Definition at line 900 of file tesseractclass.h.

◆ tessedit_whole_wd_rej_row_percent

double tesseract::Tesseract::tessedit_whole_wd_rej_row_percent = 70.00

"Number of row rejects in whole word rejects" "which prevents whole row rejection"

Definition at line 909 of file tesseractclass.h.

◆ tessedit_word_for_word

bool tesseract::Tesseract::tessedit_word_for_word = false

"Make output have exactly one word per WERD"

Definition at line 1022 of file tesseractclass.h.

◆ tessedit_write_block_separators

bool tesseract::Tesseract::tessedit_write_block_separators = false

"Write block separators in output"

Definition at line 995 of file tesseractclass.h.

◆ tessedit_write_images

bool tesseract::Tesseract::tessedit_write_images = false

"Capture the image from the IPE"

Definition at line 1046 of file tesseractclass.h.

◆ tessedit_write_params_to_file

char* tesseract::Tesseract::tessedit_write_params_to_file = ""

"Write all parameters to the given file."

Definition at line 819 of file tesseractclass.h.

◆ tessedit_write_rep_codes

bool tesseract::Tesseract::tessedit_write_rep_codes = false

"Write repetition char code"

Definition at line 996 of file tesseractclass.h.

◆ tessedit_write_unlv

bool tesseract::Tesseract::tessedit_write_unlv = false

"Write .unlv output file"

Definition at line 997 of file tesseractclass.h.

◆ tessedit_zero_kelvin_rejection

bool tesseract::Tesseract::tessedit_zero_kelvin_rejection = false

"Don't reject ANYTHING AT ALL"

Definition at line 1024 of file tesseractclass.h.

◆ tessedit_zero_rejection

bool tesseract::Tesseract::tessedit_zero_rejection = false

"Don't reject ANYTHING"

Definition at line 1020 of file tesseractclass.h.

◆ test_pt

bool tesseract::Tesseract::test_pt = false

"Test for point"

Definition at line 887 of file tesseractclass.h.

◆ test_pt_x

double tesseract::Tesseract::test_pt_x = 99999.99

"xcoord"

Definition at line 888 of file tesseractclass.h.

◆ test_pt_y

double tesseract::Tesseract::test_pt_y = 99999.99

"ycoord"

Definition at line 889 of file tesseractclass.h.

◆ textonly_pdf

bool tesseract::Tesseract::textonly_pdf = false

"Create PDF with only one invisible text layer"

Definition at line 1008 of file tesseractclass.h.

◆ textord_equation_detect

bool tesseract::Tesseract::textord_equation_detect = false

"Turn on equation detector"

Definition at line 1065 of file tesseractclass.h.

◆ textord_tabfind_aligned_gap_fraction

double tesseract::Tesseract::textord_tabfind_aligned_gap_fraction = 0.75

"Fraction of height used as a minimum gap for aligned blobs."

Definition at line 1073 of file tesseractclass.h.

◆ textord_tabfind_force_vertical_text

bool tesseract::Tesseract::textord_tabfind_force_vertical_text = false

"Force using vertical text page mode"

Definition at line 1068 of file tesseractclass.h.

◆ textord_tabfind_show_vlines

bool tesseract::Tesseract::textord_tabfind_show_vlines = false

"Debug line finding"

Definition at line 1058 of file tesseractclass.h.

◆ textord_tabfind_vertical_text

bool tesseract::Tesseract::textord_tabfind_vertical_text = true

"Enable vertical detection"

Definition at line 1066 of file tesseractclass.h.

◆ textord_tabfind_vertical_text_ratio

double tesseract::Tesseract::textord_tabfind_vertical_text_ratio = 0.5

"Fraction of textlines deemed vertical to use vertical page " "mode"

Definition at line 1071 of file tesseractclass.h.

◆ textord_use_cjk_fp_model

bool tesseract::Tesseract::textord_use_cjk_fp_model = false

"Use CJK fixed pitch model"

Definition at line 1059 of file tesseractclass.h.

◆ unlv_tilde_crunching

bool tesseract::Tesseract::unlv_tilde_crunching = false

"Mark v.bad words for tilde crunch"

Definition at line 930 of file tesseractclass.h.

◆ unrecognised_char

char* tesseract::Tesseract::unrecognised_char = "|"

"Output char for unidentified blobs"

Definition at line 1013 of file tesseractclass.h.

◆ user_defined_dpi

int tesseract::Tesseract::user_defined_dpi = 0

"Specify DPI for input image"

Definition at line 1010 of file tesseractclass.h.

◆ x_ht_acceptance_tolerance

int tesseract::Tesseract::x_ht_acceptance_tolerance = 8

"Max allowed deviation of blob top outside of font data"

Definition at line 970 of file tesseractclass.h.

◆ x_ht_min_change

int tesseract::Tesseract::x_ht_min_change = 8

"Min change in xht before actually trying it"

Definition at line 971 of file tesseractclass.h.


The documentation for this class was generated from the following files:
tesseract::Tesseract::chs_leading_punct
char * chs_leading_punct
Definition: tesseractclass.h:873
WERD_RES::done
bool done
Definition: pageres.h:299
UNICHARSET::load_from_file
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:378
CM_ITALIC
Definition: pgedit.cpp:84
tesseract::Tesseract::tessedit_dont_rowrej_good_wds
bool tessedit_dont_rowrej_good_wds
Definition: tesseractclass.h:917
TBOX
Definition: cleanapi_test.cc:19
REJMAP::full_print
void full_print(FILE *fp)
Definition: rejctmap.cpp:332
tesseract::Tesseract::superscript_bettered_certainty
double superscript_bettered_certainty
Definition: tesseractclass.h:981
tesseract::Tesseract::ProcessTargetWord
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:120
WERD_RES::SetupBoxWord
void SetupBoxWord()
Definition: pageres.cpp:845
WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:529
tesseract::ShiroRekhaSplitter::set_ocr_split_strategy
void set_ocr_split_strategy(SplitStrategy strategy)
Definition: devanagari_processing.h:138
tesseract::Tesseract::BelievableSuperscript
bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
Definition: superscript.cpp:520
REJMAP::rej_word_row_rej
void rej_word_row_rej()
Definition: rejctmap.cpp:441
tesseract::Tesseract::dump_words
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
Definition: fixspace.cpp:475
BLOCKS_CMD_EVENT
Definition: pgedit.cpp:61
tesseract::Tesseract::first_alphanum_index
int16_t first_alphanum_index(const char *word, const char *word_lengths)
Definition: reject.cpp:468
SHOW_BOLD_CMD_EVENT
Definition: pgedit.cpp:72
SHOW_FIXEDPITCH_CMD_EVENT
Definition: pgedit.cpp:74
SHOW_UNDERLINE_CMD_EVENT
Definition: pgedit.cpp:73
G_NEVER_CRUNCH
Definition: docqual.h:29
tesseract::ParamUtils::ReadParamsFile
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:39
tesseract::Tesseract::applybox_exposure_pattern
char * applybox_exposure_pattern
Definition: tesseractclass.h:828
tesseract::Tesseract::quality_based_rejection
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:133
ScrollView::Brush
void Brush(Color color)
Definition: scrollview.cpp:723
tesseract::Tesseract::hocr_font_info
bool hocr_font_info
Definition: tesseractclass.h:931
os_detect_blobs
int os_detect_blobs(const GenericVector< int > *allowed_scripts, BLOBNBOX_CLIST *blob_list, OSResults *osr, tesseract::Tesseract *tess)
Definition: osdetect.cpp:278
ICOORD::set_x
void set_x(int16_t xin)
rewrite function
Definition: points.h:60
tesseract::Tesseract::break_noisiest_blob_word
void break_noisiest_blob_word(WERD_RES_LIST &words)
Definition: fixspace.cpp:641
tesseract::Tesseract::poly_allow_detailed_fx
bool poly_allow_detailed_fx
Definition: tesseractclass.h:1061
tesseract::Tesseract::split_and_recog_word
void split_and_recog_word(WERD_RES *word)
Definition: tfacepp.cpp:137
tesseract::Tesseract::init_tesseract
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:302
tesseract::Tesseract::fix_noisy_space_list
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:595
SVET_DESTROY
Definition: scrollview.h:45
BlamerBundle::TruthString
STRING TruthString() const
Definition: blamer.h:115
C_BLOB::bounding_box
TBOX bounding_box() const
Definition: stepblob.cpp:247
WERD_RES::fix_hyphens
void fix_hyphens()
Definition: pageres.cpp:1042
TBOX::move
void move(const ICOORD vec)
Definition: rect.h:156
tesseract::Tesseract::eval_word_spacing
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:265
tesseract::PSM_OSD_ENABLED
bool PSM_OSD_ENABLED(int pageseg_mode)
Definition: publictypes.h:188
tesseract::Textord::set_use_cjk_fp_model
void set_use_cjk_fp_model(bool flag)
Definition: textord.h:95
BLOCK_RES::rej_count
int32_t rej_count
Definition: pageres.h:115
OSResults::best_result
OSBestResult best_result
Definition: osdetect.h:81
tesseract::Tesseract::quality_min_initial_alphas_reqd
int quality_min_initial_alphas_reqd
Definition: tesseractclass.h:881
tesseract::Tesseract::min_sane_x_ht_pixels
int min_sane_x_ht_pixels
Definition: tesseractclass.h:1042
clear_fx_win
void clear_fx_win()
Definition: drawfx.cpp:60
tesseract::Tesseract::crunch_terrible_rating
double crunch_terrible_rating
Definition: tesseractclass.h:936
PAGE_RES_IT::forward
WERD_RES * forward()
Definition: pageres.h:728
SVEvent::y_size
int y_size
Definition: scrollview.h:69
tesseract::CCUtil::use_ambigs_for_adaption
bool use_ambigs_for_adaption
Definition: ccutil.h:73
tesseract::Tesseract::tessedit_use_reject_spaces
bool tessedit_use_reject_spaces
Definition: tesseractclass.h:900
tesseract::Tesseract::output_pass
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:35
WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:266
tesseract::ShiroRekhaSplitter::set_pageseg_split_strategy
void set_pageseg_split_strategy(SplitStrategy strategy)
Definition: devanagari_processing.h:146
AC_LC_ABBREV
a.b.c.
Definition: control.h:33
WERD_CHOICE::shallow_copy
WERD_CHOICE shallow_copy(int start, int end) const
Definition: ratngs.cpp:416
SUBLOC_NORM
#define SUBLOC_NORM
Definition: errcode.h:57
tesseract::Tesseract::recog_pseudo_word
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
Definition: control.cpp:62
tesseract::Tesseract::tessedit_train_from_boxes
bool tessedit_train_from_boxes
Definition: tesseractclass.h:787
tesseract::Tesseract::crunch_del_rating
double crunch_del_rating
Definition: tesseractclass.h:942
BITS16::turn_on_bit
void turn_on_bit(uint8_t bit_num)
Definition: bits16.h:46
tesseract::Tesseract::pageseg_devanagari_split_strategy
int pageseg_devanagari_split_strategy
Definition: tesseractclass.h:813
UNICHARSET::get_islower
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:488
tesseract::TesseractStats::doc_blob_quality
int16_t doc_blob_quality
Definition: tesseractclass.h:128
tesseract::Tesseract::failure_count
int16_t failure_count(WERD_RES *word)
Definition: docqual.cpp:946
WERD_RES::blob_widths
GenericVector< int > blob_widths
Definition: pageres.h:210
BLOCK_RES::char_count
int32_t char_count
Definition: pageres.h:114
PAGE_RES_IT::next_block
BLOCK_RES * next_block() const
Definition: pageres.h:763
REJMAP::rej_word_block_rej
void rej_word_block_rej()
Definition: rejctmap.cpp:432
TBOX::intersection
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:83
tesseract::Tesseract::quality_char_pc
double quality_char_pc
Definition: tesseractclass.h:880
CR_DELETE
Definition: pageres.h:156
STRING::add_str_int
void add_str_int(const char *str, int number)
Definition: strngs.cpp:370
ROW::base_line
float base_line(float xpos) const
Definition: ocrrow.h:58
tesseract::Tesseract::tessedit_good_doc_still_rowrej_wd
double tessedit_good_doc_still_rowrej_wd
Definition: tesseractclass.h:923
tesseract::Tesseract::crunch_poor_garbage_cert
double crunch_poor_garbage_cert
Definition: tesseractclass.h:938
tesseract::Tesseract::file_type
char * file_type
Definition: tesseractclass.h:1048
tesseract::Tesseract::tessedit_create_pdf
bool tessedit_create_pdf
Definition: tesseractclass.h:1006
ROW::descenders
float descenders() const
Definition: ocrrow.h:84
WERD::flag
bool flag(WERD_FLAGS mask) const
Definition: werd.h:116
ROW_RES::rej_count
int32_t rej_count
Definition: pageres.h:138
tesseract::Textord::CleanupSingleRowResult
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
Definition: textord.cpp:318
tesseract::Tesseract::crunch_del_cert
double crunch_del_cert
Definition: tesseractclass.h:943
C_BLOB::out_list
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:69
tesseract::Tesseract::crunch_rating_max
int crunch_rating_max
Definition: tesseractclass.h:950
tesseract::Tesseract::set_word_fonts
void set_word_fonts(WERD_RES *word)
Definition: control.cpp:1961
ScrollView::AddMessage
void AddMessage(const char *format,...)
Definition: scrollview.cpp:560
kBlnXHeight
const int kBlnXHeight
Definition: normalis.h:23
PDBLK::bounding_box
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:58
tesseract::ShiroRekhaSplitter::set_orig_pix
void set_orig_pix(Pix *pix)
Definition: devanagari_processing.cpp:68
WERD_RES::BestChoiceToCorrectText
void BestChoiceToCorrectText()
Definition: pageres.cpp:920
tesseract::Tesseract::superscript_worse_certainty
double superscript_worse_certainty
Definition: tesseractclass.h:976
TPOINT
Definition: blobs.h:49
SHOW_SMALLCAPS_CMD_EVENT
Definition: pgedit.cpp:76
tesseract::Tesseract::x_ht_acceptance_tolerance
int x_ht_acceptance_tolerance
Definition: tesseractclass.h:970
UNICHARSET::encode_string
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:258
REJMAP::accept_count
int16_t accept_count()
Definition: rejctmap.cpp:278
tesseract::Tesseract::lstm_choice_mode
int lstm_choice_mode
Definition: tesseractclass.h:1086
editor_image_blob_bb_color
int editor_image_blob_bb_color
Definition: pgedit.cpp:127
ACCEPTABLE_WERD_TYPE
ACCEPTABLE_WERD_TYPE
Definition: control.h:27
tesseract::Wordrec::end_recog
int end_recog()
Definition: tface.cpp:76
tesseract::Tesseract::tessedit_display_outwords
bool tessedit_display_outwords
Definition: tesseractclass.h:836
tesseract::Tesseract::textord_use_cjk_fp_model
bool textord_use_cjk_fp_model
Definition: tesseractclass.h:1059
tesseract::Tesseract::chs_trailing_punct1
char * chs_trailing_punct1
Definition: tesseractclass.h:874
BCC_FAKE
Definition: ratngs.h:46
CM_SERIF
Definition: pgedit.cpp:88
tesseract::Tesseract::tessedit_create_wordstrbox
bool tessedit_create_wordstrbox
Definition: tesseractclass.h:1005
W_REP_CHAR
repeated character
Definition: werd.h:52
ScrollView::SetVisible
void SetVisible(bool visible)
Definition: scrollview.cpp:548
tesseract::Tesseract::ComputeCompatibleXheight
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
Definition: fixxht.cpp:117
reject_whole_page
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:385
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:303
WERD::NextColor
static ScrollView::Color NextColor(ScrollView::Color colour)
Definition: werd.cpp:291
POLY_BLOCK::IsText
bool IsText() const
Definition: polyblk.h:62
tesseract::OEM_TESSERACT_LSTM_COMBINED
Definition: publictypes.h:268
tesseract::Tesseract::tessedit_create_boxfile
bool tessedit_create_boxfile
Definition: tesseractclass.h:1043
tesseract::PSM_COL_FIND_ENABLED
bool PSM_COL_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:194
tesseract::Tesseract::noise_outlines
bool noise_outlines(TWERD *word)
Definition: docqual.cpp:958
tesseract::CCStruct::kXHeightCapRatio
static const double kXHeightCapRatio
Definition: ccstruct.h:37
UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:502
WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:260
WERD_CHOICE
Definition: ratngs.h:261
WERD_CHOICE::dangerous_ambig_found
bool dangerous_ambig_found() const
Definition: ratngs.h:351
ScrollView::AddMessageBox
void AddMessageBox()
Definition: scrollview.cpp:577
G_TERRIBLE
Definition: docqual.h:32
tesseract::Classify::fontinfo_table_
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529
REJMAP::initialise
void initialise(int16_t length)
Definition: rejctmap.cpp:272
W_DONT_CHOP
fixed pitch chopped
Definition: werd.h:51
TWERD
Definition: blobs.h:416
tesseract::Tesseract::tessedit_do_invert
bool tessedit_do_invert
Definition: tesseractclass.h:795
UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:481
TBLOB::NumOutlines
int NumOutlines() const
Definition: blobs.cpp:453
UNICHARSET::get_script_id_from_name
int get_script_id_from_name(const char *script_name) const
Definition: unicharset.cpp:1099
POLYGONAL_CMD_EVENT
Definition: pgedit.cpp:57
tesseract::Tesseract::lstm_rating_coefficient
double lstm_rating_coefficient
Definition: tesseractclass.h:1095
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
fx_win
ScrollView * fx_win
Definition: drawfx.cpp:40
tesseract::Tesseract::recog_word
void recog_word(WERD_RES *word)
Definition: tfacepp.cpp:41
tesseract::Tesseract::tessedit_debug_quality_metrics
bool tessedit_debug_quality_metrics
Definition: tesseractclass.h:927
SYSTEM_DAWG_PERM
Definition: ratngs.h:239
tesseract::Tesseract::CountMisfitTops
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:85
WERD_RES::bln_boxes
tesseract::BoxWord * bln_boxes
Definition: pageres.h:189
WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:147
tesseract::Tesseract::applybox_debug
int applybox_debug
Definition: tesseractclass.h:823
INT_MEMBER
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:312
TBLOB::outlines
TESSLINE * outlines
Definition: blobs.h:398
tesseract::PSM_RAW_LINE
Definition: publictypes.h:176
tesseract::TESSDATA_PARAMS_MODEL
Definition: tessdatamanager.h:73
GenericVector::insert
void insert(const T &t, int index)
Definition: genericvector.h:750
BLOCK::row_list
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:115
SVMenuNode::AddChild
SVMenuNode * AddChild(const char *txt)
Definition: svmnode.cpp:58
baseline
Definition: mfoutline.h:62
WERD_RES::denorm
DENORM denorm
Definition: pageres.h:195
DF_TEXT
Correct ascii.
Definition: werd.h:46
tesseract::Textord::TextordPage
void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, int height, Pix *binary_pix, Pix *thresholds_pix, Pix *grey_pix, bool use_box_bottoms, BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: textord.cpp:226
OSBestResult::script_id
int script_id
Definition: osdetect.h:45
tesseract::DebugPixa::WritePDF
void WritePDF(const char *filename)
Definition: debugpixa.h:36
DUMP_WERD_CMD_EVENT
Definition: pgedit.cpp:50
tesseract::Tesseract::tessedit_fix_fuzzy_spaces
bool tessedit_fix_fuzzy_spaces
Definition: tesseractclass.h:839
tesseract::Tesseract::debug_word
void debug_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:665
WERD_CHOICE::set_certainty
void set_certainty(float new_val)
Definition: ratngs.h:360
PAGE_RES_IT::block
BLOCK_RES * block() const
Definition: pageres.h:754
tesseract::Tesseract::applybox_learn_ngrams_mode
bool applybox_learn_ngrams_mode
Definition: tesseractclass.h:835
tesseract::Tesseract::AssignDiacriticsToOverlappingBlobs
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB * > *target_blobs)
Definition: control.cpp:1010
ETEXT_DESC::progress_callback2
PROGRESS_FUNC2 progress_callback2
called whenever progress increases
Definition: ocrclass.h:108
tesseract::Tesseract::tilde_crunch
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:396
tesseract::Tesseract::crunch_terrible_garbage
bool crunch_terrible_garbage
Definition: tesseractclass.h:937
FCOORD::y
float y() const
Definition: points.h:209
tesseract::Tesseract::tessedit_write_rep_codes
bool tessedit_write_rep_codes
Definition: tesseractclass.h:996
PAGE_RES_IT::row
ROW_RES * row() const
Definition: pageres.h:751
ICOORD
integer coordinate
Definition: points.h:30
WERD_RES::unlv_crunch_mode
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:309
tesseract::kCertaintyScale
const float kCertaintyScale
Definition: linerec.cpp:35
kMaxCredibleResolution
constexpr int kMaxCredibleResolution
Definition: publictypes.h:39
tesseract::SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY
Definition: params.h:52
CM_RAINBOW
Definition: pgedit.cpp:81
tesseract::RECOG_WERDS
Definition: tessedit.cpp:488
WERD::AddSelectedOutlines
bool AddSelectedOutlines(const GenericVector< bool > &wanted, const GenericVector< C_BLOB * > &target_blobs, const GenericVector< C_OUTLINE * > &outlines, bool *make_next_word_fuzzy)
Definition: werd.cpp:523
tesseract::Tesseract::min_orientation_margin
double min_orientation_margin
Definition: tesseractclass.h:1057
WERD_CHOICE::make_bad
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:431
tesseract::Wordrec::cc_recog
void cc_recog(WERD_RES *word)
Definition: tface.cpp:139
tesseract::Wordrec::chop_debug
int chop_debug
Definition: wordrec.h:204
tesseract::Tesseract::RetryWithLanguage
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
Definition: control.cpp:903
kBackUpConfigFile
const char *const kBackUpConfigFile
Definition: control.cpp:48
TBOX::print
void print() const
Definition: rect.h:277
CR_NONE
Definition: pageres.h:153
tesseract::Tesseract::ReportXhtFixResult
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
Definition: control.cpp:1461
tesseract::Classify::AdaptableWord
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:821
tesseract::Wordrec::set_pass1
void set_pass1()
Definition: tface.cpp:115
create_fx_win
void create_fx_win()
Definition: drawfx.cpp:48
tesseract::Wordrec::dict_word
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:103
TESSLINE
Definition: blobs.h:201
PAGE_RES_IT::restart_page
WERD_RES * restart_page()
Definition: pageres.h:695
PAGE_RES::misadaption_log
GenericVector< STRING > misadaption_log
Definition: pageres.h:89
WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:318
FCOORD::x
float x() const
Definition: points.h:206
WERD_RES::combination
bool combination
Definition: pageres.h:333
tesseract::Tesseract::superscript_min_y_bottom
double superscript_min_y_bottom
Definition: tesseractclass.h:993
TBOX::top
int16_t top() const
Definition: rect.h:57
TBOX::contains
bool contains(const FCOORD pt) const
Definition: rect.h:330
PAGE_RES::blame_reasons
GenericVector< int > blame_reasons
Definition: pageres.h:84
tesseract::Tesseract::crunch_leave_lc_strings
int crunch_leave_lc_strings
Definition: tesseractclass.h:958
tesseract::Tesseract::recog_interactive
bool recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:77
tesseract::Tesseract::set_unlv_suspects
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:272
tesseract::EquationDetect::SetLangTesseract
void SetLangTesseract(Tesseract *lang_tesseract)
Definition: equationdetect.cpp:123
tesseract::FontInfo::is_italic
bool is_italic() const
Definition: fontinfo.h:111
TBOX::area
int32_t area() const
Definition: rect.h:121
STRING
Definition: strngs.h:45
tesseract::TesseractStats::doc_outline_errs
int16_t doc_outline_errs
Definition: tesseractclass.h:129
UNICHARSET::get_script_from_script_id
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:844
WERD_RES::x_height
float x_height
Definition: pageres.h:310
ScrollView::Clear
void Clear()
Definition: scrollview.cpp:588
SHOW_ITALIC_CMD_EVENT
Definition: pgedit.cpp:71
tesseract::Dict::stopper_debug_level
int stopper_debug_level
Definition: dict.h:638
tesseract::SetParamConstraint
SetParamConstraint
Definition: params.h:49
SHOW_BLOB_FEATURES
Definition: pgedit.cpp:68
tesseract::ShiroRekhaSplitter::Clear
void Clear()
Definition: devanagari_processing.cpp:56
tesseract::Tesseract::crunch_include_numerals
bool crunch_include_numerals
Definition: tesseractclass.h:956
TO_BLOCK
Definition: blobbox.h:691
transform_to_next_perm
void transform_to_next_perm(WERD_RES_LIST &words)
Definition: fixspace.cpp:398
SHOW_SERIF_CMD_EVENT
Definition: pgedit.cpp:75
tesseract::UnicharAmbigs::LoadUnicharAmbigs
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
Definition: ambigs.cpp:75
tesseract::Tesseract::noise_maxperword
int noise_maxperword
Definition: tesseractclass.h:871
TBOX::set_top
void set_top(int y)
Definition: rect.h:60
ScrollView::BROWN
Definition: scrollview.h:120
WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:334
tesseract::RECOG_PSEUDO
Definition: tessedit.cpp:489
tesseract::Tesseract::ReportFailedBox
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
tesseract::Tesseract::rej_1Il_trust_permuter_type
bool rej_1Il_trust_permuter_type
Definition: tesseractclass.h:1032
tesseract::Wordrec::prev_word_best_choice_
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:476
PDBLK::index
int index() const
Definition: pdblock.h:66
WERD_RES::fontinfo
const FontInfo * fontinfo
Definition: pageres.h:303
WERD_RES
Definition: pageres.h:160
tesseract::Dict::Load
void Load(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:210
ScrollView::NONE
Definition: scrollview.h:101
tesseract::ParamUtils::PrintParams
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:168
tesseract::Tesseract::tessedit_zero_rejection
bool tessedit_zero_rejection
Definition: tesseractclass.h:1020
ScrollView::Pen
void Pen(Color color)
Definition: scrollview.cpp:717
tesseract::Tesseract::tessedit_train_line_recognizer
bool tessedit_train_line_recognizer
Definition: tesseractclass.h:791
tesseract::OEM_LSTM_ONLY
Definition: publictypes.h:267
tesseract::Tesseract::suspect_rating_per_ch
double suspect_rating_per_ch
Definition: tesseractclass.h:1017
tesseract::Dict::add_document_word
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
Definition: dict.cpp:644
tesseract::Tesseract::read_config_file
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:64
ScrollView::Image
void Image(struct Pix *image, int x_pos, int y_pos)
Definition: scrollview.cpp:763
tesseract::Classify::classify_max_certainty_margin
double classify_max_certainty_margin
Definition: classify.h:440
tesseract::PSM_SINGLE_BLOCK
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:168
tesseract::Tesseract::ambigs_classify_and_output
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
Definition: recogtraining.cpp:211
tesseract::Tesseract::classify_word_pass2
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1571
WERD_RES::ConsumeWordResults
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:761
SHOW_DROPCAPS_CMD_EVENT
Definition: pgedit.cpp:77
tesseract::Classify::LearnWord
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:250
WERD_RES::merge_tess_fails
void merge_tess_fails()
Definition: pageres.cpp:1062
PAGE_RES_IT::prev_row
ROW_RES * prev_row() const
Definition: pageres.h:742
IntCastRounded
int IntCastRounded(double x)
Definition: helpers.h:173
tesseract::TesseractStats::good_char_count
int16_t good_char_count
Definition: tesseractclass.h:131
tesseract::Classify::classify_class_pruner_multiplier
int classify_class_pruner_multiplier
Definition: classify.h:501
IncorrectResultReason
IncorrectResultReason
Definition: blamer.h:52
LOC_FUZZY_SPACE
#define LOC_FUZZY_SPACE
Definition: errcode.h:48
C_BLOB::SortByXMiddle
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:124
tesseract::TesseractStats::write_results_empty_block
bool write_results_empty_block
Definition: tesseractclass.h:140
WERD_RES::ratings
MATRIX * ratings
Definition: pageres.h:231
tesseract::Tesseract::quality_rej_pc
double quality_rej_pc
Definition: tesseractclass.h:876
tesseract::Tesseract::tessedit_enable_dict_correction
bool tessedit_enable_dict_correction
Definition: tesseractclass.h:850
WERD_CHOICE::unicharset
const UNICHARSET * unicharset() const
Definition: ratngs.h:288
tesseract::Tesseract::bland_unrej
bool bland_unrej
Definition: tesseractclass.h:928
tesseract::Tesseract::debug_noise_removal
int debug_noise_removal
Definition: tesseractclass.h:857
tesseract::Tesseract::tessedit_pageseg_mode
int tessedit_pageseg_mode
Definition: tesseractclass.h:799
tesseract::Tesseract::tess_add_doc_word
void tess_add_doc_word(WERD_CHOICE *word_choice)
Definition: tessbox.cpp:71
WERD_RES::best_state
GenericVector< int > best_state
Definition: pageres.h:279
BL_NORM_CMD_EVENT
Definition: pgedit.cpp:58
tesseract::Tesseract::LSTMRecognizeWord
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
Definition: linerec.cpp:228
tesseract::TesseractStats::tilde_crunch_written
bool tilde_crunch_written
Definition: tesseractclass.h:137
tesseract::Tesseract::count_alphas
int16_t count_alphas(const WERD_CHOICE &word)
Definition: output.cpp:365
tesseract::Tesseract::rej_1Il_use_dict_word
bool rej_1Il_use_dict_word
Definition: tesseractclass.h:1031
WERD_CHOICE::state
int state(int index) const
Definition: ratngs.h:307
EqualIgnoringCaseAndTerminalPunct
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:807
AC_UC_ABBREV
A.B.C.
Definition: control.h:34
tesseract::Tesseract::quality_rowrej_pc
double quality_rowrej_pc
Definition: tesseractclass.h:929
tesseract::Tesseract::reject_I_1_L
void reject_I_1_L(WERD_RES *word)
Definition: reject.cpp:191
WERD_RES::SetAllScriptPositions
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: pageres.cpp:861
WERD_RES::GetBlobChoice
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:746
UNICHARSET::eq
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:686
tesseract::Tesseract::tessedit_image_border
int tessedit_image_border
Definition: tesseractclass.h:1039
tesseract::ShiroRekhaSplitter::set_segmentation_block_list
void set_segmentation_block_list(BLOCK_LIST *block_list)
Definition: devanagari_processing.h:104
tesseract::Tesseract::tessedit_flip_0O
bool tessedit_flip_0O
Definition: tesseractclass.h:1027
tesseract::Dict::GlobalDawgCache
static TESS_API DawgCache * GlobalDawgCache()
Definition: dict.cpp:184
tesseract::ShiroRekhaSplitter::splitted_image
Pix * splitted_image()
Definition: devanagari_processing.h:121
tesseract::CCUtil::language_data_path_prefix
STRING language_data_path_prefix
Definition: ccutil.h:56
tesseract::Tesseract::noise_maxperblob
int noise_maxperblob
Definition: tesseractclass.h:870
TESSLINE::next
TESSLINE * next
Definition: blobs.h:279
tesseract::Tesseract::ok_repeated_ch_non_alphanum_wds
char * ok_repeated_ch_non_alphanum_wds
Definition: tesseractclass.h:1040
ScrollView::ZoomToRectangle
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:755
tesseract::Tesseract::worst_noise_blob
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:706
FCOORD
Definition: points.h:187
SHOW_SUBSCRIPT_CMD_EVENT
Definition: pgedit.cpp:69
tesseract::LSTMRecognizer::GetUnicharset
const UNICHARSET & GetUnicharset() const
Definition: lstmrecognizer.h:132
CM_BOLD
Definition: pgedit.cpp:85
tesseract::Tesseract::superscript_scaledown_ratio
double superscript_scaledown_ratio
Definition: tesseractclass.h:985
tesseract::Tesseract::GetSubAndSuperscriptCandidates
void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
Definition: superscript.cpp:252
WERD_RES::SetupBasicsFromChoppedWord
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:339
tesseract::Tesseract::crunch_leave_uc_strings
int crunch_leave_uc_strings
Definition: tesseractclass.h:960
tesseract::Tesseract::acceptable_word_string
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1744
tesseract::Tesseract::textord_tabfind_force_vertical_text
bool textord_tabfind_force_vertical_text
Definition: tesseractclass.h:1068
tesseract::TesseractStats::word_count
int32_t word_count
Definition: tesseractclass.h:133
tesseract::Tesseract::tessedit_bigram_debug
int tessedit_bigram_debug
Definition: tesseractclass.h:853
tesseract::Tesseract::fixsp_small_outlines_size
double fixsp_small_outlines_size
Definition: tesseractclass.h:964
tesseract::ShiroRekhaSplitter::Split
bool Split(bool split_for_pageseg, DebugPixa *pixa_debug)
Definition: devanagari_processing.cpp:80
SHOW_POINT_CMD_EVENT
Definition: pgedit.cpp:51
tesseract::Tesseract::enable_noise_removal
bool enable_noise_removal
Definition: tesseractclass.h:856
tesseract::Tesseract::rejection_passes
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
Definition: control.cpp:612
BLOCK_RES
Definition: pageres.h:110
tesseract::Tesseract::crunch_pot_poor_cert
double crunch_pot_poor_cert
Definition: tesseractclass.h:941
tesseract::FontInfo::universal_id
int32_t universal_id
Definition: fontinfo.h:123
tesseract::Tesseract::script_pos_pass
void script_pos_pass(PAGE_RES *page_res)
Definition: control.cpp:733
ICOORD::set_y
void set_y(int16_t yin)
rewrite function
Definition: points.h:64
PAGE_RES::block_res_list
BLOCK_RES_LIST block_res_list
Definition: pageres.h:78
tesseract::Tesseract::tessedit_row_rej_good_docs
bool tessedit_row_rej_good_docs
Definition: tesseractclass.h:921
C_BLOB
Definition: stepblob.h:36
BlamerBundle::JoinBlames
void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug)
Definition: blamer.cpp:231
ReadNextBox
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:134
TBOX::rotate
void rotate(const FCOORD &vec)
Definition: rect.h:196
tesseract::CCUtil::unicharset
UNICHARSET unicharset
Definition: ccutil.h:57
WERD_RES::CloneChoppedToRebuild
void CloneChoppedToRebuild()
Definition: pageres.cpp:831
tesseract::SET_PARAM_CONSTRAINT_DEBUG_ONLY
Definition: params.h:51
ETEXT_DESC::ocr_alive
volatile int8_t ocr_alive
true if not last
Definition: ocrclass.h:103
tesseract::Tesseract::tessedit_preserve_min_wd_len
int tessedit_preserve_min_wd_len
Definition: tesseractclass.h:919
UnicityTable::push_back
int push_back(T object)
Add an element in the table.
Definition: unicity_table.h:168
GenericVector::back
T & back() const
Definition: genericvector.h:728
C_OUTLINE
Definition: coutln.h:71
tesseract::Tesseract::AutoPageSeg
int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
Definition: pagesegmain.cpp:214
BOOL_INIT_MEMBER
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:327
tesseract::Tesseract::PreenXHeights
void PreenXHeights(BLOCK_LIST *block_list)
Definition: applybox.cpp:180
WERD_RES::fontinfo_id_count
int8_t fontinfo_id_count
Definition: pageres.h:305
TBOX::height
int16_t height() const
Definition: rect.h:107
SEAM
Definition: seam.h:36
WERD_RES::space_certainty
float space_certainty
Definition: pageres.h:315
tesseract::UnicharAmbigs::LoadUniversal
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
Definition: ambigs.cpp:68
tesseract::Tesseract::test_pt_x
double test_pt_x
Definition: tesseractclass.h:888
UNICHARSET::get_ispunctuation
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:509
tesseract::PSM_SINGLE_WORD
Treat the image as a single word.
Definition: publictypes.h:170
WERD_RES::tess_failed
bool tess_failed
Definition: pageres.h:289
tesseract::Tesseract::lstm_choice_iterations
int lstm_choice_iterations
Definition: tesseractclass.h:1090
tesseract::Tesseract::tessedit_adaption_debug
bool tessedit_adaption_debug
Definition: tesseractclass.h:821
tesseract::Tesseract::chs_trailing_punct2
char * chs_trailing_punct2
Definition: tesseractclass.h:875
tesseract::Tesseract::BestPix
Pix * BestPix() const
Definition: tesseractclass.h:231
WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:197
PAGE_RES_IT::InsertSimpleCloneWord
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1209
tesseract::TESSDATA_UNICHARSET
Definition: tessdatamanager.h:58
PERFECT_WERDS
#define PERFECT_WERDS
Definition: fixspace.cpp:43
SVEvent::y
int y
Definition: scrollview.h:67
QUIT_CMD_EVENT
Definition: pgedit.cpp:65
WERD::cblob_list
C_BLOB_LIST * cblob_list()
Definition: werd.h:94
tesseract::Dict::stopper_no_acceptable_choices
bool stopper_no_acceptable_choices
Definition: dict.h:641
tesseract::Classify::getDict
virtual Dict & getDict()
Definition: classify.h:107
WERD_RES::blamer_bundle
BlamerBundle * blamer_bundle
Definition: pageres.h:246
INT_INIT_MEMBER
#define INT_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:324
set_global_subloc_code
void set_global_subloc_code(int loc_code)
Definition: globaloc.cpp:30
tesseract::Dict::AcceptableResult
bool AcceptableResult(WERD_RES *word) const
Definition: stopper.cpp:116
WERD_RES::InitForRetryRecognition
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:273
tesseract::SP_SUBSCRIPT
Definition: ratngs.h:252
tesseract::Tesseract::SetupWordPassN
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:177
CM_SUPERSCRIPT
Definition: pgedit.cpp:83
tesseract::Tesseract::crunch_pot_indicators
int crunch_pot_indicators
Definition: tesseractclass.h:951
tesseract::Classify::get_fontinfo_table
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
PAGE_RES_IT::MakeCurrentWordFuzzy
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1469
tesseract::ShiroRekhaSplitter::orig_pix
Pix * orig_pix()
Definition: devanagari_processing.h:130
ROW_RES::char_count
int32_t char_count
Definition: pageres.h:137
BlamerBundle::IncorrectReasonName
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:62
tesseract::Tesseract::debug_x_ht_level
int debug_x_ht_level
Definition: tesseractclass.h:872
tesseract::Tesseract::TestNewNormalization
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1518
tesseract::Wordrec::wordrec_enable_assoc
bool wordrec_enable_assoc
Definition: wordrec.h:198
tesseract::Tesseract::crunch_early_convert_bad_unlv_chs
bool crunch_early_convert_bad_unlv_chs
Definition: tesseractclass.h:935
BLOB_CHOICE::set_rating
void set_rating(float newrat)
Definition: ratngs.h:142
tesseract::OcrEngineMode
OcrEngineMode
Definition: publictypes.h:265
tesseract::DebugPixa::AddPix
void AddPix(const Pix *pix, const char *caption)
Definition: debugpixa.h:26
tesseract::Tesseract::textord_tabfind_vertical_text
bool textord_tabfind_vertical_text
Definition: tesseractclass.h:1066
tesseract::Tesseract::build_menu_new
SVMenuNode * build_menu_new()
Definition: pgedit.cpp:298
tesseract::Tesseract::tessedit_resegment_from_line_boxes
bool tessedit_resegment_from_line_boxes
Definition: tesseractclass.h:785
tesseract::Tesseract::crunch_small_outlines_size
double crunch_small_outlines_size
Definition: tesseractclass.h:949
tesseract::Tesseract::tessedit_make_boxes_from_boxes
bool tessedit_make_boxes_from_boxes
Definition: tesseractclass.h:789
WERD::print
void print()
Definition: werd.cpp:252
tesseract::Tesseract::unrecognised_char
char * unrecognised_char
Definition: tesseractclass.h:1013
OSBestResult::orientation_id
int orientation_id
Definition: osdetect.h:44
kMinRefitXHeightFraction
const double kMinRefitXHeightFraction
Definition: control.cpp:51
UNICHARSET::major_right_to_left
bool major_right_to_left() const
Definition: unicharset.cpp:952
tesseract::Tesseract::crunch_del_max_ht
double crunch_del_max_ht
Definition: tesseractclass.h:945
tesseract::Tesseract::ResegmentWordBox
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
tesseract::Tesseract::TrainedXheightFix
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1484
tesseract::Textord::find_components
void find_components(Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: tordmain.cpp:215
tesseract::Tesseract::tessedit_tess_adaption_mode
int tessedit_tess_adaption_mode
Definition: tesseractclass.h:883
tesseract::PSM_SPARSE
bool PSM_SPARSE(int pageseg_mode)
Definition: publictypes.h:197
tesseract::ScriptPos
ScriptPos
Definition: ratngs.h:250
REJMAP::rej_word_bad_quality
void rej_word_bad_quality()
Definition: rejctmap.cpp:414
tesseract::Tesseract::crunch_leave_ok_strings
bool crunch_leave_ok_strings
Definition: tesseractclass.h:952
extract_edges
void extract_edges(Pix *pix, BLOCK *block)
Definition: edgblob.cpp:329
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
BLOCK
Definition: ocrblock.h:28
tesseract::Tesseract::tessedit_prefer_joined_punct
bool tessedit_prefer_joined_punct
Definition: tesseractclass.h:965
C_BLOB::deep_copy
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:118
tesseract::Tesseract::debug_fix_space_level
int debug_fix_space_level
Definition: tesseractclass.h:967
tesseract::PSM_SINGLE_BLOCK_VERT_TEXT
Definition: publictypes.h:166
tesseract::LSTMRecognizer::RecognizeLine
void RecognizeLine(const ImageData &image_data, bool invert, bool debug, double worst_dict_cert, const TBOX &line_box, PointerVector< WERD_RES > *words, int lstm_choice_mode=0, int lstm_choice_amount=5)
Definition: lstmrecognizer.cpp:187
tesseract::ACTION_2_CMD_EVENT
Definition: tessedit.cpp:490
tesseract::Tesseract::fp_eval_word_spacing
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:856
BLAMER_CMD_EVENT
Definition: pgedit.cpp:54
tesseract::Tesseract::pageseg_apply_music_mask
bool pageseg_apply_music_mask
Definition: tesseractclass.h:1097
WERD_RES::deep_copy
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:643
BLOCK::pdblk
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:189
tesseract::Tesseract::TidyUp
void TidyUp(PAGE_RES *page_res)
R_DOC_REJ
Definition: rejctmap.h:113
SortHelper::Add
void Add(T value, int count)
Definition: sorthelper.h:65
ETEXT_DESC::cancel_this
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:109
CHANGE_DISP_CMD_EVENT
Definition: pgedit.cpp:49
REJMAP::length
int32_t length() const
Definition: rejctmap.h:222
WERD_CHOICE::IsAllSpaces
bool IsAllSpaces() const
Definition: ratngs.h:509
tesseract::Tesseract::word_bln_display
bool word_bln_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:717
W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:54
tesseract::Tesseract::tessedit_enable_doc_dict
bool tessedit_enable_doc_dict
Definition: tesseractclass.h:844
ROW::x_height
float x_height() const
Definition: ocrrow.h:63
WERD_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:347
tesseract::Tesseract::interactive_display_mode
bool interactive_display_mode
Definition: tesseractclass.h:1047
STRING_MEMBER
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:318
BITMAP_CMD_EVENT
Definition: pgedit.cpp:59
WERD_RES::PrintBestChoices
void PrintBestChoices() const
Definition: pageres.cpp:713
LOC_DOC_BLK_REJ
#define LOC_DOC_BLK_REJ
Definition: errcode.h:51
PAGE_RES::char_count
int32_t char_count
Definition: pageres.h:76
tesseract::kImagePadding
const int kImagePadding
Definition: imagedata.h:38
tesseract::Tesseract::suspect_constrain_1Il
bool suspect_constrain_1Il
Definition: tesseractclass.h:1016
tesseract::Tesseract::SubAndSuperscriptFix
bool SubAndSuperscriptFix(WERD_RES *word_res)
Definition: superscript.cpp:100
WERD_RES::DebugWordChoices
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:476
WERD_CHOICE::min_x_height
float min_x_height() const
Definition: ratngs.h:324
ETEXT_DESC::progress
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:98
CORRECT_TEXT_CMD_EVENT
Definition: pgedit.cpp:56
tesseract::Tesseract::rej_use_tess_blanks
bool rej_use_tess_blanks
Definition: tesseractclass.h:1034
tesseract::BoxWord::BlobBox
const TBOX & BlobBox(int index) const
Definition: boxword.h:83
tesseract::TesseractStats::last_char_was_newline
bool last_char_was_newline
Definition: tesseractclass.h:138
UNICHARSET::debug_str
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:342
tesseract::Tesseract::tessedit_debug_fonts
bool tessedit_debug_fonts
Definition: tesseractclass.h:845
WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235
WERD_RES::SetupForRecognition
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:298
tesseract::Tesseract::test_pt_y
double test_pt_y
Definition: tesseractclass.h:889
TBOX::major_x_overlap
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:403
tesseract::Tesseract::safe_dict_word
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:605
tesseract::Tesseract::GetRectImage
ImageData * GetRectImage(const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
Definition: linerec.cpp:169
tesseract::Tesseract::acceptable_number_string
bool acceptable_number_string(const char *s, const char *lengths)
Definition: output.cpp:386
tesseract::Tesseract::tessedit_test_adaption
bool tessedit_test_adaption
Definition: tesseractclass.h:886
WERD::space
uint8_t space()
Definition: werd.h:98
tesseract::Tesseract::tessedit_rejection_debug
bool tessedit_rejection_debug
Definition: tesseractclass.h:1026
tesseract::ParamsModel::PTRAIN_PASS1
Definition: params_model.h:35
tesseract::Tesseract::tessedit_resegment_from_boxes
bool tessedit_resegment_from_boxes
Definition: tesseractclass.h:783
UNICHARSET::set_black_and_whitelist
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
Definition: unicharset.cpp:969
tesseract::Tesseract::fix_fuzzy_space_list
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:170
tesseract::Tesseract::tessedit_reject_bad_qual_wds
bool tessedit_reject_bad_qual_wds
Definition: tesseractclass.h:924
tesseract::Tesseract::fixspace_thinks_word_done
bool fixspace_thinks_word_done(WERD_RES *word)
Definition: fixspace.cpp:529
GARBAGE_LEVEL
GARBAGE_LEVEL
Definition: docqual.h:27
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
tesseract::ShiroRekhaSplitter::RefreshSegmentationWithNewBlobs
void RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs)
Definition: devanagari_processing.cpp:356
UNICHARSET::get_top_bottom
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:558
WERD_RES::InsertSeam
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:414
TPOINT::x
int16_t x
Definition: blobs.h:91
BLOB_CHOICE::set_certainty
void set_certainty(float newrat)
Definition: ratngs.h:145
tesseract::Tesseract::tessedit_char_whitelist
char * tessedit_char_whitelist
Definition: tesseractclass.h:805
tesseract::BoxWord::ProcessMatchedBlobs
void ProcessMatchedBlobs(const TWERD &other, std::function< void(int)> cb) const
Definition: boxword.cpp:190
MAX_NUM_CLASSES
#define MAX_NUM_CLASSES
Definition: matchdefs.h:29
BITS16::turn_off_bit
void turn_off_bit(uint8_t bit_num)
Definition: bits16.h:51
UNICHARSET::null_sid
int null_sid() const
Definition: unicharset.h:874
PDBLK::poly_block
POLY_BLOCK * poly_block() const
Definition: pdblock.h:54
TBOX::null_box
bool null_box() const
Definition: rect.h:49
tesseract::Tesseract::non_O_upper
bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:783
tesseract::Tesseract::crunch_leave_accept_strings
bool crunch_leave_accept_strings
Definition: tesseractclass.h:955
tesseract::Tesseract::rej_trust_doc_dawg
bool rej_trust_doc_dawg
Definition: tesseractclass.h:1030
tesseract::Tesseract::noise_cert_disjoint
double noise_cert_disjoint
Definition: tesseractclass.h:863
tesseract::CCUtil::ambigs_debug_level
int ambigs_debug_level
Definition: ccutil.h:71
tesseract::Tesseract::tessedit_reject_mode
int tessedit_reject_mode
Definition: tesseractclass.h:1025
WERD_RES::fontinfo2
const FontInfo * fontinfo2
Definition: pageres.h:304
tesseract::TesseractStats::last_char_was_tilde
bool last_char_was_tilde
Definition: tesseractclass.h:139
tesseract::Wordrec::chop_enable
bool chop_enable
Definition: wordrec.h:205
WERD::set_flag
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:117
AC_UPPER_CASE
ALL upper case.
Definition: control.h:31
tesseract::Tesseract::PrerecAllWordsPar
void PrerecAllWordsPar(const GenericVector< WordData > &words)
Definition: par_control.cpp:38
tesseract::Tesseract::word_adaptable
bool word_adaptable(WERD_RES *word, uint16_t mode)
Definition: adaptions.cpp:50
ScrollView::Exit
static void Exit()
Definition: scrollview.cpp:582
WERD_RES::ClearResults
void ClearResults()
Definition: pageres.cpp:1100
tesseract::Tesseract::end_tesseract
void end_tesseract()
Definition: tessedit.cpp:482
tesseract::Tesseract::tessedit_preserve_row_rej_perfect_wds
bool tessedit_preserve_row_rej_perfect_wds
Definition: tesseractclass.h:913
tesseract::Tesseract::tessedit_write_unlv
bool tessedit_write_unlv
Definition: tesseractclass.h:997
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
tesseract::Tesseract::rej_use_good_perm
bool rej_use_good_perm
Definition: tesseractclass.h:1035
tesseract::PSM_BLOCK_FIND_ENABLED
bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:200
tesseract::Classify::StartBackupAdaptiveClassifier
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:629
tesseract::kWorstDictCertainty
const float kWorstDictCertainty
Definition: linerec.cpp:37
tesseract::Tesseract::tessedit_load_sublangs
char * tessedit_load_sublangs
Definition: tesseractclass.h:1051
ROW::ascenders
float ascenders() const
Definition: ocrrow.h:81
set_global_loc_code
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:25
W_EOL
end of line
Definition: werd.h:47
tesseract::Tesseract::tessedit_dump_pageseg_images
bool tessedit_dump_pageseg_images
Definition: tesseractclass.h:793
tesseract::Tesseract::tessedit_zero_kelvin_rejection
bool tessedit_zero_kelvin_rejection
Definition: tesseractclass.h:1024
tesseract::OEM_DEFAULT
Definition: publictypes.h:271
DF_BLAMER
Blamer information.
Definition: werd.h:50
DENORM::y_scale
float y_scale() const
Definition: normalis.h:269
tesseract::Tesseract::word_char_quality
void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:95
tesseract::Wordrec::wordrec_run_blamer
bool wordrec_run_blamer
Definition: wordrec.h:232
TPOINT::y
int16_t y
Definition: blobs.h:92
tesseract::Tesseract::tessedit_override_permuter
bool tessedit_override_permuter
Definition: tesseractclass.h:1049
tesseract::Tesseract::outlines_2
char * outlines_2
Definition: tesseractclass.h:897
tesseract::Tesseract::quality_blob_pc
double quality_blob_pc
Definition: tesseractclass.h:877
TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:457
kResolutionEstimationFactor
constexpr int kResolutionEstimationFactor
Definition: publictypes.h:44
ParamsEditor
Definition: paramsd.h:99
kMinCredibleResolution
constexpr int kMinCredibleResolution
Definition: publictypes.h:37
tesseract::kMaxBoxEdgeDiff
const int16_t kMaxBoxEdgeDiff
Definition: recogtraining.cpp:32
PAGE_RES_IT::DeleteCurrentWord
void DeleteCurrentWord()
Definition: pageres.cpp:1436
tesseract::Tesseract::suspect_level
int suspect_level
Definition: tesseractclass.h:1014
DF_EDGE_STEP
Edge steps.
Definition: werd.h:48
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
TBOX::width
int16_t width() const
Definition: rect.h:114
tesseract::Tesseract::crunch_del_low_word
double crunch_del_low_word
Definition: tesseractclass.h:948
tesseract::Classify::classify_max_rating_ratio
double classify_max_rating_ratio
Definition: classify.h:438
UNICHARSET
Definition: unicharset.h:145
UNICHARSET::top_bottom_useful
bool top_bottom_useful() const
Definition: unicharset.h:527
tesseract::Tesseract::noise_cert_basechar
double noise_cert_basechar
Definition: tesseractclass.h:860
tesseract::Dict::SetupForLoad
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:192
CM_SMALLCAPS
Definition: pgedit.cpp:89
UNICHARSET::script_has_xheight
bool script_has_xheight() const
Definition: unicharset.h:894
WERD_RES::tess_accepted
bool tess_accepted
Definition: pageres.h:297
double_MEMBER
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:321
ETEXT_DESC::deadline_exceeded
bool deadline_exceeded() const
Definition: ocrclass.h:129
TWERD::PolygonalCopy
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
Definition: blobs.cpp:774
tesseract::Tesseract::getDict
Dict & getDict() override
Definition: tesseractclass.cpp:564
PAGE_RES::rej_count
int32_t rej_count
Definition: pageres.h:77
tesseract::Tesseract::tessedit_ambigs_training
bool tessedit_ambigs_training
Definition: tesseractclass.h:809
tesseract::Tesseract::blob_noise_score
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:786
TBOX::bottom
int16_t bottom() const
Definition: rect.h:64
tesseract::Tesseract::tessedit_whole_wd_rej_row_percent
double tessedit_whole_wd_rej_row_percent
Definition: tesseractclass.h:909
tesseract::Tesseract::RecogAllWordsPassN
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
Definition: control.cpp:213
tesseract::Tesseract::init_tesseract_lang_data
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:95
tesseract::Tesseract::AnyTessLang
bool AnyTessLang() const
Definition: tesseractclass.h:283
tesseract::Tesseract::crunch_long_repetitions
int crunch_long_repetitions
Definition: tesseractclass.h:961
tesseract::Tesseract::tessedit_minimal_rej_pass1
bool tessedit_minimal_rej_pass1
Definition: tesseractclass.h:885
tesseract::Tesseract::crunch_poor_garbage_rate
double crunch_poor_garbage_rate
Definition: tesseractclass.h:939
tesseract::Tesseract::do_re_display
void do_re_display(bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
Definition: pgedit.cpp:349
tesseract::Tesseract::applybox_page
int applybox_page
Definition: tesseractclass.h:824
BandTriMatrix::AttachOnCorner
void AttachOnCorner(BandTriMatrix< T > *array2)
Definition: matrix.h:549
WERD_CHOICE::GetNonSuperscriptSpan
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:399
tesseract::Tesseract::tess_acceptable_word
bool tess_acceptable_word(WERD_RES *word)
Definition: tessbox.cpp:61
tesseract::Tesseract::tessedit_create_alto
bool tessedit_create_alto
Definition: tesseractclass.h:1000
TBOX::topright
const ICOORD & topright() const
Definition: rect.h:103
tesseract::ShiroRekhaSplitter::HasDifferentSplitStrategies
bool HasDifferentSplitStrategies() const
Definition: devanagari_processing.h:97
tesseract::Tesseract::tessedit_good_quality_unrej
bool tessedit_good_quality_unrej
Definition: tesseractclass.h:899
tesseract::Tesseract::bidi_debug
int bidi_debug
Definition: tesseractclass.h:822
tesseract::Wordrec::chop_one_blob
SEAM * chop_one_blob(const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:369
tesseract::Tesseract::ImageHeight
int ImageHeight() const
Definition: tesseractclass.h:253
tesseract::CCUtil::lang
STRING lang
Definition: ccutil.h:55
BLOB_CHOICE::fonts
const GenericVector< tesseract::ScoredFont > & fonts() const
Definition: ratngs.h:91
tesseract::SP_NORMAL
Definition: ratngs.h:251
WERD_RES::raw_choice
WERD_CHOICE * raw_choice
Definition: pageres.h:240
make_pseudo_word
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:33
REJMAP::reject_count
int16_t reject_count()
Definition: rejctmap.h:228
tesseract::Tesseract::rej_whole_of_mostly_reject_word_fract
double rej_whole_of_mostly_reject_word_fract
Definition: tesseractclass.h:1038
UnicityTable::get
const T & get(int id) const
Return the object from an id.
Definition: unicity_table.h:140
tesseract::Tesseract::tessedit_dont_blkrej_good_wds
bool tessedit_dont_blkrej_good_wds
Definition: tesseractclass.h:915
tesseract::Tesseract::non_0_digit
bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:787
tesseract::Tesseract::bigram_correction_pass
void bigram_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:467
WERD_RES::chopped_word
TWERD * chopped_word
Definition: pageres.h:206
tesseract::Tesseract::garbage_word
GARBAGE_LEVEL garbage_word(WERD_RES *word, bool ok_dict_word)
Definition: docqual.cpp:658
REJMAP::rej_word_tess_failure
void rej_word_tess_failure()
Definition: rejctmap.cpp:351
WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:288
tesseract::Tesseract::classify_word_pass1
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1400
tesseract::TESSDATA_AMBIGS
Definition: tessdatamanager.h:59
IRR_CORRECT
Definition: blamer.h:54
WERD_CHOICE::debug_string
const STRING debug_string() const
Definition: ratngs.h:493
tesseract::TesseractStats::dict_words
int32_t dict_words
Definition: tesseractclass.h:134
tesseract::TesseractStats::dump_words_str
STRING dump_words_str
Definition: tesseractclass.h:135
DivRounded
int DivRounded(int a, int b)
Definition: helpers.h:165
tesseract::Tesseract::crunch_early_merge_tess_fails
bool crunch_early_merge_tess_fails
Definition: tesseractclass.h:934
tesseract::Tesseract::AnyLSTMLang
bool AnyLSTMLang() const
Definition: tesseractclass.h:293
tesseract::Tesseract::tessedit_parallelize
int tessedit_parallelize
Definition: tesseractclass.h:1074
PAGE_RES_IT::word
WERD_RES * word() const
Definition: pageres.h:748
kMaxXHeightDeviationFraction
const double kMaxXHeightDeviationFraction
Definition: applybox.cpp:33
OSResults::orientations
float orientations[4]
Definition: osdetect.h:76
tesseract::Tesseract::tessedit_upper_flip_hyphen
double tessedit_upper_flip_hyphen
Definition: tesseractclass.h:1029
tesseract::Tesseract::x_ht_min_change
int x_ht_min_change
Definition: tesseractclass.h:971
tesseract::CCUtil::params
ParamsVectors * params()
Definition: ccutil.h:51
tesseract::Tesseract::tessedit_init_config_only
bool tessedit_init_config_only
Definition: tesseractclass.h:1064
R_1IL_CONFLICT
Definition: rejctmap.h:90
tesseract::Tesseract::ClassifyBlobAsWord
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
Definition: control.cpp:1269
WERD_CHOICE::ContainsAnyNonSpaceDelimited
bool ContainsAnyNonSpaceDelimited() const
Definition: ratngs.h:502
PAGE_RES
Definition: pageres.h:73
tesseract::Tesseract::MaximallyChopWord
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: applybox.cpp:242
tesseract::Tesseract::process_selected_words
void process_selected_words(PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
Definition: pagewalk.cpp:45
SVEvent::type
SVEventType type
Definition: scrollview.h:63
WERD_RES::best_choices
WERD_CHOICE_LIST best_choices
Definition: pageres.h:243
tesseract::Tesseract::tessedit_lower_flip_hyphen
double tessedit_lower_flip_hyphen
Definition: tesseractclass.h:1028
WERD_RES::SetScriptPositions
void SetScriptPositions()
Definition: pageres.cpp:854
TBOX::botleft
const ICOORD & botleft() const
Definition: rect.h:91
ScrollView::RED
Definition: scrollview.h:104
tesseract::Tesseract::applybox_learn_chars_and_char_frags_mode
bool applybox_learn_chars_and_char_frags_mode
Definition: tesseractclass.h:832
tesseract::Tesseract::word_display
bool word_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:749
STATS
Definition: statistc.h:30
tesseract::Tesseract::split_word
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
Definition: tfacepp.cpp:174
tesseract::Tesseract::classify_word_and_language
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1318
tesseract::Tesseract::tessedit_page_number
int tessedit_page_number
Definition: tesseractclass.h:1045
tesseract::UnicharAmbigs::InitUnicharAmbigs
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
Definition: ambigs.cpp:54
tesseract::Tesseract::AssignDiacriticsToNewBlobs
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB * > *target_blobs)
Definition: control.cpp:1063
tesseract::Wordrec::set_pass2
void set_pass2()
Definition: tface.cpp:127
tesseract::Tesseract::textord_tabfind_show_vlines
bool textord_tabfind_show_vlines
Definition: tesseractclass.h:1058
tesseract::Tesseract::tessedit_debug_doc_rejection
bool tessedit_debug_doc_rejection
Definition: tesseractclass.h:925
tesseract::Tesseract::blob_feature_display
void blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:952
tesseract::Tesseract::paragraph_text_based
bool paragraph_text_based
Definition: tesseractclass.h:894
tesseract::SP_DROPCAP
Definition: ratngs.h:254
WERD_RES::seam_array
GenericVector< SEAM * > seam_array
Definition: pageres.h:208
BLOB_CHOICE::rating
float rating() const
Definition: ratngs.h:78
tesseract::FontInfo
Definition: fontinfo.h:62
PGEventHandler
Definition: pgedit.h:34
AC_LOWER_CASE
ALL lower case.
Definition: control.h:30
PAGE_RES_IT::rej_stat_word
void rej_stat_word()
Definition: pageres.cpp:1658
tesseract::Tesseract::ocr_devanagari_split_strategy
int ocr_devanagari_split_strategy
Definition: tesseractclass.h:817
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
tesseract::Tesseract::tessedit_write_images
bool tessedit_write_images
Definition: tesseractclass.h:1046
TBLOB::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:466
tesseract::Tesseract::noise_cert_factor
double noise_cert_factor
Definition: tesseractclass.h:869
ReadAllBoxes
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:71
WERD_RES::IsAmbiguous
bool IsAmbiguous()
Definition: pageres.cpp:448
GenericVector< UNICHAR_ID >
tesseract::Tesseract::potential_word_crunch
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
Definition: docqual.cpp:520
tesseract::Tesseract::tessedit_char_blacklist
char * tessedit_char_blacklist
Definition: tesseractclass.h:804
tesseract::FontInfo::name
char * name
Definition: fontinfo.h:117
tesseract::Tesseract::crunch_del_min_width
double crunch_del_min_width
Definition: tesseractclass.h:946
UNICHARSET::get_script_table_size
int get_script_table_size() const
Definition: unicharset.h:839
IMAGE_CMD_EVENT
Definition: pgedit.cpp:60
tesseract::EquationDetect::LabelSpecialText
int LabelSpecialText(TO_BLOCK *to_block) override
Definition: equationdetect.cpp:131
tesseract::Tesseract::TrainFromBoxes
void TrainFromBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
Definition: linerec.cpp:80
tesseract::Tesseract::crunch_del_min_ht
double crunch_del_min_ht
Definition: tesseractclass.h:944
GenericVector::reserve
void reserve(int size)
Definition: genericvector.h:679
PAGE_RES_IT
Definition: pageres.h:668
tesseract::TESSDATA_LANG_CONFIG
Definition: tessdatamanager.h:57
UnicityTable
Definition: fontinfo.h:30
WERD_RES::caps_height
float caps_height
Definition: pageres.h:311
WERD_RES::StatesAllValid
bool StatesAllValid()
Definition: pageres.cpp:454
DF_BOX
Bounding box.
Definition: werd.h:45
tesseract::Tesseract::recog_word_recursive
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:104
tesseract::Tesseract::min_characters_to_try
int min_characters_to_try
Definition: tesseractclass.h:1012
tesseract::Tesseract::first_alphanum_offset
int16_t first_alphanum_offset(const char *word, const char *word_lengths)
Definition: reject.cpp:481
tesseract::Tesseract::terrible_word_crunch
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:482
BLOB_CHOICE::fontinfo_id
int16_t fontinfo_id() const
Definition: ratngs.h:84
tesseract::ParamsModel::PTRAIN_NUM_PASSES
Definition: params_model.h:38
tesseract::Tesseract::Clear
void Clear()
Definition: tesseractclass.cpp:574
IRR_NUM_REASONS
Definition: blamer.h:99
WERD_RES::tess_would_adapt
bool tess_would_adapt
Definition: pageres.h:298
WERD_RES::fix_quotes
void fix_quotes()
Definition: pageres.cpp:1013
DF_BN_POLYGONAL
BL normalisd polyapx.
Definition: werd.h:49
tesseract::CCUtil::unichar_ambigs
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:59
UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:495
tesseract::Tesseract::textonly_pdf
bool textonly_pdf
Definition: tesseractclass.h:1008
tesseract::Tesseract::suspect_accept_rating
double suspect_accept_rating
Definition: tesseractclass.h:1018
UnicityTable::size
int size() const
Return the size used.
Definition: unicity_table.h:127
SVMenuNode::BuildMenu
void BuildMenu(ScrollView *sv, bool menu_bar=true)
Definition: svmnode.cpp:120
tesseract::Tesseract::Tesseract
Tesseract()
Definition: tesseractclass.cpp:52
tesseract::Tesseract::hocr_char_boxes
bool hocr_char_boxes
Definition: tesseractclass.h:933
IRR_PAGE_LAYOUT
Definition: blamer.h:73
tesseract::Tesseract::jpg_quality
int jpg_quality
Definition: tesseractclass.h:1009
TBOX::x_overlap
bool x_overlap(const TBOX &box) const
Definition: rect.h:393
ScrollView::AwaitEvent
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:443
tesseract::Tesseract::SetupApplyBoxes
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:206
tesseract::Tesseract::tessedit_dump_choices
bool tessedit_dump_choices
Definition: tesseractclass.h:837
AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:29
tesseract::Tesseract::tessedit_create_lstmbox
bool tessedit_create_lstmbox
Definition: tesseractclass.h:1002
BlamerBundle::SetMisAdaptionDebug
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
Definition: blamer.cpp:585
tesseract::Tesseract::tessedit_char_unblacklist
char * tessedit_char_unblacklist
Definition: tesseractclass.h:807
tesseract::Dict::FinishLoad
bool FinishLoad()
Definition: dict.cpp:351
tesseract::Tesseract::tessedit_timing_debug
bool tessedit_timing_debug
Definition: tesseractclass.h:838
tesseract::Tesseract::outlines_odd
char * outlines_odd
Definition: tesseractclass.h:896
tesseract::Wordrec::wordrec_debug_blamer
bool wordrec_debug_blamer
Definition: wordrec.h:231
tesseract::Tesseract::rej_use_tess_accepted
bool rej_use_tess_accepted
Definition: tesseractclass.h:1033
STRING::length
int32_t length() const
Definition: strngs.cpp:187
tesseract::ParamUtils::ReadParamsFromFp
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params)
Definition: params.cpp:50
SVEvent::x_size
int x_size
Definition: scrollview.h:68
WERD_CHOICE::print
void print() const
Definition: ratngs.h:568
editor_image_word_bb_color
int editor_image_word_bb_color
Definition: pgedit.cpp:125
BLOB_CHOICE::fontinfo_id2
int16_t fontinfo_id2() const
Definition: ratngs.h:87
STRING::contains
bool contains(char c) const
Definition: strngs.cpp:183
WERD_RES::SetupFake
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:348
WERD::ConstructFromSingleBlob
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
Definition: werd.cpp:124
tesseract::Tesseract::tessedit_enable_bigram_correction
bool tessedit_enable_bigram_correction
Definition: tesseractclass.h:848
tesseract::Classify::AdaptiveClassifierIsEmpty
bool AdaptiveClassifierIsEmpty() const
Definition: classify.h:326
WERD_CHOICE::length
int length() const
Definition: ratngs.h:291
TBOX::pad
void pad(int xpad, int ypad)
Definition: rect.h:130
tesseract::Tesseract::count_outline_errs
int16_t count_outline_errs(char c, int16_t outline_count)
Definition: docqual.cpp:121
count
int count(LIST var_list)
Definition: oldlist.cpp:79
LOC_WRITE_RESULTS
#define LOC_WRITE_RESULTS
Definition: errcode.h:52
W_FUZZY_SP
fuzzy space
Definition: werd.h:53
BLOB_CHOICE
Definition: ratngs.h:49
tesseract::BoxWord::length
int length() const
Definition: boxword.h:82
tesseract::Tesseract::SetupUniversalFontIds
void SetupUniversalFontIds()
Definition: tessedit.cpp:447
TBLOB
Definition: blobs.h:282
tesseract::Tesseract::subscript_max_y_top
double subscript_max_y_top
Definition: tesseractclass.h:989
tesseract::Tesseract::match_word_pass_n
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1629
tesseract::Tesseract::multilang_debug_level
int multilang_debug_level
Definition: tesseractclass.h:890
SVMenuNode
Definition: svmnode.h:35
WERD_RES::tesseract
tesseract::Tesseract * tesseract
Definition: pageres.h:274
tesseract::Tesseract::textord_tabfind_vertical_text_ratio
double textord_tabfind_vertical_text_ratio
Definition: tesseractclass.h:1071
TBOX::major_overlap
bool major_overlap(const TBOX &box) const
Definition: rect.h:362
tesseract::Tesseract::numeric_punctuation
char * numeric_punctuation
Definition: tesseractclass.h:968
WERD
Definition: werd.h:55
tesseract::Tesseract::font_recognition_pass
void font_recognition_pass(PAGE_RES *page_res)
Definition: control.cpp:2036
ScrollView::TextAttributes
void TextAttributes(const char *font, int pixel_size, bool bold, bool italic, bool underlined)
Definition: scrollview.cpp:634
PAGE_RES_IT::next_word
WERD_RES * next_word() const
Definition: pageres.h:757
BLOCK_RES::block
BLOCK * block
Definition: pageres.h:113
tesseract::Tesseract::word_deletable
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
Definition: docqual.cpp:875
GenericVector::truncate
void truncate(int size)
Definition: genericvector.h:132
tesseract::Tesseract::tessedit_reject_row_percent
double tessedit_reject_row_percent
Definition: tesseractclass.h:906
tesseract::Tesseract::pix_binary
Pix * pix_binary() const
Definition: tesseractclass.h:200
TBOX::left
int16_t left() const
Definition: rect.h:71
tesseract::TesseractStats::doc_good_char_quality
int16_t doc_good_char_quality
Definition: tesseractclass.h:132
ROW
Definition: ocrrow.h:35
tesseract::Tesseract::blamer_pass
void blamer_pass(PAGE_RES *page_res)
Definition: control.cpp:709
R_ROW_REJ
Definition: rejctmap.h:115
tesseract::SP_SUPERSCRIPT
Definition: ratngs.h:253
ScrollView::GREEN
Definition: scrollview.h:106
UNIFORM_DISP_CMD_EVENT
Definition: pgedit.cpp:63
tesseract::ImageFind::FindImages
static Pix * FindImages(Pix *pix, DebugPixa *pixa_debug)
Definition: imagefind.cpp:62
tesseract::Tesseract::convert_bad_unlv_chs
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:638
ETEXT_DESC::cancel
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:105
tesseract::Tesseract::ReSegmentByClassification
void ReSegmentByClassification(PAGE_RES *page_res)
WERD_RES::FakeClassifyWord
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:873
GenericVector::clear
void clear()
Definition: genericvector.h:857
tesseract::Tesseract::tessedit_reject_doc_percent
double tessedit_reject_doc_percent
Definition: tesseractclass.h:902
tesseract::Tesseract::textord_tabfind_aligned_gap_fraction
double textord_tabfind_aligned_gap_fraction
Definition: tesseractclass.h:1073
debug_fp
FILE * debug_fp
Definition: tessvars.cpp:23
tesseract::Tesseract::dictionary_correction_pass
void dictionary_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:2092
tesseract::Tesseract::one_ell_conflict
bool one_ell_conflict(WERD_RES *word_res, bool update_map)
Definition: reject.cpp:291
WERD_RES::fontinfo_id2_count
int8_t fontinfo_id2_count
Definition: pageres.h:306
White
Definition: callcpp.h:28
tesseract::Tesseract::tilde_delete
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:568
TBOX::right
int16_t right() const
Definition: rect.h:78
tesseract::PSM_ORIENTATION_ENABLED
bool PSM_ORIENTATION_ENABLED(int pageseg_mode)
Definition: publictypes.h:191
tesseract::Wordrec::assume_fixed_pitch_char_segment
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:225
tesseract::PSM_CIRCLE_WORD
Treat the image as a single word in a circle.
Definition: publictypes.h:171
GenericVector::init_to_size
void init_to_size(int size, const T &t)
Definition: genericvector.h:706
PAGE_RES_IT::ReplaceCurrentWord
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1329
tesseract::Tesseract::SetupAllWordsPassN
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
Definition: control.cpp:154
tesseract::Tesseract::SelectGoodDiacriticOutlines
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Definition: control.cpp:1139
tesseract::Tesseract::crunch_del_high_word
double crunch_del_high_word
Definition: tesseractclass.h:947
WERD_RES::part_of_combo
bool part_of_combo
Definition: pageres.h:334
STRING::unsigned_size
uint32_t unsigned_size() const
Definition: strngs.h:72
tesseract::Tesseract::fix_fuzzy_spaces
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:73
tesseract::Tesseract::suspect_short_words
int suspect_short_words
Definition: tesseractclass.h:1015
tesseract::Dict::valid_bigram
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:813
tesseract::Tesseract::conflict_set_I_l_1
char * conflict_set_I_l_1
Definition: tesseractclass.h:1041
tesseract::TesseractStats::doc_char_quality
int16_t doc_char_quality
Definition: tesseractclass.h:130
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::PSM_OSD_ONLY
Orientation and script detection only.
Definition: publictypes.h:160
WERD_RES::blob_gaps
GenericVector< int > blob_gaps
Definition: pageres.h:213
ScrollView::AddEventHandler
void AddEventHandler(SVEventHandler *listener)
Add an Event Listener to this ScrollView Window.
Definition: scrollview.cpp:415
BLOCK::classify_rotation
FCOORD classify_rotation() const
Definition: ocrblock.h:139
R_MM_REJECT
Definition: rejctmap.h:93
CRUNCH_MODE
CRUNCH_MODE
Definition: pageres.h:150
tesseract::LSTMRecognizer::GetDict
const Dict * GetDict() const
Definition: lstmrecognizer.h:137
SVEvent::x
int x
Definition: scrollview.h:66
POLY_BLOCK
Definition: polyblk.h:26
tesseract::Tesseract::tessedit_fix_hyphens
bool tessedit_fix_hyphens
Definition: tesseractclass.h:842
tesseract::Tesseract::recog_all_words
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:302
WERD::set_blanks
void set_blanks(uint8_t new_blanks)
Definition: werd.h:101
tesseract::Tesseract::tessedit_word_for_word
bool tessedit_word_for_word
Definition: tesseractclass.h:1022
tesseract::Tesseract::fix_rep_char
void fix_rep_char(PAGE_RES_IT *page_res_it)
Definition: control.cpp:1705
tesseract::Tesseract::SearchWords
void SearchWords(PointerVector< WERD_RES > *words)
Definition: linerec.cpp:259
tesseract::Tesseract::textord_equation_detect
bool textord_equation_detect
Definition: tesseractclass.h:1065
ScrollView::Update
static void Update()
Definition: scrollview.cpp:708
tesseract::Tesseract::paragraph_debug_level
int paragraph_debug_level
Definition: tesseractclass.h:891
WERD_RES::reject_spaces
bool reject_spaces
Definition: pageres.h:335
tesseract::Tesseract::unrej_good_chs
void unrej_good_chs(WERD_RES *word)
Definition: docqual.cpp:112
CM_FIXEDPITCH
Definition: pgedit.cpp:87
tesseract::ParamUtils::SetParam
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:79
tesseract::Tesseract::ImageWidth
int ImageWidth() const
Definition: tesseractclass.h:250
tesseract::Tesseract::tessedit_unrej_any_wd
bool tessedit_unrej_any_wd
Definition: tesseractclass.h:841
CR_LOOSE_SPACE
Definition: pageres.h:155
tesseract::Tesseract::reject_edge_blobs
void reject_edge_blobs(WERD_RES *word)
Definition: reject.cpp:263
tesseract::Tesseract::superscript_debug
int superscript_debug
Definition: tesseractclass.h:972
TWERD::plot
void plot(ScrollView *window)
Definition: blobs.cpp:895
tesseract::Tesseract::rej_use_sensible_wd
bool rej_use_sensible_wd
Definition: tesseractclass.h:1036
WERD_RES::word
WERD * word
Definition: pageres.h:180
TWERD::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:859
WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:315
CM_DROPCAPS
Definition: pgedit.cpp:90
tesseract::Tesseract::tess_segment_pass_n
void tess_segment_pass_n(int pass_n, WERD_RES *word)
Definition: tessbox.cpp:31
tesseract::Tesseract::tessedit_create_txt
bool tessedit_create_txt
Definition: tesseractclass.h:998
R_BLOCK_REJ
Definition: rejctmap.h:114
tesseract::Tesseract::doc_and_block_rejection
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:225
tesseract::SET_PARAM_CONSTRAINT_NONE
Definition: params.h:50
tesseract::Classify::classify_integer_matcher_multiplier
int classify_integer_matcher_multiplier
Definition: classify.h:505
WERD_RES::MergeAdjacentBlobs
void MergeAdjacentBlobs(int index)
Definition: pageres.cpp:969
BlamerBundle::incorrect_result_reason
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:121
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
tesseract::Tesseract::digit_or_numeric_punct
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
Definition: fixspace.cpp:369
TBOX::set_bottom
void set_bottom(int y)
Definition: rect.h:67
tesseract::Classify::ResetAdaptiveClassifierInternal
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:598
G_OK
Definition: docqual.h:30
ScrollView::Text
void Text(int x, int y, const char *mystring)
Definition: scrollview.cpp:651
tesseract::Tesseract::lstm_use_matrix
bool lstm_use_matrix
Definition: tesseractclass.h:895
tesseract::Tesseract::unlv_tilde_crunching
bool unlv_tilde_crunching
Definition: tesseractclass.h:930
tesseract::Tesseract::tessedit_reject_block_percent
double tessedit_reject_block_percent
Definition: tesseractclass.h:904
REJMAP::print
void print(FILE *fp)
Definition: rejctmap.cpp:320
REFRESH_CMD_EVENT
Definition: pgedit.cpp:64
R_POSTNN_1IL
Definition: rejctmap.h:91
WERD_RES::ReplaceBestChoice
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:791
tesseract::Tesseract::unrej_good_quality_words
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:154
ScrollView::Color
Color
Definition: scrollview.h:100
tesseract::Wordrec::language_model_
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:471
tesseract::Tesseract::word_blank_and_set_display
bool word_blank_and_set_display(PAGE_RES_IT *pr_its)
Definition: pgedit.cpp:705
tesseract::CCUtil::imagebasename
STRING imagebasename
Definition: ccutil.h:54
tesseract::Tesseract::word_set_display
bool word_set_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:937
tesseract::Tesseract::rej_alphas_in_number_perm
bool rej_alphas_in_number_perm
Definition: tesseractclass.h:1037
tesseract::Tesseract::tessedit_preserve_blk_rej_perfect_wds
bool tessedit_preserve_blk_rej_perfect_wds
Definition: tesseractclass.h:911
tesseract::Tesseract::ResetAdaptiveClassifier
void ResetAdaptiveClassifier()
Definition: tesseractclass.cpp:597
tesseract::Tesseract::GetLineData
ImageData * GetLineData(const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block)
Definition: linerec.cpp:135
LOC_MM_ADAPT
#define LOC_MM_ADAPT
Definition: errcode.h:50
G_DODGY
Definition: docqual.h:31
WERD::GetNoiseOutlines
void GetNoiseOutlines(GenericVector< C_OUTLINE * > *outlines)
Definition: werd.cpp:505
tesseract::Classify::classify_bln_numeric_mode
bool classify_bln_numeric_mode
Definition: classify.h:508
BOOL_MEMBER
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:315
tesseract::Tesseract::tessedit_debug_block_rejection
bool tessedit_debug_block_rejection
Definition: tesseractclass.h:846
tesseract::LSTMRecognizer::Load
bool Load(const ParamsVectors *params, const char *lang, TessdataManager *mgr)
Definition: lstmrecognizer.cpp:77
BLOCK::re_rotation
FCOORD re_rotation() const
Definition: ocrblock.h:133
tesseract::Tesseract::TrySuperscriptSplits
WERD_RES * TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
Definition: superscript.cpp:381
tesseract::TESSDATA_LSTM
Definition: tessdatamanager.h:74
DOC_DAWG_PERM
Definition: ratngs.h:240
CR_KEEP_SPACE
Definition: pageres.h:154
ScrollView::ShowInputDialog
char * ShowInputDialog(const char *msg)
Definition: scrollview.cpp:731
AC_INITIAL_CAP
ALL but initial lc.
Definition: control.h:32
ScrollView::Rectangle
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:599
tesseract::Tesseract::tessedit_minimal_rejection
bool tessedit_minimal_rejection
Definition: tesseractclass.h:1019
initialise_search
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
Definition: fixspace.cpp:203
BlamerBundle
Definition: blamer.h:103
DF_POLYGONAL
Polyg approx.
Definition: werd.h:47
tesseract::Tesseract::word_dumper
bool word_dumper(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:913
tesseract::Tesseract::join_words
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
Definition: tfacepp.cpp:231
SHOW_SUPERSCRIPT_CMD_EVENT
Definition: pgedit.cpp:70
DEBUG_WERD_CMD_EVENT
Definition: pgedit.cpp:53
tesseract::Tesseract::tessedit_write_block_separators
bool tessedit_write_block_separators
Definition: tesseractclass.h:995
tesseract::Tesseract::tessedit_use_primary_params_model
bool tessedit_use_primary_params_model
Definition: tesseractclass.h:1053
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::Dict::ResetDocumentDictionary
void ResetDocumentDictionary()
Definition: dict.h:326
tesseract::OEM_TESSERACT_ONLY
Definition: publictypes.h:266
tesseract::Tesseract::tessedit_ocr_engine_mode
int tessedit_ocr_engine_mode
Definition: tesseractclass.h:802
tesseract::Tesseract::noise_cert_punc
double noise_cert_punc
Definition: tesseractclass.h:866
ROW::word_list
WERD_LIST * word_list()
Definition: ocrrow.h:54
WERD_RES::guessed_x_ht
bool guessed_x_ht
Definition: pageres.h:307
SHOW_BLN_WERD_CMD_EVENT
Definition: pgedit.cpp:52
textord_debug_tabfind
int textord_debug_tabfind
Definition: alignedblob.cpp:27
tesseract::Tesseract::fixsp_non_noise_limit
int fixsp_non_noise_limit
Definition: tesseractclass.h:963
WERD_CHOICE::set_permuter
void set_permuter(uint8_t perm)
Definition: ratngs.h:363
tesseract::Tesseract::crunch_debug
int crunch_debug
Definition: tesseractclass.h:962
tesseract::Tesseract::tessedit_create_tsv
bool tessedit_create_tsv
Definition: tesseractclass.h:1003
tesseract::Tesseract::test_pt
bool test_pt
Definition: tesseractclass.h:887
CM_UNDERLINE
Definition: pgedit.cpp:86
BOUNDING_BOX_CMD_EVENT
Definition: pgedit.cpp:55
WERD_CHOICE::unichar_lengths
const STRING & unichar_lengths() const
Definition: ratngs.h:536
PAGE_RES_IT::page_res
PAGE_RES * page_res
Definition: pageres.h:671
FREQ_DAWG_PERM
Definition: ratngs.h:242
tesseract::Tesseract::crunch_accept_ok
bool crunch_accept_ok
Definition: tesseractclass.h:953
tesseract::Tesseract::preserve_interword_spaces
bool preserve_interword_spaces
Definition: tesseractclass.h:1076
tesseract::Tesseract::ResegmentCharBox
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
Definition: applybox.cpp:328
tesseract::Tesseract::ClassifyBlobPlusOutlines
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
Definition: control.cpp:1225
tesseract::Tesseract::user_defined_dpi
int user_defined_dpi
Definition: tesseractclass.h:1010
SVET_SELECTION
Definition: scrollview.h:48
WERD::rej_cblob_list
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:89
tesseract::Tesseract::fix_sp_fp_word
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
Definition: fixspace.cpp:561
tesseract::Tesseract::match_current_words
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:222
tesseract::Classify::SwitchAdaptiveClassifier
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:613
tesseract::WordRecognizer
void(Tesseract::*)(const WordData &, WERD_RES **, PointerVector< WERD_RES > *) WordRecognizer
Definition: tesseractclass.h:170
tesseract::Classify::classify_debug_level
int classify_debug_level
Definition: classify.h:430
CM_SUBSCRIPT
Definition: pgedit.cpp:82
tesseract::Tesseract::write_results
void write_results(PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
Definition: output.cpp:96
NULL_CMD_EVENT
Definition: pgedit.cpp:48
determine_newline_type
char determine_newline_type(WERD *word, BLOCK *block, WERD *next_word, BLOCK *next_block)
Definition: output.cpp:211
tesseract::Tesseract::tessedit_create_hocr
bool tessedit_create_hocr
Definition: tesseractclass.h:999
kBlnBaselineOffset
const int kBlnBaselineOffset
Definition: normalis.h:24
tesseract::Tesseract::make_reject_map
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
PAGE_RES_IT::ResetWordIterator
void ResetWordIterator()
Definition: pageres.cpp:1518
tesseract::Tesseract::tessedit_write_params_to_file
char * tessedit_write_params_to_file
Definition: tesseractclass.h:819
UNICHARSET::size
int size() const
Definition: unicharset.h:341
WERD_CHOICE::append_unichar_id
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:470
WERD_RES::small_caps
bool small_caps
Definition: pageres.h:300
TWERD::NumBlobs
int NumBlobs() const
Definition: blobs.h:446
tesseract::Tesseract::word_blob_quality
int16_t word_blob_quality(WERD_RES *word)
Definition: docqual.cpp:64
tesseract::Tesseract::word_contains_non_1_digit
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
Definition: reject.cpp:508
UNICHARSET::CopyFrom
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:447
W_BOL
start of line
Definition: werd.h:46
NUMBER_PERM
Definition: ratngs.h:237
tesseract::Tesseract::crunch_pot_poor_rate
double crunch_pot_poor_rate
Definition: tesseractclass.h:940
tesseract::Tesseract::ParseLanguageString
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
Definition: tessedit.cpp:270
tesseract::Tesseract::ReassignDiacritics
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
Definition: control.cpp:944
tesseract::Tesseract::page_separator
char * page_separator
Definition: tesseractclass.h:1078
USER_DAWG_PERM
Definition: ratngs.h:241
tesseract::Tesseract::fixsp_done_mode
int fixsp_done_mode
Definition: tesseractclass.h:966
tesseract::ScriptPosToString
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:202
BandTriMatrix::dimension
int dimension() const
Definition: matrix.h:532
tesseract::ACTION_1_CMD_EVENT
Definition: tessedit.cpp:487
ROW::set_x_height
void set_x_height(float new_xheight)
Definition: ocrrow.h:66
BASELINES_CMD_EVENT
Definition: pgedit.cpp:62
tesseract::Tesseract::alpha_count
int16_t alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:494
tesseract::Tesseract::word_outline_errs
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:76
tesseract::ShiroRekhaSplitter::NO_SPLIT
Definition: devanagari_processing.h:74
tesseract::Tesseract::check_debug_pt
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1848
tesseract::Tesseract::right_to_left
bool right_to_left() const
Definition: tesseractclass.h:273
TBOX
Definition: rect.h:33
tesseract::Tesseract::quality_outline_pc
double quality_outline_pc
Definition: tesseractclass.h:879
WERD_CHOICE::max_x_height
float max_x_height() const
Definition: ratngs.h:327
BlamerBundle::LastChanceBlame
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:558
tesseract::Classify::AdaptiveClassifierIsFull
bool AdaptiveClassifierIsFull() const
Definition: classify.h:325
SortHelper
Definition: sorthelper.h:36